[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.2-385-gcd846d2

Service Account noreply at mpich.org
Tue Aug 23 09:58:12 CDT 2016


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  cd846d2c0f3349032f981b297a28eef8af9a5995 (commit)
      from  1494e0f44808516588ab7f57d0754cf27556f43c (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/cd846d2c0f3349032f981b297a28eef8af9a5995

commit cd846d2c0f3349032f981b297a28eef8af9a5995
Author: Ken Raffenetti <raffenet at mcs.anl.gov>
Date:   Mon Aug 22 14:10:26 2016 -0500

    mpid: Initial commit of CH4 device
    
    CH4 is a new device layer implementation designed for low software
    overheads. Network modules to be supported at first are OFI, UCX, and
    Portals 4. POSIX shared memory is supported for intranode
    communication. CH4 design was a collaborative effort - contributions
    from (in alphabetical order):
    
      Argonne National Laboratory
      Intel Corporation
      Mellanox Technologies
      RIKEN AICS

diff --git a/.gitignore b/.gitignore
index 6ad5751..6a297db 100644
--- a/.gitignore
+++ b/.gitignore
@@ -599,3 +599,6 @@ Makefile.am-stamp
 # /test/util/timer/
 /test/util/timer/timertest
 
+# /src/mpid/ch4/include
+/src/mpid/ch4/include/netmodpre.h
+/src/mpid/ch4/include/shmpre.h
diff --git a/src/mpid/Makefile.mk b/src/mpid/Makefile.mk
index b1a06ae..84122ef 100644
--- a/src/mpid/Makefile.mk
+++ b/src/mpid/Makefile.mk
@@ -12,5 +12,6 @@ noinst_HEADERS +=                          \
 
 
 include $(top_srcdir)/src/mpid/ch3/Makefile.mk
+include $(top_srcdir)/src/mpid/ch4/Makefile.mk
 include $(top_srcdir)/src/mpid/pamid/Makefile.mk
 include $(top_srcdir)/src/mpid/common/Makefile.mk
diff --git a/src/mpid/ch4/.gitignore b/src/mpid/ch4/.gitignore
new file mode 100644
index 0000000..f0558fd
--- /dev/null
+++ b/src/mpid/ch4/.gitignore
@@ -0,0 +1,3 @@
+# ch4 ignore files
+src/mpid_ch4_net_array.c
+src/mpid_ch4_shm_array.c
diff --git a/src/mpid/ch4/Makefile.mk b/src/mpid/ch4/Makefile.mk
new file mode 100644
index 0000000..8fceac6
--- /dev/null
+++ b/src/mpid/ch4/Makefile.mk
@@ -0,0 +1,20 @@
+## -*- Mode: Makefile; -*-
+## vim: set ft=automake :
+##
+## (C) 2016 by Argonne National Laboratory.
+##     See COPYRIGHT in top-level directory.
+##
+##  Portions of this code were written by Intel Corporation.
+##  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+##  to Argonne National Laboratory subject to Software Grant and Corporate
+##  Contributor License Agreement dated February 8, 2012.
+##
+
+if BUILD_CH4
+
+include $(top_srcdir)/src/mpid/ch4/include/Makefile.mk
+include $(top_srcdir)/src/mpid/ch4/src/Makefile.mk
+include $(top_srcdir)/src/mpid/ch4/netmod/Makefile.mk
+include $(top_srcdir)/src/mpid/ch4/shm/Makefile.mk
+
+endif BUILD_CH4
diff --git a/src/mpid/ch4/cross/gcc-linux-x86-8 b/src/mpid/ch4/cross/gcc-linux-x86-8
new file mode 100644
index 0000000..33e0896
--- /dev/null
+++ b/src/mpid/ch4/cross/gcc-linux-x86-8
@@ -0,0 +1,13 @@
+CROSS_F77_SIZEOF_INTEGER="4"
+CROSS_F77_SIZEOF_REAL="4"
+CROSS_F77_SIZEOF_DOUBLE_PRECISION="8"
+CROSS_F77_TRUE_VALUE="1"
+CROSS_F77_FALSE_VALUE="0"
+CROSS_F90_ADDRESS_KIND="8"
+CROSS_F90_OFFSET_KIND="8"
+CROSS_F90_INTEGER_KIND="4"
+CROSS_F90_REAL_MODEL=" 6 , 37"
+CROSS_F90_DOUBLE_MODEL=" 15 , 307"
+CROSS_F90_INTEGER_MODEL=" 9"
+CROSS_F90_ALL_INTEGER_MODELS=" 2 , 1, 4 , 2, 9 , 4, 18 , 8,"
+CROSS_F90_INTEGER_MODEL_MAP=" {  2 , 1 , 1 }, {  4 , 2 , 2 }, {  9 , 4 , 4 }, {  18 , 8 , 8 },"
diff --git a/src/mpid/ch4/cross/icc-linux-x86-8 b/src/mpid/ch4/cross/icc-linux-x86-8
new file mode 100644
index 0000000..c3800d1
--- /dev/null
+++ b/src/mpid/ch4/cross/icc-linux-x86-8
@@ -0,0 +1,13 @@
+CROSS_F77_SIZEOF_INTEGER="4"
+CROSS_F77_SIZEOF_REAL="4"
+CROSS_F77_SIZEOF_DOUBLE_PRECISION="8"
+CROSS_F77_TRUE_VALUE="-1"
+CROSS_F77_FALSE_VALUE="0"
+CROSS_F90_ADDRESS_KIND="8"
+CROSS_F90_OFFSET_KIND="8"
+CROSS_F90_INTEGER_KIND="4"
+CROSS_F90_REAL_MODEL=" 6 , 37"
+CROSS_F90_DOUBLE_MODEL=" 15 , 307"
+CROSS_F90_INTEGER_MODEL=" 9"
+CROSS_F90_ALL_INTEGER_MODELS=" 2 , 1, 4 , 2, 9 , 4, 18 , 8,"
+CROSS_F90_INTEGER_MODEL_MAP=" {  2 , 1 , 1 }, {  4 , 2 , 2 }, {  9 , 4 , 4 }, {  18 , 8 , 8 },"
diff --git a/src/mpid/ch4/errnames.txt b/src/mpid/ch4/errnames.txt
new file mode 100644
index 0000000..55c9135
--- /dev/null
+++ b/src/mpid/ch4/errnames.txt
@@ -0,0 +1,47 @@
+#
+# CH4 errors
+#
+**ch4|pktarraytoosmall: Size of the array of packet handlers is too small
+**ch4|badmsgtype %d:request contained an invalid message type (%d)
+**ch4|badmsgtype:request contained an invalid message type
+**ch4|badreqtype %d:request contained an invalid request type (%d)
+**ch4|badreqtype:request contained an invalid request type
+**ch4|cancelreq:failure occurred while sending remote cancellation request packet
+**ch4|cancelresp:failure occurred while attempting to send cancel response packet
+**ch4|cancelrndv:failure occurred while performing local cancellation of a rendezvous message
+**ch4|ch4_init:channel initialization failed
+**ch4|ctspkt:failure occurred while attempting to send CTS packet
+**ch4|eagermsg:failure occurred while attempting to send an eager message
+**ch4|loadrecviov:failure occurred while loading the receive I/O vector
+**ch4|loadsendiov:failure occurred while loading the send I/O vector
+**ch4|nopktcontainermem:failed to allocate memory for a packet reorder container
+**ch4|ooocancelreq:UNIMPLEMENTED: unable to process out-of-order cancellation requests
+**ch4|pktordered:failure occurred while processing a reordered packet
+**ch4|postrecv %s:failure occurred while posting a receive for message data (%s)
+**ch4|postrecv:failure occurred while posting a receive for message data
+**ch4|rmamsg:failure occurred while attempting to send an RMA message
+**ch4|rtspkt:failure occurred while attempting to send RTS packet
+**ch4|senddata:failure occurred while attempting to send message data
+**ch4|syncack:failure occurred while attempting to send eager synchronization packet
+**ch4|get_parent_port:spawn process group was unable to obtain parent port name from the channel
+**ch4|conn_parent:spawned process group was unable to connect back to the parent
+**ch4|conn_parent %s:spawned process group was unable to connect back to the parent on port <%s>
+**ch4|unhandled_connection_state:encountered an unexpected connection state
+**ch4|unhandled_connection_state %p %d:encountered an unexpected connection state (vc=%p, state=%d)
+**ch4|send_close_ack:an error occurred when the device attempted to acknowledge the closing of a connection
+**ch4|close_progress:an error occurred while the device was waiting for all open connections to close
+**ch4|pmi_finalize:PMI_Finalize failed
+**ch4|pmi_finalize %d:PMI_Finalize failed, error %d
+**ch4|invalid_shm:Invalid shm specified
+**ch4|invalid_shm %s:Invalid shm specified (%s)
+**ch4|invalid_locality:Invalid locality usage
+**ch4|invalid_locality %s:Invalid locality usage (%s)
+#
+# RMA errors
+#
+**ch4|sync_arg:Invalid RMA synchronization argument
+**ch4|sync_arg %d:Invalid RMA synchronization argument (%d)
+**ch4|rma_flags:Invalid combination of RMA packet flags
+**ch4|nocheck_invalid:MPI_MODE_NOCHECK was specified, but the lock was not available at the target
+**ch4|rma_msg:RMA message operation failed
+**ch4|win_shared_comm:Cannot map shared memory using the given communicator
diff --git a/src/mpid/ch4/include/Makefile.mk b/src/mpid/ch4/include/Makefile.mk
new file mode 100644
index 0000000..bd58048
--- /dev/null
+++ b/src/mpid/ch4/include/Makefile.mk
@@ -0,0 +1,23 @@
+## -*- Mode: Makefile; -*-
+## vim: set ft=automake :
+##
+## (C) 2016 by Argonne National Laboratory.
+##     See COPYRIGHT in top-level directory.
+##
+##  Portions of this code were written by Intel Corporation.
+##  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+##  to Argonne National Laboratory subject to Software Grant and Corporate
+##  Contributor License Agreement dated February 8, 2012.
+##
+
+AM_CPPFLAGS += -I$(top_srcdir)/src/mpid/ch4/include
+AM_CPPFLAGS += -I$(top_builddir)/src/mpid/ch4/include
+
+noinst_HEADERS += src/mpid/ch4/include/netmodpre.h  \
+                  src/mpid/ch4/include/shmpre.h     \
+                  src/mpid/ch4/include/mpidch4.h    \
+                  src/mpid/ch4/include/mpidch4r.h   \
+                  src/mpid/ch4/include/mpidimpl.h   \
+                  src/mpid/ch4/include/mpidpre.h    \
+                  src/mpid/ch4/include/mpid_sched.h \
+                  src/mpid/ch4/include/mpid_thread.h
diff --git a/src/mpid/ch4/include/mpid_sched.h b/src/mpid/ch4/include/mpid_sched.h
new file mode 100644
index 0000000..818a1cf
--- /dev/null
+++ b/src/mpid/ch4/include/mpid_sched.h
@@ -0,0 +1,30 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2015 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef MPID_SCHED_H_INCLUDED
+#define MPID_SCHED_H_INCLUDED
+#include "mpidu_sched.h"
+
+#define MPIR_Sched_cb MPIDU_Sched_cb
+#define MPIR_Sched_cb2 MPIDU_Sched_cb2
+#define MPIR_Sched_next_tag  MPIDU_Sched_next_tag
+#define MPIR_Sched_create MPIDU_Sched_create
+#define MPIR_Sched_clone MPIDU_Sched_clone
+#define MPIR_Sched_start MPIDU_Sched_start
+#define MPIR_Sched_send MPIDU_Sched_send
+#define MPIR_Sched_send_defer MPIDU_Sched_send_defer
+#define MPIR_Sched_recv MPIDU_Sched_recv
+#define MPIR_Sched_recv_status MPIDU_Sched_recv_status
+#define MPIR_Sched_ssend MPIDU_Sched_ssend
+#define MPIR_Sched_reduce MPIDU_Sched_reduce
+#define MPIR_Sched_copy MPIDU_Sched_copy
+#define MPIR_Sched_barrier MPIDU_Sched_barrier
+
+#endif /* MPID_SCHED_H_INCLUDED */
diff --git a/src/mpid/ch4/include/mpid_thread.h b/src/mpid/ch4/include/mpid_thread.h
new file mode 100644
index 0000000..2122d3d
--- /dev/null
+++ b/src/mpid/ch4/include/mpid_thread.h
@@ -0,0 +1,68 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef MPID_THREAD_H_INCLUDED
+#define MPID_THREAD_H_INCLUDED
+
+#include "mpidu_thread_fallback.h"
+
+/* We simply use the fallback timer functionality and do not define
+ * our own */
+
+typedef MPIDU_Thread_cond_t MPID_Thread_cond_t;
+typedef MPIDU_Thread_id_t MPID_Thread_id_t;
+typedef MPIDU_Thread_tls_t MPID_Thread_tls_t;
+typedef MPIDU_Thread_func_t MPID_Thread_func_t;
+
+#ifdef MPIDI_CH4_USE_TICKET_LOCK
+#include "mpid_ticketlock.h"
+typedef MPIDI_CH4_Ticket_lock MPID_Thread_mutex_t;
+#define MPID_THREAD_CS_ENTER       MPIDI_CH4I_THREAD_CS_ENTER
+#define MPID_THREAD_CS_EXIT        MPIDI_CH4I_THREAD_CS_EXIT
+#define MPID_THREAD_CS_YIELD       MPIDI_CH4I_THREAD_CS_YIELD
+#define MPID_Thread_mutex_create   MPIDI_CH4I_Thread_mutex_create
+#define MPID_Thread_mutex_destroy  MPIDI_CH4I_Thread_mutex_destroy
+#define MPID_Thread_mutex_lock     MPIDI_CH4I_Thread_mutex_lock
+#define MPID_Thread_mutex_unlock   MPIDI_CH4I_Thread_mutex_unlock
+#define MPID_Thread_cond_wait      MPIDI_CH4I_Thread_cond_wait
+#else
+typedef MPIDU_Thread_mutex_t MPID_Thread_mutex_t;
+#define MPID_THREAD_CS_ENTER       MPIDU_THREAD_CS_ENTER
+#define MPID_THREAD_CS_EXIT        MPIDU_THREAD_CS_EXIT
+#define MPID_THREAD_CS_YIELD       MPIDU_THREAD_CS_YIELD
+#define MPID_Thread_mutex_create   MPIDU_Thread_mutex_create
+#define MPID_Thread_mutex_destroy  MPIDU_Thread_mutex_destroy
+#define MPID_Thread_mutex_lock     MPIDU_Thread_mutex_lock
+#define MPID_Thread_mutex_unlock   MPIDU_Thread_mutex_unlock
+#define MPID_Thread_cond_wait      MPIDU_Thread_cond_wait
+#endif /* MPIDI_CH4_USE_TICKET_LOCK */
+
+#define MPID_Thread_create       MPIDU_Thread_create
+#define MPID_Thread_exit         MPIDU_Thread_exit
+#define MPID_Thread_self         MPIDU_Thread_self
+#define MPID_Thread_same       MPIDU_Thread_same
+#define MPID_Thread_same       MPIDU_Thread_same
+
+#define MPID_Thread_cond_create MPIDU_Thread_cond_create
+#define MPID_Thread_cond_destroy MPIDU_Thread_cond_destroy
+#define MPID_Thread_cond_broadcast MPIDU_Thread_cond_broadcast
+#define MPID_Thread_cond_signal MPIDU_Thread_cond_signal
+
+#define MPID_Thread_tls_create MPIDU_Thread_tls_create
+#define MPID_Thread_tls_destroy MPIDU_Thread_tls_destroy
+#define MPID_Thread_tls_set MPIDU_Thread_tls_set
+#define MPID_Thread_tls_get MPIDU_Thread_tls_get
+
+#define MPID_THREADPRIV_KEY_CREATE  MPIDU_THREADPRIV_KEY_CREATE
+#define MPID_THREADPRIV_KEY_GET_ADDR MPIDU_THREADPRIV_KEY_GET_ADDR
+#define MPID_THREADPRIV_KEY_DESTROY MPIDU_THREADPRIV_KEY_DESTROY
+
+
+#endif /* MPID_THREAD_H_INCLUDED */
diff --git a/src/mpid/ch4/include/mpid_ticketlock.h b/src/mpid/ch4/include/mpid_ticketlock.h
new file mode 100644
index 0000000..0dc5567
--- /dev/null
+++ b/src/mpid/ch4/include/mpid_ticketlock.h
@@ -0,0 +1,159 @@
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef MPID_TICKETLOCK_H_INCLUDED
+#define MPID_TICKETLOCK_H_INCLUDED
+
+#define __MUTEX_INLINE__ __attribute__((always_inline))static inline
+
+#define MPIDI_CH4_CACHELINE_SIZE 64
+
+typedef union MPIDI_CH4_Ticket_lock {
+    unsigned u;
+    char cacheline[MPIDI_CH4_CACHELINE_SIZE];
+    struct {
+        unsigned short ticket;
+        unsigned short clients;
+    } s;
+} MPIDI_CH4_Ticket_lock __attribute__ ((aligned(MPIDI_CH4_CACHELINE_SIZE)));
+
+__MUTEX_INLINE__ void MPIDI_CH4I_Thread_mutex_acquire(MPIDI_CH4_Ticket_lock * m)
+{
+    uint16_t u = __sync_fetch_and_add(&m->s.clients, 1);
+    while (m->s.ticket != u)
+        asm volatile ("pause\n":::"memory");
+}
+
+__MUTEX_INLINE__ void MPIDI_CH4I_Thread_mutex_release(MPIDI_CH4_Ticket_lock * m)
+{
+    asm volatile ("":::"memory");
+    m->s.ticket++;
+}
+
+__MUTEX_INLINE__ int MPIDI_CH4I_Thread_mutex_try_acquire(MPIDI_CH4_Ticket_lock * m)
+{
+    uint16_t u = m->s.clients;
+    uint16_t u2 = u + 1;
+    uint32_t val = ((uint32_t) u << 16) + u;
+    uint32_t val2 = ((uint32_t) u2 << 16) + u;
+
+    if (__sync_val_compare_and_swap(&m->u, val, val2) == val)
+        return 0;
+
+    return EBUSY;
+}
+
+__MUTEX_INLINE__ void MPIDI_CH4I_Thread_mutex_lock(MPIDI_CH4_Ticket_lock * m, int *mpi_error)
+{
+    MPIDI_CH4I_Thread_mutex_acquire(m);
+    *mpi_error = 0;
+}
+
+__MUTEX_INLINE__ void MPIDI_CH4I_Thread_mutex_unlock(MPIDI_CH4_Ticket_lock * m, int *mpi_error)
+{
+    MPIDI_CH4I_Thread_mutex_release(m);
+    *mpi_error = 0;
+}
+
+__MUTEX_INLINE__ void MPIDI_CH4I_Thread_mutex_create(MPIDI_CH4_Ticket_lock * m, int *mpi_error)
+{
+    m->u = 0;
+    *mpi_error = 0;
+}
+
+__MUTEX_INLINE__ void MPIDI_CH4I_Thread_mutex_destroy(MPIDI_CH4_Ticket_lock * m, int *mpi_error)
+{
+    m->u = 0;
+    *mpi_error = 0;
+}
+
+/* For this implementation we have two options                                        */
+/* 1)  Split the typedef for condition variable mutexes and call the utility routines */
+/* 2)  Implement it from scratch                                                      */
+/* Currently only async.c is using condition variables, so we should figure out what  */
+/* we really want from the cv implementations                                         */
+__MUTEX_INLINE__ void
+MPIDI_CH4I_Thread_cond_wait(MPIDU_Thread_cond_t * cond, MPIDI_CH4_Ticket_lock * m, int *mpi_error)
+{
+    MPIR_Assert(0);
+}
+
+
+#if MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__GLOBAL
+
+#define MPIDI_CH4I_THREAD_CS_ENTER_POBJ(mutex)
+#define MPIDI_CH4I_THREAD_CS_EXIT_POBJ(mutex)
+#define MPIDI_CH4I_THREAD_CS_TRY_POBJ(mutex)
+#define MPIDI_CH4I_THREAD_CS_YIELD_POBJ(mutex)
+
+#define MPIDI_CH4I_THREAD_CS_ENTER_GLOBAL(m) do { if (MPIR_ThreadInfo.isThreaded) {  MPIDI_CH4I_Thread_mutex_acquire(&m); }} while (0)
+#define MPIDI_CH4I_THREAD_CS_EXIT_GLOBAL(m)  do { if (MPIR_ThreadInfo.isThreaded) {  MPIDI_CH4I_Thread_mutex_release(&m); }} while (0)
+#define MPIDI_CH4I_THREAD_CS_TRY_GLOBAL(m)   do { (0==MPIDI_CH4I_Thread_mutex_try_acquire(&m));}} while (0)
+#define MPIDI_CH4I_THREAD_CS_YIELD_GLOBAL(m) do { if (MPIR_ThreadInfo.isThreaded) {  MPIDI_CH4I_Thread_mutex_release(&m); sched_yield(); MPIDI_CH4I_Thread_mutex_acquire(&m); }} while (0)
+
+#define MPIDI_CH4I_THREAD_CS_ENTER_ALLGRAN(mutex) MPIDI_CH4I_THREAD_CS_ENTER_GLOBAL(m)
+#define MPIDI_CH4I_THREAD_CS_EXIT_ALLGRAN(mutex)  MPIDI_CH4I_THREAD_CS_EXIT_GLOBAL(m)
+#define MPIDI_CH4I_THREAD_CS_TRY_ALLGRAN(mutex)   MPIDI_CH4I_THREAD_CS_TRY_GLOBAL(m)
+#define MPIDI_CH4I_THREAD_CS_YIELD_ALLGRAN(mutex) MPIDI_CH4I_THREAD_CS_YIELD_GLOBAL(m)
+
+#elif MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__POBJ
+
+#define MPIDI_CH4I_THREAD_CS_ENTER_POBJ(m)                   \
+        do {                                                \
+                if (likely(MPIR_ThreadInfo.isThreaded)) {   \
+                        MPIDI_CH4I_Thread_mutex_acquire(&m); \
+                }                                           \
+        } while (0)
+
+#define MPIDI_CH4I_THREAD_CS_EXIT_POBJ(m)                    \
+        do {                                                \
+                if (likely(MPIR_ThreadInfo.isThreaded)) {   \
+                        MPIDI_CH4I_Thread_mutex_release(&m); \
+                }                                           \
+        } while (0)
+
+#define MPIDI_CH4I_THREAD_CS_TRY_POBJ(m)                         \
+        do {                                                    \
+                if (likely(MPIR_ThreadInfo.isThreaded)) {       \
+                        MPIDI_CH4I_Thread_mutex_try_acquire(&m); \
+                }                                               \
+        } while (0)
+
+#define MPIDI_CH4I_THREAD_CS_YIELD_POBJ(m)                   \
+        do {                                                \
+                if (likely(MPIR_ThreadInfo.isThreaded)) {   \
+                        MPIDI_CH4I_Thread_mutex_release(&m); \
+                        sched_yield();                      \
+                        MPIDI_CH4I_Thread_mutex_acquire(&m); \
+                }                                           \
+        } while (0)
+
+#define MPIDI_CH4I_THREAD_CS_ENTER_ALLGRAN MPIDI_CH4I_THREAD_CS_ENTER_POBJ
+#define MPIDI_CH4I_THREAD_CS_EXIT_ALLGRAN  MPIDI_CH4I_THREAD_CS_EXIT_POBJ
+#define MPIDI_CH4I_THREAD_CS_TRY_ALLGRAN   MPIDI_CH4I_THREAD_CS_TRY_POBJ
+#define MPIDI_CH4I_THREAD_CS_YIELD_ALLGRAN MPIDI_CH4I_THREAD_CS_YIELD_POBJ
+
+/* GLOBAL locks are all NO-OPs */
+#define MPIDI_CH4I_THREAD_CS_ENTER_GLOBAL(mutex)
+#define MPIDI_CH4I_THREAD_CS_EXIT_GLOBAL(mutex)
+#define MPIDI_CH4I_THREAD_CS_TRY_GLOBAL(mutex)
+#define MPIDI_CH4I_THREAD_CS_YIELD_GLOBAL(mutex)
+
+#else
+
+#error "Ticket locks are only supported in Global or Per-Object Granularity"
+
+#endif /* MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__GLOBAL */
+
+#define MPIDI_CH4I_THREAD_CS_ENTER(name, mutex) MPIDI_CH4I_THREAD_CS_ENTER_##name(mutex)
+#define MPIDI_CH4I_THREAD_CS_EXIT(name, mutex)  MPIDI_CH4I_THREAD_CS_EXIT_##name(mutex)
+#define MPIDI_CH4I_THREAD_CS_TRY(name, mutex)  MPIDI_CH4I_THREAD_CS_TRY_##name(mutex)
+#define MPIDI_CH4I_THREAD_CS_YIELD(name, mutex) MPIDI_CH4I_THREAD_CS_YIELD_##name(mutex)
+
+#endif /* MPID_TICKETLOCK_H_INCLUDED */
diff --git a/src/mpid/ch4/include/mpidch4.h b/src/mpid/ch4/include/mpidch4.h
new file mode 100644
index 0000000..8110d81
--- /dev/null
+++ b/src/mpid/ch4/include/mpidch4.h
@@ -0,0 +1,435 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef MPIDCH4_H_INCLUDED
+#define MPIDCH4_H_INCLUDED
+
+#define __CH4_INLINE__ __attribute__((always_inline)) static inline
+
+/* We need to define the static inlines right away to avoid
+ * any implicit prototype generation and subsequent warnings
+ * This allows us to make ADI up calls from within a direct
+ * netmod.
+ */
+#define MPIDI_CH4I_API(rc,fcnname,...)            \
+  __CH4_INLINE__ rc MPIDI_##fcnname(__VA_ARGS__) \
+  __attribute__((always_inline))
+
+MPIDI_CH4I_API(int, Init, int *, char ***, int, int *, int *, int *);
+MPIDI_CH4I_API(int, InitCompleted, void);
+MPIDI_CH4I_API(int, Abort, MPIR_Comm *, int, int, const char *);
+MPIDI_CH4I_API(int, Cancel_recv, MPIR_Request *);
+MPIDI_CH4I_API(int, Cancel_send, MPIR_Request *);
+MPIDI_CH4I_API(int, Comm_disconnect, MPIR_Comm *);
+MPIDI_CH4I_API(int, Comm_spawn_multiple, int, char *[], char **[], const int[], MPIR_Info *[], int,
+               MPIR_Comm *, MPIR_Comm **, int[]);
+MPIDI_CH4I_API(int, Comm_failure_get_acked, MPIR_Comm *, MPIR_Group **);
+MPIDI_CH4I_API(int, Comm_get_all_failed_procs, MPIR_Comm *, MPIR_Group **, int);
+MPIDI_CH4I_API(int, Comm_revoke, MPIR_Comm *, int);
+MPIDI_CH4I_API(int, Comm_failure_ack, MPIR_Comm *);
+MPIDI_CH4I_API(int, Comm_AS_enabled, MPIR_Comm *);
+MPIDI_CH4I_API(int, Comm_get_lpid, MPIR_Comm *, int, int *, MPL_bool);
+MPIDI_CH4I_API(int, Finalize, void);
+MPIDI_CH4I_API(int, Get_universe_size, int *);
+MPIDI_CH4I_API(int, Get_processor_name, char *, int, int *);
+MPIDI_CH4I_API(int, Iprobe, int, int, MPIR_Comm *, int, int *, MPI_Status *);
+MPIDI_CH4I_API(int, Irecv, void *, int, MPI_Datatype, int, int, MPIR_Comm *, int, MPIR_Request **);
+MPIDI_CH4I_API(int, Isend, const void *, int, MPI_Datatype, int, int, MPIR_Comm *, int,
+               MPIR_Request **);
+MPIDI_CH4I_API(int, Issend, const void *, int, MPI_Datatype, int, int, MPIR_Comm *, int,
+               MPIR_Request **);
+MPIDI_CH4I_API(int, Mrecv, void *, int, MPI_Datatype, MPIR_Request *, MPI_Status *);
+MPIDI_CH4I_API(int, Imrecv, void *, int, MPI_Datatype, MPIR_Request *, MPIR_Request **);
+MPIDI_CH4I_API(int, Open_port, MPIR_Info *, char *);
+MPIDI_CH4I_API(int, Close_port, const char *);
+MPIDI_CH4I_API(int, Comm_accept, const char *, MPIR_Info *, int, MPIR_Comm *, MPIR_Comm **);
+MPIDI_CH4I_API(int, Comm_connect, const char *, MPIR_Info *, int, MPIR_Comm *, MPIR_Comm **);
+MPIDI_CH4I_API(int, Probe, int, int, MPIR_Comm *, int, MPI_Status *);
+MPIDI_CH4I_API(int, Mprobe, int, int, MPIR_Comm *, int, MPIR_Request **, MPI_Status *);
+MPIDI_CH4I_API(int, Improbe, int, int, MPIR_Comm *, int, int *, MPIR_Request **, MPI_Status *);
+MPIDI_CH4I_API(int, Progress_test, void);
+MPIDI_CH4I_API(int, Progress_poke, void);
+MPIDI_CH4I_API(void, Progress_start, MPID_Progress_state *);
+MPIDI_CH4I_API(void, Progress_end, MPID_Progress_state *);
+MPIDI_CH4I_API(int, Progress_wait, MPID_Progress_state *);
+MPIDI_CH4I_API(int, Progress_register, int (*progress_fn) (int *), int *id);
+MPIDI_CH4I_API(int, Progress_deregister, int id);
+MPIDI_CH4I_API(int, Progress_activate, int id);
+MPIDI_CH4I_API(int, Progress_deactivate, int id);
+MPIDI_CH4I_API(int, Recv, void *, int, MPI_Datatype, int, int, MPIR_Comm *, int, MPI_Status *,
+               MPIR_Request **);
+MPIDI_CH4I_API(int, Recv_init, void *, int, MPI_Datatype, int, int, MPIR_Comm *, int,
+               MPIR_Request **);
+MPIDI_CH4I_API(void, Request_set_completed, MPIR_Request *);
+MPIDI_CH4I_API(int, Request_complete, MPIR_Request *);
+MPIDI_CH4I_API(int, Request_is_anysource, MPIR_Request *);
+MPIDI_CH4I_API(int, Send, const void *, int, MPI_Datatype, int, int, MPIR_Comm *, int,
+               MPIR_Request **);
+MPIDI_CH4I_API(int, Ssend, const void *, int, MPI_Datatype, int, int, MPIR_Comm *, int,
+               MPIR_Request **);
+MPIDI_CH4I_API(int, Rsend, const void *, int, MPI_Datatype, int, int, MPIR_Comm *, int,
+               MPIR_Request **);
+MPIDI_CH4I_API(int, Irsend, const void *, int, MPI_Datatype, int, int, MPIR_Comm *, int,
+               MPIR_Request **);
+MPIDI_CH4I_API(int, Send_init, const void *, int, MPI_Datatype, int, int, MPIR_Comm *, int,
+               MPIR_Request **);
+MPIDI_CH4I_API(int, Ssend_init, const void *, int, MPI_Datatype, int, int, MPIR_Comm *, int,
+               MPIR_Request **);
+MPIDI_CH4I_API(int, Bsend_init, const void *, int, MPI_Datatype, int, int, MPIR_Comm *, int,
+               MPIR_Request **);
+MPIDI_CH4I_API(int, Rsend_init, const void *, int, MPI_Datatype, int, int, MPIR_Comm *, int,
+               MPIR_Request **);
+MPIDI_CH4I_API(int, Startall, int, MPIR_Request *[]);
+MPIDI_CH4I_API(int, GPID_Get, MPIR_Comm *, int, MPIR_Gpid *);
+MPIDI_CH4I_API(int, Accumulate, const void *, int, MPI_Datatype, int, MPI_Aint, int, MPI_Datatype,
+               MPI_Op, MPIR_Win *);
+MPIDI_CH4I_API(int, Win_create, void *, MPI_Aint, int, MPIR_Info *, MPIR_Comm *, MPIR_Win **);
+MPIDI_CH4I_API(int, Win_fence, int, MPIR_Win *);
+MPIDI_CH4I_API(int, Win_free, MPIR_Win **);
+MPIDI_CH4I_API(int, Get, void *, int, MPI_Datatype, int, MPI_Aint, int, MPI_Datatype, MPIR_Win *);
+MPIDI_CH4I_API(int, Win_get_info, MPIR_Win *, MPIR_Info **);
+MPIDI_CH4I_API(int, Win_lock, int, int, int, MPIR_Win *);
+MPIDI_CH4I_API(int, Win_unlock, int, MPIR_Win *);
+MPIDI_CH4I_API(int, Win_start, MPIR_Group *, int, MPIR_Win *);
+MPIDI_CH4I_API(int, Win_complete, MPIR_Win *);
+MPIDI_CH4I_API(int, Win_post, MPIR_Group *, int, MPIR_Win *);
+MPIDI_CH4I_API(int, Win_wait, MPIR_Win *);
+MPIDI_CH4I_API(int, Win_test, MPIR_Win *, int *);
+MPIDI_CH4I_API(int, Put, const void *, int, MPI_Datatype, int, MPI_Aint, int, MPI_Datatype,
+               MPIR_Win *);
+MPIDI_CH4I_API(int, Win_set_info, MPIR_Win *, MPIR_Info *);
+MPIDI_CH4I_API(int, Comm_reenable_anysource, MPIR_Comm *, MPIR_Group **);
+MPIDI_CH4I_API(int, Comm_remote_group_failed, MPIR_Comm *, MPIR_Group **);
+MPIDI_CH4I_API(int, Comm_group_failed, MPIR_Comm *, MPIR_Group **);
+MPIDI_CH4I_API(int, Win_attach, MPIR_Win *, void *, MPI_Aint);
+MPIDI_CH4I_API(int, Win_allocate_shared, MPI_Aint, int, MPIR_Info *, MPIR_Comm *, void **,
+               MPIR_Win **);
+MPIDI_CH4I_API(int, Rput, const void *, int, MPI_Datatype, int, MPI_Aint, int, MPI_Datatype,
+               MPIR_Win *, MPIR_Request **);
+MPIDI_CH4I_API(int, Win_flush_local, int, MPIR_Win *);
+MPIDI_CH4I_API(int, Win_detach, MPIR_Win *, const void *);
+MPIDI_CH4I_API(int, Compare_and_swap, const void *, const void *, void *, MPI_Datatype, int,
+               MPI_Aint, MPIR_Win *);
+MPIDI_CH4I_API(int, Raccumulate, const void *, int, MPI_Datatype, int, MPI_Aint, int, MPI_Datatype,
+               MPI_Op, MPIR_Win *, MPIR_Request **);
+MPIDI_CH4I_API(int, Rget_accumulate, const void *, int, MPI_Datatype, void *, int, MPI_Datatype,
+               int, MPI_Aint, int, MPI_Datatype, MPI_Op, MPIR_Win *, MPIR_Request **);
+MPIDI_CH4I_API(int, Fetch_and_op, const void *, void *, MPI_Datatype, int, MPI_Aint, MPI_Op,
+               MPIR_Win *);
+MPIDI_CH4I_API(int, Win_shared_query, MPIR_Win *, int, MPI_Aint *, int *, void *);
+MPIDI_CH4I_API(int, Win_allocate, MPI_Aint, int, MPIR_Info *, MPIR_Comm *, void *, MPIR_Win **);
+MPIDI_CH4I_API(int, Win_flush, int, MPIR_Win *);
+MPIDI_CH4I_API(int, Win_flush_local_all, MPIR_Win *);
+MPIDI_CH4I_API(int, Win_unlock_all, MPIR_Win *);
+MPIDI_CH4I_API(int, Win_create_dynamic, MPIR_Info *, MPIR_Comm *, MPIR_Win **);
+MPIDI_CH4I_API(int, Rget, void *, int, MPI_Datatype, int, MPI_Aint, int, MPI_Datatype, MPIR_Win *,
+               MPIR_Request **);
+MPIDI_CH4I_API(int, Win_sync, MPIR_Win *);
+MPIDI_CH4I_API(int, Win_flush_all, MPIR_Win *);
+MPIDI_CH4I_API(int, Get_accumulate, const void *, int, MPI_Datatype, void *, int, MPI_Datatype, int,
+               MPI_Aint, int, MPI_Datatype, MPI_Op, MPIR_Win *);
+MPIDI_CH4I_API(int, Win_lock_all, int, MPIR_Win *);
+MPIDI_CH4I_API(void *, Alloc_mem, size_t, MPIR_Info *);
+MPIDI_CH4I_API(int, Free_mem, void *);
+MPIDI_CH4I_API(int, Get_node_id, MPIR_Comm *, int rank, MPID_Node_id_t *);
+MPIDI_CH4I_API(int, Get_max_node_id, MPIR_Comm *, MPID_Node_id_t *);
+MPIDI_CH4I_API(int, Request_is_pending_failure, MPIR_Request *);
+MPIDI_CH4I_API(MPI_Aint, Aint_add, MPI_Aint, MPI_Aint);
+MPIDI_CH4I_API(MPI_Aint, Aint_diff, MPI_Aint, MPI_Aint);
+MPIDI_CH4I_API(int, GPID_GetAllInComm, MPIR_Comm *, int, MPIR_Gpid[], int *);
+MPIDI_CH4I_API(int, GPID_ToLpidArray, int, MPIR_Gpid[], int[]);
+MPIDI_CH4I_API(int, Create_intercomm_from_lpids, MPIR_Comm *, int, const int[]);
+MPIDI_CH4I_API(int, Comm_create, MPIR_Comm *);
+MPIDI_CH4I_API(int, Comm_destroy, MPIR_Comm *);
+MPIDI_CH4I_API(int, Barrier, MPIR_Comm *, MPIR_Errflag_t *);
+MPIDI_CH4I_API(int, Bcast, void *, int, MPI_Datatype, int, MPIR_Comm *, MPIR_Errflag_t *);
+MPIDI_CH4I_API(int, Allreduce, const void *, void *, int, MPI_Datatype, MPI_Op, MPIR_Comm *,
+               MPIR_Errflag_t *);
+MPIDI_CH4I_API(int, Allgather, const void *, int, MPI_Datatype, void *, int, MPI_Datatype,
+               MPIR_Comm *, MPIR_Errflag_t *);
+MPIDI_CH4I_API(int, Allgatherv, const void *, int, MPI_Datatype, void *, const int *, const int *,
+               MPI_Datatype, MPIR_Comm *, MPIR_Errflag_t *);
+MPIDI_CH4I_API(int, Scatter, const void *, int, MPI_Datatype, void *, int, MPI_Datatype, int,
+               MPIR_Comm *, MPIR_Errflag_t *);
+MPIDI_CH4I_API(int, Scatterv, const void *, const int *, const int *, MPI_Datatype, void *, int,
+               MPI_Datatype, int, MPIR_Comm *, MPIR_Errflag_t *);
+MPIDI_CH4I_API(int, Gather, const void *, int, MPI_Datatype, void *, int, MPI_Datatype, int,
+               MPIR_Comm *, MPIR_Errflag_t *);
+MPIDI_CH4I_API(int, Gatherv, const void *, int, MPI_Datatype, void *, const int *, const int *,
+               MPI_Datatype, int, MPIR_Comm *, MPIR_Errflag_t *);
+MPIDI_CH4I_API(int, Alltoall, const void *, int, MPI_Datatype, void *, int, MPI_Datatype,
+               MPIR_Comm *, MPIR_Errflag_t *);
+MPIDI_CH4I_API(int, Alltoallv, const void *, const int *, const int *, MPI_Datatype, void *,
+               const int *, const int *, MPI_Datatype, MPIR_Comm *, MPIR_Errflag_t *);
+MPIDI_CH4I_API(int, Alltoallw, const void *, const int[], const int[], const MPI_Datatype[], void *,
+               const int[], const int[], const MPI_Datatype[], MPIR_Comm *, MPIR_Errflag_t *);
+MPIDI_CH4I_API(int, Reduce, const void *, void *, int, MPI_Datatype, MPI_Op, int, MPIR_Comm *,
+               MPIR_Errflag_t *);
+MPIDI_CH4I_API(int, Reduce_scatter, const void *, void *, const int[], MPI_Datatype, MPI_Op,
+               MPIR_Comm *, MPIR_Errflag_t *);
+MPIDI_CH4I_API(int, Reduce_scatter_block, const void *, void *, int, MPI_Datatype, MPI_Op,
+               MPIR_Comm *, MPIR_Errflag_t *);
+MPIDI_CH4I_API(int, Scan, const void *, void *, int, MPI_Datatype, MPI_Op, MPIR_Comm *,
+               MPIR_Errflag_t *);
+MPIDI_CH4I_API(int, Exscan, const void *, void *, int, MPI_Datatype, MPI_Op, MPIR_Comm *,
+               MPIR_Errflag_t *);
+MPIDI_CH4I_API(int, Neighbor_allgather, const void *, int, MPI_Datatype, void *, int, MPI_Datatype,
+               MPIR_Comm *);
+MPIDI_CH4I_API(int, Neighbor_allgatherv, const void *, int, MPI_Datatype, void *, const int[],
+               const int[], MPI_Datatype, MPIR_Comm *);
+MPIDI_CH4I_API(int, Neighbor_alltoallv, const void *, const int[], const int[], MPI_Datatype,
+               void *, const int[], const int[], MPI_Datatype, MPIR_Comm *);
+MPIDI_CH4I_API(int, Neighbor_alltoallw, const void *, const int[], const MPI_Aint[],
+               const MPI_Datatype[], void *, const int[], const MPI_Aint[], const MPI_Datatype[],
+               MPIR_Comm *);
+MPIDI_CH4I_API(int, Neighbor_alltoall, const void *, int, MPI_Datatype, void *, int, MPI_Datatype,
+               MPIR_Comm *);
+MPIDI_CH4I_API(int, Ineighbor_allgather, const void *, int, MPI_Datatype, void *, int, MPI_Datatype,
+               MPIR_Comm *, MPI_Request *);
+MPIDI_CH4I_API(int, Ineighbor_allgatherv, const void *, int, MPI_Datatype, void *, const int[],
+               const int[], MPI_Datatype, MPIR_Comm *, MPI_Request *);
+MPIDI_CH4I_API(int, Ineighbor_alltoall, const void *, int, MPI_Datatype, void *, int, MPI_Datatype,
+               MPIR_Comm *, MPI_Request *);
+MPIDI_CH4I_API(int, Ineighbor_alltoallv, const void *, const int[], const int[], MPI_Datatype,
+               void *, const int[], const int[], MPI_Datatype, MPIR_Comm *, MPI_Request *);
+MPIDI_CH4I_API(int, Ineighbor_alltoallw, const void *, const int[], const MPI_Aint[],
+               const MPI_Datatype[], void *, const int[], const MPI_Aint[], const MPI_Datatype[],
+               MPIR_Comm *, MPI_Request *);
+MPIDI_CH4I_API(int, Ibarrier, MPIR_Comm *, MPI_Request *);
+MPIDI_CH4I_API(int, Ibcast, void *, int, MPI_Datatype, int, MPIR_Comm *, MPI_Request *);
+MPIDI_CH4I_API(int, Iallgather, const void *, int, MPI_Datatype, void *, int, MPI_Datatype,
+               MPIR_Comm *, MPI_Request *);
+MPIDI_CH4I_API(int, Iallgatherv, const void *, int, MPI_Datatype, void *, const int *, const int *,
+               MPI_Datatype, MPIR_Comm *, MPI_Request *);
+MPIDI_CH4I_API(int, Iallreduce, const void *, void *, int, MPI_Datatype, MPI_Op, MPIR_Comm *,
+               MPI_Request *);
+MPIDI_CH4I_API(int, Ialltoall, const void *, int, MPI_Datatype, void *, int, MPI_Datatype,
+               MPIR_Comm *, MPI_Request *);
+MPIDI_CH4I_API(int, Ialltoallv, const void *, const int[], const int[], MPI_Datatype, void *,
+               const int[], const int[], MPI_Datatype, MPIR_Comm *, MPI_Request *);
+MPIDI_CH4I_API(int, Ialltoallw, const void *, const int[], const int[], const MPI_Datatype[],
+               void *, const int[], const int[], const MPI_Datatype[], MPIR_Comm *, MPI_Request *);
+MPIDI_CH4I_API(int, Iexscan, const void *, void *, int, MPI_Datatype, MPI_Op, MPIR_Comm *,
+               MPI_Request *);
+MPIDI_CH4I_API(int, Igather, const void *, int, MPI_Datatype, void *, int, MPI_Datatype, int,
+               MPIR_Comm *, MPI_Request *);
+MPIDI_CH4I_API(int, Igatherv, const void *, int, MPI_Datatype, void *, const int *, const int *,
+               MPI_Datatype, int, MPIR_Comm *, MPI_Request *);
+MPIDI_CH4I_API(int, Ireduce_scatter_block, const void *, void *, int, MPI_Datatype, MPI_Op,
+               MPIR_Comm *, MPI_Request *);
+MPIDI_CH4I_API(int, Ireduce_scatter, const void *, void *, const int[], MPI_Datatype, MPI_Op,
+               MPIR_Comm *, MPI_Request *);
+MPIDI_CH4I_API(int, Ireduce, const void *, void *, int, MPI_Datatype, MPI_Op, int, MPIR_Comm *,
+               MPI_Request *);
+MPIDI_CH4I_API(int, Iscan, const void *, void *, int, MPI_Datatype, MPI_Op, MPIR_Comm *,
+               MPI_Request *);
+MPIDI_CH4I_API(int, Iscatter, const void *, int, MPI_Datatype, void *, int, MPI_Datatype, int,
+               MPIR_Comm *, MPI_Request *);
+MPIDI_CH4I_API(int, Iscatterv, const void *, const int *, const int *, MPI_Datatype, void *, int,
+               MPI_Datatype, int, MPIR_Comm *, MPI_Request *);
+
+/* This function is not exposed to the upper layers but functions in a way
+ * similar to the functions above. Other CH4-level functions should call this
+ * function to query locality. This function will determine whether to call the
+ * netmod or CH4U locality functions. */
+__CH4_INLINE__ int MPIDI_CH4_rank_is_local(int rank, MPIR_Comm * comm);
+
+/* Include netmod prototypes */
+#include <netmod.h>
+#ifdef MPIDI_BUILD_CH4_SHM
+#include "shm.h"
+#endif
+
+/* Declare request functions here so netmods can refer to
+   them in the NETMOD_DIRECT mode */
+#include "ch4_request.h"
+
+/* Include netmod and shm implementations  */
+/* Prototypes are split from impl to avoid */
+/* circular dependencies                   */
+#include <netmod_impl.h>
+#ifdef MPIDI_BUILD_CH4_SHM
+#include "shm_impl.h"
+#endif
+
+#include "ch4_init.h"
+#include "ch4_probe.h"
+#include "ch4_send.h"
+#include "ch4_recv.h"
+#include "ch4_comm.h"
+#include "ch4_win.h"
+#include "ch4_rma.h"
+#include "ch4_progress.h"
+#include "ch4_spawn.h"
+#include "ch4_proc.h"
+#include "ch4_coll.h"
+
+#define MPID_Abort                       MPIDI_Abort
+#define MPID_Accumulate                  MPIDI_Accumulate
+#define MPID_Alloc_mem                   MPIDI_Alloc_mem
+#define MPID_Bsend_init                  MPIDI_Bsend_init
+#define MPID_Cancel_recv                 MPIDI_Cancel_recv
+#define MPID_Cancel_send                 MPIDI_Cancel_send
+#define MPID_Close_port                  MPIDI_Close_port
+#define MPID_Comm_accept                 MPIDI_Comm_accept
+#define MPID_Comm_connect                MPIDI_Comm_connect
+#define MPID_Comm_disconnect             MPIDI_Comm_disconnect
+#define MPID_Comm_group_failed           MPIDI_Comm_group_failed
+#define MPID_Comm_reenable_anysource     MPIDI_Comm_reenable_anysource
+#define MPID_Comm_remote_group_failed    MPIDI_Comm_remote_group_failed
+#define MPID_Comm_spawn_multiple         MPIDI_Comm_spawn_multiple
+#define MPID_Comm_failure_get_acked      MPIDI_Comm_failure_get_acked
+#define MPID_Comm_get_all_failed_procs   MPIDI_Comm_get_all_failed_procs
+#define MPID_Comm_revoke                 MPIDI_Comm_revoke
+#define MPID_Comm_failure_ack            MPIDI_Comm_failure_ack
+#define MPID_Comm_AS_enabled             MPIDI_Comm_AS_enabled
+#define MPID_Comm_get_lpid               MPIDI_Comm_get_lpid
+#define MPID_Compare_and_swap            MPIDI_Compare_and_swap
+#define MPID_Fetch_and_op                MPIDI_Fetch_and_op
+#define MPID_Finalize                    MPIDI_Finalize
+#define MPID_Free_mem                    MPIDI_Free_mem
+#define MPID_GPID_Get                    MPIDI_GPID_Get
+#define MPID_Get                         MPIDI_Get
+#define MPID_Get_accumulate              MPIDI_Get_accumulate
+#define MPID_Get_processor_name          MPIDI_Get_processor_name
+#define MPID_Get_universe_size           MPIDI_Get_universe_size
+#define MPID_Improbe                     MPIDI_Improbe
+#define MPID_Imrecv                      MPIDI_Imrecv
+#define MPID_Init                        MPIDI_Init
+#define MPID_InitCompleted               MPIDI_InitCompleted
+#define MPID_Iprobe                      MPIDI_Iprobe
+#define MPID_Irecv                       MPIDI_Irecv
+#define MPID_Irsend                      MPIDI_Irsend
+#define MPID_Isend                       MPIDI_Isend
+#define MPID_Issend                      MPIDI_Issend
+#define MPID_Mprobe                      MPIDI_Mprobe
+#define MPID_Mrecv                       MPIDI_Mrecv
+#define MPID_Open_port                   MPIDI_Open_port
+#define MPID_Probe                       MPIDI_Probe
+#define MPID_Progress_end                MPIDI_Progress_end
+#define MPID_Progress_poke               MPIDI_Progress_poke
+#define MPID_Progress_start              MPIDI_Progress_start
+#define MPID_Progress_test               MPIDI_Progress_test
+#define MPID_Progress_wait               MPIDI_Progress_wait
+#define MPID_Progress_register           MPIDI_Progress_register
+#define MPID_Progress_deregister         MPIDI_Progress_deregister
+#define MPID_Progress_activate           MPIDI_Progress_activate
+#define MPID_Progress_deactivate         MPIDI_Progress_deactivate
+#define MPID_Put                         MPIDI_Put
+#define MPID_Raccumulate                 MPIDI_Raccumulate
+#define MPID_Recv                        MPIDI_Recv
+#define MPID_Recv_init                   MPIDI_Recv_init
+#define MPID_Request_release             MPIDI_Request_release
+#define MPID_Request_complete            MPIDI_Request_complete
+#define MPID_Request_is_anysource        MPIDI_Request_is_anysource
+#define MPID_Request_set_completed       MPIDI_Request_set_completed
+#define MPID_Rget                        MPIDI_Rget
+#define MPID_Rget_accumulate             MPIDI_Rget_accumulate
+#define MPID_Rput                        MPIDI_Rput
+#define MPID_Rsend                       MPIDI_Rsend
+#define MPID_Rsend_init                  MPIDI_Rsend_init
+#define MPID_Send                        MPIDI_Send
+#define MPID_Send_init                   MPIDI_Send_init
+#define MPID_Ssend                       MPIDI_Ssend
+#define MPID_Ssend_init                  MPIDI_Ssend_init
+#define MPID_Startall                    MPIDI_Startall
+#define MPID_Win_allocate                MPIDI_Win_allocate
+#define MPID_Win_allocate_shared         MPIDI_Win_allocate_shared
+#define MPID_Win_attach                  MPIDI_Win_attach
+#define MPID_Win_complete                MPIDI_Win_complete
+#define MPID_Win_create                  MPIDI_Win_create
+#define MPID_Win_create_dynamic          MPIDI_Win_create_dynamic
+#define MPID_Win_detach                  MPIDI_Win_detach
+#define MPID_Win_fence                   MPIDI_Win_fence
+#define MPID_Win_flush                   MPIDI_Win_flush
+#define MPID_Win_flush_all               MPIDI_Win_flush_all
+#define MPID_Win_flush_local             MPIDI_Win_flush_local
+#define MPID_Win_flush_local_all         MPIDI_Win_flush_local_all
+#define MPID_Win_free                    MPIDI_Win_free
+#define MPID_Win_get_info                MPIDI_Win_get_info
+#define MPID_Win_lock                    MPIDI_Win_lock
+#define MPID_Win_lock_all                MPIDI_Win_lock_all
+#define MPID_Win_post                    MPIDI_Win_post
+#define MPID_Win_set_info                MPIDI_Win_set_info
+#define MPID_Win_shared_query            MPIDI_Win_shared_query
+#define MPID_Win_start                   MPIDI_Win_start
+#define MPID_Win_sync                    MPIDI_Win_sync
+#define MPID_Win_test                    MPIDI_Win_test
+#define MPID_Win_unlock                  MPIDI_Win_unlock
+#define MPID_Win_unlock_all              MPIDI_Win_unlock_all
+#define MPID_Win_wait                    MPIDI_Win_wait
+#define MPID_Get_node_id                 MPIDI_Get_node_id
+#define MPID_Get_max_node_id             MPIDI_Get_max_node_id
+#define MPID_Request_is_pending_failure  MPIDI_Request_is_pending_failure
+#define MPID_Aint_add                    MPIDI_Aint_add
+#define MPID_Aint_diff                   MPIDI_Aint_diff
+#define MPID_GPID_GetAllInComm           MPIDI_GPID_GetAllInComm
+#define MPID_GPID_ToLpidArray            MPIDI_GPID_ToLpidArray
+#define MPID_Create_intercomm_from_lpids MPIDI_Create_intercomm_from_lpids
+/* Variables */
+#define MPID_Comm_create                 MPIDI_Comm_create
+#define MPID_Comm_destroy                MPIDI_Comm_destroy
+#define MPID_Barrier                     MPIDI_Barrier
+#define MPID_Bcast                       MPIDI_Bcast
+#define MPID_Allreduce                   MPIDI_Allreduce
+#define MPID_Allgather                   MPIDI_Allgather
+#define MPID_Allgatherv                  MPIDI_Allgatherv
+#define MPID_Scatter                     MPIDI_Scatter
+#define MPID_Scatterv                    MPIDI_Scatterv
+#define MPID_Gather                      MPIDI_Gather
+#define MPID_Gatherv                     MPIDI_Gatherv
+#define MPID_Alltoall                    MPIDI_Alltoall
+#define MPID_Alltoallv                   MPIDI_Alltoallv
+#define MPID_Alltoallw                   MPIDI_Alltoallw
+#define MPID_Reduce                      MPIDI_Reduce
+#define MPID_Reduce_scatter              MPIDI_Reduce_scatter
+#define MPID_Reduce_scatter_block        MPIDI_Reduce_scatter_block
+#define MPID_Scan                        MPIDI_Scan
+#define MPID_Exscan                      MPIDI_Exscan
+#define MPID_Neighbor_allgather          MPIDI_Neighbor_allgather
+#define MPID_Neighbor_allgatherv         MPIDI_Neighbor_allgatherv
+#define MPID_Neighbor_alltoallv          MPIDI_Neighbor_alltoallv
+#define MPID_Neighbor_alltoallw          MPIDI_Neighbor_alltoallw
+#define MPID_Neighbor_alltoall           MPIDI_Neighbor_alltoall
+#define MPID_Ineighbor_allgather         MPIDI_Ineighbor_allgather
+#define MPID_Ineighbor_allgatherv        MPIDI_Ineighbor_allgatherv
+#define MPID_Ineighbor_alltoall          MPIDI_Ineighbor_alltoall
+#define MPID_Ineighbor_alltoallv         MPIDI_Ineighbor_alltoallv
+#define MPID_Ineighbor_alltoallw         MPIDI_Ineighbor_alltoallw
+#define MPID_Ibarrier                    MPIDI_Ibarrier
+#define MPID_Ibcast                      MPIDI_Ibcast
+#define MPID_Iallgather                  MPIDI_Iallgather
+#define MPID_Iallgatherv                 MPIDI_Iallgatherv
+#define MPID_Iallreduce                  MPIDI_Iallreduce
+#define MPID_Ialltoall                   MPIDI_Ialltoall
+#define MPID_Ialltoallv                  MPIDI_Ialltoallv
+#define MPID_Ialltoallw                  MPIDI_Ialltoallw
+#define MPID_Iexscan                     MPIDI_Iexscan
+#define MPID_Igather                     MPIDI_Igather
+#define MPID_Igatherv                    MPIDI_Igatherv
+#define MPID_Ireduce_scatter_block       MPIDI_Ireduce_scatter_block
+#define MPID_Ireduce_scatter             MPIDI_Ireduce_scatter
+#define MPID_Ireduce                     MPIDI_Ireduce
+#define MPID_Iscan                       MPIDI_Iscan
+#define MPID_Iscatter                    MPIDI_Iscatter
+#define MPID_Iscatterv                   MPIDI_Iscatterv
+
+#define MPIDI_MAX_NETMOD_STRING_LEN 64
+extern int MPIDI_num_netmods;
+#if defined(MPL_USE_DBG_LOGGING)
+extern MPL_dbg_class MPIDI_CH4_DBG_GENERAL;
+extern MPL_dbg_class MPIDI_CH4_DBG_MAP;
+extern MPL_dbg_class MPIDI_CH4_DBG_MEMORY;
+#endif /* MPL_USE_DBG_LOGGING */
+
+
+
+
+#endif /* MPIDCH4_H_INCLUDED */
diff --git a/src/mpid/ch4/include/mpidch4r.h b/src/mpid/ch4/include/mpidch4r.h
new file mode 100644
index 0000000..e70f11e
--- /dev/null
+++ b/src/mpid/ch4/include/mpidch4r.h
@@ -0,0 +1,26 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef MPIDCH4R_H_INCLUDED
+#define MPIDCH4R_H_INCLUDED
+
+#include "ch4r_symheap.h"
+#include "ch4r_recvq.h"
+#include "ch4r_proc.h"
+#include "ch4r_init.h"
+#include "ch4r_probe.h"
+#include "ch4r_recv.h"
+#include "ch4r_rma.h"
+#include "ch4r_send.h"
+#include "ch4r_win.h"
+#include "ch4r_buf.h"
+#include "ch4r_request.h"
+
+#endif /* MPIDCH4R_H_INCLUDED */
diff --git a/src/mpid/ch4/include/mpidimpl.h b/src/mpid/ch4/include/mpidimpl.h
new file mode 100644
index 0000000..e32d90e
--- /dev/null
+++ b/src/mpid/ch4/include/mpidimpl.h
@@ -0,0 +1,30 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef MPIDIMPL_H_INCLUDED
+#define MPIDIMPL_H_INCLUDED
+
+#include "mpichconf.h"
+#include <stdio.h>
+
+#if defined(HAVE_ASSERT_H)
+#include <assert.h>
+#endif
+
+#define MPICH_SKIP_MPICXX
+#include "mpiimpl.h"
+
+#if !defined(MPICH_MPIDPRE_H_INCLUDED)
+#include "mpidpre.h"
+#endif
+
+#include "mpidch4.h"
+
+#endif /* MPIDIMPL_H_INCLUDED */
diff --git a/src/mpid/ch4/include/mpidpost.h b/src/mpid/ch4/include/mpidpost.h
new file mode 100644
index 0000000..40edeb7
--- /dev/null
+++ b/src/mpid/ch4/include/mpidpost.h
@@ -0,0 +1,30 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef MPIDPOST_H_INCLUDED
+#define MPIDPOST_H_INCLUDED
+
+#include "mpidu_datatype.h"
+#include "mpidch4.h"
+
+__ALWAYS_INLINE__ void MPID_Request_init(MPIR_Request * req)
+{
+    MPIDI_CH4U_REQUEST(req, req) = NULL;
+#ifdef MPIDI_BUILD_CH4_SHM
+    MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(req) = NULL;
+#endif
+}
+
+__ALWAYS_INLINE__ void MPID_Request_finalize(MPIR_Request * req)
+{
+    return;
+}
+
+#endif /* MPIDPOST_H_INCLUDED */
diff --git a/src/mpid/ch4/include/mpidpre.h b/src/mpid/ch4/include/mpidpre.h
new file mode 100644
index 0000000..6fdd3f8
--- /dev/null
+++ b/src/mpid/ch4/include/mpidpre.h
@@ -0,0 +1,465 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef MPIDPRE_H_INCLUDED
+#define MPIDPRE_H_INCLUDED
+
+#if defined(HAVE_SYS_TYPES_H)
+#include <sys/types.h>
+#endif
+
+#include "mpidu_dataloop.h"
+#include "mpid_thread.h"
+#include "mpid_sched.h"
+#include "mpid_timers_fallback.h"
+#include "netmodpre.h"
+#include "shmpre.h"
+#include "mpl_uthash.h"
+
+typedef struct {
+    union {
+    MPIDI_NM_DT_DECL} netmod;
+} MPIDI_Devdt_t;
+#define MPID_DEV_DATATYPE_DECL   MPIDI_Devdt_t   dev;
+#include "mpid_datatype_fallback.h"
+
+typedef int MPID_Progress_state;
+#define HAVE_GPID_ROUTINES
+
+#define __ALWAYS_INLINE__ __attribute__((always_inline)) static inline
+
+#define CH4_COMPILE_TIME_ASSERT(expr_)                                  \
+  do { switch(0) { case 0: case (expr_): default: break; } } while (0)
+
+/* Forward declaration of MPIR_Win so that we can refer to it in this file */
+struct MPIR_Win;
+typedef struct MPIR_Win MPIR_Win;
+
+typedef enum {
+    MPIDI_PTYPE_RECV,
+    MPIDI_PTYPE_SEND,
+    MPIDI_PTYPE_BSEND,
+    MPIDI_PTYPE_SSEND
+} MPIDI_ptype;
+
+#define MPIDI_CH4U_REQ_BUSY 		  (0x1)
+#define MPIDI_CH4U_REQ_PEER_SSEND 	  (0x1 << 1)
+#define MPIDI_CH4U_REQ_UNEXPECTED 	  (0x1 << 2)
+#define MPIDI_CH4U_REQ_UNEXP_DQUED 	  (0x1 << 3)
+#define MPIDI_CH4U_REQ_UNEXP_CLAIMED  (0x1 << 4)
+#define MPIDI_CH4U_REQ_RCV_NON_CONTIG (0x1 << 5)
+#define MPIDI_CH4U_REQ_MATCHED (0x1 << 6)
+#define MPIDI_CH4U_REQ_LONG_RTS (0x1 << 7)
+
+#define MPIDI_PARENT_PORT_KVSKEY "PARENT_ROOT_PORT_NAME"
+#define MPIDI_MAX_KVS_VALUE_LEN  4096
+
+typedef struct MPIDI_CH4U_sreq_t {
+    /* persistent send fields */
+} MPIDI_CH4U_sreq_t;
+
+typedef struct MPIDI_CH4U_lreq_t {
+    /* Long send fields */
+    const void *src_buf;
+    MPI_Count count;
+    MPI_Datatype datatype;
+    uint64_t msg_tag;
+} MPIDI_CH4U_lreq_t;
+
+typedef struct MPIDI_CH4U_rreq_t {
+    /* mrecv fields */
+    void *mrcv_buffer;
+    uint64_t mrcv_count;
+    MPI_Datatype mrcv_datatype;
+
+    uint64_t ignore;
+    uint64_t peer_req_ptr;
+    uint64_t match_req;
+    uint64_t request;
+
+    struct MPIDI_CH4U_rreq_t *prev, *next;
+} MPIDI_CH4U_rreq_t;
+
+typedef struct MPIDI_CH4U_put_req_t {
+    MPIR_Win *win_ptr;
+    uint64_t preq_ptr;
+    void *dt_iov;
+    void *origin_addr;
+    int origin_count;
+    MPI_Datatype origin_datatype;
+    int n_iov;
+} MPIDI_CH4U_put_req_t;
+
+typedef struct MPIDI_CH4U_get_req_t {
+    MPIR_Win *win_ptr;
+    uint64_t greq_ptr;
+    uint64_t addr;
+    MPI_Datatype datatype;
+    int count;
+    int n_iov;
+    void *dt_iov;
+} MPIDI_CH4U_get_req_t;
+
+typedef struct MPIDI_CH4U_cswap_req_t {
+    MPIR_Win *win_ptr;
+    uint64_t creq_ptr;
+    uint64_t addr;
+    MPI_Datatype datatype;
+    void *data;
+    void *result_addr;
+} MPIDI_CH4U_cswap_req_t;
+
+typedef struct MPIDI_CH4U_acc_req_t {
+    MPIR_Win *win_ptr;
+    uint64_t req_ptr;
+    MPI_Datatype origin_datatype;
+    MPI_Datatype target_datatype;
+    int origin_count;
+    int target_count;
+    int n_iov;
+    void *target_addr;
+    void *dt_iov;
+    void *data;
+    size_t data_sz;
+    MPI_Op op;
+    void *result_addr;
+    int result_count;
+    int do_get;
+    void *origin_addr;
+    MPI_Datatype result_datatype;
+} MPIDI_CH4U_acc_req_t;
+
+typedef struct MPIDI_CH4U_req_ext_t {
+    union {
+        MPIDI_CH4U_sreq_t sreq;
+        MPIDI_CH4U_lreq_t lreq;
+        MPIDI_CH4U_rreq_t rreq;
+        MPIDI_CH4U_put_req_t preq;
+        MPIDI_CH4U_get_req_t greq;
+        MPIDI_CH4U_cswap_req_t creq;
+        MPIDI_CH4U_acc_req_t areq;
+    };
+
+    struct iovec *iov;
+    void *cmpl_handler_fn;
+    uint64_t seq_no;
+    uint64_t request;
+    uint64_t status;
+    struct MPIDI_CH4U_req_ext_t *next, *prev;
+
+} MPIDI_CH4U_req_ext_t;
+
+typedef struct MPIDI_CH4U_req_t {
+    union {
+    MPIDI_NM_REQUEST_AM_DECL} netmod_am;
+    MPIDI_CH4U_req_ext_t *req;
+    MPIDI_ptype p_type;
+    void *buffer;
+    uint64_t count;
+    uint64_t tag;
+    int src_rank;
+    MPI_Datatype datatype;
+} MPIDI_CH4U_req_t;
+
+typedef struct {
+#ifdef MPIDI_CH4_EXCLUSIVE_SHM
+    int is_local;
+#endif
+    /* Anysource handling. Netmod and shm specific requests are cross
+     * referenced. This must be present all of the time to avoid lots of extra
+     * ifdefs in the code. */
+#ifdef MPIDI_BUILD_CH4_SHM
+    struct MPIR_Request *anysource_partner_request;
+#endif
+
+    union {
+        /* The first fields are used by the CH4U apis */
+        MPIDI_CH4U_req_t ch4u;
+
+        /* Used by the netmod direct apis */
+        union {
+        MPIDI_NM_REQUEST_DECL} netmod;
+
+        union {
+        MPIDI_SHM_REQUEST_DECL} shm;
+    } ch4;
+} MPIDI_Devreq_t;
+#define MPIDI_REQUEST_HDR_SIZE              offsetof(struct MPIR_Request, dev.ch4.netmod)
+#define MPIDI_REQUEST_CH4U_HDR_SIZE         offsetof(struct MPIR_Request, dev.ch4.netmod_am)
+#define MPIDI_CH4I_REQUEST(req,field)       (((req)->dev).field)
+#define MPIDI_CH4U_REQUEST(req,field)       (((req)->dev.ch4.ch4u).field)
+
+#ifdef MPIDI_BUILD_CH4_SHM
+#define MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(req)  (((req)->dev).anysource_partner_request)
+#else
+#define MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(req)  NULL
+#endif
+
+__ALWAYS_INLINE__ void MPID_Request_init(struct MPIR_Request *req);
+
+__ALWAYS_INLINE__ void MPID_Request_finalize(struct MPIR_Request *req);
+
+typedef struct MPIDI_CH4U_win_shared_info {
+    uint32_t disp_unit;
+    size_t size;
+} __attribute__ ((packed)) MPIDI_CH4U_win_shared_info_t;
+
+#define MPIDI_CH4I_ACCU_ORDER_RAR (1)
+#define MPIDI_CH4I_ACCU_ORDER_RAW (1 << 1)
+#define MPIDI_CH4I_ACCU_ORDER_WAR (1 << 2)
+#define MPIDI_CH4I_ACCU_ORDER_WAW (1 << 3)
+
+typedef enum {
+    MPIDI_CH4I_ACCU_SAME_OP,
+    MPIDI_CH4I_ACCU_SAME_OP_NO_OP
+} MPIDI_CH4U_win_info_accumulate_ops;
+
+typedef struct MPIDI_CH4U_win_info_args_t {
+    int no_locks;
+    int same_size;
+    int accumulate_ordering;
+    int alloc_shared_noncontig;
+    MPIDI_CH4U_win_info_accumulate_ops accumulate_ops;
+} MPIDI_CH4U_win_info_args_t;
+
+struct MPIDI_CH4U_win_lock {
+    struct MPIDI_CH4U_win_lock *next;
+    int rank;
+    uint16_t mtype;
+    uint16_t type;
+};
+
+struct MPIDI_CH4U_win_queue {
+    struct MPIDI_CH4U_win_lock *head;
+    struct MPIDI_CH4U_win_lock *tail;
+};
+
+typedef struct MPIDI_CH4U_win_lock_info {
+    unsigned peer;
+    int lock_type;
+    struct MPIR_Win *win;
+    volatile unsigned done;
+} MPIDI_CH4U_win_lock_info;
+
+typedef struct MPIDI_CH4U_win_sync_lock {
+    struct {
+        volatile unsigned locked;
+        volatile unsigned allLocked;
+    } remote;
+    struct {
+        struct MPIDI_CH4U_win_queue requested;
+        int type;
+        unsigned count;
+    } local;
+} MPIDI_CH4U_win_sync_lock;
+
+typedef struct MPIDI_CH4U_win_sync_pscw {
+    struct MPIR_Group *group;
+    volatile unsigned count;
+} MPIDI_CH4U_win_sync_pscw;
+
+typedef struct MPIDI_CH4U_win_sync_t {
+    volatile int origin_epoch_type;
+    volatile int target_epoch_type;
+    MPIDI_CH4U_win_sync_pscw sc, pw;
+    MPIDI_CH4U_win_sync_lock lock;
+} MPIDI_CH4U_win_sync_t;
+
+typedef struct MPIDI_CH4U_win_t {
+    uint64_t win_id;
+    void *mmap_addr;
+    int64_t mmap_sz;
+    OPA_int_t outstanding_ops;
+    MPI_Aint *sizes;
+    MPIDI_CH4U_win_lock_info *lockQ;
+    MPIDI_CH4U_win_sync_t sync;
+    MPIDI_CH4U_win_info_args_t info_args;
+    MPIDI_CH4U_win_shared_info_t *shared_table;
+    MPL_UT_hash_handle hash_handle;
+} MPIDI_CH4U_win_t;
+
+typedef struct {
+    MPIDI_CH4U_win_t ch4u;
+    union {
+    MPIDI_NM_WIN_DECL} netmod;
+} MPIDI_Devwin_t;
+
+#define MPIDI_CH4U_WIN(win,field)        (((win)->dev.ch4u).field)
+#define MPIDI_CH4U_WINFO(win,rank) (MPIDI_CH4U_win_info_t*) &(MPIDI_CH4U_WIN(win, info_table)[rank])
+
+typedef unsigned MPIDII_locality_t;
+
+typedef struct MPIDI_CH4U_comm_t {
+    MPIDI_CH4U_rreq_t *posted_list;
+    MPIDI_CH4U_rreq_t *unexp_list;
+    uint32_t window_instance;
+} MPIDI_CH4U_comm_t;
+
+#define MPIDII_CALC_STRIDE(rank, stride, blocksize, offset) \
+    ((rank) / (blocksize) * ((stride) - (blocksize)) + (rank) + (offset))
+
+#define MPIDII_CALC_STRIDE_SIMPLE(rank, stride, offset) \
+    ((rank) * (stride) + (offset))
+
+typedef enum {
+    MPIDII_RANK_MAP_DIRECT,
+    MPIDII_RANK_MAP_DIRECT_INTRA,
+    MPIDII_RANK_MAP_OFFSET,
+    MPIDII_RANK_MAP_OFFSET_INTRA,
+    MPIDII_RANK_MAP_STRIDE,
+    MPIDII_RANK_MAP_STRIDE_INTRA,
+    MPIDII_RANK_MAP_STRIDE_BLOCK,
+    MPIDII_RANK_MAP_STRIDE_BLOCK_INTRA,
+    MPIDII_RANK_MAP_LUT,
+    MPIDII_RANK_MAP_LUT_INTRA,
+    MPIDII_RANK_MAP_MLUT,
+    MPIDII_RANK_MAP_NONE
+} MPIDII_rank_map_mode;
+
+typedef int MPIDII_lpid_t;
+typedef struct {
+    int avtid;
+    int lpid;
+} MPIDII_gpid_t;
+
+typedef struct {
+    MPIR_OBJECT_HEADER;
+    MPIDII_lpid_t lpid[0];
+} MPIDII_rank_map_lut_t;
+
+typedef struct {
+    MPIR_OBJECT_HEADER;
+    MPIDII_gpid_t gpid[0];
+} MPIDII_rank_map_mlut_t;
+
+typedef struct {
+    MPIDII_rank_map_mode mode;
+    int avtid;
+    int size;
+
+    union {
+        int offset;
+        struct {
+            int offset;
+            int stride;
+            int blocksize;
+        } stride;
+    } reg;
+
+    union {
+        struct {
+            MPIDII_rank_map_lut_t *t;
+            MPIDII_lpid_t *lpid;
+        } lut;
+        struct {
+            MPIDII_rank_map_mlut_t *t;
+            MPIDII_gpid_t *gpid;
+        } mlut;
+    } irreg;
+} MPIDII_rank_map_t;
+
+typedef struct MPIDI_Devcomm_t {
+    struct {
+        /* The first fields are used by the CH4U apis */
+        MPIDI_CH4U_comm_t ch4u;
+
+        /* Used by the netmod direct apis */
+        union {
+        MPIDI_NM_COMM_DECL} netmod;
+
+        union {
+        MPIDI_SHM_COMM_DECL} shm;
+
+        MPIDII_rank_map_t map;
+        MPIDII_rank_map_t local_map;
+    } ch4;
+} MPIDI_Devcomm_t;
+#define MPIDI_CH4U_COMM(comm,field) ((comm)->dev.ch4.ch4u).field
+#define MPIDII_COMM(comm,field) ((comm)->dev.ch4).field
+
+
+#define MPID_USE_NODE_IDS
+typedef uint16_t MPID_Node_id_t;
+
+typedef struct {
+    union {
+    MPIDI_NM_OP_DECL} netmod;
+} MPIDI_Devop_t;
+
+typedef struct {
+    union {
+    MPIDI_NM_GPID_DECL} netmod;
+    MPID_Node_id_t node;
+} MPIDI_Devgpid_t;
+
+#define MPID_DEV_REQUEST_DECL    MPIDI_Devreq_t  dev;
+#define MPID_DEV_WIN_DECL        MPIDI_Devwin_t  dev;
+#define MPID_DEV_COMM_DECL       MPIDI_Devcomm_t dev;
+#define MPID_DEV_OP_DECL         MPIDI_Devop_t   dev;
+#define MPID_DEV_GPID_DECL       MPIDI_Devgpid_t dev;
+
+#define MPIDII_GPID(gpid) (gpid)->dev
+
+typedef struct {
+    union {
+    MPIDI_NM_ADDR_DECL} netmod;
+#ifdef MPIDI_BUILD_CH4_LOCALITY_INFO
+    MPIDII_locality_t is_local;
+#endif
+} MPIDII_av_entry_t;
+
+typedef struct {
+    MPIR_OBJECT_HEADER;
+    int size;
+    MPIDII_av_entry_t table[0];
+} MPIDII_av_table_t;
+
+extern MPIDII_av_table_t **MPIDII_av_table;
+extern MPIDII_av_table_t *MPIDII_av_table0;
+
+#define MPIDIU_get_av_table(avtid) (MPIDII_av_table[(avtid)])
+#define MPIDIU_get_av(avtid, lpid) (MPIDII_av_table[(avtid)]->table[(lpid)])
+
+#define MPIDIU_get_node_map(avtid)   (MPIDI_CH4_Global.node_map[(avtid)])
+
+#define MPID_Progress_register_hook(fn_, id_) MPID_Progress_register(fn_, id_)
+#define MPID_Progress_deregister_hook(id_) MPID_Progress_deregister(id_)
+#define MPID_Progress_activate_hook(id_) MPID_Progress_activate(id_)
+#define MPID_Progress_deactivate_hook(id_) MPID_Progress_deactivate(id_)
+
+#define HAVE_DEV_COMM_HOOK
+#define MPID_Dev_comm_create_hook(a)  (MPID_Comm_create(a))
+#define MPID_Dev_comm_destroy_hook(a) (MPID_Comm_destroy(a))
+
+#define MPID_Dev_datatype_commit_hook   MPIDI_NM_datatype_commit
+#define MPID_Dev_datatype_destroy_hook  MPIDI_NM_datatype_destroy
+
+#define MPID_Dev_op_commit_hook          MPIDI_NM_op_commit
+#define MPID_Dev_op_destroy_hook         MPIDI_NM_op_destroy
+
+/* operation for (avtid, lpid) to/from "lpid64" */
+/* hard code limit on number of live comm worlds. This should be fixed by future
+ * LUPID patch */
+#define MPIDIU_AVTID_BITS                    (8)
+#define MPIDIU_LPID_BITS                     (24)
+#define MPIDIU_LPID_MASK                     (0x00FFFFFFU)
+#define MPIDIU_AVTID_MASK                    (0xFF000000U)
+#define MPIDIU_NEW_AVT_MARK                  (0x80000000U)
+#define MPIDIU_LPID_CREATE(avtid, lpid)      (((avtid) << MPIDIU_LPID_BITS) | (lpid))
+#define MPIDIU_LPID_GET_AVTID(lpid)          ((((lpid) & MPIDIU_AVTID_MASK) >> MPIDIU_LPID_BITS))
+#define MPIDIU_LPID_GET_LPID(lpid)           (((lpid) & MPIDIU_LPID_MASK))
+#define MPIDIU_LPID_SET_NEW_AVT_MARK(lpid)   ((lpid) |= MPIDIU_NEW_AVT_MARK)
+#define MPIDIU_LPID_CLEAR_NEW_AVT_MARK(lpid) ((lpid) &= (~MPIDIU_NEW_AVT_MARK))
+#define MPIDIU_LPID_IS_NEW_AVT(lpid)         ((lpid) & MPIDIU_NEW_AVT_MARK)
+
+
+#include "mpidu_pre.h"
+
+#endif /* MPIDPRE_H_INCLUDED */
diff --git a/src/mpid/ch4/include/netmodpre.h.in b/src/mpid/ch4/include/netmodpre.h.in
new file mode 100644
index 0000000..086714e
--- /dev/null
+++ b/src/mpid/ch4/include/netmodpre.h.in
@@ -0,0 +1,26 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMODPRE_H_INCLUDED
+#define NETMODPRE_H_INCLUDED
+
+ at ch4_netmod_pre_include@
+
+#define MPIDI_NM_REQUEST_AM_DECL @ch4_netmod_amrequest_decl@
+#define MPIDI_NM_REQUEST_DECL    @ch4_netmod_request_decl@
+
+#define MPIDI_NM_COMM_DECL       @ch4_netmod_comm_decl@
+#define MPIDI_NM_DT_DECL         @ch4_netmod_dt_decl@
+#define MPIDI_NM_WIN_DECL        @ch4_netmod_win_decl@
+#define MPIDI_NM_GPID_DECL    @ch4_netmod_gpid_decl@
+#define MPIDI_NM_ADDR_DECL    @ch4_netmod_addr_decl@
+#define MPIDI_NM_OP_DECL         @ch4_netmod_op_decl@
+
+#endif /* NETMODPRE_H_INCLUDED */
diff --git a/src/mpid/ch4/include/shmpre.h.in b/src/mpid/ch4/include/shmpre.h.in
new file mode 100644
index 0000000..c6bcbdb
--- /dev/null
+++ b/src/mpid/ch4/include/shmpre.h.in
@@ -0,0 +1,19 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHMPRE_H_INCLUDED
+#define SHMPRE_H_INCLUDED
+
+ at ch4_shm_pre_include@
+
+#define MPIDI_SHM_REQUEST_DECL       @ch4_shm_request_decl@
+#define MPIDI_SHM_COMM_DECL          @ch4_shm_comm_decl@
+
+#endif /* SHMPRE_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/Makefile.mk b/src/mpid/ch4/netmod/Makefile.mk
new file mode 100644
index 0000000..2b875a6
--- /dev/null
+++ b/src/mpid/ch4/netmod/Makefile.mk
@@ -0,0 +1,22 @@
+## -*- Mode: Makefile; -*-
+## vim: set ft=automake :
+##
+## (C) 2016 by Argonne National Laboratory.
+## (C) 2014 by Mellanox Technologies, Inc.
+##     See COPYRIGHT in top-level directory.
+##
+##  Portions of this code were written by Intel Corporation.
+##  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+##  to Argonne National Laboratory subject to Software Grant and Corporate
+##  Contributor License Agreement dated February 8, 2012.
+##
+
+AM_CPPFLAGS += -I$(top_srcdir)/src/mpid/ch4/netmod/include
+
+noinst_HEADERS += src/mpid/ch4/netmod/include/netmod.h
+noinst_HEADERS += src/mpid/ch4/netmod/include/netmod_impl.h
+
+include $(top_srcdir)/src/mpid/ch4/netmod/ofi/Makefile.mk
+include $(top_srcdir)/src/mpid/ch4/netmod/ucx/Makefile.mk
+include $(top_srcdir)/src/mpid/ch4/netmod/portals4/Makefile.mk
+include $(top_srcdir)/src/mpid/ch4/netmod/stubnm/Makefile.mk
diff --git a/src/mpid/ch4/netmod/include/netmod.h b/src/mpid/ch4/netmod/include/netmod.h
new file mode 100644
index 0000000..bd33b0c
--- /dev/null
+++ b/src/mpid/ch4/netmod/include/netmod.h
@@ -0,0 +1,1133 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+/* ch4 netmod functions */
+#ifndef NETMOD_PROTOTYPES_H_INCLUDED
+#define NETMOD_PROTOTYPES_H_INCLUDED
+
+#include <mpidimpl.h>
+
+#define MPIDI_MAX_NETMOD_STRING_LEN 64
+
+typedef int (*MPIDI_NM_am_completion_handler_fn) (MPIR_Request * req);
+typedef int (*MPIDI_NM_am_origin_handler_fn) (MPIR_Request * req);
+
+/* Callback function setup by handler register function */
+/* for short cases, output arguments are NULL */
+typedef int (*MPIDI_NM_am_target_handler_fn)
+ (void *am_hdr, void **data,    /* data should be iovs if *is_contig is false */
+  size_t * data_sz, int *is_contig, MPIDI_NM_am_completion_handler_fn * cmpl_handler_fn,        /* completion handler */
+  MPIR_Request ** req);         /* if allocated, need pointer to completion function */
+
+typedef int (*MPIDI_NM_init_t) (int rank, int size, int appnum, int *tag_ub, MPIR_Comm * comm_world,
+                                MPIR_Comm * comm_self, int spawned, int num_contexts,
+                                void **netmod_contexts);
+typedef int (*MPIDI_NM_finalize_t) (void);
+typedef int (*MPIDI_NM_progress_t) (void *netmod_context, int blocking);
+typedef int (*MPIDI_NM_reg_hdr_handler_t) (int handler_id,
+                                           MPIDI_NM_am_origin_handler_fn origin_handler_fn,
+                                           MPIDI_NM_am_target_handler_fn target_handler_fn);
+typedef int (*MPIDI_NM_comm_connect_t) (const char *port_name, MPIR_Info * info, int root,
+                                        MPIR_Comm * comm, MPIR_Comm ** newcomm_ptr);
+typedef int (*MPIDI_NM_comm_disconnect_t) (MPIR_Comm * comm_ptr);
+typedef int (*MPIDI_NM_open_port_t) (MPIR_Info * info_ptr, char *port_name);
+typedef int (*MPIDI_NM_close_port_t) (const char *port_name);
+typedef int (*MPIDI_NM_comm_accept_t) (const char *port_name, MPIR_Info * info, int root,
+                                       MPIR_Comm * comm, MPIR_Comm ** newcomm_ptr);
+typedef int (*MPIDI_NM_send_am_hdr_t) (int rank, MPIR_Comm * comm, int handler_id,
+                                       const void *am_hdr, size_t am_hdr_sz, MPIR_Request * sreq,
+                                       void *netmod_context);
+typedef int (*MPIDI_NM_inject_am_hdr_t) (int rank, MPIR_Comm * comm, int handler_id,
+                                         const void *am_hdr, size_t am_hdr_sz,
+                                         void *netmod_context);
+typedef int (*MPIDI_NM_send_am_t) (int rank, MPIR_Comm * comm, int handler_id, const void *am_hdr,
+                                   size_t am_hdr_sz, const void *data, MPI_Count count,
+                                   MPI_Datatype datatype, MPIR_Request * sreq,
+                                   void *netmod_context);
+typedef int (*MPIDI_NM_send_amv_t) (int rank, MPIR_Comm * comm, int handler_id,
+                                    struct iovec * am_hdrs, size_t iov_len, const void *data,
+                                    MPI_Count count, MPI_Datatype datatype, MPIR_Request * sreq,
+                                    void *netmod_context);
+typedef int (*MPIDI_NM_send_amv_hdr_t) (int rank, MPIR_Comm * comm, int handler_id,
+                                        struct iovec * am_hdrs, size_t iov_len, MPIR_Request * sreq,
+                                        void *netmod_context);
+typedef int (*MPIDI_NM_send_am_hdr_reply_t) (MPIR_Context_id_t context_id, int src_rank,
+                                             int handler_id, const void *am_hdr, size_t am_hdr_sz,
+                                             MPIR_Request * sreq);
+typedef int (*MPIDI_NM_inject_am_hdr_reply_t) (MPIR_Context_id_t context_id, int src_rank,
+                                               int handler_id, const void *am_hdr,
+                                               size_t am_hdr_sz);
+typedef int (*MPIDI_NM_send_am_reply_t) (MPIR_Context_id_t context_id, int src_rank, int handler_id,
+                                         const void *am_hdr, size_t am_hdr_sz, const void *data,
+                                         MPI_Count count, MPI_Datatype datatype,
+                                         MPIR_Request * sreq);
+typedef int (*MPIDI_NM_send_amv_reply_t) (MPIR_Context_id_t context_id, int src_rank,
+                                          int handler_id, struct iovec * am_hdr, size_t iov_len,
+                                          const void *data, MPI_Count count, MPI_Datatype datatype,
+                                          MPIR_Request * sreq);
+typedef size_t(*MPIDI_NM_am_hdr_max_sz_t) (void);
+typedef size_t(*MPIDI_NM_am_inject_max_sz_t) (void);
+typedef int (*MPIDI_NM_am_recv_t) (MPIR_Request * req);
+typedef int (*MPIDI_NM_comm_get_lpid_t) (MPIR_Comm * comm_ptr, int idx, int *lpid_ptr,
+                                         MPL_bool is_remote);
+typedef int (*MPIDI_NM_gpid_get_t) (MPIR_Comm * comm_ptr, int rank, MPIR_Gpid * gpid);
+typedef int (*MPIDI_NM_get_node_id_t) (MPIR_Comm * comm, int rank, MPID_Node_id_t * id_p);
+typedef int (*MPIDI_NM_get_max_node_id_t) (MPIR_Comm * comm, MPID_Node_id_t * max_id_p);
+typedef int (*MPIDI_NM_getallincomm_t) (MPIR_Comm * comm_ptr, int local_size,
+                                        MPIR_Gpid local_gpids[], int *singleAVT);
+typedef int (*MPIDI_NM_gpid_tolpidarray_t) (int size, MPIR_Gpid gpid[], int lpid[]);
+typedef int (*MPIDI_NM_create_intercomm_from_lpids_t) (MPIR_Comm * newcomm_ptr, int size,
+                                                       const int lpids[]);
+typedef int (*MPIDI_NM_comm_create_t) (MPIR_Comm * comm);
+typedef int (*MPIDI_NM_comm_destroy_t) (MPIR_Comm * comm);
+typedef void (*MPIDI_NM_am_request_init_t) (MPIR_Request * req);
+typedef void (*MPIDI_NM_am_request_finalize_t) (MPIR_Request * req);
+typedef int (*MPIDI_NM_send_t) (const void *buf, int count, MPI_Datatype datatype, int rank,
+                                int tag, MPIR_Comm * comm, int context_offset,
+                                MPIR_Request ** request);
+typedef int (*MPIDI_NM_ssend_t) (const void *buf, int count, MPI_Datatype datatype, int rank,
+                                 int tag, MPIR_Comm * comm, int context_offset,
+                                 MPIR_Request ** request);
+typedef int (*MPIDI_NM_startall_t) (int count, MPIR_Request * requests[]);
+typedef int (*MPIDI_NM_send_init_t) (const void *buf, int count, MPI_Datatype datatype, int rank,
+                                     int tag, MPIR_Comm * comm, int context_offset,
+                                     MPIR_Request ** request);
+typedef int (*MPIDI_NM_ssend_init_t) (const void *buf, int count, MPI_Datatype datatype, int rank,
+                                      int tag, MPIR_Comm * comm, int context_offset,
+                                      MPIR_Request ** request);
+typedef int (*MPIDI_NM_rsend_init_t) (const void *buf, int count, MPI_Datatype datatype, int rank,
+                                      int tag, MPIR_Comm * comm, int context_offset,
+                                      MPIR_Request ** request);
+typedef int (*MPIDI_NM_bsend_init_t) (const void *buf, int count, MPI_Datatype datatype, int rank,
+                                      int tag, MPIR_Comm * comm, int context_offset,
+                                      MPIR_Request ** request);
+typedef int (*MPIDI_NM_isend_t) (const void *buf, int count, MPI_Datatype datatype, int rank,
+                                 int tag, MPIR_Comm * comm, int context_offset,
+                                 MPIR_Request ** request);
+typedef int (*MPIDI_NM_issend_t) (const void *buf, int count, MPI_Datatype datatype, int rank,
+                                  int tag, MPIR_Comm * comm, int context_offset,
+                                  MPIR_Request ** request);
+typedef int (*MPIDI_NM_cancel_send_t) (MPIR_Request * sreq);
+typedef int (*MPIDI_NM_recv_init_t) (void *buf, int count, MPI_Datatype datatype, int rank, int tag,
+                                     MPIR_Comm * comm, int context_offset, MPIR_Request ** request);
+typedef int (*MPIDI_NM_recv_t) (void *buf, int count, MPI_Datatype datatype, int rank, int tag,
+                                MPIR_Comm * comm, int context_offset, MPI_Status * status,
+                                MPIR_Request ** request);
+typedef int (*MPIDI_NM_irecv_t) (void *buf, int count, MPI_Datatype datatype, int rank, int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request);
+typedef int (*MPIDI_NM_imrecv_t) (void *buf, int count, MPI_Datatype datatype,
+                                  MPIR_Request * message, MPIR_Request ** rreqp);
+typedef int (*MPIDI_NM_cancel_recv_t) (MPIR_Request * rreq);
+typedef void *(*MPIDI_NM_alloc_mem_t) (size_t size, MPIR_Info * info_ptr);
+typedef int (*MPIDI_NM_free_mem_t) (void *ptr);
+typedef int (*MPIDI_NM_improbe_t) (int source, int tag, MPIR_Comm * comm, int context_offset,
+                                   int *flag, MPIR_Request ** message, MPI_Status * status);
+typedef int (*MPIDI_NM_iprobe_t) (int source, int tag, MPIR_Comm * comm, int context_offset,
+                                  int *flag, MPI_Status * status);
+typedef int (*MPIDI_NM_win_set_info_t) (MPIR_Win * win, MPIR_Info * info);
+typedef int (*MPIDI_NM_win_shared_query_t) (MPIR_Win * win, int rank, MPI_Aint * size,
+                                            int *disp_unit, void *baseptr);
+typedef int (*MPIDI_NM_put_t) (const void *origin_addr, int origin_count,
+                               MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp,
+                               int target_count, MPI_Datatype target_datatype, MPIR_Win * win);
+typedef int (*MPIDI_NM_win_start_t) (MPIR_Group * group, int assert, MPIR_Win * win);
+typedef int (*MPIDI_NM_win_complete_t) (MPIR_Win * win);
+typedef int (*MPIDI_NM_win_post_t) (MPIR_Group * group, int assert, MPIR_Win * win);
+typedef int (*MPIDI_NM_win_wait_t) (MPIR_Win * win);
+typedef int (*MPIDI_NM_win_test_t) (MPIR_Win * win, int *flag);
+typedef int (*MPIDI_NM_win_lock_t) (int lock_type, int rank, int assert, MPIR_Win * win);
+typedef int (*MPIDI_NM_win_unlock_t) (int rank, MPIR_Win * win);
+typedef int (*MPIDI_NM_win_get_info_t) (MPIR_Win * win, MPIR_Info ** info_p_p);
+typedef int (*MPIDI_NM_get_t) (void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                               int target_rank, MPI_Aint target_disp, int target_count,
+                               MPI_Datatype target_datatype, MPIR_Win * win);
+typedef int (*MPIDI_NM_win_free_t) (MPIR_Win ** win_ptr);
+typedef int (*MPIDI_NM_win_fence_t) (int assert, MPIR_Win * win);
+typedef int (*MPIDI_NM_win_create_t) (void *base, MPI_Aint length, int disp_unit, MPIR_Info * info,
+                                      MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr);
+typedef int (*MPIDI_NM_accumulate_t) (const void *origin_addr, int origin_count,
+                                      MPI_Datatype origin_datatype, int target_rank,
+                                      MPI_Aint target_disp, int target_count,
+                                      MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win);
+typedef int (*MPIDI_NM_win_attach_t) (MPIR_Win * win, void *base, MPI_Aint size);
+typedef int (*MPIDI_NM_win_allocate_shared_t) (MPI_Aint size, int disp_unit, MPIR_Info * info_ptr,
+                                               MPIR_Comm * comm_ptr, void **base_ptr,
+                                               MPIR_Win ** win_ptr);
+typedef int (*MPIDI_NM_rput_t) (const void *origin_addr, int origin_count,
+                                MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp,
+                                int target_count, MPI_Datatype target_datatype, MPIR_Win * win,
+                                MPIR_Request ** request);
+typedef int (*MPIDI_NM_win_flush_local_t) (int rank, MPIR_Win * win);
+typedef int (*MPIDI_NM_win_detach_t) (MPIR_Win * win, const void *base);
+typedef int (*MPIDI_NM_compare_and_swap_t) (const void *origin_addr, const void *compare_addr,
+                                            void *result_addr, MPI_Datatype datatype,
+                                            int target_rank, MPI_Aint target_disp, MPIR_Win * win);
+typedef int (*MPIDI_NM_raccumulate_t) (const void *origin_addr, int origin_count,
+                                       MPI_Datatype origin_datatype, int target_rank,
+                                       MPI_Aint target_disp, int target_count,
+                                       MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win,
+                                       MPIR_Request ** request);
+typedef int (*MPIDI_NM_rget_accumulate_t) (const void *origin_addr, int origin_count,
+                                           MPI_Datatype origin_datatype, void *result_addr,
+                                           int result_count, MPI_Datatype result_datatype,
+                                           int target_rank, MPI_Aint target_disp, int target_count,
+                                           MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win,
+                                           MPIR_Request ** request);
+typedef int (*MPIDI_NM_fetch_and_op_t) (const void *origin_addr, void *result_addr,
+                                        MPI_Datatype datatype, int target_rank,
+                                        MPI_Aint target_disp, MPI_Op op, MPIR_Win * win);
+typedef int (*MPIDI_NM_win_allocate_t) (MPI_Aint size, int disp_unit, MPIR_Info * info,
+                                        MPIR_Comm * comm, void *baseptr, MPIR_Win ** win);
+typedef int (*MPIDI_NM_win_flush_t) (int rank, MPIR_Win * win);
+typedef int (*MPIDI_NM_win_flush_local_all_t) (MPIR_Win * win);
+typedef int (*MPIDI_NM_win_unlock_all_t) (MPIR_Win * win);
+typedef int (*MPIDI_NM_win_create_dynamic_t) (MPIR_Info * info, MPIR_Comm * comm, MPIR_Win ** win);
+typedef int (*MPIDI_NM_rget_t) (void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                                int target_rank, MPI_Aint target_disp, int target_count,
+                                MPI_Datatype target_datatype, MPIR_Win * win,
+                                MPIR_Request ** request);
+typedef int (*MPIDI_NM_win_sync_t) (MPIR_Win * win);
+typedef int (*MPIDI_NM_win_flush_all_t) (MPIR_Win * win);
+typedef int (*MPIDI_NM_get_accumulate_t) (const void *origin_addr, int origin_count,
+                                          MPI_Datatype origin_datatype, void *result_addr,
+                                          int result_count, MPI_Datatype result_datatype,
+                                          int target_rank, MPI_Aint target_disp, int target_count,
+                                          MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win);
+typedef int (*MPIDI_NM_win_lock_all_t) (int assert, MPIR_Win * win);
+typedef int (*MPIDI_NM_rank_is_local_t) (int target, MPIR_Comm * comm);
+typedef int (*MPIDI_NM_barrier_t) (MPIR_Comm * comm, MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_NM_bcast_t) (void *buffer, int count, MPI_Datatype datatype, int root,
+                                 MPIR_Comm * comm, MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_NM_allreduce_t) (const void *sendbuf, void *recvbuf, int count,
+                                     MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                     MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_NM_allgather_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                     MPIR_Comm * comm, MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_NM_allgatherv_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                      void *recvbuf, const int *recvcounts, const int *displs,
+                                      MPI_Datatype recvtype, MPIR_Comm * comm,
+                                      MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_NM_scatter_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, int recvcount, MPI_Datatype recvtype, int root,
+                                   MPIR_Comm * comm, MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_NM_scatterv_t) (const void *sendbuf, const int *sendcounts, const int *displs,
+                                    MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                    MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr,
+                                    MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_NM_gather_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                  void *recvbuf, int recvcount, MPI_Datatype recvtype, int root,
+                                  MPIR_Comm * comm, MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_NM_gatherv_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, const int *recvcounts, const int *displs,
+                                   MPI_Datatype recvtype, int root, MPIR_Comm * comm,
+                                   MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_NM_alltoall_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                    MPIR_Comm * comm, MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_NM_alltoallv_t) (const void *sendbuf, const int *sendcounts, const int *sdispls,
+                                     MPI_Datatype sendtype, void *recvbuf, const int *recvcounts,
+                                     const int *rdispls, MPI_Datatype recvtype, MPIR_Comm * comm,
+                                     MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_NM_alltoallw_t) (const void *sendbuf, const int *sendcounts, const int *sdispls,
+                                     const MPI_Datatype sendtypes[], void *recvbuf,
+                                     const int *recvcounts, const int *rdispls,
+                                     const MPI_Datatype recvtypes[], MPIR_Comm * comm,
+                                     MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_NM_reduce_t) (const void *sendbuf, void *recvbuf, int count,
+                                  MPI_Datatype datatype, MPI_Op op, int root, MPIR_Comm * comm_ptr,
+                                  MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_NM_reduce_scatter_t) (const void *sendbuf, void *recvbuf, const int *recvcounts,
+                                          MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                          MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_NM_reduce_scatter_block_t) (const void *sendbuf, void *recvbuf, int recvcount,
+                                                MPI_Datatype datatype, MPI_Op op,
+                                                MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_NM_scan_t) (const void *sendbuf, void *recvbuf, int count,
+                                MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_NM_exscan_t) (const void *sendbuf, void *recvbuf, int count,
+                                  MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                  MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_NM_neighbor_allgather_t) (const void *sendbuf, int sendcount,
+                                              MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                              MPI_Datatype recvtype, MPIR_Comm * comm);
+typedef int (*MPIDI_NM_neighbor_allgatherv_t) (const void *sendbuf, int sendcount,
+                                               MPI_Datatype sendtype, void *recvbuf,
+                                               const int *recvcounts, const int *displs,
+                                               MPI_Datatype recvtype, MPIR_Comm * comm);
+typedef int (*MPIDI_NM_neighbor_alltoallv_t) (const void *sendbuf, const int *sendcounts,
+                                              const int *sdispls, MPI_Datatype sendtype,
+                                              void *recvbuf, const int *recvcounts,
+                                              const int *rdispls, MPI_Datatype recvtype,
+                                              MPIR_Comm * comm);
+typedef int (*MPIDI_NM_neighbor_alltoallw_t) (const void *sendbuf, const int *sendcounts,
+                                              const MPI_Aint * sdispls,
+                                              const MPI_Datatype * sendtypes, void *recvbuf,
+                                              const int *recvcounts, const MPI_Aint * rdispls,
+                                              const MPI_Datatype * recvtypes, MPIR_Comm * comm);
+typedef int (*MPIDI_NM_neighbor_alltoall_t) (const void *sendbuf, int sendcount,
+                                             MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                             MPI_Datatype recvtype, MPIR_Comm * comm);
+typedef int (*MPIDI_NM_ineighbor_allgather_t) (const void *sendbuf, int sendcount,
+                                               MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                               MPI_Datatype recvtype, MPIR_Comm * comm,
+                                               MPI_Request * req);
+typedef int (*MPIDI_NM_ineighbor_allgatherv_t) (const void *sendbuf, int sendcount,
+                                                MPI_Datatype sendtype, void *recvbuf,
+                                                const int *recvcounts, const int *displs,
+                                                MPI_Datatype recvtype, MPIR_Comm * comm,
+                                                MPI_Request * req);
+typedef int (*MPIDI_NM_ineighbor_alltoall_t) (const void *sendbuf, int sendcount,
+                                              MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                              MPI_Datatype recvtype, MPIR_Comm * comm,
+                                              MPI_Request * req);
+typedef int (*MPIDI_NM_ineighbor_alltoallv_t) (const void *sendbuf, const int *sendcounts,
+                                               const int *sdispls, MPI_Datatype sendtype,
+                                               void *recvbuf, const int *recvcounts,
+                                               const int *rdispls, MPI_Datatype recvtype,
+                                               MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_NM_ineighbor_alltoallw_t) (const void *sendbuf, const int *sendcounts,
+                                               const MPI_Aint * sdispls,
+                                               const MPI_Datatype * sendtypes, void *recvbuf,
+                                               const int *recvcounts, const MPI_Aint * rdispls,
+                                               const MPI_Datatype * recvtypes, MPIR_Comm * comm,
+                                               MPI_Request * req);
+typedef int (*MPIDI_NM_ibarrier_t) (MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_NM_ibcast_t) (void *buffer, int count, MPI_Datatype datatype, int root,
+                                  MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_NM_iallgather_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                      void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                      MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_NM_iallgatherv_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                       void *recvbuf, const int *recvcounts, const int *displs,
+                                       MPI_Datatype recvtype, MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_NM_iallreduce_t) (const void *sendbuf, void *recvbuf, int count,
+                                      MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                      MPI_Request * req);
+typedef int (*MPIDI_NM_ialltoall_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                     MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_NM_ialltoallv_t) (const void *sendbuf, const int *sendcounts,
+                                      const int *sdispls, MPI_Datatype sendtype, void *recvbuf,
+                                      const int *recvcounts, const int *rdispls,
+                                      MPI_Datatype recvtype, MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_NM_ialltoallw_t) (const void *sendbuf, const int *sendcounts,
+                                      const int *sdispls, const MPI_Datatype sendtypes[],
+                                      void *recvbuf, const int *recvcounts, const int *rdispls,
+                                      const MPI_Datatype recvtypes[], MPIR_Comm * comm,
+                                      MPI_Request * req);
+typedef int (*MPIDI_NM_iexscan_t) (const void *sendbuf, void *recvbuf, int count,
+                                   MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                   MPI_Request * req);
+typedef int (*MPIDI_NM_igather_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, int recvcount, MPI_Datatype recvtype, int root,
+                                   MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_NM_igatherv_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, const int *recvcounts, const int *displs,
+                                    MPI_Datatype recvtype, int root, MPIR_Comm * comm,
+                                    MPI_Request * req);
+typedef int (*MPIDI_NM_ireduce_scatter_block_t) (const void *sendbuf, void *recvbuf, int recvcount,
+                                                 MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                                 MPI_Request * req);
+typedef int (*MPIDI_NM_ireduce_scatter_t) (const void *sendbuf, void *recvbuf,
+                                           const int *recvcounts, MPI_Datatype datatype, MPI_Op op,
+                                           MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_NM_ireduce_t) (const void *sendbuf, void *recvbuf, int count,
+                                   MPI_Datatype datatype, MPI_Op op, int root, MPIR_Comm * comm_ptr,
+                                   MPI_Request * req);
+typedef int (*MPIDI_NM_iscan_t) (const void *sendbuf, void *recvbuf, int count,
+                                 MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                 MPI_Request * req);
+typedef int (*MPIDI_NM_iscatter_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype, int root,
+                                    MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_NM_iscatterv_t) (const void *sendbuf, const int *sendcounts, const int *displs,
+                                     MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                     MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr,
+                                     MPI_Request * req);
+typedef void (*MPIDI_NM_datatype_commit_t) (MPIR_Datatype * datatype_p);
+typedef void (*MPIDI_NM_datatype_dup_t) (MPIR_Datatype * old_datatype_p,
+                                         MPIR_Datatype * new_datatype_p);
+typedef void (*MPIDI_NM_datatype_destroy_t) (MPIR_Datatype * datatype_p);
+typedef void (*MPIDI_NM_op_commit_t) (MPIR_Op * op_p);
+typedef void (*MPIDI_NM_op_destroy_t) (MPIR_Op * op_p);
+
+typedef struct MPIDI_NM_funcs {
+    MPIDI_NM_init_t init;
+    MPIDI_NM_finalize_t finalize;
+    MPIDI_NM_progress_t progress;
+    MPIDI_NM_comm_connect_t comm_connect;
+    MPIDI_NM_comm_disconnect_t comm_disconnect;
+    MPIDI_NM_open_port_t open_port;
+    MPIDI_NM_close_port_t close_port;
+    MPIDI_NM_comm_accept_t comm_accept;
+    /* Routines that handle addressing */
+    MPIDI_NM_comm_get_lpid_t comm_get_lpid;
+    MPIDI_NM_gpid_get_t gpid_get;
+    MPIDI_NM_get_node_id_t get_node_id;
+    MPIDI_NM_get_max_node_id_t get_max_node_id;
+    MPIDI_NM_getallincomm_t getallincomm;
+    MPIDI_NM_gpid_tolpidarray_t gpid_tolpidarray;
+    MPIDI_NM_create_intercomm_from_lpids_t create_intercomm_from_lpids;
+    MPIDI_NM_comm_create_t comm_create;
+    MPIDI_NM_comm_destroy_t comm_destroy;
+    /* Request allocation routines */
+    MPIDI_NM_am_request_init_t am_request_init;
+    MPIDI_NM_am_request_finalize_t am_request_finalize;
+    /* Active Message Routines */
+    MPIDI_NM_reg_hdr_handler_t reg_hdr_handler;
+    MPIDI_NM_send_am_hdr_t send_am_hdr;
+    MPIDI_NM_inject_am_hdr_t inject_am_hdr;
+    MPIDI_NM_send_am_t send_am;
+    MPIDI_NM_send_amv_t send_amv;
+    MPIDI_NM_send_amv_hdr_t send_amv_hdr;
+    MPIDI_NM_send_am_hdr_reply_t send_am_hdr_reply;
+    MPIDI_NM_inject_am_hdr_reply_t inject_am_hdr_reply;
+    MPIDI_NM_send_am_reply_t send_am_reply;
+    MPIDI_NM_send_amv_reply_t send_amv_reply;
+    MPIDI_NM_am_hdr_max_sz_t am_hdr_max_sz;
+    MPIDI_NM_am_inject_max_sz_t am_inject_max_sz;
+    MPIDI_NM_am_recv_t am_recv;
+} MPIDI_NM_funcs_t;
+
+typedef struct MPIDI_NM_native_funcs {
+    MPIDI_NM_send_t send;
+    MPIDI_NM_ssend_t ssend;
+    MPIDI_NM_startall_t startall;
+    MPIDI_NM_send_init_t send_init;
+    MPIDI_NM_ssend_init_t ssend_init;
+    MPIDI_NM_rsend_init_t rsend_init;
+    MPIDI_NM_bsend_init_t bsend_init;
+    MPIDI_NM_isend_t isend;
+    MPIDI_NM_issend_t issend;
+    MPIDI_NM_cancel_send_t cancel_send;
+    MPIDI_NM_recv_init_t recv_init;
+    MPIDI_NM_recv_t recv;
+    MPIDI_NM_irecv_t irecv;
+    MPIDI_NM_imrecv_t imrecv;
+    MPIDI_NM_cancel_recv_t cancel_recv;
+    MPIDI_NM_alloc_mem_t alloc_mem;
+    MPIDI_NM_free_mem_t free_mem;
+    MPIDI_NM_improbe_t improbe;
+    MPIDI_NM_iprobe_t iprobe;
+    MPIDI_NM_win_set_info_t win_set_info;
+    MPIDI_NM_win_shared_query_t win_shared_query;
+    MPIDI_NM_put_t put;
+    MPIDI_NM_win_start_t win_start;
+    MPIDI_NM_win_complete_t win_complete;
+    MPIDI_NM_win_post_t win_post;
+    MPIDI_NM_win_wait_t win_wait;
+    MPIDI_NM_win_test_t win_test;
+    MPIDI_NM_win_lock_t win_lock;
+    MPIDI_NM_win_unlock_t win_unlock;
+    MPIDI_NM_win_get_info_t win_get_info;
+    MPIDI_NM_get_t get;
+    MPIDI_NM_win_free_t win_free;
+    MPIDI_NM_win_fence_t win_fence;
+    MPIDI_NM_win_create_t win_create;
+    MPIDI_NM_accumulate_t accumulate;
+    MPIDI_NM_win_attach_t win_attach;
+    MPIDI_NM_win_allocate_shared_t win_allocate_shared;
+    MPIDI_NM_rput_t rput;
+    MPIDI_NM_win_flush_local_t win_flush_local;
+    MPIDI_NM_win_detach_t win_detach;
+    MPIDI_NM_compare_and_swap_t compare_and_swap;
+    MPIDI_NM_raccumulate_t raccumulate;
+    MPIDI_NM_rget_accumulate_t rget_accumulate;
+    MPIDI_NM_fetch_and_op_t fetch_and_op;
+    MPIDI_NM_win_allocate_t win_allocate;
+    MPIDI_NM_win_flush_t win_flush;
+    MPIDI_NM_win_flush_local_all_t win_flush_local_all;
+    MPIDI_NM_win_unlock_all_t win_unlock_all;
+    MPIDI_NM_win_create_dynamic_t win_create_dynamic;
+    MPIDI_NM_rget_t rget;
+    MPIDI_NM_win_sync_t win_sync;
+    MPIDI_NM_win_flush_all_t win_flush_all;
+    MPIDI_NM_get_accumulate_t get_accumulate;
+    MPIDI_NM_win_lock_all_t win_lock_all;
+    MPIDI_NM_rank_is_local_t rank_is_local;
+    /* Collectives */
+    MPIDI_NM_barrier_t barrier;
+    MPIDI_NM_bcast_t bcast;
+    MPIDI_NM_allreduce_t allreduce;
+    MPIDI_NM_allgather_t allgather;
+    MPIDI_NM_allgatherv_t allgatherv;
+    MPIDI_NM_scatter_t scatter;
+    MPIDI_NM_scatterv_t scatterv;
+    MPIDI_NM_gather_t gather;
+    MPIDI_NM_gatherv_t gatherv;
+    MPIDI_NM_alltoall_t alltoall;
+    MPIDI_NM_alltoallv_t alltoallv;
+    MPIDI_NM_alltoallw_t alltoallw;
+    MPIDI_NM_reduce_t reduce;
+    MPIDI_NM_reduce_scatter_t reduce_scatter;
+    MPIDI_NM_reduce_scatter_block_t reduce_scatter_block;
+    MPIDI_NM_scan_t scan;
+    MPIDI_NM_exscan_t exscan;
+    MPIDI_NM_neighbor_allgather_t neighbor_allgather;
+    MPIDI_NM_neighbor_allgatherv_t neighbor_allgatherv;
+    MPIDI_NM_neighbor_alltoall_t neighbor_alltoall;
+    MPIDI_NM_neighbor_alltoallv_t neighbor_alltoallv;
+    MPIDI_NM_neighbor_alltoallw_t neighbor_alltoallw;
+    MPIDI_NM_ineighbor_allgather_t ineighbor_allgather;
+    MPIDI_NM_ineighbor_allgatherv_t ineighbor_allgatherv;
+    MPIDI_NM_ineighbor_alltoall_t ineighbor_alltoall;
+    MPIDI_NM_ineighbor_alltoallv_t ineighbor_alltoallv;
+    MPIDI_NM_ineighbor_alltoallw_t ineighbor_alltoallw;
+    MPIDI_NM_ibarrier_t ibarrier;
+    MPIDI_NM_ibcast_t ibcast;
+    MPIDI_NM_iallgather_t iallgather;
+    MPIDI_NM_iallgatherv_t iallgatherv;
+    MPIDI_NM_iallreduce_t iallreduce;
+    MPIDI_NM_ialltoall_t ialltoall;
+    MPIDI_NM_ialltoallv_t ialltoallv;
+    MPIDI_NM_ialltoallw_t ialltoallw;
+    MPIDI_NM_iexscan_t iexscan;
+    MPIDI_NM_igather_t igather;
+    MPIDI_NM_igatherv_t igatherv;
+    MPIDI_NM_ireduce_scatter_block_t ireduce_scatter_block;
+    MPIDI_NM_ireduce_scatter_t ireduce_scatter;
+    MPIDI_NM_ireduce_t ireduce;
+    MPIDI_NM_iscan_t iscan;
+    MPIDI_NM_iscatter_t iscatter;
+    MPIDI_NM_iscatterv_t iscatterv;
+    /* Datatype hooks */
+    MPIDI_NM_datatype_commit_t datatype_commit;
+    MPIDI_NM_datatype_dup_t datatype_dup;
+    MPIDI_NM_datatype_destroy_t datatype_destroy;
+    /* Op hooks */
+    MPIDI_NM_op_commit_t op_commit;
+    MPIDI_NM_op_destroy_t op_destroy;
+} MPIDI_NM_native_funcs_t;
+
+extern MPIDI_NM_funcs_t *MPIDI_NM_funcs[];
+extern MPIDI_NM_funcs_t *MPIDI_NM_func;
+extern MPIDI_NM_native_funcs_t *MPIDI_NM_native_funcs[];
+extern MPIDI_NM_native_funcs_t *MPIDI_NM_native_func;
+extern int MPIDI_num_netmods;
+extern char MPIDI_NM_strings[][MPIDI_MAX_NETMOD_STRING_LEN];
+
+#ifndef MPIDI_NM_STATIC_INLINE_PREFIX
+#define MPIDI_NM_STATIC_INLINE_PREFIX __attribute__((always_inline)) static inline
+#endif
+
+#ifndef MPIDI_NM_STATIC_INLINE_SUFFIX
+#define MPIDI_NM_STATIC_INLINE_SUFFIX __attribute__((always_inline))
+#endif
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_init(int rank, int size, int appnum, int *tag_ub,
+                                                MPIR_Comm * comm_world, MPIR_Comm * comm_self,
+                                                int spawned, int num_contexts,
+                                                void **netmod_contexts)
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_finalize(void) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_progress(void *netmod_context,
+                                                    int blocking) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_reg_hdr_handler(int handler_id,
+                                                           MPIDI_NM_am_origin_handler_fn
+                                                           origin_handler_fn,
+                                                           MPIDI_NM_am_target_handler_fn
+                                                           target_handler_fn)
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_comm_connect(const char *port_name, MPIR_Info * info,
+                                                        int root, MPIR_Comm * comm,
+                                                        MPIR_Comm **
+                                                        newcomm_ptr) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_comm_disconnect(MPIR_Comm *
+                                                           comm_ptr) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_open_port(MPIR_Info * info_ptr,
+                                                     char *port_name) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_close_port(const char *port_name)
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_comm_accept(const char *port_name, MPIR_Info * info,
+                                                       int root, MPIR_Comm * comm,
+                                                       MPIR_Comm **
+                                                       newcomm_ptr) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_send_am_hdr(int rank, MPIR_Comm * comm, int handler_id,
+                                                       const void *am_hdr, size_t am_hdr_sz,
+                                                       MPIR_Request * sreq,
+                                                       void *netmod_context)
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_inject_am_hdr(int rank, MPIR_Comm * comm, int handler_id,
+                                                         const void *am_hdr, size_t am_hdr_sz,
+                                                         void *netmod_context)
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_send_am(int rank, MPIR_Comm * comm, int handler_id,
+                                                   const void *am_hdr, size_t am_hdr_sz,
+                                                   const void *data, MPI_Count count,
+                                                   MPI_Datatype datatype, MPIR_Request * sreq,
+                                                   void *netmod_context)
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_send_amv(int rank, MPIR_Comm * comm, int handler_id,
+                                                    struct iovec *am_hdrs, size_t iov_len,
+                                                    const void *data, MPI_Count count,
+                                                    MPI_Datatype datatype, MPIR_Request * sreq,
+                                                    void *netmod_context)
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_send_amv_hdr(int rank, MPIR_Comm * comm, int handler_id,
+                                                        struct iovec *am_hdrs, size_t iov_len,
+                                                        MPIR_Request * sreq,
+                                                        void *netmod_context)
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_send_am_hdr_reply(MPIR_Context_id_t context_id,
+                                                             int src_rank, int handler_id,
+                                                             const void *am_hdr, size_t am_hdr_sz,
+                                                             MPIR_Request *
+                                                             sreq) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_inject_am_hdr_reply(MPIR_Context_id_t context_id,
+                                                               int src_rank, int handler_id,
+                                                               const void *am_hdr,
+                                                               size_t am_hdr_sz)
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_send_am_reply(MPIR_Context_id_t context_id, int src_rank,
+                                                         int handler_id, const void *am_hdr,
+                                                         size_t am_hdr_sz, const void *data,
+                                                         MPI_Count count, MPI_Datatype datatype,
+                                                         MPIR_Request *
+                                                         sreq) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_send_amv_reply(MPIR_Context_id_t context_id,
+                                                          int src_rank, int handler_id,
+                                                          struct iovec *am_hdr, size_t iov_len,
+                                                          const void *data, MPI_Count count,
+                                                          MPI_Datatype datatype,
+                                                          MPIR_Request *
+                                                          sreq) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX size_t MPIDI_NM_am_hdr_max_sz(void) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX size_t MPIDI_NM_am_inject_max_sz(void) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_am_recv(MPIR_Request *
+                                                   req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_comm_get_lpid(MPIR_Comm * comm_ptr, int idx,
+                                                         int *lpid_ptr,
+                                                         MPL_bool is_remote)
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_gpid_get(MPIR_Comm * comm_ptr, int rank,
+                                                    MPIR_Gpid * gpid) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_get_node_id(MPIR_Comm * comm, int rank,
+                                                       MPID_Node_id_t *
+                                                       id_p) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_get_max_node_id(MPIR_Comm * comm,
+                                                           MPID_Node_id_t *
+                                                           max_id_p) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_getallincomm(MPIR_Comm * comm_ptr, int local_size,
+                                                        MPIR_Gpid local_gpids[],
+                                                        int *singleAVT)
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_gpid_tolpidarray(int size, MPIR_Gpid gpid[],
+                                                            int lpid[])
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr,
+                                                                       int size,
+                                                                       const int lpids[])
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_comm_create(MPIR_Comm *
+                                                       comm) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_comm_destroy(MPIR_Comm *
+                                                        comm) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX void MPIDI_NM_am_request_init(MPIR_Request *
+                                                            req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX void MPIDI_NM_am_request_finalize(MPIR_Request *
+                                                                req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_send(const void *buf, int count, MPI_Datatype datatype,
+                                                int rank, int tag, MPIR_Comm * comm,
+                                                int context_offset,
+                                                MPIR_Request **
+                                                request) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ssend(const void *buf, int count, MPI_Datatype datatype,
+                                                 int rank, int tag, MPIR_Comm * comm,
+                                                 int context_offset,
+                                                 MPIR_Request **
+                                                 request) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_startall(int count,
+                                                    MPIR_Request *
+                                                    requests[]) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_send_init(const void *buf, int count,
+                                                     MPI_Datatype datatype, int rank, int tag,
+                                                     MPIR_Comm * comm, int context_offset,
+                                                     MPIR_Request **
+                                                     request) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ssend_init(const void *buf, int count,
+                                                      MPI_Datatype datatype, int rank, int tag,
+                                                      MPIR_Comm * comm, int context_offset,
+                                                      MPIR_Request **
+                                                      request) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_rsend_init(const void *buf, int count,
+                                                      MPI_Datatype datatype, int rank, int tag,
+                                                      MPIR_Comm * comm, int context_offset,
+                                                      MPIR_Request **
+                                                      request) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_bsend_init(const void *buf, int count,
+                                                      MPI_Datatype datatype, int rank, int tag,
+                                                      MPIR_Comm * comm, int context_offset,
+                                                      MPIR_Request **
+                                                      request) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_isend(const void *buf, int count, MPI_Datatype datatype,
+                                                 int rank, int tag, MPIR_Comm * comm,
+                                                 int context_offset,
+                                                 MPIR_Request **
+                                                 request) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_issend(const void *buf, int count, MPI_Datatype datatype,
+                                                  int rank, int tag, MPIR_Comm * comm,
+                                                  int context_offset,
+                                                  MPIR_Request **
+                                                  request) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_cancel_send(MPIR_Request *
+                                                       sreq) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_recv_init(void *buf, int count, MPI_Datatype datatype,
+                                                     int rank, int tag, MPIR_Comm * comm,
+                                                     int context_offset,
+                                                     MPIR_Request **
+                                                     request) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_recv(void *buf, int count, MPI_Datatype datatype,
+                                                int rank, int tag, MPIR_Comm * comm,
+                                                int context_offset, MPI_Status * status,
+                                                MPIR_Request **
+                                                request) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_irecv(void *buf, int count, MPI_Datatype datatype,
+                                                 int rank, int tag, MPIR_Comm * comm,
+                                                 int context_offset,
+                                                 MPIR_Request **
+                                                 request) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_imrecv(void *buf, int count, MPI_Datatype datatype,
+                                                  MPIR_Request * message,
+                                                  MPIR_Request **
+                                                  rreqp) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_cancel_recv(MPIR_Request *
+                                                       rreq) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX void *MPIDI_NM_alloc_mem(size_t size,
+                                                       MPIR_Info *
+                                                       info_ptr) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_free_mem(void *ptr) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_improbe(int source, int tag, MPIR_Comm * comm,
+                                                   int context_offset, int *flag,
+                                                   MPIR_Request ** message,
+                                                   MPI_Status *
+                                                   status) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_iprobe(int source, int tag, MPIR_Comm * comm,
+                                                  int context_offset, int *flag,
+                                                  MPI_Status *
+                                                  status) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_set_info(MPIR_Win * win,
+                                                        MPIR_Info *
+                                                        info) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_shared_query(MPIR_Win * win, int rank,
+                                                            MPI_Aint * size, int *disp_unit,
+                                                            void *baseptr)
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_put(const void *origin_addr, int origin_count,
+                                               MPI_Datatype origin_datatype, int target_rank,
+                                               MPI_Aint target_disp, int target_count,
+                                               MPI_Datatype target_datatype,
+                                               MPIR_Win * win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_start(MPIR_Group * group, int assert,
+                                                     MPIR_Win * win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_complete(MPIR_Win *
+                                                        win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_post(MPIR_Group * group, int assert,
+                                                    MPIR_Win * win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_wait(MPIR_Win * win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_test(MPIR_Win * win,
+                                                    int *flag) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_lock(int lock_type, int rank, int assert,
+                                                    MPIR_Win * win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_unlock(int rank,
+                                                      MPIR_Win * win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_get_info(MPIR_Win * win,
+                                                        MPIR_Info **
+                                                        info_p_p) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_get(void *origin_addr, int origin_count,
+                                               MPI_Datatype origin_datatype, int target_rank,
+                                               MPI_Aint target_disp, int target_count,
+                                               MPI_Datatype target_datatype,
+                                               MPIR_Win * win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_free(MPIR_Win **
+                                                    win_ptr) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_fence(int assert,
+                                                     MPIR_Win * win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_create(void *base, MPI_Aint length, int disp_unit,
+                                                      MPIR_Info * info, MPIR_Comm * comm_ptr,
+                                                      MPIR_Win **
+                                                      win_ptr) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_accumulate(const void *origin_addr, int origin_count,
+                                                      MPI_Datatype origin_datatype, int target_rank,
+                                                      MPI_Aint target_disp, int target_count,
+                                                      MPI_Datatype target_datatype, MPI_Op op,
+                                                      MPIR_Win * win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_attach(MPIR_Win * win, void *base,
+                                                      MPI_Aint size) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_allocate_shared(MPI_Aint size, int disp_unit,
+                                                               MPIR_Info * info_ptr,
+                                                               MPIR_Comm * comm_ptr,
+                                                               void **base_ptr,
+                                                               MPIR_Win **
+                                                               win_ptr)
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_rput(const void *origin_addr, int origin_count,
+                                                MPI_Datatype origin_datatype, int target_rank,
+                                                MPI_Aint target_disp, int target_count,
+                                                MPI_Datatype target_datatype, MPIR_Win * win,
+                                                MPIR_Request **
+                                                request) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_flush_local(int rank,
+                                                           MPIR_Win *
+                                                           win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_detach(MPIR_Win * win,
+                                                      const void *base)
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_compare_and_swap(const void *origin_addr,
+                                                            const void *compare_addr,
+                                                            void *result_addr,
+                                                            MPI_Datatype datatype, int target_rank,
+                                                            MPI_Aint target_disp,
+                                                            MPIR_Win *
+                                                            win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_raccumulate(const void *origin_addr, int origin_count,
+                                                       MPI_Datatype origin_datatype,
+                                                       int target_rank, MPI_Aint target_disp,
+                                                       int target_count,
+                                                       MPI_Datatype target_datatype, MPI_Op op,
+                                                       MPIR_Win * win,
+                                                       MPIR_Request **
+                                                       request) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_rget_accumulate(const void *origin_addr,
+                                                           int origin_count,
+                                                           MPI_Datatype origin_datatype,
+                                                           void *result_addr, int result_count,
+                                                           MPI_Datatype result_datatype,
+                                                           int target_rank, MPI_Aint target_disp,
+                                                           int target_count,
+                                                           MPI_Datatype target_datatype, MPI_Op op,
+                                                           MPIR_Win * win,
+                                                           MPIR_Request **
+                                                           request) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_fetch_and_op(const void *origin_addr, void *result_addr,
+                                                        MPI_Datatype datatype, int target_rank,
+                                                        MPI_Aint target_disp, MPI_Op op,
+                                                        MPIR_Win *
+                                                        win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_allocate(MPI_Aint size, int disp_unit,
+                                                        MPIR_Info * info, MPIR_Comm * comm,
+                                                        void *baseptr,
+                                                        MPIR_Win **
+                                                        win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_flush(int rank,
+                                                     MPIR_Win * win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_flush_local_all(MPIR_Win *
+                                                               win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_unlock_all(MPIR_Win *
+                                                          win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_create_dynamic(MPIR_Info * info, MPIR_Comm * comm,
+                                                              MPIR_Win **
+                                                              win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_rget(void *origin_addr, int origin_count,
+                                                MPI_Datatype origin_datatype, int target_rank,
+                                                MPI_Aint target_disp, int target_count,
+                                                MPI_Datatype target_datatype, MPIR_Win * win,
+                                                MPIR_Request **
+                                                request) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_sync(MPIR_Win * win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_flush_all(MPIR_Win *
+                                                         win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_get_accumulate(const void *origin_addr, int origin_count,
+                                                          MPI_Datatype origin_datatype,
+                                                          void *result_addr, int result_count,
+                                                          MPI_Datatype result_datatype,
+                                                          int target_rank, MPI_Aint target_disp,
+                                                          int target_count,
+                                                          MPI_Datatype target_datatype, MPI_Op op,
+                                                          MPIR_Win *
+                                                          win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_lock_all(int assert,
+                                                        MPIR_Win *
+                                                        win) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_rank_is_local(int target,
+                                                         MPIR_Comm *
+                                                         comm) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_barrier(MPIR_Comm * comm,
+                                                   MPIR_Errflag_t *
+                                                   errflag) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_bcast(void *buffer, int count, MPI_Datatype datatype,
+                                                 int root, MPIR_Comm * comm,
+                                                 MPIR_Errflag_t *
+                                                 errflag) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_allreduce(const void *sendbuf, void *recvbuf, int count,
+                                                     MPI_Datatype datatype, MPI_Op op,
+                                                     MPIR_Comm * comm,
+                                                     MPIR_Errflag_t *
+                                                     errflag) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_allgather(const void *sendbuf, int sendcount,
+                                                     MPI_Datatype sendtype, void *recvbuf,
+                                                     int recvcount, MPI_Datatype recvtype,
+                                                     MPIR_Comm * comm,
+                                                     MPIR_Errflag_t *
+                                                     errflag) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_allgatherv(const void *sendbuf, int sendcount,
+                                                      MPI_Datatype sendtype, void *recvbuf,
+                                                      const int *recvcounts, const int *displs,
+                                                      MPI_Datatype recvtype, MPIR_Comm * comm,
+                                                      MPIR_Errflag_t *
+                                                      errflag) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_scatter(const void *sendbuf, int sendcount,
+                                                   MPI_Datatype sendtype, void *recvbuf,
+                                                   int recvcount, MPI_Datatype recvtype, int root,
+                                                   MPIR_Comm * comm,
+                                                   MPIR_Errflag_t *
+                                                   errflag) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_scatterv(const void *sendbuf, const int *sendcounts,
+                                                    const int *displs, MPI_Datatype sendtype,
+                                                    void *recvbuf, int recvcount,
+                                                    MPI_Datatype recvtype, int root,
+                                                    MPIR_Comm * comm_ptr,
+                                                    MPIR_Errflag_t *
+                                                    errflag) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_gather(const void *sendbuf, int sendcount,
+                                                  MPI_Datatype sendtype, void *recvbuf,
+                                                  int recvcount, MPI_Datatype recvtype, int root,
+                                                  MPIR_Comm * comm,
+                                                  MPIR_Errflag_t *
+                                                  errflag) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_gatherv(const void *sendbuf, int sendcount,
+                                                   MPI_Datatype sendtype, void *recvbuf,
+                                                   const int *recvcounts, const int *displs,
+                                                   MPI_Datatype recvtype, int root,
+                                                   MPIR_Comm * comm,
+                                                   MPIR_Errflag_t *
+                                                   errflag) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_alltoall(const void *sendbuf, int sendcount,
+                                                    MPI_Datatype sendtype, void *recvbuf,
+                                                    int recvcount, MPI_Datatype recvtype,
+                                                    MPIR_Comm * comm,
+                                                    MPIR_Errflag_t *
+                                                    errflag) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_alltoallv(const void *sendbuf, const int *sendcounts,
+                                                     const int *sdispls, MPI_Datatype sendtype,
+                                                     void *recvbuf, const int *recvcounts,
+                                                     const int *rdispls, MPI_Datatype recvtype,
+                                                     MPIR_Comm * comm,
+                                                     MPIR_Errflag_t *
+                                                     errflag) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_alltoallw(const void *sendbuf, const int *sendcounts,
+                                                     const int *sdispls,
+                                                     const MPI_Datatype sendtypes[], void *recvbuf,
+                                                     const int *recvcounts, const int *rdispls,
+                                                     const MPI_Datatype recvtypes[],
+                                                     MPIR_Comm * comm,
+                                                     MPIR_Errflag_t *
+                                                     errflag) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_reduce(const void *sendbuf, void *recvbuf, int count,
+                                                  MPI_Datatype datatype, MPI_Op op, int root,
+                                                  MPIR_Comm * comm_ptr,
+                                                  MPIR_Errflag_t *
+                                                  errflag) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_reduce_scatter(const void *sendbuf, void *recvbuf,
+                                                          const int *recvcounts,
+                                                          MPI_Datatype datatype, MPI_Op op,
+                                                          MPIR_Comm * comm_ptr,
+                                                          MPIR_Errflag_t *
+                                                          errflag) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_reduce_scatter_block(const void *sendbuf, void *recvbuf,
+                                                                int recvcount,
+                                                                MPI_Datatype datatype, MPI_Op op,
+                                                                MPIR_Comm * comm_ptr,
+                                                                MPIR_Errflag_t *
+                                                                errflag)
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_scan(const void *sendbuf, void *recvbuf, int count,
+                                                MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                                MPIR_Errflag_t *
+                                                errflag) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_exscan(const void *sendbuf, void *recvbuf, int count,
+                                                  MPI_Datatype datatype, MPI_Op op,
+                                                  MPIR_Comm * comm,
+                                                  MPIR_Errflag_t *
+                                                  errflag) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_neighbor_allgather(const void *sendbuf, int sendcount,
+                                                              MPI_Datatype sendtype, void *recvbuf,
+                                                              int recvcount, MPI_Datatype recvtype,
+                                                              MPIR_Comm *
+                                                              comm) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_neighbor_allgatherv(const void *sendbuf, int sendcount,
+                                                               MPI_Datatype sendtype, void *recvbuf,
+                                                               const int *recvcounts,
+                                                               const int *displs,
+                                                               MPI_Datatype recvtype,
+                                                               MPIR_Comm *
+                                                               comm) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_neighbor_alltoallv(const void *sendbuf,
+                                                              const int *sendcounts,
+                                                              const int *sdispls,
+                                                              MPI_Datatype sendtype, void *recvbuf,
+                                                              const int *recvcounts,
+                                                              const int *rdispls,
+                                                              MPI_Datatype recvtype,
+                                                              MPIR_Comm *
+                                                              comm) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_neighbor_alltoallw(const void *sendbuf,
+                                                              const int *sendcounts,
+                                                              const MPI_Aint * sdispls,
+                                                              const MPI_Datatype * sendtypes,
+                                                              void *recvbuf, const int *recvcounts,
+                                                              const MPI_Aint * rdispls,
+                                                              const MPI_Datatype * recvtypes,
+                                                              MPIR_Comm *
+                                                              comm) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_neighbor_alltoall(const void *sendbuf, int sendcount,
+                                                             MPI_Datatype sendtype, void *recvbuf,
+                                                             int recvcount, MPI_Datatype recvtype,
+                                                             MPIR_Comm *
+                                                             comm) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ineighbor_allgather(const void *sendbuf, int sendcount,
+                                                               MPI_Datatype sendtype, void *recvbuf,
+                                                               int recvcount, MPI_Datatype recvtype,
+                                                               MPIR_Comm * comm,
+                                                               MPI_Request *
+                                                               req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ineighbor_allgatherv(const void *sendbuf, int sendcount,
+                                                                MPI_Datatype sendtype,
+                                                                void *recvbuf,
+                                                                const int *recvcounts,
+                                                                const int *displs,
+                                                                MPI_Datatype recvtype,
+                                                                MPIR_Comm * comm,
+                                                                MPI_Request *
+                                                                req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ineighbor_alltoall(const void *sendbuf, int sendcount,
+                                                              MPI_Datatype sendtype, void *recvbuf,
+                                                              int recvcount, MPI_Datatype recvtype,
+                                                              MPIR_Comm * comm,
+                                                              MPI_Request *
+                                                              req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ineighbor_alltoallv(const void *sendbuf,
+                                                               const int *sendcounts,
+                                                               const int *sdispls,
+                                                               MPI_Datatype sendtype, void *recvbuf,
+                                                               const int *recvcounts,
+                                                               const int *rdispls,
+                                                               MPI_Datatype recvtype,
+                                                               MPIR_Comm * comm,
+                                                               MPI_Request *
+                                                               req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ineighbor_alltoallw(const void *sendbuf,
+                                                               const int *sendcounts,
+                                                               const MPI_Aint * sdispls,
+                                                               const MPI_Datatype * sendtypes,
+                                                               void *recvbuf, const int *recvcounts,
+                                                               const MPI_Aint * rdispls,
+                                                               const MPI_Datatype * recvtypes,
+                                                               MPIR_Comm * comm,
+                                                               MPI_Request *
+                                                               req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ibarrier(MPIR_Comm * comm,
+                                                    MPI_Request *
+                                                    req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ibcast(void *buffer, int count, MPI_Datatype datatype,
+                                                  int root, MPIR_Comm * comm,
+                                                  MPI_Request * req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_iallgather(const void *sendbuf, int sendcount,
+                                                      MPI_Datatype sendtype, void *recvbuf,
+                                                      int recvcount, MPI_Datatype recvtype,
+                                                      MPIR_Comm * comm,
+                                                      MPI_Request *
+                                                      req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_iallgatherv(const void *sendbuf, int sendcount,
+                                                       MPI_Datatype sendtype, void *recvbuf,
+                                                       const int *recvcounts, const int *displs,
+                                                       MPI_Datatype recvtype, MPIR_Comm * comm,
+                                                       MPI_Request *
+                                                       req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_iallreduce(const void *sendbuf, void *recvbuf, int count,
+                                                      MPI_Datatype datatype, MPI_Op op,
+                                                      MPIR_Comm * comm,
+                                                      MPI_Request *
+                                                      req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ialltoall(const void *sendbuf, int sendcount,
+                                                     MPI_Datatype sendtype, void *recvbuf,
+                                                     int recvcount, MPI_Datatype recvtype,
+                                                     MPIR_Comm * comm,
+                                                     MPI_Request *
+                                                     req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ialltoallv(const void *sendbuf, const int *sendcounts,
+                                                      const int *sdispls, MPI_Datatype sendtype,
+                                                      void *recvbuf, const int *recvcounts,
+                                                      const int *rdispls, MPI_Datatype recvtype,
+                                                      MPIR_Comm * comm,
+                                                      MPI_Request *
+                                                      req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ialltoallw(const void *sendbuf, const int *sendcounts,
+                                                      const int *sdispls,
+                                                      const MPI_Datatype sendtypes[], void *recvbuf,
+                                                      const int *recvcounts, const int *rdispls,
+                                                      const MPI_Datatype recvtypes[],
+                                                      MPIR_Comm * comm,
+                                                      MPI_Request *
+                                                      req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_iexscan(const void *sendbuf, void *recvbuf, int count,
+                                                   MPI_Datatype datatype, MPI_Op op,
+                                                   MPIR_Comm * comm,
+                                                   MPI_Request * req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_igather(const void *sendbuf, int sendcount,
+                                                   MPI_Datatype sendtype, void *recvbuf,
+                                                   int recvcount, MPI_Datatype recvtype, int root,
+                                                   MPIR_Comm * comm,
+                                                   MPI_Request * req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_igatherv(const void *sendbuf, int sendcount,
+                                                    MPI_Datatype sendtype, void *recvbuf,
+                                                    const int *recvcounts, const int *displs,
+                                                    MPI_Datatype recvtype, int root,
+                                                    MPIR_Comm * comm,
+                                                    MPI_Request *
+                                                    req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ireduce_scatter_block(const void *sendbuf, void *recvbuf,
+                                                                 int recvcount,
+                                                                 MPI_Datatype datatype, MPI_Op op,
+                                                                 MPIR_Comm * comm,
+                                                                 MPI_Request *
+                                                                 req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ireduce_scatter(const void *sendbuf, void *recvbuf,
+                                                           const int *recvcounts,
+                                                           MPI_Datatype datatype, MPI_Op op,
+                                                           MPIR_Comm * comm,
+                                                           MPI_Request *
+                                                           req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ireduce(const void *sendbuf, void *recvbuf, int count,
+                                                   MPI_Datatype datatype, MPI_Op op, int root,
+                                                   MPIR_Comm * comm_ptr,
+                                                   MPI_Request * req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_iscan(const void *sendbuf, void *recvbuf, int count,
+                                                 MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                                 MPI_Request * req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_iscatter(const void *sendbuf, int sendcount,
+                                                    MPI_Datatype sendtype, void *recvbuf,
+                                                    int recvcount, MPI_Datatype recvtype, int root,
+                                                    MPIR_Comm * comm,
+                                                    MPI_Request *
+                                                    req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_iscatterv(const void *sendbuf, const int *sendcounts,
+                                                     const int *displs, MPI_Datatype sendtype,
+                                                     void *recvbuf, int recvcount,
+                                                     MPI_Datatype recvtype, int root,
+                                                     MPIR_Comm * comm_ptr,
+                                                     MPI_Request *
+                                                     req) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX void MPIDI_NM_datatype_commit(MPIR_Datatype *
+                                                            datatype_p)
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX void MPIDI_NM_datatype_dup(MPIR_Datatype * old_datatype_p,
+                                                         MPIR_Datatype *
+                                                         new_datatype_p)
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX void MPIDI_NM_datatype_destroy(MPIR_Datatype *
+                                                             datatype_p)
+    MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX void MPIDI_NM_op_commit(MPIR_Op * op_p) MPIDI_NM_STATIC_INLINE_SUFFIX;
+MPIDI_NM_STATIC_INLINE_PREFIX void MPIDI_NM_op_destroy(MPIR_Op *
+                                                       op_p) MPIDI_NM_STATIC_INLINE_SUFFIX;
+
+#endif
diff --git a/src/mpid/ch4/netmod/include/netmod_impl.h b/src/mpid/ch4/netmod/include/netmod_impl.h
new file mode 100644
index 0000000..96cb922
--- /dev/null
+++ b/src/mpid/ch4/netmod/include/netmod_impl.h
@@ -0,0 +1,1083 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+/* ch4 netmod functions */
+#ifndef NETMOD_IMPL_PROTOTYPES_H_INCLUDED
+#define NETMOD_IMPL_PROTOTYPES_H_INCLUDED
+
+
+#ifndef NETMOD_DIRECT
+#ifndef NETMOD_DISABLE_INLINES
+
+#ifndef MPIDI_NM_STATIC_INLINE_PREFIX
+#define MPIDI_NM_STATIC_INLINE_PREFIX __attribute__((always_inline)) static inline
+#endif
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_init(int rank, int size, int appnum, int *tag_ub,
+                                                MPIR_Comm * comm_world, MPIR_Comm * comm_self,
+                                                int spawned, int num_contexts,
+                                                void **netmod_contexts)
+{
+    return MPIDI_NM_func->init(rank, size, appnum, tag_ub, comm_world, comm_self, spawned,
+                               num_contexts, netmod_contexts);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_finalize(void)
+{
+    return MPIDI_NM_func->finalize();
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_progress(void *netmod_context, int blocking)
+{
+    return MPIDI_NM_func->progress(netmod_context, blocking);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_reg_hdr_handler(int handler_id,
+                                                           MPIDI_NM_am_origin_handler_fn
+                                                           origin_handler_fn,
+                                                           MPIDI_NM_am_target_handler_fn
+                                                           target_handler_fn)
+{
+    return MPIDI_NM_func->reg_hdr_handler(handler_id, origin_handler_fn, target_handler_fn);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_comm_connect(const char *port_name, MPIR_Info * info,
+                                                        int root, MPIR_Comm * comm,
+                                                        MPIR_Comm ** newcomm_ptr)
+{
+    return MPIDI_NM_func->comm_connect(port_name, info, root, comm, newcomm_ptr);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_comm_disconnect(MPIR_Comm * comm_ptr)
+{
+    return MPIDI_NM_func->comm_disconnect(comm_ptr);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_open_port(MPIR_Info * info_ptr, char *port_name)
+{
+    return MPIDI_NM_func->open_port(info_ptr, port_name);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_close_port(const char *port_name)
+{
+    return MPIDI_NM_func->close_port(port_name);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_comm_accept(const char *port_name, MPIR_Info * info,
+                                                       int root, MPIR_Comm * comm,
+                                                       MPIR_Comm ** newcomm_ptr)
+{
+    return MPIDI_NM_func->comm_accept(port_name, info, root, comm, newcomm_ptr);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_send_am_hdr(int rank, MPIR_Comm * comm, int handler_id,
+                                                       const void *am_hdr, size_t am_hdr_sz,
+                                                       MPIR_Request * sreq, void *netmod_context)
+{
+    return MPIDI_NM_func->send_am_hdr(rank, comm, handler_id, am_hdr, am_hdr_sz, sreq,
+                                      netmod_context);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_inject_am_hdr(int rank, MPIR_Comm * comm, int handler_id,
+                                                         const void *am_hdr, size_t am_hdr_sz,
+                                                         void *netmod_context)
+{
+    return MPIDI_NM_func->inject_am_hdr(rank, comm, handler_id, am_hdr, am_hdr_sz, netmod_context);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_send_am(int rank, MPIR_Comm * comm, int handler_id,
+                                                   const void *am_hdr, size_t am_hdr_sz,
+                                                   const void *data, MPI_Count count,
+                                                   MPI_Datatype datatype, MPIR_Request * sreq,
+                                                   void *netmod_context)
+{
+    return MPIDI_NM_func->send_am(rank, comm, handler_id, am_hdr, am_hdr_sz, data, count, datatype,
+                                  sreq, netmod_context);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_send_amv(int rank, MPIR_Comm * comm, int handler_id,
+                                                    struct iovec *am_hdrs, size_t iov_len,
+                                                    const void *data, MPI_Count count,
+                                                    MPI_Datatype datatype, MPIR_Request * sreq,
+                                                    void *netmod_context)
+{
+    return MPIDI_NM_func->send_amv(rank, comm, handler_id, am_hdrs, iov_len, data, count, datatype,
+                                   sreq, netmod_context);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_send_amv_hdr(int rank, MPIR_Comm * comm, int handler_id,
+                                                        struct iovec *am_hdrs, size_t iov_len,
+                                                        MPIR_Request * sreq, void *netmod_context)
+{
+    return MPIDI_NM_func->send_amv_hdr(rank, comm, handler_id, am_hdrs, iov_len, sreq,
+                                       netmod_context);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_send_am_hdr_reply(MPIR_Context_id_t context_id,
+                                                             int src_rank, int handler_id,
+                                                             const void *am_hdr, size_t am_hdr_sz,
+                                                             MPIR_Request * sreq)
+{
+    return MPIDI_NM_func->send_am_hdr_reply(context_id, src_rank, handler_id, am_hdr, am_hdr_sz,
+                                            sreq);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_inject_am_hdr_reply(MPIR_Context_id_t context_id,
+                                                               int src_rank, int handler_id,
+                                                               const void *am_hdr, size_t am_hdr_sz)
+{
+    return MPIDI_NM_func->inject_am_hdr_reply(context_id, src_rank, handler_id, am_hdr, am_hdr_sz);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_send_am_reply(MPIR_Context_id_t context_id, int src_rank,
+                                                         int handler_id, const void *am_hdr,
+                                                         size_t am_hdr_sz, const void *data,
+                                                         MPI_Count count, MPI_Datatype datatype,
+                                                         MPIR_Request * sreq)
+{
+    return MPIDI_NM_func->send_am_reply(context_id, src_rank, handler_id, am_hdr, am_hdr_sz, data,
+                                        count, datatype, sreq);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_send_amv_reply(MPIR_Context_id_t context_id,
+                                                          int src_rank, int handler_id,
+                                                          struct iovec *am_hdr, size_t iov_len,
+                                                          const void *data, MPI_Count count,
+                                                          MPI_Datatype datatype,
+                                                          MPIR_Request * sreq)
+{
+    return MPIDI_NM_func->send_amv_reply(context_id, src_rank, handler_id, am_hdr, iov_len, data,
+                                         count, datatype, sreq);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX size_t MPIDI_NM_am_hdr_max_sz(void)
+{
+    return MPIDI_NM_func->am_hdr_max_sz();
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX size_t MPIDI_NM_am_inject_max_sz(void)
+{
+    return MPIDI_NM_func->am_inject_max_sz();
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_am_recv(MPIR_Request * req)
+{
+    return MPIDI_NM_func->am_recv(req);
+}
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_comm_get_lpid(MPIR_Comm * comm_ptr, int idx,
+                                                         int *lpid_ptr, MPL_bool is_remote)
+{
+    return MPIDI_NM_func->comm_get_lpid(comm_ptr, idx, lpid_ptr, is_remote);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_gpid_get(MPIR_Comm * comm_ptr, int rank,
+                                                    MPIR_Gpid * gpid)
+{
+    return MPIDI_NM_func->gpid_get(comm_ptr, rank, gpid);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_get_node_id(MPIR_Comm * comm, int rank,
+                                                       MPID_Node_id_t * id_p)
+{
+    return MPIDI_NM_func->get_node_id(comm, rank, id_p);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_get_max_node_id(MPIR_Comm * comm,
+                                                           MPID_Node_id_t * max_id_p)
+{
+    return MPIDI_NM_func->get_max_node_id(comm, max_id_p);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_getallincomm(MPIR_Comm * comm_ptr, int local_size,
+                                                        MPIR_Gpid local_gpid[], int *singleAVT)
+{
+    return MPIDI_NM_func->getallincomm(comm_ptr, local_size, local_gpid, singleAVT);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_gpid_tolpidarray(int size, MPIR_Gpid gpid[], int lpid[])
+{
+    return MPIDI_NM_func->gpid_tolpidarray(size, gpid, lpid);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr,
+                                                                       int size, const int lpids[])
+{
+    return MPIDI_NM_func->create_intercomm_from_lpids(newcomm_ptr, size, lpids);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_comm_create(MPIR_Comm * comm)
+{
+    return MPIDI_NM_func->comm_create(comm);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_comm_destroy(MPIR_Comm * comm)
+{
+    return MPIDI_NM_func->comm_destroy(comm);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX void MPIDI_NM_am_request_init(MPIR_Request * req)
+{
+    return MPIDI_NM_func->am_request_init(req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX void MPIDI_NM_am_request_finalize(MPIR_Request * req)
+{
+    return MPIDI_NM_func->am_request_finalize(req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_send(const void *buf, int count, MPI_Datatype datatype,
+                                                int rank, int tag, MPIR_Comm * comm,
+                                                int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_NM_native_func->send(buf, count, datatype, rank, tag, comm, context_offset,
+                                      request);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ssend(const void *buf, int count, MPI_Datatype datatype,
+                                                 int rank, int tag, MPIR_Comm * comm,
+                                                 int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_NM_native_func->ssend(buf, count, datatype, rank, tag, comm, context_offset,
+                                       request);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_startall(int count, MPIR_Request * requests[])
+{
+    return MPIDI_NM_native_func->startall(count, requests);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_send_init(const void *buf, int count,
+                                                     MPI_Datatype datatype, int rank, int tag,
+                                                     MPIR_Comm * comm, int context_offset,
+                                                     MPIR_Request ** request)
+{
+    return MPIDI_NM_native_func->send_init(buf, count, datatype, rank, tag, comm, context_offset,
+                                           request);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ssend_init(const void *buf, int count,
+                                                      MPI_Datatype datatype, int rank, int tag,
+                                                      MPIR_Comm * comm, int context_offset,
+                                                      MPIR_Request ** request)
+{
+    return MPIDI_NM_native_func->ssend_init(buf, count, datatype, rank, tag, comm, context_offset,
+                                            request);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_rsend_init(const void *buf, int count,
+                                                      MPI_Datatype datatype, int rank, int tag,
+                                                      MPIR_Comm * comm, int context_offset,
+                                                      MPIR_Request ** request)
+{
+    return MPIDI_NM_native_func->rsend_init(buf, count, datatype, rank, tag, comm, context_offset,
+                                            request);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_bsend_init(const void *buf, int count,
+                                                      MPI_Datatype datatype, int rank, int tag,
+                                                      MPIR_Comm * comm, int context_offset,
+                                                      MPIR_Request ** request)
+{
+    return MPIDI_NM_native_func->bsend_init(buf, count, datatype, rank, tag, comm, context_offset,
+                                            request);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_isend(const void *buf, int count, MPI_Datatype datatype,
+                                                 int rank, int tag, MPIR_Comm * comm,
+                                                 int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_NM_native_func->isend(buf, count, datatype, rank, tag, comm, context_offset,
+                                       request);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_issend(const void *buf, int count, MPI_Datatype datatype,
+                                                  int rank, int tag, MPIR_Comm * comm,
+                                                  int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_NM_native_func->issend(buf, count, datatype, rank, tag, comm, context_offset,
+                                        request);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_cancel_send(MPIR_Request * sreq)
+{
+    return MPIDI_NM_native_func->cancel_send(sreq);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_recv_init(void *buf, int count, MPI_Datatype datatype,
+                                                     int rank, int tag, MPIR_Comm * comm,
+                                                     int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_NM_native_func->recv_init(buf, count, datatype, rank, tag, comm, context_offset,
+                                           request);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_recv(void *buf, int count, MPI_Datatype datatype,
+                                                int rank, int tag, MPIR_Comm * comm,
+                                                int context_offset, MPI_Status * status,
+                                                MPIR_Request ** request)
+{
+    return MPIDI_NM_native_func->recv(buf, count, datatype, rank, tag, comm, context_offset, status,
+                                      request);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_irecv(void *buf, int count, MPI_Datatype datatype,
+                                                 int rank, int tag, MPIR_Comm * comm,
+                                                 int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_NM_native_func->irecv(buf, count, datatype, rank, tag, comm, context_offset,
+                                       request);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_imrecv(void *buf, int count, MPI_Datatype datatype,
+                                                  MPIR_Request * message, MPIR_Request ** rreqp)
+{
+    return MPIDI_NM_native_func->imrecv(buf, count, datatype, message, rreqp);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_cancel_recv(MPIR_Request * rreq)
+{
+    return MPIDI_NM_native_func->cancel_recv(rreq);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX void *MPIDI_NM_alloc_mem(size_t size, MPIR_Info * info_ptr)
+{
+    return MPIDI_NM_native_func->alloc_mem(size, info_ptr);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_free_mem(void *ptr)
+{
+    return MPIDI_NM_native_func->free_mem(ptr);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_improbe(int source, int tag, MPIR_Comm * comm,
+                                                   int context_offset, int *flag,
+                                                   MPIR_Request ** message, MPI_Status * status)
+{
+    return MPIDI_NM_native_func->improbe(source, tag, comm, context_offset, flag, message, status);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_iprobe(int source, int tag, MPIR_Comm * comm,
+                                                  int context_offset, int *flag,
+                                                  MPI_Status * status)
+{
+    return MPIDI_NM_native_func->iprobe(source, tag, comm, context_offset, flag, status);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_set_info(MPIR_Win * win, MPIR_Info * info)
+{
+    return MPIDI_NM_native_func->win_set_info(win, info);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_shared_query(MPIR_Win * win, int rank,
+                                                            MPI_Aint * size, int *disp_unit,
+                                                            void *baseptr)
+{
+    return MPIDI_NM_native_func->win_shared_query(win, rank, size, disp_unit, baseptr);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_put(const void *origin_addr, int origin_count,
+                                               MPI_Datatype origin_datatype, int target_rank,
+                                               MPI_Aint target_disp, int target_count,
+                                               MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->put(origin_addr, origin_count, origin_datatype, target_rank,
+                                     target_disp, target_count, target_datatype, win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_start(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->win_start(group, assert, win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_complete(MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->win_complete(win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_post(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->win_post(group, assert, win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_wait(MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->win_wait(win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_test(MPIR_Win * win, int *flag)
+{
+    return MPIDI_NM_native_func->win_test(win, flag);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_lock(int lock_type, int rank, int assert,
+                                                    MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->win_lock(lock_type, rank, assert, win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_unlock(int rank, MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->win_unlock(rank, win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_get_info(MPIR_Win * win, MPIR_Info ** info_p_p)
+{
+    return MPIDI_NM_native_func->win_get_info(win, info_p_p);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_get(void *origin_addr, int origin_count,
+                                               MPI_Datatype origin_datatype, int target_rank,
+                                               MPI_Aint target_disp, int target_count,
+                                               MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->get(origin_addr, origin_count, origin_datatype, target_rank,
+                                     target_disp, target_count, target_datatype, win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_free(MPIR_Win ** win_ptr)
+{
+    return MPIDI_NM_native_func->win_free(win_ptr);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_fence(int assert, MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->win_fence(assert, win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_create(void *base, MPI_Aint length, int disp_unit,
+                                                      MPIR_Info * info, MPIR_Comm * comm_ptr,
+                                                      MPIR_Win ** win_ptr)
+{
+    return MPIDI_NM_native_func->win_create(base, length, disp_unit, info, comm_ptr, win_ptr);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_accumulate(const void *origin_addr, int origin_count,
+                                                      MPI_Datatype origin_datatype, int target_rank,
+                                                      MPI_Aint target_disp, int target_count,
+                                                      MPI_Datatype target_datatype, MPI_Op op,
+                                                      MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->accumulate(origin_addr, origin_count, origin_datatype, target_rank,
+                                            target_disp, target_count, target_datatype, op, win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_attach(MPIR_Win * win, void *base, MPI_Aint size)
+{
+    return MPIDI_NM_native_func->win_attach(win, base, size);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_allocate_shared(MPI_Aint size, int disp_unit,
+                                                               MPIR_Info * info_ptr,
+                                                               MPIR_Comm * comm_ptr,
+                                                               void **base_ptr, MPIR_Win ** win_ptr)
+{
+    return MPIDI_NM_native_func->win_allocate_shared(size, disp_unit, info_ptr, comm_ptr, base_ptr,
+                                                     win_ptr);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_rput(const void *origin_addr, int origin_count,
+                                                MPI_Datatype origin_datatype, int target_rank,
+                                                MPI_Aint target_disp, int target_count,
+                                                MPI_Datatype target_datatype, MPIR_Win * win,
+                                                MPIR_Request ** request)
+{
+    return MPIDI_NM_native_func->rput(origin_addr, origin_count, origin_datatype, target_rank,
+                                      target_disp, target_count, target_datatype, win, request);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_flush_local(int rank, MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->win_flush_local(rank, win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_detach(MPIR_Win * win, const void *base)
+{
+    return MPIDI_NM_native_func->win_detach(win, base);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_compare_and_swap(const void *origin_addr,
+                                                            const void *compare_addr,
+                                                            void *result_addr,
+                                                            MPI_Datatype datatype, int target_rank,
+                                                            MPI_Aint target_disp, MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->compare_and_swap(origin_addr, compare_addr, result_addr, datatype,
+                                                  target_rank, target_disp, win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_raccumulate(const void *origin_addr, int origin_count,
+                                                       MPI_Datatype origin_datatype,
+                                                       int target_rank, MPI_Aint target_disp,
+                                                       int target_count,
+                                                       MPI_Datatype target_datatype, MPI_Op op,
+                                                       MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_NM_native_func->raccumulate(origin_addr, origin_count, origin_datatype,
+                                             target_rank, target_disp, target_count,
+                                             target_datatype, op, win, request);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_rget_accumulate(const void *origin_addr,
+                                                           int origin_count,
+                                                           MPI_Datatype origin_datatype,
+                                                           void *result_addr, int result_count,
+                                                           MPI_Datatype result_datatype,
+                                                           int target_rank, MPI_Aint target_disp,
+                                                           int target_count,
+                                                           MPI_Datatype target_datatype, MPI_Op op,
+                                                           MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_NM_native_func->rget_accumulate(origin_addr, origin_count, origin_datatype,
+                                                 result_addr, result_count, result_datatype,
+                                                 target_rank, target_disp, target_count,
+                                                 target_datatype, op, win, request);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_fetch_and_op(const void *origin_addr, void *result_addr,
+                                                        MPI_Datatype datatype, int target_rank,
+                                                        MPI_Aint target_disp, MPI_Op op,
+                                                        MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->fetch_and_op(origin_addr, result_addr, datatype, target_rank,
+                                              target_disp, op, win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_allocate(MPI_Aint size, int disp_unit,
+                                                        MPIR_Info * info, MPIR_Comm * comm,
+                                                        void *baseptr, MPIR_Win ** win)
+{
+    return MPIDI_NM_native_func->win_allocate(size, disp_unit, info, comm, baseptr, win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_flush(int rank, MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->win_flush(rank, win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_flush_local_all(MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->win_flush_local_all(win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_unlock_all(MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->win_unlock_all(win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_create_dynamic(MPIR_Info * info, MPIR_Comm * comm,
+                                                              MPIR_Win ** win)
+{
+    return MPIDI_NM_native_func->win_create_dynamic(info, comm, win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_rget(void *origin_addr, int origin_count,
+                                                MPI_Datatype origin_datatype, int target_rank,
+                                                MPI_Aint target_disp, int target_count,
+                                                MPI_Datatype target_datatype, MPIR_Win * win,
+                                                MPIR_Request ** request)
+{
+    return MPIDI_NM_native_func->rget(origin_addr, origin_count, origin_datatype, target_rank,
+                                      target_disp, target_count, target_datatype, win, request);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_sync(MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->win_sync(win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_flush_all(MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->win_flush_all(win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_get_accumulate(const void *origin_addr, int origin_count,
+                                                          MPI_Datatype origin_datatype,
+                                                          void *result_addr, int result_count,
+                                                          MPI_Datatype result_datatype,
+                                                          int target_rank, MPI_Aint target_disp,
+                                                          int target_count,
+                                                          MPI_Datatype target_datatype, MPI_Op op,
+                                                          MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->get_accumulate(origin_addr, origin_count, origin_datatype,
+                                                result_addr, result_count, result_datatype,
+                                                target_rank, target_disp, target_count,
+                                                target_datatype, op, win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_win_lock_all(int assert, MPIR_Win * win)
+{
+    return MPIDI_NM_native_func->win_lock_all(assert, win);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_rank_is_local(int target, MPIR_Comm * comm)
+{
+    return MPIDI_NM_native_func->rank_is_local(target, comm);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_barrier(MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_native_func->barrier(comm, errflag);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_bcast(void *buffer, int count, MPI_Datatype datatype,
+                                                 int root, MPIR_Comm * comm,
+                                                 MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_native_func->bcast(buffer, count, datatype, root, comm, errflag);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_allreduce(const void *sendbuf, void *recvbuf, int count,
+                                                     MPI_Datatype datatype, MPI_Op op,
+                                                     MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_native_func->allreduce(sendbuf, recvbuf, count, datatype, op, comm, errflag);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_allgather(const void *sendbuf, int sendcount,
+                                                     MPI_Datatype sendtype, void *recvbuf,
+                                                     int recvcount, MPI_Datatype recvtype,
+                                                     MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_native_func->allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                                           recvtype, comm, errflag);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_allgatherv(const void *sendbuf, int sendcount,
+                                                      MPI_Datatype sendtype, void *recvbuf,
+                                                      const int *recvcounts, const int *displs,
+                                                      MPI_Datatype recvtype, MPIR_Comm * comm,
+                                                      MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_native_func->allgatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts,
+                                            displs, recvtype, comm, errflag);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_scatter(const void *sendbuf, int sendcount,
+                                                   MPI_Datatype sendtype, void *recvbuf,
+                                                   int recvcount, MPI_Datatype recvtype, int root,
+                                                   MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_native_func->scatter(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
+                                         root, comm, errflag);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_scatterv(const void *sendbuf, const int *sendcounts,
+                                                    const int *displs, MPI_Datatype sendtype,
+                                                    void *recvbuf, int recvcount,
+                                                    MPI_Datatype recvtype, int root,
+                                                    MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_native_func->scatterv(sendbuf, sendcounts, displs, sendtype, recvbuf, recvcount,
+                                          recvtype, root, comm_ptr, errflag);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_gather(const void *sendbuf, int sendcount,
+                                                  MPI_Datatype sendtype, void *recvbuf,
+                                                  int recvcount, MPI_Datatype recvtype, int root,
+                                                  MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_native_func->gather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
+                                        root, comm, errflag);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_gatherv(const void *sendbuf, int sendcount,
+                                                   MPI_Datatype sendtype, void *recvbuf,
+                                                   const int *recvcounts, const int *displs,
+                                                   MPI_Datatype recvtype, int root,
+                                                   MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_native_func->gatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs,
+                                         recvtype, root, comm, errflag);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_alltoall(const void *sendbuf, int sendcount,
+                                                    MPI_Datatype sendtype, void *recvbuf,
+                                                    int recvcount, MPI_Datatype recvtype,
+                                                    MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_native_func->alltoall(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                                          recvtype, comm, errflag);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_alltoallv(const void *sendbuf, const int *sendcounts,
+                                                     const int *sdispls, MPI_Datatype sendtype,
+                                                     void *recvbuf, const int *recvcounts,
+                                                     const int *rdispls, MPI_Datatype recvtype,
+                                                     MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_native_func->alltoallv(sendbuf, sendcounts, sdispls, sendtype, recvbuf,
+                                           recvcounts, rdispls, recvtype, comm, errflag);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_alltoallw(const void *sendbuf, const int *sendcounts,
+                                                     const int *sdispls,
+                                                     const MPI_Datatype sendtypes[], void *recvbuf,
+                                                     const int *recvcounts, const int *rdispls,
+                                                     const MPI_Datatype recvtypes[],
+                                                     MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_native_func->alltoallw(sendbuf, sendcounts, sdispls, sendtypes, recvbuf,
+                                           recvcounts, rdispls, recvtypes, comm, errflag);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_reduce(const void *sendbuf, void *recvbuf, int count,
+                                                  MPI_Datatype datatype, MPI_Op op, int root,
+                                                  MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_native_func->reduce(sendbuf, recvbuf, count, datatype, op, root, comm_ptr,
+                                        errflag);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_reduce_scatter(const void *sendbuf, void *recvbuf,
+                                                          const int *recvcounts,
+                                                          MPI_Datatype datatype, MPI_Op op,
+                                                          MPIR_Comm * comm_ptr,
+                                                          MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_native_func->reduce_scatter(sendbuf, recvbuf, recvcounts, datatype, op,
+                                                comm_ptr, errflag);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_reduce_scatter_block(const void *sendbuf, void *recvbuf,
+                                                                int recvcount,
+                                                                MPI_Datatype datatype, MPI_Op op,
+                                                                MPIR_Comm * comm_ptr,
+                                                                MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_native_func->reduce_scatter_block(sendbuf, recvbuf, recvcount, datatype, op,
+                                                      comm_ptr, errflag);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_scan(const void *sendbuf, void *recvbuf, int count,
+                                                MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                                MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_native_func->scan(sendbuf, recvbuf, count, datatype, op, comm, errflag);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_exscan(const void *sendbuf, void *recvbuf, int count,
+                                                  MPI_Datatype datatype, MPI_Op op,
+                                                  MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_native_func->exscan(sendbuf, recvbuf, count, datatype, op, comm, errflag);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_neighbor_allgather(const void *sendbuf, int sendcount,
+                                                              MPI_Datatype sendtype, void *recvbuf,
+                                                              int recvcount, MPI_Datatype recvtype,
+                                                              MPIR_Comm * comm)
+{
+    return MPIDI_NM_native_func->neighbor_allgather(sendbuf, sendcount, sendtype, recvbuf,
+                                                    recvcount, recvtype, comm);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_neighbor_allgatherv(const void *sendbuf, int sendcount,
+                                                               MPI_Datatype sendtype, void *recvbuf,
+                                                               const int *recvcounts,
+                                                               const int *displs,
+                                                               MPI_Datatype recvtype,
+                                                               MPIR_Comm * comm)
+{
+    return MPIDI_NM_native_func->neighbor_allgatherv(sendbuf, sendcount, sendtype, recvbuf,
+                                                     recvcounts, displs, recvtype, comm);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_neighbor_alltoallv(const void *sendbuf,
+                                                              const int *sendcounts,
+                                                              const int *sdispls,
+                                                              MPI_Datatype sendtype, void *recvbuf,
+                                                              const int *recvcounts,
+                                                              const int *rdispls,
+                                                              MPI_Datatype recvtype,
+                                                              MPIR_Comm * comm)
+{
+    return MPIDI_NM_native_func->neighbor_alltoallv(sendbuf, sendcounts, sdispls, sendtype, recvbuf,
+                                                    recvcounts, rdispls, recvtype, comm);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_neighbor_alltoallw(const void *sendbuf,
+                                                              const int *sendcounts,
+                                                              const MPI_Aint * sdispls,
+                                                              const MPI_Datatype * sendtypes,
+                                                              void *recvbuf, const int *recvcounts,
+                                                              const MPI_Aint * rdispls,
+                                                              const MPI_Datatype * recvtypes,
+                                                              MPIR_Comm * comm)
+{
+    return MPIDI_NM_native_func->neighbor_alltoallw(sendbuf, sendcounts, sdispls, sendtypes,
+                                                    recvbuf, recvcounts, rdispls, recvtypes, comm);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_neighbor_alltoall(const void *sendbuf, int sendcount,
+                                                             MPI_Datatype sendtype, void *recvbuf,
+                                                             int recvcount, MPI_Datatype recvtype,
+                                                             MPIR_Comm * comm)
+{
+    return MPIDI_NM_native_func->neighbor_alltoall(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                                                   recvtype, comm);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ineighbor_allgather(const void *sendbuf, int sendcount,
+                                                               MPI_Datatype sendtype, void *recvbuf,
+                                                               int recvcount, MPI_Datatype recvtype,
+                                                               MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->ineighbor_allgather(sendbuf, sendcount, sendtype, recvbuf,
+                                                     recvcount, recvtype, comm, req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ineighbor_allgatherv(const void *sendbuf, int sendcount,
+                                                                MPI_Datatype sendtype,
+                                                                void *recvbuf,
+                                                                const int *recvcounts,
+                                                                const int *displs,
+                                                                MPI_Datatype recvtype,
+                                                                MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->ineighbor_allgatherv(sendbuf, sendcount, sendtype, recvbuf,
+                                                      recvcounts, displs, recvtype, comm, req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ineighbor_alltoall(const void *sendbuf, int sendcount,
+                                                              MPI_Datatype sendtype, void *recvbuf,
+                                                              int recvcount, MPI_Datatype recvtype,
+                                                              MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->ineighbor_alltoall(sendbuf, sendcount, sendtype, recvbuf,
+                                                    recvcount, recvtype, comm, req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ineighbor_alltoallv(const void *sendbuf,
+                                                               const int *sendcounts,
+                                                               const int *sdispls,
+                                                               MPI_Datatype sendtype, void *recvbuf,
+                                                               const int *recvcounts,
+                                                               const int *rdispls,
+                                                               MPI_Datatype recvtype,
+                                                               MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->ineighbor_alltoallv(sendbuf, sendcounts, sdispls, sendtype,
+                                                     recvbuf, recvcounts, rdispls, recvtype, comm,
+                                                     req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ineighbor_alltoallw(const void *sendbuf,
+                                                               const int *sendcounts,
+                                                               const MPI_Aint * sdispls,
+                                                               const MPI_Datatype * sendtypes,
+                                                               void *recvbuf, const int *recvcounts,
+                                                               const MPI_Aint * rdispls,
+                                                               const MPI_Datatype * recvtypes,
+                                                               MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->ineighbor_alltoallw(sendbuf, sendcounts, sdispls, sendtypes,
+                                                     recvbuf, recvcounts, rdispls, recvtypes, comm,
+                                                     req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ibarrier(MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->ibarrier(comm, req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ibcast(void *buffer, int count, MPI_Datatype datatype,
+                                                  int root, MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->ibcast(buffer, count, datatype, root, comm, req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_iallgather(const void *sendbuf, int sendcount,
+                                                      MPI_Datatype sendtype, void *recvbuf,
+                                                      int recvcount, MPI_Datatype recvtype,
+                                                      MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->iallgather(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                                            recvtype, comm, req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_iallgatherv(const void *sendbuf, int sendcount,
+                                                       MPI_Datatype sendtype, void *recvbuf,
+                                                       const int *recvcounts, const int *displs,
+                                                       MPI_Datatype recvtype, MPIR_Comm * comm,
+                                                       MPI_Request * req)
+{
+    return MPIDI_NM_native_func->iallgatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts,
+                                             displs, recvtype, comm, req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_iallreduce(const void *sendbuf, void *recvbuf, int count,
+                                                      MPI_Datatype datatype, MPI_Op op,
+                                                      MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->iallreduce(sendbuf, recvbuf, count, datatype, op, comm, req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ialltoall(const void *sendbuf, int sendcount,
+                                                     MPI_Datatype sendtype, void *recvbuf,
+                                                     int recvcount, MPI_Datatype recvtype,
+                                                     MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->ialltoall(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                                           recvtype, comm, req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ialltoallv(const void *sendbuf, const int *sendcounts,
+                                                      const int *sdispls, MPI_Datatype sendtype,
+                                                      void *recvbuf, const int *recvcounts,
+                                                      const int *rdispls, MPI_Datatype recvtype,
+                                                      MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->ialltoallv(sendbuf, sendcounts, sdispls, sendtype, recvbuf,
+                                            recvcounts, rdispls, recvtype, comm, req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ialltoallw(const void *sendbuf, const int *sendcounts,
+                                                      const int *sdispls,
+                                                      const MPI_Datatype sendtypes[], void *recvbuf,
+                                                      const int *recvcounts, const int *rdispls,
+                                                      const MPI_Datatype recvtypes[],
+                                                      MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->ialltoallw(sendbuf, sendcounts, sdispls, sendtypes, recvbuf,
+                                            recvcounts, rdispls, recvtypes, comm, req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_iexscan(const void *sendbuf, void *recvbuf, int count,
+                                                   MPI_Datatype datatype, MPI_Op op,
+                                                   MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->iexscan(sendbuf, recvbuf, count, datatype, op, comm, req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_igather(const void *sendbuf, int sendcount,
+                                                   MPI_Datatype sendtype, void *recvbuf,
+                                                   int recvcount, MPI_Datatype recvtype, int root,
+                                                   MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->igather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
+                                         root, comm, req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_igatherv(const void *sendbuf, int sendcount,
+                                                    MPI_Datatype sendtype, void *recvbuf,
+                                                    const int *recvcounts, const int *displs,
+                                                    MPI_Datatype recvtype, int root,
+                                                    MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->igatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs,
+                                          recvtype, root, comm, req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ireduce_scatter_block(const void *sendbuf, void *recvbuf,
+                                                                 int recvcount,
+                                                                 MPI_Datatype datatype, MPI_Op op,
+                                                                 MPIR_Comm * comm,
+                                                                 MPI_Request * req)
+{
+    return MPIDI_NM_native_func->ireduce_scatter_block(sendbuf, recvbuf, recvcount, datatype, op,
+                                                       comm, req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ireduce_scatter(const void *sendbuf, void *recvbuf,
+                                                           const int *recvcounts,
+                                                           MPI_Datatype datatype, MPI_Op op,
+                                                           MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->ireduce_scatter(sendbuf, recvbuf, recvcounts, datatype, op, comm,
+                                                 req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_ireduce(const void *sendbuf, void *recvbuf, int count,
+                                                   MPI_Datatype datatype, MPI_Op op, int root,
+                                                   MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->ireduce(sendbuf, recvbuf, count, datatype, op, root, comm_ptr,
+                                         req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_iscan(const void *sendbuf, void *recvbuf, int count,
+                                                 MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                                 MPI_Request * req)
+{
+    return MPIDI_NM_native_func->iscan(sendbuf, recvbuf, count, datatype, op, comm, req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_iscatter(const void *sendbuf, int sendcount,
+                                                    MPI_Datatype sendtype, void *recvbuf,
+                                                    int recvcount, MPI_Datatype recvtype, int root,
+                                                    MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->iscatter(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                                          recvtype, root, comm, req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX int MPIDI_NM_iscatterv(const void *sendbuf, const int *sendcounts,
+                                                     const int *displs, MPI_Datatype sendtype,
+                                                     void *recvbuf, int recvcount,
+                                                     MPI_Datatype recvtype, int root,
+                                                     MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    return MPIDI_NM_native_func->iscatterv(sendbuf, sendcounts, displs, sendtype, recvbuf,
+                                           recvcount, recvtype, root, comm_ptr, req);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX void MPIDI_NM_datatype_commit(MPIR_Datatype * datatype_p)
+{
+    return MPIDI_NM_native_func->datatype_commit(datatype_p);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX void MPIDI_NM_datatype_dup(MPIR_Datatype * old_datatype_p,
+                                                         MPIR_Datatype * new_datatype_p)
+{
+    return MPIDI_NM_native_func->datatype_dup(old_datatype_p, new_datatype_p);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX void MPIDI_NM_datatype_destroy(MPIR_Datatype * datatype_p)
+{
+    return MPIDI_NM_native_func->datatype_destroy(datatype_p);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX void MPIDI_NM_op_commit(MPIR_Op * op_p)
+{
+    return MPIDI_NM_native_func->op_commit(op_p);
+};
+
+MPIDI_NM_STATIC_INLINE_PREFIX void MPIDI_NM_op_destroy(MPIR_Op * op_p)
+{
+    return MPIDI_NM_native_func->op_destroy(op_p);
+};
+
+#endif /* NETMOD_DISABLE_INLINES  */
+
+#else
+#define __netmod_direct_stubnm__   0
+#define __netmod_direct_ofi__    1
+#define __netmod_direct_shm__    2
+#define __netmod_direct_ucx__    3
+#define __netmod_direct_portals4__ 4
+
+#if NETMOD_DIRECT==__netmod_direct_stubnm__
+#include "../stubnm/netmod_direct.h"
+#elif NETMOD_DIRECT==__netmod_direct_ofi__
+#include "../ofi/netmod_direct.h"
+#elif NETMOD_DIRECT==__netmod_direct_shm__
+#include "../shm/netmod_direct.h"
+#elif NETMOD_DIRECT==__netmod_direct_ucx__
+#include "../ucx/netmod_direct.h"
+#elif NETMOD_DIRECT==__netmod_direct_portals4__
+#include "../portals4/netmod_direct.h"
+#else
+#error "No direct netmod included"
+#endif
+#endif /* NETMOD_DIRECT           */
+
+#endif
diff --git a/src/mpid/ch4/netmod/ofi/Makefile.mk b/src/mpid/ch4/netmod/ofi/Makefile.mk
new file mode 100644
index 0000000..df2a564
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/Makefile.mk
@@ -0,0 +1,23 @@
+## -*- Mode: Makefile; -*-
+## vim: set ft=automake :
+##
+## (C) 2016 by Argonne National Laboratory.
+##     See COPYRIGHT in top-level directory.
+##
+##  Portions of this code were written by Intel Corporation.
+##  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+##  to Argonne National Laboratory subject to Software Grant and Corporate
+##  Contributor License Agreement dated February 8, 2012.
+##
+if BUILD_CH4_NETMOD_OFI
+
+noinst_HEADERS     +=
+mpi_core_sources   += src/mpid/ch4/netmod/ofi/func_table.c \
+                      src/mpid/ch4/netmod/ofi/globals.c \
+                      src/mpid/ch4/netmod/ofi/util.c
+errnames_txt_files += src/mpid/ch4/netmod/ofi/errnames.txt
+external_ldflags   += -ldl -lpthread
+external_subdirs   += @ofisrcdir@
+pmpi_convenience_libs += @ofilib@
+
+endif
diff --git a/src/mpid/ch4/netmod/ofi/catalog.c b/src/mpid/ch4/netmod/ofi/catalog.c
new file mode 100644
index 0000000..0905a25
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/catalog.c
@@ -0,0 +1,61 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+static inline void MPIDI_OFI_unused_gen_catalog()
+{
+#if 0
+    char *a;
+    int b, e;
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_pmi", "**ofid_pmi %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_addrinfo", "**ofid_addrinfo %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_opendomain", "**ofid_opendomain %s %d %s %s", a, b, a,
+                  a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_bind", "**ofid_bind %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_fabric", "**ofid_fabric %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_opencq", "**ofid_opencq %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_openct", "**ofid_openct %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_bind", "**ofid_bind %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_ep_enable", "**ofid_ep_enable %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_ep", "**ofid_ep %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_avopen", "**ofid_avopen %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_getname", "**ofid_getname %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_avmap", "**ofid_avmap %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_avlookup", "**ofid_avlookup %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_avsync", "**ofid_avsync %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_epclose", "**ofid_epclose %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_cqclose", "**ofid_cqclose %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_epsync", "**ofid_epsync %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_alias", "**ofid_alias %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_getopt", "**ofid_getopt %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_setopt", "**ofid_setopt %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_domainclose", "**ofid_domainclose %s %d %s %s", a, b, a,
+                  a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_avclose", "**ofid_avclose %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_tsend", "**ofid_tsend %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_tinject", "**ofid_tinject %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_tsendsync", "**ofid_tsendsync %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_trecv", "**ofid_trecv %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_trecvsync", "**ofid_trecvsync %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_poll", "**ofid_poll %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_peek", "**ofid_peek %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_send", "**ofid_send %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_inject", "**ofid_inject %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_rdma_write", "**ofid_rdma_write %s %d %s %s", a, b, a,
+                  a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_rdma_inject_write",
+                  "**ofid_rdma_inject_write %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_rdma_atomicto", "**ofid_rdma_atomicto %s %d %s %s", a,
+                  b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_rdma_cswap", "**ofid_rdma_cswap %s %d %s %s", a, b, a,
+                  a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_rdma_readfrom", "**ofid_rdma_readfrom %s %d %s %s", a,
+                  b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_rdma_readfrom", "**ofid_rdma_readfrom %s %d %s %s", a,
+                  b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_mr_reg", "**ofid_mr_reg %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_mr_unreg", "**ofid_mr_unreg %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_prepost", "**ofid_prepost %s %d %s %s", a, b, a, a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_ctrlcancel", "**ofid_ctrlcancel %s %d %s %s", a, b, a,
+                  a);
+    MPIR_ERR_SET2(e, MPI_ERR_OTHER, "**ofid_cntr_wait", "**ofid_cntr_wait %s %d %s %s", a, b, a, a);
+
+#endif
+}
diff --git a/src/mpid/ch4/netmod/ofi/errnames.txt b/src/mpid/ch4/netmod/ofi/errnames.txt
new file mode 100644
index 0000000..2648e10
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/errnames.txt
@@ -0,0 +1,97 @@
+**ofid_pmi:PMI_Init() failure
+**ofid_pmi %s %d %s %s:pmi failed (%s:%d:%s:%s)
+**ofid_addrinfo:OFI addrinfo() failure
+**ofid_addrinfo %s %d %s %s:OFI addrinfo() failed (%s:%d:%s:%s)
+**ofid_addrinfo %s:Addrinfo failure ('%s')
+**ofid_opendomain:OFI fi_open domain failure
+**ofid_opendomain %s %d %s %s:OFI fi_open domain failed (%s:%d:%s:%s)
+**ofid_fabric:OFI fi_fabric failure
+**ofid_fabric %s %d %s %s:OFI fi_fabric failed (%s:%d:%s:%s)
+**ofid_opencq:OFI event queue create failure
+**ofid_opencq %s %d %s %s:OFI event queue create failed (%s:%d:%s:%s)
+**ofid_openct:OFI event counter create failure
+**ofid_openct %s %d %s %s:OFI event counter create failed (%s:%d:%s:%s)
+**ofid_bind:OFI resource bind failure
+**ofid_bind %s %d %s %s:OFI resource bind failed (%s:%d:%s:%s)
+**ofid_ep_enable:OFI EP enable failure
+**ofid_ep_enable %s %d %s %s:OFI EP enable failed (%s:%d:%s:%s)
+**ofid_avopen:OFI address vector open failed
+**ofid_avopen %s %d %s %s:OFI address vector open failed (%s:%d:%s:%s)
+**ofid_ep:OFI endpoint open failed
+**ofid_ep %s %d %s %s:OFI endpoint open failed (%s:%d:%s:%s)
+**ofid_getname:OFI get endpoint name failed
+**ofid_getname %s %d %s %s:OFI get endpoint name failed (%s:%d:%s:%s)
+**ofid_avmap:OFI get address vector map failed
+**ofid_avmap %s %d %s %s:OFI address vector map failed (%s:%d:%s:%s)
+**ofid_avlookup:OFI get address vector lookup failed
+**ofid_avlookup %s %d %s %s:OFI address vector lookup failed (%s:%d:%s:%s)
+**ofid_avsync:OFI get address vector sync failed
+**ofid_avsync %s %d %s %s:OFI address vector sync failed (%s:%d:%s:%s)
+**ofid_epclose:OFI endpoint close failed
+**ofid_epclose %s %d %s %s:OFI endpoint close failed (%s:%d:%s:%s)
+**ofid_cqclose:OFI cq close failed
+**ofid_cqclose %s %d %s %s:OFI cq close failed (%s:%d:%s:%s)
+**ofid_epsync:OFI synchronization failed
+**ofid_epsync %s %d %s %s:OFI endpoint synchronization failed (%s:%d:%s:%s)
+**ofid_alias:OFI cq alias failed
+**ofid_alias %s %d %s %s:OFI cq alias failed (%s:%d:%s:%s)
+**ofid_getopt:OFI getopt failed
+**ofid_getopt %s %d %s %s:OFI getopt failed (%s:%d:%s:%s)
+**ofid_setopt:OFI setopt failed
+**ofid_setopt %s %d %s %s:OFI setopt failed (%s:%d:%s:%s)
+**ofid_domainclose:OFI domain close failed
+**ofid_domainclose %s %d %s %s:OFI domain close failed (%s:%d:%s:%s)
+**ofid_avclose:OFI av close failed
+**ofid_avclose %s %d %s %s:OFI av close failed (%s:%d:%s:%s)
+**ofid_tsend:OFI tagged send failed
+**ofid_tsend %s %d %s %s:OFI tagged send failed (%s:%d:%s:%s)
+**ofid_tsenddata:OFI tagged senddata failed
+**ofid_tsenddata %s %d %s %s:OFI tagged senddata failed (%s:%d:%s:%s)
+**ofid_tsendmsg:OFI tagged sendmsg failed
+**ofid_tsendmsg %s %d %s %s:OFI tagged sendmsg failed (%s:%d:%s:%s)
+**ofid_tinject:OFI tagged inject failed
+**ofid_tinject %s %d %s %s:OFI tagged inject failed (%s:%d:%s:%s)
+**ofid_tinjectdata:OFI tagged injectdata failed
+**ofid_tinjectdata %s %d %s %s:OFI tagged injectdata failed (%s:%d:%s:%s)
+**ofid_tsendsync:OFI tagged send sync failed
+**ofid_tsendsync %s %d %s %s:OFI tagged send sync failed (%s:%d:%s:%s)
+**ofid_trecv:OFI tagged recv failed
+**ofid_trecv %s %d %s %s:OFI tagged recv failed (%s:%d:%s:%s)
+**ofid_trecvmsg:OFI tagged recvmsg failed
+**ofid_trecvmsg %s %d %s %s:OFI tagged recvmsg failed (%s:%d:%s:%s)
+**ofid_trecvsync:OFI tagged recv sync failed
+**ofid_trecvsync %s %d %s %s:OFI tagged recv sync failed (%s:%d:%s:%s)
+**ofid_poll:OFI poll failed
+**ofid_poll %s %d %s %s:OFI poll failed (%s:%d:%s:%s)
+**ofid_peek:OFI peek failed
+**ofid_peek %s %d %s %s:OFI peek failed (%s:%d:%s:%s)
+**ofid_send:OFI send failed
+**ofid_send %s %d %s %s:OFI send failed (%s:%d:%s:%s)
+**ofid_inject:OFI inject failed
+**ofid_inject %s %d %s %s:OFI inject failed (%s:%d:%s:%s)
+**ofid_rdma_write:OFI rdma write failed
+**ofid_rdma_write %s %d %s %s:OFI rdma write failed (%s:%d:%s:%s)
+**ofid_rdma_inject_write:OFI rdma write immediatefailed
+**ofid_rdma_inject_write %s %d %s %s:OFI rdma write immediate failed (%s:%d:%s:%s)
+**ofid_rdma_atomicto:OFI rdma atomicto failed
+**ofid_rdma_atomicto %s %d %s %s:OFI rdma atomicto failed (%s:%d:%s:%s)
+**ofid_rdma_cswap:OFI rdma cswap failed
+**ofid_rdma_cswap %s %d %s %s:OFI rdma cswap failed (%s:%d:%s:%s)
+**ofid_rdma_readfrom:OFI rdma read failed
+**ofid_rdma_readfrom %s %d %s %s:OFI read failed (%s:%d:%s:%s)
+**ofid_mr_reg:OFI memory registration failed
+**ofid_mr_reg %s %d %s %s:OFI memory registration failed (%s:%d:%s:%s)
+**ofid_mr_unreg:OFI memory deregistration failed
+**ofid_mr_unreg %s %d %s %s:OFI memory deregistration failed (%s:%d:%s:%s)
+**ofid_prepost:OFI preposting receives failed
+**ofid_prepost %s %d %s %s:OFI preposting receives failed (%s:%d:%s:%s)
+**ofid_ctrlcancel:OFI Control cancel failed
+**ofid_ctrlcancel %s %d %s %s:OFI control cancel failed (%s:%d:%s:%s)
+**ofid_cntr_wait:OFI Counter wait failed
+**ofid_cntr_wait %s %d %s %s:OFI OFI Counter wait failed (%s:%d:%s:%s)
+**ofid_rma_init:OFI RMA Initialization failed
+**ofid_rma_init %s %d %s %s:OFI RMA Initialization failed (%s:%d:%s:%s)
+**ofid_stx_ctx:OFI fi_stx_context failed
+**ofid_stx_ctx %s %d %s %s:OFI fi_stx_context failed (%s:%d:%s:%s)
+**ofid_stx_ctx_close:OFI stx context close failed
+**ofid_stx_ctx_close %s %d %s %s:OFI stx context close failed (%s:%d:%s:%s)
diff --git a/src/mpid/ch4/netmod/ofi/fi_list.h b/src/mpid/ch4/netmod/ofi/fi_list.h
new file mode 100644
index 0000000..488aed8
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/fi_list.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2011-2016 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#if !defined(LIST_H)
+#define LIST_H
+
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+
+/*
+ * Double-linked list
+ */
+struct dlist_entry {
+    struct dlist_entry *next;
+    struct dlist_entry *prev;
+};
+
+#define DLIST_INIT(addr) { addr, addr }
+#define DEFINE_LIST(name) struct dlist_entry name = DLIST_INIT(&name)
+
+static inline void dlist_init(struct dlist_entry *head)
+{
+    head->next = head;
+    head->prev = head;
+}
+
+static inline int dlist_empty(struct dlist_entry *head)
+{
+    return head->next == head;
+}
+
+static inline void dlist_insert_after(struct dlist_entry *item, struct dlist_entry *head)
+{
+    item->next = head->next;
+    item->prev = head;
+    head->next->prev = item;
+    head->next = item;
+}
+
+static inline void dlist_insert_before(struct dlist_entry *item, struct dlist_entry *head)
+{
+    dlist_insert_after(item, head->prev);
+}
+
+#define dlist_insert_head dlist_insert_after
+#define dlist_insert_tail dlist_insert_before
+
+static inline void dlist_remove(struct dlist_entry *item)
+{
+    item->prev->next = item->next;
+    item->next->prev = item->prev;
+}
+
+#define dlist_foreach(head, item) \
+	for (item = (head)->next; item != head; item = item->next)
+
+typedef int dlist_func_t(struct dlist_entry *item, const void *arg);
+
+static inline struct dlist_entry *dlist_find_first_match(struct dlist_entry *head,
+                                                         dlist_func_t * match, const void *arg)
+{
+    struct dlist_entry *item;
+
+    dlist_foreach(head, item) {
+        if (match(item, arg))
+            return item;
+    }
+
+    return NULL;
+}
+
+static inline struct dlist_entry *dlist_remove_first_match(struct dlist_entry *head,
+                                                           dlist_func_t * match, const void *arg)
+{
+    struct dlist_entry *item;
+
+    item = dlist_find_first_match(head, match, arg);
+    if (item)
+        dlist_remove(item);
+
+    return item;
+}
+
+/*
+ * Single-linked list
+ */
+struct slist_entry {
+    struct slist_entry *next;
+};
+
+struct slist {
+    struct slist_entry *head;
+    struct slist_entry *tail;
+};
+
+static inline void slist_init(struct slist *list)
+{
+    list->head = list->tail = NULL;
+}
+
+static inline int slist_empty(struct slist *list)
+{
+    return !list->head;
+}
+
+static inline void slist_insert_head(struct slist_entry *item, struct slist *list)
+{
+    if (slist_empty(list))
+        list->tail = item;
+    else
+        item->next = list->head;
+
+    list->head = item;
+}
+
+static inline void slist_insert_tail(struct slist_entry *item, struct slist *list)
+{
+    if (slist_empty(list))
+        list->head = item;
+    else
+        list->tail->next = item;
+
+    list->tail = item;
+}
+
+static inline struct slist_entry *slist_remove_head(struct slist *list)
+{
+    struct slist_entry *item;
+
+    item = list->head;
+    if (list->head == list->tail)
+        slist_init(list);
+    else
+        list->head = item->next;
+    return item;
+}
+
+#define slist_foreach(list, item, prev) \
+	for (prev = NULL, item = list->head; item; prev = item, item = item->next)
+
+typedef int slist_func_t(struct slist_entry *item, const void *arg);
+
+static inline struct slist_entry *slist_remove_first_match(struct slist *list, slist_func_t * match,
+                                                           const void *arg)
+{
+    struct slist_entry *item, *prev;
+
+    slist_foreach(list, item, prev) {
+        if (match(item, arg)) {
+            if (prev)
+                prev->next = item->next;
+            else
+                list->head = item->next;
+
+            if (!item->next)
+                list->tail = prev;
+
+            return item;
+        }
+    }
+
+    return NULL;
+}
+
+#endif /* LIST_H */
diff --git a/src/mpid/ch4/netmod/ofi/func_table.c b/src/mpid/ch4/netmod/ofi/func_table.c
new file mode 100644
index 0000000..777c73e
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/func_table.c
@@ -0,0 +1,157 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#ifndef NETMOD_DIRECT
+#define NETMOD_DISABLE_INLINES
+#include <mpidimpl.h>
+#include "netmod_direct.h"
+MPIDI_NM_funcs_t MPIDI_NM_ofi_funcs = {
+    MPIDI_NM_init,
+    MPIDI_NM_finalize,
+    MPIDI_NM_progress,
+    MPIDI_NM_comm_connect,
+    MPIDI_NM_comm_disconnect,
+    MPIDI_NM_open_port,
+    MPIDI_NM_close_port,
+    MPIDI_NM_comm_accept,
+    MPIDI_NM_comm_get_lpid,
+    MPIDI_NM_gpid_get,
+    MPIDI_NM_get_node_id,
+    MPIDI_NM_get_max_node_id,
+    MPIDI_NM_getallincomm,
+    MPIDI_NM_gpid_tolpidarray,
+    MPIDI_NM_create_intercomm_from_lpids,
+    MPIDI_NM_comm_create,
+    MPIDI_NM_comm_destroy,
+    MPIDI_NM_am_request_init,
+    MPIDI_NM_am_request_finalize,
+    MPIDI_NM_reg_hdr_handler,
+    MPIDI_NM_send_am_hdr,
+    MPIDI_NM_inject_am_hdr,
+    MPIDI_NM_send_am,
+    MPIDI_NM_send_amv,
+    MPIDI_NM_send_amv_hdr,
+    MPIDI_NM_send_am_hdr_reply,
+    MPIDI_NM_inject_am_hdr_reply,
+    MPIDI_NM_send_am_reply,
+    MPIDI_NM_send_amv_reply,
+    MPIDI_NM_am_hdr_max_sz,
+    MPIDI_NM_am_inject_max_sz,
+    MPIDI_NM_am_recv
+};
+
+MPIDI_NM_native_funcs_t MPIDI_NM_native_ofi_funcs = {
+    MPIDI_NM_send,
+    MPIDI_NM_ssend,
+    MPIDI_NM_startall,
+    MPIDI_NM_send_init,
+    MPIDI_NM_ssend_init,
+    MPIDI_NM_rsend_init,
+    MPIDI_NM_bsend_init,
+    MPIDI_NM_isend,
+    MPIDI_NM_issend,
+    MPIDI_NM_cancel_send,
+    MPIDI_NM_recv_init,
+    MPIDI_NM_recv,
+    MPIDI_NM_irecv,
+    MPIDI_NM_imrecv,
+    MPIDI_NM_cancel_recv,
+    MPIDI_NM_alloc_mem,
+    MPIDI_NM_free_mem,
+    MPIDI_NM_improbe,
+    MPIDI_NM_iprobe,
+    MPIDI_NM_win_set_info,
+    MPIDI_NM_win_shared_query,
+    MPIDI_NM_put,
+    MPIDI_NM_win_start,
+    MPIDI_NM_win_complete,
+    MPIDI_NM_win_post,
+    MPIDI_NM_win_wait,
+    MPIDI_NM_win_test,
+    MPIDI_NM_win_lock,
+    MPIDI_NM_win_unlock,
+    MPIDI_NM_win_get_info,
+    MPIDI_NM_get,
+    MPIDI_NM_win_free,
+    MPIDI_NM_win_fence,
+    MPIDI_NM_win_create,
+    MPIDI_NM_accumulate,
+    MPIDI_NM_win_attach,
+    MPIDI_NM_win_allocate_shared,
+    MPIDI_NM_rput,
+    MPIDI_NM_win_flush_local,
+    MPIDI_NM_win_detach,
+    MPIDI_NM_compare_and_swap,
+    MPIDI_NM_raccumulate,
+    MPIDI_NM_rget_accumulate,
+    MPIDI_NM_fetch_and_op,
+    MPIDI_NM_win_allocate,
+    MPIDI_NM_win_flush,
+    MPIDI_NM_win_flush_local_all,
+    MPIDI_NM_win_unlock_all,
+    MPIDI_NM_win_create_dynamic,
+    MPIDI_NM_rget,
+    MPIDI_NM_win_sync,
+    MPIDI_NM_win_flush_all,
+    MPIDI_NM_get_accumulate,
+    MPIDI_NM_win_lock_all,
+    MPIDI_NM_rank_is_local,
+    MPIDI_NM_barrier,
+    MPIDI_NM_bcast,
+    MPIDI_NM_allreduce,
+    MPIDI_NM_allgather,
+    MPIDI_NM_allgatherv,
+    MPIDI_NM_scatter,
+    MPIDI_NM_scatterv,
+    MPIDI_NM_gather,
+    MPIDI_NM_gatherv,
+    MPIDI_NM_alltoall,
+    MPIDI_NM_alltoallv,
+    MPIDI_NM_alltoallw,
+    MPIDI_NM_reduce,
+    MPIDI_NM_reduce_scatter,
+    MPIDI_NM_reduce_scatter_block,
+    MPIDI_NM_scan,
+    MPIDI_NM_exscan,
+    MPIDI_NM_neighbor_allgather,
+    MPIDI_NM_neighbor_allgatherv,
+    MPIDI_NM_neighbor_alltoall,
+    MPIDI_NM_neighbor_alltoallv,
+    MPIDI_NM_neighbor_alltoallw,
+    MPIDI_NM_ineighbor_allgather,
+    MPIDI_NM_ineighbor_allgatherv,
+    MPIDI_NM_ineighbor_alltoall,
+    MPIDI_NM_ineighbor_alltoallv,
+    MPIDI_NM_ineighbor_alltoallw,
+    MPIDI_NM_ibarrier,
+    MPIDI_NM_ibcast,
+    MPIDI_NM_iallgather,
+    MPIDI_NM_iallgatherv,
+    MPIDI_NM_iallreduce,
+    MPIDI_NM_ialltoall,
+    MPIDI_NM_ialltoallv,
+    MPIDI_NM_ialltoallw,
+    MPIDI_NM_iexscan,
+    MPIDI_NM_igather,
+    MPIDI_NM_igatherv,
+    MPIDI_NM_ireduce_scatter_block,
+    MPIDI_NM_ireduce_scatter,
+    MPIDI_NM_ireduce,
+    MPIDI_NM_iscan,
+    MPIDI_NM_iscatter,
+    MPIDI_NM_iscatterv,
+    MPIDI_NM_datatype_commit,
+    MPIDI_NM_datatype_dup,
+    MPIDI_NM_datatype_destroy,
+    MPIDI_NM_op_commit,
+    MPIDI_NM_op_destroy,
+};
+#endif
diff --git a/src/mpid/ch4/netmod/ofi/globals.c b/src/mpid/ch4/netmod/ofi/globals.c
new file mode 100644
index 0000000..cf74fd9
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/globals.c
@@ -0,0 +1,13 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#include <mpidimpl.h>
+#include "ofi_impl.h"
+MPIDI_OFI_global_t MPIDI_Global = { 0 };
diff --git a/src/mpid/ch4/netmod/ofi/netmod_direct.h b/src/mpid/ch4/netmod/ofi/netmod_direct.h
new file mode 100644
index 0000000..0d77db2
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/netmod_direct.h
@@ -0,0 +1,41 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_DIRECT_H_INCLUDED
+#define NETMOD_DIRECT_H_INCLUDED
+
+#include "ofi_am.h"
+#include "ofi_events.h"
+#include "ofi_comm.h"
+#include "ofi_proc.h"
+#include "ofi_progress.h"
+#include "ofi_unimpl.h"
+#include "ofi_init.h"
+#include "ofi_coll.h"
+#include "ofi_datatype.h"
+#include "ofi_op.h"
+
+#ifdef USE_OFI_TAGGED
+#include "ofi_probe.h"
+#include "ofi_recv.h"
+#include "ofi_send.h"
+#include "ofi_win.h"
+#include "ofi_rma.h"
+#include "ofi_spawn.h"
+#else
+#include "ofi_am_probe.h"
+#include "ofi_am_recv.h"
+#include "ofi_am_send.h"
+#include "ofi_am_win.h"
+#include "ofi_am_rma.h"
+#include "ofi_am_spawn.h"
+#endif /* USE_OFI_TAGGED */
+
+#endif /* NETMOD_DIRECT_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_am.h b/src/mpid/ch4/netmod/ofi/ofi_am.h
new file mode 100644
index 0000000..f3425dd
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_am.h
@@ -0,0 +1,383 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_AM_H_INCLUDED
+#define NETMOD_OFI_AM_H_INCLUDED
+#include "ofi_impl.h"
+#include "ofi_am_impl.h"
+#include "ofi_am_events.h"
+
+static inline int MPIDI_OFI_progress_do_queue(void *netmod_context);
+
+static inline void MPIDI_NM_am_request_init(MPIR_Request * req)
+{
+    MPIDI_OFI_AMREQUEST(req, req_hdr) = NULL;
+}
+
+static inline void MPIDI_NM_am_request_finalize(MPIR_Request * req)
+{
+    MPIDI_OFI_am_clear_request(req);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_reg_hdr_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_reg_hdr_handler(int handler_id,
+                                           MPIDI_NM_am_origin_handler_fn origin_handler_fn,
+                                           MPIDI_NM_am_target_handler_fn target_handler_fn)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_REG_HDR_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_REG_HDR_HANDLER);
+
+    if (handler_id > MPIDI_OFI_MAX_AM_HANDLERS) {
+        mpi_errno = MPI_ERR_OTHER;
+        goto fn_fail;
+    }
+
+    MPIDI_Global.am_handlers[handler_id] = target_handler_fn;
+    MPIDI_Global.am_send_cmpl_handlers[handler_id] = origin_handler_fn;
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_REG_HDR_HANDLER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_send_am_hdr
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_send_am_hdr(int rank,
+                                       MPIR_Comm * comm,
+                                       int handler_id,
+                                       const void *am_hdr,
+                                       size_t am_hdr_sz, MPIR_Request * sreq, void *netmod_context)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_SEND_AM_HDR);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_SEND_AM_HDR);
+
+    mpi_errno = MPIDI_OFI_do_send_am_header(rank, comm, handler_id, am_hdr, am_hdr_sz, sreq, FALSE);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_SEND_AM_HDR);
+    return mpi_errno;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_send_am
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_send_am(int rank,
+                                   MPIR_Comm * comm,
+                                   int handler_id,
+                                   const void *am_hdr,
+                                   size_t am_hdr_sz,
+                                   const void *data,
+                                   MPI_Count count,
+                                   MPI_Datatype datatype, MPIR_Request * sreq, void *netmod_context)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_SEND_AM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_SEND_AM);
+    mpi_errno = MPIDI_OFI_do_send_am(rank, comm, handler_id,
+                                     am_hdr, am_hdr_sz, data, count, datatype, sreq, FALSE);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_SEND_AM);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_send_amv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_send_amv(int rank,
+                                    MPIR_Comm * comm,
+                                    int handler_id,
+                                    struct iovec *am_hdr,
+                                    size_t iov_len,
+                                    const void *data,
+                                    MPI_Count count,
+                                    MPI_Datatype datatype,
+                                    MPIR_Request * sreq, void *netmod_context)
+{
+    int mpi_errno = MPI_SUCCESS, is_allocated;
+    size_t am_hdr_sz = 0, i;
+    char *am_hdr_buf;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_SEND_AMV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_SEND_AMV);
+
+    for (i = 0; i < iov_len; i++) {
+        am_hdr_sz += am_hdr[i].iov_len;
+    }
+
+    if (am_hdr_sz > MPIDI_OFI_BUF_POOL_SIZE) {
+        am_hdr_buf = (char *) MPL_malloc(am_hdr_sz);
+        is_allocated = 1;
+    }
+    else {
+        am_hdr_buf = (char *) MPIDI_CH4R_get_buf(MPIDI_Global.am_buf_pool);
+        is_allocated = 0;
+    }
+
+    MPIR_Assert(am_hdr_buf);
+    am_hdr_sz = 0;
+
+    for (i = 0; i < iov_len; i++) {
+        MPIR_Memcpy(am_hdr_buf + am_hdr_sz, am_hdr[i].iov_base, am_hdr[i].iov_len);
+        am_hdr_sz += am_hdr[i].iov_len;
+    }
+
+    mpi_errno = MPIDI_NM_send_am(rank, comm, handler_id, am_hdr_buf, am_hdr_sz,
+                                 data, count, datatype, sreq, netmod_context);
+
+    if (is_allocated)
+        MPL_free(am_hdr_buf);
+    else
+        MPIDI_CH4R_release_buf(am_hdr_buf);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_SEND_AMV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_send_amv_hdr
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_send_amv_hdr(int rank,
+                                        MPIR_Comm * comm,
+                                        int handler_id,
+                                        struct iovec *am_hdr,
+                                        size_t iov_len, MPIR_Request * sreq, void *netmod_context)
+{
+    int mpi_errno = MPI_SUCCESS, is_allocated;
+    size_t am_hdr_sz = 0, i;
+    char *am_hdr_buf;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_SEND_AMV_HDR);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_SEND_AMV_HDR);
+
+    for (i = 0; i < iov_len; i++) {
+        am_hdr_sz += am_hdr[i].iov_len;
+    }
+
+    /* TODO: avoid the malloc here, use the am_hdr directly */
+    if (am_hdr_sz > MPIDI_OFI_BUF_POOL_SIZE) {
+        am_hdr_buf = (char *) MPL_malloc(am_hdr_sz);
+        is_allocated = 1;
+    }
+    else {
+        am_hdr_buf = (char *) MPIDI_CH4R_get_buf(MPIDI_Global.am_buf_pool);
+        is_allocated = 0;
+    }
+
+    MPIR_Assert(am_hdr_buf);
+    am_hdr_sz = 0;
+
+    for (i = 0; i < iov_len; i++) {
+        MPIR_Memcpy(am_hdr_buf + am_hdr_sz, am_hdr[i].iov_base, am_hdr[i].iov_len);
+        am_hdr_sz += am_hdr[i].iov_len;
+    }
+
+    mpi_errno = MPIDI_NM_send_am_hdr(rank, comm, handler_id, am_hdr_buf, am_hdr_sz,
+                                     sreq, netmod_context);
+
+    if (is_allocated)
+        MPL_free(am_hdr_buf);
+    else
+        MPIDI_CH4R_release_buf(am_hdr_buf);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_SEND_AMV_HDR);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_send_am_hdr_reply
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_send_am_hdr_reply(MPIR_Context_id_t context_id,
+                                             int src_rank,
+                                             int handler_id,
+                                             const void *am_hdr,
+                                             size_t am_hdr_sz, MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_SEND_AM_HDR_REPLY);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_SEND_AM_HDR_REPLY);
+    mpi_errno = MPIDI_OFI_do_send_am_header(src_rank,
+                                            MPIDI_CH4U_context_id_to_comm(context_id),
+                                            handler_id, am_hdr, am_hdr_sz, sreq, TRUE);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_SEND_AM_HDR_REPLY);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_send_am_reply
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_send_am_reply(MPIR_Context_id_t context_id,
+                                         int src_rank,
+                                         int handler_id,
+                                         const void *am_hdr,
+                                         size_t am_hdr_sz,
+                                         const void *data,
+                                         MPI_Count count,
+                                         MPI_Datatype datatype, MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_SEND_AM_REPLY);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_SEND_AM_REPLY);
+    mpi_errno = MPIDI_OFI_do_send_am(src_rank,
+                                     MPIDI_CH4U_context_id_to_comm(context_id),
+                                     handler_id,
+                                     am_hdr, am_hdr_sz, data, count, datatype, sreq, TRUE);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_SEND_AM_REPLY);
+    return mpi_errno;
+}
+
+static inline int MPIDI_NM_send_amv_reply(MPIR_Context_id_t context_id,
+                                          int src_rank,
+                                          int handler_id,
+                                          struct iovec *am_hdr,
+                                          size_t iov_len,
+                                          const void *data,
+                                          MPI_Count count,
+                                          MPI_Datatype datatype, MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS, is_allocated;
+    size_t am_hdr_sz = 0, i;
+    char *am_hdr_buf;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_SEND_AMV_REPLY);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_SEND_AMV_REPLY);
+
+    for (i = 0; i < iov_len; i++) {
+        am_hdr_sz += am_hdr[i].iov_len;
+    }
+
+    /* TODO: avoid the malloc here, use the am_hdr directly */
+    if (am_hdr_sz > MPIDI_OFI_BUF_POOL_SIZE) {
+        am_hdr_buf = (char *) MPL_malloc(am_hdr_sz);
+        is_allocated = 1;
+    }
+    else {
+        am_hdr_buf = (char *) MPIDI_CH4R_get_buf(MPIDI_Global.am_buf_pool);
+        is_allocated = 0;
+    }
+
+    MPIR_Assert(am_hdr_buf);
+    am_hdr_sz = 0;
+
+    for (i = 0; i < iov_len; i++) {
+        MPIR_Memcpy(am_hdr_buf + am_hdr_sz, am_hdr[i].iov_base, am_hdr[i].iov_len);
+        am_hdr_sz += am_hdr[i].iov_len;
+    }
+
+    mpi_errno = MPIDI_NM_send_am_reply(context_id, src_rank, handler_id, am_hdr_buf, am_hdr_sz,
+                                       data, count, datatype, sreq);
+
+    if (is_allocated)
+        MPL_free(am_hdr_buf);
+    else
+        MPIDI_CH4R_release_buf(am_hdr_buf);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_SEND_AMV_REPLY);
+    return mpi_errno;
+}
+
+static inline size_t MPIDI_NM_am_hdr_max_sz(void)
+{
+    /* Maximum size that fits in short send */
+    size_t max_shortsend = MPIDI_OFI_DEFAULT_SHORT_SEND_SIZE -
+        (sizeof(MPIDI_OFI_am_header_t) + sizeof(MPIDI_OFI_lmt_msg_payload_t));
+    /* Maximum payload size representable by MPIDI_OFI_am_header_t::am_hdr_sz field */
+    size_t max_representable = (1 << MPIDI_OFI_AM_HDR_SZ_BITS) - 1;
+
+    return MPL_MIN(max_shortsend, max_representable);
+}
+
+static inline int MPIDI_NM_inject_am_hdr(int rank,
+                                         MPIR_Comm * comm,
+                                         int handler_id,
+                                         const void *am_hdr, size_t am_hdr_sz, void *netmod_context)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_INJECT_AM_HDR);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_INJECT_AM_HDR);
+    mpi_errno = MPIDI_OFI_do_inject(rank, comm,
+                                    handler_id, am_hdr, am_hdr_sz,
+                                    netmod_context, FALSE, TRUE, TRUE);
+
+    if (mpi_errno != MPI_SUCCESS)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_INJECT_AM_HDR);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_NM_inject_am_hdr_reply(MPIR_Context_id_t context_id,
+                                               int src_rank,
+                                               int handler_id, const void *am_hdr, size_t am_hdr_sz)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_INJECT_AM_HDR_REPLY);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_INJECT_AM_HDR_REPLY);
+
+    mpi_errno = MPIDI_OFI_do_inject(src_rank, MPIDI_CH4U_context_id_to_comm(context_id),
+                                    handler_id, am_hdr, am_hdr_sz, NULL, TRUE, TRUE, FALSE);
+
+    if (mpi_errno != MPI_SUCCESS)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_INJECT_AM_HDR);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline size_t MPIDI_NM_am_inject_max_sz(void)
+{
+    if (unlikely(MPIDI_Global.max_buffered_send < sizeof(MPIDI_OFI_am_header_t)))
+        return 0;
+    return MPIDI_Global.max_buffered_send - sizeof(MPIDI_OFI_am_header_t);
+}
+
+static inline int MPIDI_NM_am_recv(MPIR_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_CH4U_send_long_ack_msg_t msg;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_AM_MATCHED);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_AM_MATCHED);
+
+    msg.sreq_ptr = (MPIDI_CH4U_REQUEST(req, req->rreq.peer_req_ptr));
+    msg.rreq_ptr = (uint64_t) req;
+    MPIR_Assert((void *) msg.sreq_ptr != NULL);
+    mpi_errno = MPIDI_NM_inject_am_hdr_reply(MPIDI_CH4U_get_context(MPIDI_CH4U_REQUEST(req, tag)),
+                                             MPIDI_CH4U_REQUEST(req, src_rank),
+                                             MPIDI_CH4U_SEND_LONG_ACK, &msg, sizeof(msg));
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_AM_MATCHED);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#endif /* NETMOD_OFI_AM_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_am_events.h b/src/mpid/ch4/netmod/ofi/ofi_am_events.h
new file mode 100644
index 0000000..fdc920c
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_am_events.h
@@ -0,0 +1,414 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_AM_OFI_EVENTS_H_INCLUDED
+#define NETMOD_AM_OFI_EVENTS_H_INCLUDED
+
+#include "ofi_am_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_handle_short_am
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_handle_short_am(MPIDI_OFI_am_header_t * msg_hdr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq;
+    void *p_data;
+    void *in_data;
+
+    size_t data_sz, in_data_sz;
+    MPIDI_NM_am_completion_handler_fn cmpl_handler_fn;
+    struct iovec *iov;
+    int i, is_contig, iov_len;
+    size_t done, curr_len, rem;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_HANDLE_SHORT_AM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_HANDLE_SHORT_AM);
+
+    p_data = in_data = (char *) msg_hdr->payload + msg_hdr->am_hdr_sz;
+    in_data_sz = data_sz = msg_hdr->data_sz;
+
+    MPIDI_Global.am_handlers[msg_hdr->handler_id] (msg_hdr->payload,
+                                                   &p_data, &data_sz,
+                                                   &is_contig, &cmpl_handler_fn, &rreq);
+
+    if (!rreq)
+        goto fn_exit;
+
+    if ((!p_data || !data_sz) && cmpl_handler_fn) {
+        MPIR_STATUS_SET_COUNT(rreq->status, data_sz);
+        cmpl_handler_fn(rreq);
+        goto fn_exit;
+    }
+
+    if (is_contig) {
+        if (in_data_sz > data_sz) {
+            rreq->status.MPI_ERROR = MPI_ERR_TRUNCATE;
+        }
+        else {
+            rreq->status.MPI_ERROR = MPI_SUCCESS;
+        }
+
+        data_sz = MPL_MIN(data_sz, in_data_sz);
+        MPIR_Memcpy(p_data, in_data, data_sz);
+        MPIR_STATUS_SET_COUNT(rreq->status, data_sz);
+    }
+    else {
+        done = 0;
+        rem = in_data_sz;
+        iov = (struct iovec *) p_data;
+        iov_len = data_sz;
+
+        for (i = 0; i < iov_len && rem > 0; i++) {
+            curr_len = MPL_MIN(rem, iov[i].iov_len);
+            MPIR_Memcpy(iov[i].iov_base, (char *) in_data + done, curr_len);
+            rem -= curr_len;
+            done += curr_len;
+        }
+
+        if (rem) {
+            rreq->status.MPI_ERROR = MPI_ERR_TRUNCATE;
+        }
+        else {
+            rreq->status.MPI_ERROR = MPI_SUCCESS;
+        }
+
+        MPIR_STATUS_SET_COUNT(rreq->status, done);
+    }
+
+    if (cmpl_handler_fn) {
+        cmpl_handler_fn(rreq);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_HANDLE_SHORT_AM);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_handle_short_am_hdr
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_handle_short_am_hdr(MPIDI_OFI_am_header_t * msg_hdr, void *am_hdr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq = NULL;
+    MPIDI_NM_am_completion_handler_fn cmpl_handler_fn = NULL;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_HANDLE_SHORT_AM_HDR);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_HANDLE_SHORT_AM_HDR);
+
+    MPIDI_Global.am_handlers[msg_hdr->handler_id] (am_hdr,
+                                                   NULL, NULL, NULL, &cmpl_handler_fn, &rreq);
+
+    if (!rreq)
+        goto fn_exit;
+
+    if (cmpl_handler_fn) {
+        cmpl_handler_fn(rreq);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_HANDLE_SHORT_AM_HDR);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_do_rdma_read
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_do_rdma_read(void *dst,
+                                         uint64_t src,
+                                         size_t data_sz,
+                                         MPIR_Context_id_t context_id,
+                                         int src_rank, MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    size_t done = 0, curr_len, rem = 0;
+    MPIDI_OFI_am_request_t *am_req;
+    MPIR_Comm *comm;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_DO_RDMA_READ);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_DO_RDMA_READ);
+
+    rem = data_sz;
+
+    while (done != data_sz) {
+        curr_len = MPL_MIN(rem, MPIDI_Global.max_send);
+
+        MPIR_Assert(sizeof(MPIDI_OFI_am_request_t) <= MPIDI_OFI_BUF_POOL_SIZE);
+        am_req = (MPIDI_OFI_am_request_t *) MPIDI_CH4R_get_buf(MPIDI_Global.am_buf_pool);
+        MPIR_Assert(am_req);
+
+        am_req->req_hdr = MPIDI_OFI_AMREQUEST(rreq, req_hdr);
+        am_req->event_id = MPIDI_OFI_EVENT_AM_READ;
+        comm = MPIDI_CH4U_context_id_to_comm(context_id);
+        MPIR_Assert(comm);
+        MPIDI_OFI_conditional_cntr_incr();
+        MPIDI_OFI_CALL_RETRY_AM(fi_read(MPIDI_OFI_EP_TX_RMA(0),
+                                        (char *) dst + done,
+                                        curr_len, NULL,
+                                        MPIDI_OFI_comm_to_phys(comm, src_rank, MPIDI_OFI_API_TAG),
+                                        src + done,
+                                        MPIDI_OFI_AMREQUEST_HDR(rreq, lmt_info).rma_key,
+                                        &am_req->context), FALSE /* no lock */ , read);
+        done += curr_len;
+        rem -= curr_len;
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_DO_RDMA_READ);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_handle_long_am_hdr
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_handle_long_am_hdr(MPIDI_OFI_am_header_t * msg_hdr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_lmt_msg_payload_t *lmt_msg;
+    MPIR_Request *rreq;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_HANDLE_LONG_AM_HDR);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_HANDLE_LONG_AM_HDR);
+
+    rreq = MPIR_Request_create(MPIR_REQUEST_KIND__RECV);
+
+    mpi_errno = MPIDI_OFI_am_init_request(NULL, 0, rreq);
+
+    if (mpi_errno != MPI_SUCCESS)
+        MPIR_ERR_POP(mpi_errno);
+
+    lmt_msg = (MPIDI_OFI_lmt_msg_payload_t *) msg_hdr->payload;
+    MPIDI_OFI_AMREQUEST_HDR(rreq, lmt_info) = *lmt_msg;
+    MPIDI_OFI_AMREQUEST_HDR(rreq, msg_hdr) = *msg_hdr;
+    MPIDI_OFI_AMREQUEST_HDR(rreq, rreq_ptr) = (void *) rreq;
+    MPIDI_OFI_AMREQUEST_HDR(rreq, am_hdr) = MPL_malloc(msg_hdr->am_hdr_sz);
+    MPIDI_OFI_AMREQUEST_HDR(rreq, lmt_cntr) =
+        ((msg_hdr->am_hdr_sz - 1) / MPIDI_Global.max_send) + 1;
+    MPIDI_OFI_do_rdma_read(MPIDI_OFI_AMREQUEST_HDR(rreq, am_hdr), lmt_msg->am_hdr_src,
+                           msg_hdr->am_hdr_sz, lmt_msg->context_id, lmt_msg->src_rank, rreq);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_HANDLE_LONG_AM_HDR);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_do_handle_long_am
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_do_handle_long_am(MPIDI_OFI_am_header_t * msg_hdr,
+                                              MPIDI_OFI_lmt_msg_payload_t * lmt_msg, void *am_hdr)
+{
+    int num_reads, i, iov_len, c, mpi_errno = MPI_SUCCESS, is_contig = 0;
+    MPIR_Request *rreq;
+    void *p_data;
+    size_t data_sz, rem, done, curr_len, in_data_sz;
+    MPIDI_NM_am_completion_handler_fn cmpl_handler_fn;
+    struct iovec *iov;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_DO_HANDLE_LONG_AM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_DO_HANDLE_LONG_AM);
+
+    in_data_sz = data_sz = msg_hdr->data_sz;
+    MPIDI_Global.am_handlers[msg_hdr->handler_id] (am_hdr,
+                                                   &p_data, &data_sz, &is_contig,
+                                                   &cmpl_handler_fn, &rreq);
+
+    if (!rreq)
+        goto fn_exit;
+
+    MPIDI_OFI_AMREQUEST(rreq, req_hdr) = NULL;
+    mpi_errno = MPIDI_OFI_am_init_request(NULL, 0, rreq);
+
+    if (mpi_errno != MPI_SUCCESS)
+        MPIR_ERR_POP(mpi_errno);
+
+    MPIR_cc_incr(rreq->cc_ptr, &c);
+
+    MPIDI_OFI_AMREQUEST_HDR(rreq, cmpl_handler_fn) = cmpl_handler_fn;
+
+    if ((!p_data || !data_sz) && cmpl_handler_fn) {
+        cmpl_handler_fn(rreq);
+        MPIDI_OFI_am_request_complete(rreq);
+        goto fn_exit;
+    }
+
+    MPIDI_OFI_AMREQUEST_HDR(rreq, msg_hdr) = *msg_hdr;
+    MPIDI_OFI_AMREQUEST_HDR(rreq, lmt_info) = *lmt_msg;
+    MPIDI_OFI_AMREQUEST_HDR(rreq, rreq_ptr) = (void *) rreq;
+
+    if (is_contig) {
+        if (in_data_sz > data_sz) {
+            rreq->status.MPI_ERROR = MPI_ERR_TRUNCATE;
+        }
+        else {
+            rreq->status.MPI_ERROR = MPI_SUCCESS;
+        }
+
+        data_sz = MPL_MIN(data_sz, in_data_sz);
+        MPIDI_OFI_AMREQUEST_HDR(rreq, lmt_cntr) = ((data_sz - 1) / MPIDI_Global.max_send) + 1;
+        MPIDI_OFI_do_rdma_read(p_data,
+                               lmt_msg->src_offset,
+                               data_sz, lmt_msg->context_id, lmt_msg->src_rank, rreq);
+        MPIR_STATUS_SET_COUNT(rreq->status, data_sz);
+    }
+    else {
+        done = 0;
+        rem = in_data_sz;
+        iov = (struct iovec *) p_data;
+        iov_len = data_sz;
+
+        /* FIXME: optimize iov processing part */
+
+        /* set lmt counter */
+        MPIDI_OFI_AMREQUEST_HDR(rreq, lmt_cntr) = 0;
+
+        for (i = 0; i < iov_len && rem > 0; i++) {
+            curr_len = MPL_MIN(rem, iov[i].iov_len);
+            num_reads = ((curr_len - 1) / MPIDI_Global.max_send) + 1;
+            MPIDI_OFI_AMREQUEST_HDR(rreq, lmt_cntr) += num_reads;
+            rem -= curr_len;
+        }
+
+        done = 0;
+        rem = in_data_sz;
+
+        for (i = 0; i < iov_len && rem > 0; i++) {
+            curr_len = MPL_MIN(rem, iov[i].iov_len);
+            MPIDI_OFI_do_rdma_read(iov[i].iov_base, lmt_msg->src_offset + done,
+                                   curr_len, lmt_msg->context_id, lmt_msg->src_rank, rreq);
+            rem -= curr_len;
+            done += curr_len;
+        }
+
+        if (rem) {
+            rreq->status.MPI_ERROR = MPI_ERR_TRUNCATE;
+        }
+        else {
+            rreq->status.MPI_ERROR = MPI_SUCCESS;
+        }
+
+        MPIR_STATUS_SET_COUNT(rreq->status, done);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_DO_HANDLE_LONG_AM);
+    return mpi_errno;
+
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_handle_long_am
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_handle_long_am(MPIDI_OFI_am_header_t * msg_hdr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_lmt_msg_payload_t *lmt_msg;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_HANDLE_LONG_AM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_HANDLE_LONG_AM);
+
+    lmt_msg = (MPIDI_OFI_lmt_msg_payload_t *) ((char *) msg_hdr->payload + msg_hdr->am_hdr_sz);
+    mpi_errno = MPIDI_OFI_do_handle_long_am(msg_hdr, lmt_msg, msg_hdr->payload);
+
+    if (mpi_errno != MPI_SUCCESS)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_HANDLE_LONG_AM);
+    return mpi_errno;
+
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_handle_lmt_ack
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_handle_lmt_ack(MPIDI_OFI_am_header_t * msg_hdr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *sreq;
+    MPIDI_OFI_ack_msg_payload_t *ack_msg;
+    int handler_id;
+    uint64_t index;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_HANDLE_LMT_ACK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_HANDLE_LMT_ACK);
+
+    ack_msg = (MPIDI_OFI_ack_msg_payload_t *) msg_hdr->payload;
+    sreq = (MPIR_Request *) ack_msg->sreq_ptr;
+
+    index = fi_mr_key(MPIDI_OFI_AMREQUEST_HDR(sreq, lmt_mr)) >> MPIDI_Global.huge_rma_shift;
+    MPIDI_OFI_index_allocator_free(MPIDI_OFI_COMM(MPIR_Process.comm_world).rma_id_allocator, index);
+    MPIDI_OFI_CALL_NOLOCK(fi_close(&MPIDI_OFI_AMREQUEST_HDR(sreq, lmt_mr)->fid), mr_unreg);
+    OPA_decr_int(&MPIDI_Global.am_inflight_rma_send_mrs);
+
+    if (MPIDI_OFI_AMREQUEST_HDR(sreq, pack_buffer)) {
+        MPL_free(MPIDI_OFI_AMREQUEST_HDR(sreq, pack_buffer));
+    }
+
+    handler_id = MPIDI_OFI_AMREQUEST_HDR(sreq, msg_hdr).handler_id;
+    MPIDI_OFI_am_request_complete(sreq);
+    mpi_errno = MPIDI_Global.am_send_cmpl_handlers[handler_id] (sreq);
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_HANDLE_LMT_ACK);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_dispatch_ack
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_dispatch_ack(int rank,
+                                         int context_id,
+                                         uint64_t sreq_ptr, int am_type, void *netmod_context)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_ack_msg_t msg;
+    MPIR_Comm *comm;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_DISPATCH_ACK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_DISPATCH_ACK);
+
+    comm = MPIDI_CH4U_context_id_to_comm(context_id);
+
+    msg.hdr.am_hdr_sz = sizeof(msg.pyld);
+    msg.hdr.data_sz = 0;
+    msg.hdr.am_type = am_type;
+    msg.pyld.sreq_ptr = sreq_ptr;
+    MPIDI_OFI_CALL_RETRY_AM(fi_inject(MPIDI_OFI_EP_TX_MSG(0), &msg, sizeof(msg),
+                                      MPIDI_OFI_comm_to_phys(comm, rank, MPIDI_OFI_API_TAG)),
+                            FALSE /* no lock */ , inject);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_DISPATCH_ACK);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#endif /* NETMOD_AM_OFI_EVENTS_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_am_impl.h b/src/mpid/ch4/netmod/ofi/ofi_am_impl.h
new file mode 100644
index 0000000..1a9cba6
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_am_impl.h
@@ -0,0 +1,565 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_AM_IMPL_H_INCLUDED
+#define NETMOD_OFI_AM_IMPL_H_INCLUDED
+
+#include "ofi_impl.h"
+
+static inline int MPIDI_OFI_progress_do_queue(void *netmod_context);
+
+/*
+  Per-object lock for OFI
+
+  * When calling OFI function MPIDI_OFI_THREAD_FI_MUTEX must be held.
+  * When being called from the MPI layer (app), we must grab the lock.
+    This is the case for regular (non-reply) functions such as send_am.
+  * When being called from callback function or progress engine, we must
+    not grab the lock because the progress engine is already holding the lock.
+    This is the case for reply functions such as send_am_reply.
+*/
+#define MPIDI_OFI_CALL_RETRY_AM(FUNC,LOCK,STR)                  \
+    do {                                                                \
+        ssize_t _ret;                                                   \
+        do {                                                            \
+            if (LOCK) MPID_THREAD_CS_ENTER(POBJ,MPIDI_OFI_THREAD_FI_MUTEX); \
+            _ret = FUNC;                                                \
+            if (LOCK) MPID_THREAD_CS_EXIT(POBJ,MPIDI_OFI_THREAD_FI_MUTEX); \
+            if (likely(_ret==0)) break;                                  \
+            MPIR_ERR_##CHKANDJUMP4(_ret != -FI_EAGAIN,                  \
+                                   mpi_errno,                           \
+                                   MPI_ERR_OTHER,                       \
+                                   "**ofi_"#STR,                        \
+                                   "**ofi_"#STR" %s %d %s %s",          \
+                                   __SHORT_FILE__,                      \
+                                   __LINE__,                            \
+                                   FCNAME,                              \
+                                   fi_strerror(-_ret));                 \
+            if (LOCK) MPID_THREAD_CS_ENTER(POBJ,MPIDI_OFI_THREAD_FI_MUTEX); \
+            mpi_errno = MPIDI_OFI_progress_do_queue(NULL);      \
+            if (LOCK) MPID_THREAD_CS_EXIT(POBJ,MPIDI_OFI_THREAD_FI_MUTEX); \
+            if (mpi_errno != MPI_SUCCESS)                                \
+                MPIR_ERR_POP(mpi_errno);                                \
+        } while (_ret == -FI_EAGAIN);                                   \
+    } while (0)
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_am_clear_request
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline void MPIDI_OFI_am_clear_request(MPIR_Request * sreq)
+{
+    MPIDI_OFI_am_request_header_t *req_hdr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_AM_OFI_CLEAR_REQ);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_AM_OFI_CLEAR_REQ);
+
+    req_hdr = MPIDI_OFI_AMREQUEST(sreq, req_hdr);
+
+    if (!req_hdr)
+        return;
+
+    if (req_hdr->am_hdr != &req_hdr->am_hdr_buf[0]) {
+        MPL_free(req_hdr->am_hdr);
+    }
+
+    MPIDI_CH4R_release_buf(req_hdr);
+    MPIDI_OFI_AMREQUEST(sreq, req_hdr) = NULL;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_AM_OFI_CLEAR_REQ);
+    return;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_am_init_request
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_am_init_request(const void *am_hdr,
+                                            size_t am_hdr_sz, MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_am_request_header_t *req_hdr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_AM_OFI_INIT_REQ);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_AM_OFI_INIT_REQ);
+
+    if (MPIDI_OFI_AMREQUEST(sreq, req_hdr) == NULL) {
+        req_hdr = (MPIDI_OFI_am_request_header_t *)
+            MPIDI_CH4R_get_buf(MPIDI_Global.am_buf_pool);
+        MPIR_Assert(req_hdr);
+        MPIDI_OFI_AMREQUEST(sreq, req_hdr) = req_hdr;
+
+        req_hdr->am_hdr = (void *) &req_hdr->am_hdr_buf[0];
+        req_hdr->am_hdr_sz = MPIDI_OFI_MAX_AM_HDR_SIZE;
+    }
+    else {
+        req_hdr = MPIDI_OFI_AMREQUEST(sreq, req_hdr);
+    }
+
+    if (am_hdr_sz > req_hdr->am_hdr_sz) {
+        if (req_hdr->am_hdr != &req_hdr->am_hdr_buf[0])
+            MPL_free(req_hdr->am_hdr);
+
+        req_hdr->am_hdr = MPL_malloc(am_hdr_sz);
+        MPIR_Assert(req_hdr->am_hdr);
+        req_hdr->am_hdr_sz = am_hdr_sz;
+    }
+
+    if (am_hdr) {
+        MPIR_Memcpy(req_hdr->am_hdr, am_hdr, am_hdr_sz);
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_AM_OFI_INIT_REQ);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_repost_buffer
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_repost_buffer(void *buf, MPIR_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_am_repost_request_t *am = (MPIDI_OFI_am_repost_request_t *) req;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_REPOST_BUFFER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_REPOST_BUFFER);
+    MPIDI_OFI_CALL_RETRY_AM(fi_recvmsg(MPIDI_OFI_EP_RX_MSG(0),
+                                       &MPIDI_Global.am_msg[am->index],
+                                       FI_MULTI_RECV | FI_COMPLETION), FALSE /* lock */ , repost);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_REPOST_BUFFER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_progress_do_queue
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_progress_do_queue(void *netmod_context)
+{
+    int mpi_errno = MPI_SUCCESS, ret;
+    struct fi_cq_tagged_entry cq_entry;
+
+    /* Caller must hold MPIDI_OFI_THREAD_FI_MUTEX */
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_PROGRESS_DO_QUEUE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_PROGRESS_DO_QUEUE);
+
+    ret = fi_cq_read(MPIDI_Global.p2p_cq, &cq_entry, 1);
+
+    if (unlikely(ret == -FI_EAGAIN))
+        goto fn_exit;
+
+    if (ret < 0) {
+        mpi_errno = MPIDI_OFI_handle_cq_error_util(ret);
+        goto fn_fail;
+    }
+
+    if (((MPIDI_Global.cq_buff_head + 1) %
+         MPIDI_OFI_NUM_CQ_BUFFERED == MPIDI_Global.cq_buff_tail) ||
+        !slist_empty(&MPIDI_Global.cq_buff_list)) {
+        MPIDI_OFI_cq_list_t *list_entry =
+            (MPIDI_OFI_cq_list_t *) MPL_malloc(sizeof(MPIDI_OFI_cq_list_t));
+        MPIR_Assert(list_entry);
+        list_entry->cq_entry = cq_entry;
+        slist_insert_tail(&list_entry->entry, &MPIDI_Global.cq_buff_list);
+    }
+    else {
+        MPIDI_Global.cq_buffered[MPIDI_Global.cq_buff_head].cq_entry = cq_entry;
+        MPIDI_Global.cq_buff_head = (MPIDI_Global.cq_buff_head + 1) % MPIDI_OFI_NUM_CQ_BUFFERED;
+    }
+
+    if ((cq_entry.flags & FI_RECV) && (cq_entry.flags & FI_MULTI_RECV)) {
+        mpi_errno = MPIDI_OFI_repost_buffer(cq_entry.op_context,
+                                            MPIDI_OFI_context_to_request(cq_entry.op_context));
+
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_PROGRESS_DO_QUEUE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_do_send_am_header
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_do_send_am_header(int rank,
+                                              MPIR_Comm * comm,
+                                              int handler_id,
+                                              const void *am_hdr,
+                                              size_t am_hdr_sz, MPIR_Request * sreq, int is_reply)
+{
+    struct iovec iov[2];
+    MPIDI_OFI_am_header_t *msg_hdr;
+    int mpi_errno = MPI_SUCCESS, c;
+    int need_lock = !is_reply;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_DO_SEND_AM_HDR);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_DO_SEND_AM_HDR);
+
+    MPIDI_OFI_AMREQUEST(sreq, req_hdr) = NULL;
+    mpi_errno = MPIDI_OFI_am_init_request(am_hdr, am_hdr_sz, sreq);
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    MPIR_Assert(handler_id < (1 << MPIDI_OFI_AM_HANDLER_ID_BITS));
+    MPIR_Assert(am_hdr_sz < (1ULL << MPIDI_OFI_AM_HDR_SZ_BITS));
+    msg_hdr = &MPIDI_OFI_AMREQUEST_HDR(sreq, msg_hdr);
+    msg_hdr->handler_id = handler_id;
+    msg_hdr->am_hdr_sz = am_hdr_sz;
+    msg_hdr->data_sz = 0;
+    msg_hdr->am_type = MPIDI_AMTYPE_SHORT_HDR;
+
+    MPIR_Assert((uint64_t) comm->rank < (1ULL << MPIDI_OFI_AM_RANK_BITS));
+
+    MPIDI_OFI_AMREQUEST_HDR(sreq, pack_buffer) = NULL;
+    MPIR_cc_incr(sreq->cc_ptr, &c);
+
+    iov[0].iov_base = msg_hdr;
+    iov[0].iov_len = sizeof(*msg_hdr);
+
+    MPIR_Assert((sizeof(*msg_hdr) + am_hdr_sz) <= MPIDI_OFI_DEFAULT_SHORT_SEND_SIZE);
+    iov[1].iov_base = MPIDI_OFI_AMREQUEST_HDR(sreq, am_hdr);
+    iov[1].iov_len = am_hdr_sz;
+    MPIDI_OFI_AMREQUEST(sreq, event_id) = MPIDI_OFI_EVENT_AM_SEND;
+    MPIDI_OFI_CALL_RETRY_AM(fi_sendv(MPIDI_OFI_EP_TX_MSG(0), iov, NULL, 2,
+                                     MPIDI_OFI_comm_to_phys(comm, rank, MPIDI_OFI_API_TAG),
+                                     &MPIDI_OFI_AMREQUEST(sreq, context)), need_lock, sendv);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_DO_SEND_AM_HDR);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_send_am_long
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_send_am_long(int rank,
+                                         MPIR_Comm * comm,
+                                         int handler_id,
+                                         const void *am_hdr,
+                                         size_t am_hdr_sz,
+                                         const void *data,
+                                         size_t data_sz, MPIR_Request * sreq, int need_lock)
+{
+    int mpi_errno = MPI_SUCCESS, c;
+    MPIDI_OFI_am_header_t *msg_hdr;
+    MPIDI_OFI_lmt_msg_payload_t *lmt_info;
+    struct iovec iov[3];
+    uint64_t index;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_SEND_AM_LONG);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_SEND_AM_LONG);
+
+    MPIR_Assert(handler_id < (1 << MPIDI_OFI_AM_HANDLER_ID_BITS));
+    MPIR_Assert(am_hdr_sz < (1ULL << MPIDI_OFI_AM_HDR_SZ_BITS));
+    MPIR_Assert(data_sz < (1ULL << MPIDI_OFI_AM_DATA_SZ_BITS));
+    MPIR_Assert((uint64_t) comm->rank < (1ULL << MPIDI_OFI_AM_RANK_BITS));
+
+    msg_hdr = &MPIDI_OFI_AMREQUEST_HDR(sreq, msg_hdr);
+    msg_hdr->handler_id = handler_id;
+    msg_hdr->am_hdr_sz = am_hdr_sz;
+    msg_hdr->data_sz = data_sz;
+    msg_hdr->am_type = MPIDI_AMTYPE_LMT_REQ;
+
+    lmt_info = &MPIDI_OFI_AMREQUEST_HDR(sreq, lmt_info);
+    lmt_info->context_id = comm->context_id;
+    lmt_info->src_rank = comm->rank;
+    lmt_info->src_offset = MPIDI_OFI_ENABLE_MR_SCALABLE ? (uint64_t) 0 /* MR_SCALABLE */ : (uint64_t) data;     /* MR_BASIC */
+    lmt_info->sreq_ptr = (uint64_t) sreq;
+    /* Always allocates RMA ID from COMM_WORLD as the actual associated communicator
+     * is not available here */
+    index =
+        MPIDI_OFI_index_allocator_alloc(MPIDI_OFI_COMM(MPIR_Process.comm_world).rma_id_allocator);
+    MPIR_Assert((int) index < MPIDI_Global.max_huge_rmas);
+    lmt_info->rma_key = MPIDI_OFI_ENABLE_MR_SCALABLE ? index << MPIDI_Global.huge_rma_shift : 0;
+
+    MPIR_cc_incr(sreq->cc_ptr, &c);     /* send completion */
+    MPIR_cc_incr(sreq->cc_ptr, &c);     /* lmt ack handler */
+    MPIR_Assert((sizeof(*msg_hdr) + sizeof(*lmt_info) + am_hdr_sz) <=
+                MPIDI_OFI_DEFAULT_SHORT_SEND_SIZE);
+    if (need_lock)
+        MPIDI_OFI_CALL(fi_mr_reg(MPIDI_Global.domain,
+                                 data,
+                                 data_sz,
+                                 FI_REMOTE_READ,
+                                 0ULL,
+                                 lmt_info->rma_key,
+                                 0ULL, &MPIDI_OFI_AMREQUEST_HDR(sreq, lmt_mr), NULL), mr_reg);
+    else
+        MPIDI_OFI_CALL_NOLOCK(fi_mr_reg(MPIDI_Global.domain,
+                                        data,
+                                        data_sz,
+                                        FI_REMOTE_READ,
+                                        0ULL,
+                                        lmt_info->rma_key,
+                                        0ULL,
+                                        &MPIDI_OFI_AMREQUEST_HDR(sreq, lmt_mr), NULL), mr_reg);
+    OPA_incr_int(&MPIDI_Global.am_inflight_rma_send_mrs);
+
+    if (!MPIDI_OFI_ENABLE_MR_SCALABLE) {
+        /* MR_BASIC */
+        lmt_info->rma_key = fi_mr_key(MPIDI_OFI_AMREQUEST_HDR(sreq, lmt_mr));
+    }
+
+    iov[0].iov_base = msg_hdr;
+    iov[0].iov_len = sizeof(*msg_hdr);
+
+    iov[1].iov_base = MPIDI_OFI_AMREQUEST_HDR(sreq, am_hdr);
+    iov[1].iov_len = am_hdr_sz;
+
+    iov[2].iov_base = lmt_info;
+    iov[2].iov_len = sizeof(*lmt_info);
+    MPIDI_OFI_AMREQUEST(sreq, event_id) = MPIDI_OFI_EVENT_AM_SEND;
+    MPIDI_OFI_CALL_RETRY_AM(fi_sendv(MPIDI_OFI_EP_TX_MSG(0), iov, NULL, 3,
+                                     MPIDI_OFI_comm_to_phys(comm, rank, MPIDI_OFI_API_TAG),
+                                     &MPIDI_OFI_AMREQUEST(sreq, context)), need_lock, sendv);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_SEND_AM_LONG);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_send_am_short
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_send_am_short(int rank,
+                                          MPIR_Comm * comm,
+                                          int handler_id,
+                                          const void *am_hdr,
+                                          size_t am_hdr_sz,
+                                          const void *data,
+                                          MPI_Count count, MPIR_Request * sreq, int need_lock)
+{
+    int mpi_errno = MPI_SUCCESS, c;
+    MPIDI_OFI_am_header_t *msg_hdr;
+    struct iovec iov[3];
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_SEND_AM_SHORT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_SEND_AM_SHORT);
+
+    MPIR_Assert(handler_id < (1 << MPIDI_OFI_AM_HANDLER_ID_BITS));
+    MPIR_Assert(am_hdr_sz < (1ULL << MPIDI_OFI_AM_HDR_SZ_BITS));
+    MPIR_Assert((uint64_t) count < (1ULL << MPIDI_OFI_AM_DATA_SZ_BITS));
+    MPIR_Assert((uint64_t) comm->rank < (1ULL << MPIDI_OFI_AM_RANK_BITS));
+
+    msg_hdr = &MPIDI_OFI_AMREQUEST_HDR(sreq, msg_hdr);
+    msg_hdr->handler_id = handler_id;
+    msg_hdr->am_hdr_sz = am_hdr_sz;
+    msg_hdr->data_sz = count;
+    msg_hdr->am_type = MPIDI_AMTYPE_SHORT;
+
+    iov[0].iov_base = msg_hdr;
+    iov[0].iov_len = sizeof(*msg_hdr);
+
+    iov[1].iov_base = MPIDI_OFI_AMREQUEST_HDR(sreq, am_hdr);
+    iov[1].iov_len = am_hdr_sz;
+
+    iov[2].iov_base = (void *) data;
+    iov[2].iov_len = count;
+
+    MPIR_cc_incr(sreq->cc_ptr, &c);
+    MPIDI_OFI_AMREQUEST(sreq, event_id) = MPIDI_OFI_EVENT_AM_SEND;
+    MPIDI_OFI_CALL_RETRY_AM(fi_sendv(MPIDI_OFI_EP_TX_MSG(0), iov, NULL, 3,
+                                     MPIDI_OFI_comm_to_phys(comm, rank, MPIDI_OFI_API_TAG),
+                                     &MPIDI_OFI_AMREQUEST(sreq, context)), need_lock, sendv);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_SEND_AM_SHORT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_do_send_am
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_do_send_am(int rank,
+                                       MPIR_Comm * comm,
+                                       int handler_id,
+                                       const void *am_hdr,
+                                       size_t am_hdr_sz,
+                                       const void *buf,
+                                       size_t count,
+                                       MPI_Datatype datatype, MPIR_Request * sreq, int is_reply)
+{
+    int dt_contig, mpi_errno = MPI_SUCCESS;
+    char *send_buf;
+    size_t data_sz;
+    MPI_Aint dt_true_lb, last;
+    MPIR_Datatype *dt_ptr;
+    int need_lock = !is_reply;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_DO_SEND_AM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_DO_SEND_AM);
+
+    MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);
+    send_buf = (char *) buf + dt_true_lb;
+
+    if (handler_id == MPIDI_CH4U_SEND &&
+        am_hdr_sz + data_sz + sizeof(MPIDI_OFI_am_header_t) > MPIDI_OFI_DEFAULT_SHORT_SEND_SIZE) {
+        MPIDI_CH4U_send_long_req_msg_t lreq_hdr;
+
+        MPIR_Memcpy(&lreq_hdr.hdr, am_hdr, am_hdr_sz);
+        lreq_hdr.data_sz = data_sz;
+        lreq_hdr.sreq_ptr = (uint64_t) sreq;
+        MPIDI_CH4U_REQUEST(sreq, req->lreq).src_buf = buf;
+        MPIDI_CH4U_REQUEST(sreq, req->lreq).count = count;
+        dtype_add_ref_if_not_builtin(datatype);
+        MPIDI_CH4U_REQUEST(sreq, req->lreq).datatype = datatype;
+        MPIDI_CH4U_REQUEST(sreq, req->lreq).msg_tag = lreq_hdr.hdr.msg_tag;
+        MPIDI_CH4U_REQUEST(sreq, src_rank) = rank;
+        mpi_errno = MPIDI_NM_inject_am_hdr(rank, comm, MPIDI_CH4U_SEND_LONG_REQ,
+                                           &lreq_hdr, sizeof(lreq_hdr), NULL);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+        goto fn_exit;
+    }
+
+    MPIDI_OFI_AMREQUEST(sreq, req_hdr) = NULL;
+    mpi_errno = MPIDI_OFI_am_init_request(am_hdr, am_hdr_sz, sreq);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    if (!dt_contig) {
+        size_t segment_first;
+        struct MPIDU_Segment *segment_ptr;
+        segment_ptr = MPIDU_Segment_alloc();
+        MPIR_ERR_CHKANDJUMP1(segment_ptr == NULL, mpi_errno,
+                             MPI_ERR_OTHER, "**nomem", "**nomem %s", "Send MPIDU_Segment_alloc");
+        MPIDU_Segment_init(buf, count, datatype, segment_ptr, 0);
+        segment_first = 0;
+        last = data_sz;
+        MPIDI_OFI_AMREQUEST_HDR(sreq, pack_buffer) = (char *) MPL_malloc(data_sz);
+        MPIR_ERR_CHKANDJUMP1(MPIDI_OFI_AMREQUEST_HDR(sreq, pack_buffer) == NULL, mpi_errno,
+                             MPI_ERR_OTHER, "**nomem", "**nomem %s", "Send Pack buffer alloc");
+        MPIDU_Segment_pack(segment_ptr, segment_first, &last,
+                           MPIDI_OFI_AMREQUEST_HDR(sreq, pack_buffer));
+        MPIDU_Segment_free(segment_ptr);
+        send_buf = (char *) MPIDI_OFI_AMREQUEST_HDR(sreq, pack_buffer);
+    }
+    else {
+        MPIDI_OFI_AMREQUEST_HDR(sreq, pack_buffer) = NULL;
+    }
+
+    if (am_hdr_sz + data_sz + sizeof(MPIDI_OFI_am_header_t) <= MPIDI_OFI_DEFAULT_SHORT_SEND_SIZE) {
+        mpi_errno =
+            MPIDI_OFI_send_am_short(rank, comm, handler_id, MPIDI_OFI_AMREQUEST_HDR(sreq, am_hdr),
+                                    am_hdr_sz, send_buf, data_sz, sreq, need_lock);
+    }
+    else {
+        mpi_errno =
+            MPIDI_OFI_send_am_long(rank, comm, handler_id, MPIDI_OFI_AMREQUEST_HDR(sreq, am_hdr),
+                                   am_hdr_sz, send_buf, data_sz, sreq, need_lock);
+    }
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_DO_SEND_AM);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_OFI_do_inject(int rank,
+                                      MPIR_Comm * comm,
+                                      int handler_id,
+                                      const void *am_hdr,
+                                      size_t am_hdr_sz,
+                                      void *netmod_context,
+                                      int is_reply, int use_comm_table, int need_lock)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_am_header_t msg_hdr;
+    struct fi_msg msg;
+    struct iovec msg_iov[2];
+    uint64_t send_flag = FI_INJECT;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_DO_INJECT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_DO_INJECT);
+
+    MPIR_Assert(handler_id < (1 << MPIDI_OFI_AM_HANDLER_ID_BITS));
+    MPIR_Assert(am_hdr_sz < (1ULL << MPIDI_OFI_AM_HDR_SZ_BITS));
+
+    msg_hdr.handler_id = handler_id;
+    msg_hdr.am_hdr_sz = am_hdr_sz;
+    msg_hdr.data_sz = 0;
+    msg_hdr.am_type = MPIDI_AMTYPE_SHORT_HDR;
+
+    MPIR_Assert((uint64_t) comm->rank < (1ULL << MPIDI_OFI_AM_RANK_BITS));
+
+    msg_iov[0].iov_base = (void *) &msg_hdr;
+    msg_iov[0].iov_len = sizeof(msg_hdr);
+
+    msg_iov[1].iov_base = (void *) am_hdr;
+    msg_iov[1].iov_len = am_hdr_sz;
+
+    msg.msg_iov = &msg_iov[0];
+    msg.desc = NULL;
+    msg.iov_count = 2;
+    msg.context = NULL;
+    msg.addr = use_comm_table ?
+        MPIDI_OFI_comm_to_phys(comm, rank, MPIDI_OFI_API_MSG) :
+        MPIDI_OFI_to_phys(rank, MPIDI_OFI_API_MSG);
+
+    if (unlikely(am_hdr_sz + sizeof(msg_hdr) > MPIDI_Global.max_buffered_send)) {
+        MPIR_Request *sreq;
+        char *ibuf;
+
+        sreq = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
+        MPIR_Assert(sreq);
+        ibuf = (char *) MPL_malloc(am_hdr_sz + sizeof(msg_hdr));
+        MPIR_Assert(ibuf);
+        memcpy(ibuf, &msg_hdr, sizeof(msg_hdr));
+        memcpy(ibuf + sizeof(msg_hdr), am_hdr, am_hdr_sz);
+        msg_iov[0].iov_base = ibuf;
+        msg_iov[0].iov_len = am_hdr_sz + sizeof(msg_hdr);
+        msg.iov_count = 1;
+
+        MPIDI_OFI_REQUEST(sreq, event_id) = MPIDI_OFI_EVENT_INJECT_EMU;
+        MPIDI_OFI_REQUEST(sreq, util.inject_buf) = ibuf;
+        /* Cancel FI_INJECT and ask for completion event */
+        send_flag = FI_COMPLETION;
+        msg.context = (void *) &(MPIDI_OFI_REQUEST(sreq, context));
+        OPA_incr_int(&MPIDI_Global.am_inflight_inject_emus);
+    }
+
+    MPIDI_OFI_CALL_RETRY_AM(fi_sendmsg(MPIDI_OFI_EP_TX_MSG(0), &msg, send_flag), need_lock, send);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_DO_INJECT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+static inline void MPIDI_OFI_am_request_complete(MPIR_Request * req)
+{
+    int incomplete;
+    MPIR_cc_decr(req->cc_ptr, &incomplete);
+
+    if (!incomplete) {
+        MPIDI_CH4U_request_release(req);
+    }
+}
+
+#endif /*NETMOD_OFI_AM_IMPL_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_am_probe.h b/src/mpid/ch4/netmod/ofi/ofi_am_probe.h
new file mode 100644
index 0000000..e8575ec
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_am_probe.h
@@ -0,0 +1,39 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_AM_OFI_PROBE_H_INCLUDED
+#define NETMOD_AM_OFI_PROBE_H_INCLUDED
+
+#include "ofi_impl.h"
+
+static inline int MPIDI_NM_probe(int source,
+                                 int tag, MPIR_Comm * comm, int context_offset, MPI_Status * status)
+{
+    return MPIDI_CH4U_probe(source, tag, comm, context_offset, status);
+}
+
+static inline int MPIDI_NM_improbe(int source,
+                                   int tag,
+                                   MPIR_Comm * comm,
+                                   int context_offset,
+                                   int *flag, MPIR_Request ** message, MPI_Status * status)
+{
+    return MPIDI_CH4U_improbe(source, tag, comm, context_offset, flag, message, status);
+}
+
+static inline int MPIDI_NM_iprobe(int source,
+                                  int tag,
+                                  MPIR_Comm * comm,
+                                  int context_offset, int *flag, MPI_Status * status)
+{
+    return MPIDI_CH4U_iprobe(source, tag, comm, context_offset, flag, status);
+}
+
+#endif /* NETMOD_AM_OFI_PROBE_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_am_recv.h b/src/mpid/ch4/netmod/ofi/ofi_am_recv.h
new file mode 100644
index 0000000..0e844db
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_am_recv.h
@@ -0,0 +1,61 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_AM_OFI_RECV_H_INCLUDED
+#define NETMOD_AM_OFI_RECV_H_INCLUDED
+
+#include "ofi_impl.h"
+
+static inline int MPIDI_NM_recv(void *buf,
+                                int count,
+                                MPI_Datatype datatype,
+                                int rank,
+                                int tag,
+                                MPIR_Comm * comm,
+                                int context_offset, MPI_Status * status, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_recv(buf, count, datatype, rank, tag, comm, context_offset, status, request);
+}
+
+static inline int MPIDI_NM_recv_init(void *buf,
+                                     int count,
+                                     MPI_Datatype datatype,
+                                     int rank,
+                                     int tag,
+                                     MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_recv_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_imrecv(void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  MPIR_Request * message, MPIR_Request ** rreqp)
+{
+    return MPIDI_CH4U_imrecv(buf, count, datatype, message, rreqp);
+}
+
+static inline int MPIDI_NM_irecv(void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_irecv(buf, count, datatype, rank, tag, comm, context_offset, request);
+
+}
+
+static inline int MPIDI_NM_cancel_recv(MPIR_Request * rreq)
+{
+    return MPIDI_CH4U_cancel_recv(rreq);
+}
+
+#endif /* NETMOD_AM_OFI_RECV_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_am_rma.h b/src/mpid/ch4/netmod/ofi/ofi_am_rma.h
new file mode 100644
index 0000000..e93b892
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_am_rma.h
@@ -0,0 +1,148 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_AM_OFI_RMA_H_INCLUDED
+#define NETMOD_AM_OFI_RMA_H_INCLUDED
+
+#include "ofi_impl.h"
+
+static inline int MPIDI_NM_put(const void *origin_addr,
+                               int origin_count,
+                               MPI_Datatype origin_datatype,
+                               int target_rank,
+                               MPI_Aint target_disp,
+                               int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    return MPIDI_CH4U_put(origin_addr, origin_count, origin_datatype,
+                          target_rank, target_disp, target_count, target_datatype, win);
+}
+
+static inline int MPIDI_NM_get(void *origin_addr,
+                               int origin_count,
+                               MPI_Datatype origin_datatype,
+                               int target_rank,
+                               MPI_Aint target_disp,
+                               int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    return MPIDI_CH4U_get(origin_addr, origin_count, origin_datatype,
+                          target_rank, target_disp, target_count, target_datatype, win);
+}
+
+static inline int MPIDI_NM_rput(const void *origin_addr,
+                                int origin_count,
+                                MPI_Datatype origin_datatype,
+                                int target_rank,
+                                MPI_Aint target_disp,
+                                int target_count,
+                                MPI_Datatype target_datatype,
+                                MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rput(origin_addr, origin_count, origin_datatype,
+                           target_rank, target_disp, target_count, target_datatype, win, request);
+}
+
+
+static inline int MPIDI_NM_compare_and_swap(const void *origin_addr,
+                                            const void *compare_addr,
+                                            void *result_addr,
+                                            MPI_Datatype datatype,
+                                            int target_rank, MPI_Aint target_disp, MPIR_Win * win)
+{
+    return MPIDI_CH4U_compare_and_swap(origin_addr, compare_addr, result_addr,
+                                       datatype, target_rank, target_disp, win);
+}
+
+static inline int MPIDI_NM_raccumulate(const void *origin_addr,
+                                       int origin_count,
+                                       MPI_Datatype origin_datatype,
+                                       int target_rank,
+                                       MPI_Aint target_disp,
+                                       int target_count,
+                                       MPI_Datatype target_datatype,
+                                       MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_raccumulate(origin_addr, origin_count, origin_datatype,
+                                  target_rank, target_disp, target_count,
+                                  target_datatype, op, win, request);
+}
+
+static inline int MPIDI_NM_rget_accumulate(const void *origin_addr,
+                                           int origin_count,
+                                           MPI_Datatype origin_datatype,
+                                           void *result_addr,
+                                           int result_count,
+                                           MPI_Datatype result_datatype,
+                                           int target_rank,
+                                           MPI_Aint target_disp,
+                                           int target_count,
+                                           MPI_Datatype target_datatype,
+                                           MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rget_accumulate(origin_addr, origin_count, origin_datatype,
+                                      result_addr, result_count, result_datatype,
+                                      target_rank, target_disp, target_count,
+                                      target_datatype, op, win, request);
+}
+
+static inline int MPIDI_NM_fetch_and_op(const void *origin_addr,
+                                        void *result_addr,
+                                        MPI_Datatype datatype,
+                                        int target_rank,
+                                        MPI_Aint target_disp, MPI_Op op, MPIR_Win * win)
+{
+    return MPIDI_CH4U_fetch_and_op(origin_addr, result_addr, datatype,
+                                   target_rank, target_disp, op, win);
+}
+
+
+static inline int MPIDI_NM_rget(void *origin_addr,
+                                int origin_count,
+                                MPI_Datatype origin_datatype,
+                                int target_rank,
+                                MPI_Aint target_disp,
+                                int target_count,
+                                MPI_Datatype target_datatype,
+                                MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rget(origin_addr, origin_count, origin_datatype,
+                           target_rank, target_disp, target_count, target_datatype, win, request);
+}
+
+
+static inline int MPIDI_NM_get_accumulate(const void *origin_addr,
+                                          int origin_count,
+                                          MPI_Datatype origin_datatype,
+                                          void *result_addr,
+                                          int result_count,
+                                          MPI_Datatype result_datatype,
+                                          int target_rank,
+                                          MPI_Aint target_disp,
+                                          int target_count,
+                                          MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    return MPIDI_CH4U_get_accumulate(origin_addr, origin_count, origin_datatype,
+                                     result_addr, result_count, result_datatype,
+                                     target_rank, target_disp, target_count,
+                                     target_datatype, op, win);
+}
+
+static inline int MPIDI_NM_accumulate(const void *origin_addr,
+                                      int origin_count,
+                                      MPI_Datatype origin_datatype,
+                                      int target_rank,
+                                      MPI_Aint target_disp,
+                                      int target_count,
+                                      MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    return MPIDI_CH4U_accumulate(origin_addr, origin_count, origin_datatype,
+                                 target_rank, target_disp, target_count, target_datatype, op, win);
+}
+
+#endif /* NETMOD_AM_OFI_RMA_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_am_send.h b/src/mpid/ch4/netmod/ofi/ofi_am_send.h
new file mode 100644
index 0000000..777377a
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_am_send.h
@@ -0,0 +1,128 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_AM_OFI_SEND_H_INCLUDED
+#define NETMOD_AM_OFI_SEND_H_INCLUDED
+
+#include "ofi_impl.h"
+
+static inline int MPIDI_NM_send(const void *buf,
+                                int count,
+                                MPI_Datatype datatype,
+                                int rank,
+                                int tag,
+                                MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_send(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_rsend(const void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rsend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+
+
+static inline int MPIDI_NM_irsend(const void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  int rank,
+                                  int tag,
+                                  MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_irsend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_ssend(const void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_ssend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_startall(int count, MPIR_Request * requests[])
+{
+    return MPIDI_CH4U_startall(count, requests);
+}
+
+static inline int MPIDI_NM_send_init(const void *buf,
+                                     int count,
+                                     MPI_Datatype datatype,
+                                     int rank,
+                                     int tag,
+                                     MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_send_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_ssend_init(const void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_ssend_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_bsend_init(const void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_bsend_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_rsend_init(const void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rsend_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_isend(const void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_isend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_issend(const void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  int rank,
+                                  int tag,
+                                  MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_issend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_cancel_send(MPIR_Request * sreq)
+{
+    return MPIDI_CH4U_cancel_send(sreq);
+}
+
+#endif /* NETMOD_AM_OFI_SEND_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_am_spawn.h b/src/mpid/ch4/netmod/ofi/ofi_am_spawn.h
new file mode 100644
index 0000000..75a5ef5
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_am_spawn.h
@@ -0,0 +1,50 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_AM_OFI_DYNPROC_H_INCLUDED
+#define NETMOD_AM_OFI_DYNPROC_H_INCLUDED
+
+#include "ofi_impl.h"
+
+static inline int MPIDI_NM_comm_connect(const char *port_name,
+                                        MPIR_Info * info,
+                                        int root, MPIR_Comm * comm, MPIR_Comm ** newcomm_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_comm_disconnect(MPIR_Comm * comm_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_open_port(MPIR_Info * info_ptr, char *port_name)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_close_port(const char *port_name)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_comm_accept(const char *port_name,
+                                       MPIR_Info * info,
+                                       int root, MPIR_Comm * comm, MPIR_Comm ** newcomm_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#endif /* NETMOD_AM_OFI_WIN_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_am_win.h b/src/mpid/ch4/netmod/ofi/ofi_am_win.h
new file mode 100644
index 0000000..89e5081
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_am_win.h
@@ -0,0 +1,160 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_AM_OFI_WIN_H_INCLUDED
+#define NETMOD_AM_OFI_WIN_H_INCLUDED
+
+#include "ofi_impl.h"
+
+static inline int MPIDI_NM_win_set_info(MPIR_Win * win, MPIR_Info * info)
+{
+    return MPIDI_CH4R_win_set_info(win, info);
+}
+
+
+static inline int MPIDI_NM_win_start(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_start(group, assert, win);
+}
+
+
+static inline int MPIDI_NM_win_complete(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_complete(win);
+}
+
+static inline int MPIDI_NM_win_post(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_post(group, assert, win);
+}
+
+
+static inline int MPIDI_NM_win_wait(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_wait(win);
+}
+
+
+static inline int MPIDI_NM_win_test(MPIR_Win * win, int *flag)
+{
+    return MPIDI_CH4R_win_test(win, flag);
+}
+
+static inline int MPIDI_NM_win_lock(int lock_type, int rank, int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_lock(lock_type, rank, assert, win);
+}
+
+
+static inline int MPIDI_NM_win_unlock(int rank, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_unlock(rank, win);
+}
+
+static inline int MPIDI_NM_win_get_info(MPIR_Win * win, MPIR_Info ** info_p_p)
+{
+    return MPIDI_CH4R_win_get_info(win, info_p_p);
+}
+
+
+static inline int MPIDI_NM_win_free(MPIR_Win ** win_ptr)
+{
+    return MPIDI_CH4R_win_free(win_ptr);
+}
+
+static inline int MPIDI_NM_win_fence(int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_fence(assert, win);
+}
+
+static inline int MPIDI_NM_win_create(void *base,
+                                      MPI_Aint length,
+                                      int disp_unit,
+                                      MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr)
+{
+    return MPIDI_CH4R_win_create(base, length, disp_unit, info, comm_ptr, win_ptr);
+}
+
+static inline int MPIDI_NM_win_attach(MPIR_Win * win, void *base, MPI_Aint size)
+{
+    return MPIDI_CH4R_win_attach(win, base, size);
+}
+
+static inline int MPIDI_NM_win_allocate_shared(MPI_Aint size,
+                                               int disp_unit,
+                                               MPIR_Info * info_ptr,
+                                               MPIR_Comm * comm_ptr,
+                                               void **base_ptr, MPIR_Win ** win_ptr)
+{
+    return MPIDI_CH4R_win_allocate_shared(size, disp_unit, info_ptr, comm_ptr, base_ptr, win_ptr);
+}
+
+static inline int MPIDI_NM_win_detach(MPIR_Win * win, const void *base)
+{
+    return MPIDI_CH4R_win_detach(win, base);
+}
+
+static inline int MPIDI_NM_win_shared_query(MPIR_Win * win,
+                                            int rank,
+                                            MPI_Aint * size, int *disp_unit, void *baseptr)
+{
+    return MPIDI_CH4R_win_shared_query(win, rank, size, disp_unit, baseptr);
+}
+
+static inline int MPIDI_NM_win_allocate(MPI_Aint size,
+                                        int disp_unit,
+                                        MPIR_Info * info,
+                                        MPIR_Comm * comm, void *baseptr, MPIR_Win ** win)
+{
+    return MPIDI_CH4R_win_allocate(size, disp_unit, info, comm, baseptr, win);
+}
+
+static inline int MPIDI_NM_win_flush(int rank, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_flush(rank, win);
+}
+
+static inline int MPIDI_NM_win_flush_local_all(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_flush_local_all(win);
+}
+
+static inline int MPIDI_NM_win_unlock_all(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_unlock_all(win);
+}
+
+static inline int MPIDI_NM_win_create_dynamic(MPIR_Info * info, MPIR_Comm * comm, MPIR_Win ** win)
+{
+    return MPIDI_CH4R_win_create_dynamic(info, comm, win);
+}
+
+static inline int MPIDI_NM_win_flush_local(int rank, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_flush_local(rank, win);
+}
+
+static inline int MPIDI_NM_win_sync(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_sync(win);
+}
+
+static inline int MPIDI_NM_win_flush_all(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_flush_all(win);
+}
+
+static inline int MPIDI_NM_win_lock_all(int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_lock_all(assert, win);
+}
+
+
+#endif /* NETMOD_AM_OFI_WIN_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_coll.h b/src/mpid/ch4/netmod/ofi/ofi_coll.h
new file mode 100644
index 0000000..1f87481
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_coll.h
@@ -0,0 +1,869 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_COLL_H_INCLUDED
+#define NETMOD_OFI_COLL_H_INCLUDED
+
+#include "ofi_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_barrier
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_barrier(MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_BARRIER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_BARRIER);
+
+    mpi_errno = MPIR_Barrier(comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_BARRIER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_bcast
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_bcast(void *buffer, int count, MPI_Datatype datatype,
+                                 int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_BCAST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_BCAST);
+
+    mpi_errno = MPIR_Bcast(buffer, count, datatype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_BCAST);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_allreduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_allreduce(const void *sendbuf, void *recvbuf, int count,
+                                     MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                     MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLREDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLREDUCE);
+
+    mpi_errno = MPIR_Allreduce(sendbuf, recvbuf, count, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLREDUCE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                     MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLGATHER);
+
+    mpi_errno = MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
+                               comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_allgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                      void *recvbuf, const int *recvcounts, const int *displs,
+                                      MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                      MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLGATHERV);
+
+    mpi_errno = MPIR_Allgatherv(sendbuf, sendcount, sendtype,
+                                recvbuf, recvcounts, displs, recvtype, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_gather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                  void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                  int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_GATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_GATHER);
+
+    mpi_errno = MPIR_Gather(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                            recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_GATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_gatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, const int *recvcounts, const int *displs,
+                                   MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr,
+                                   MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_GATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_GATHERV);
+
+    mpi_errno = MPIR_Gatherv(sendbuf, sendcount, sendtype,
+                             recvbuf, recvcounts, displs, recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_GATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_scatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                   int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_SCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_SCATTER);
+
+    mpi_errno = MPIR_Scatter(sendbuf, sendcount, sendtype,
+                             recvbuf, recvcount, recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_SCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_scatterv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_scatterv(const void *sendbuf, const int *sendcounts,
+                                    const int *displs, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                    int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_SCATTERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_SCATTERV);
+
+    mpi_errno = MPIR_Scatterv(sendbuf, sendcounts, displs,
+                              sendtype, recvbuf, recvcount, recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_SCATTERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_alltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                    MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLTOALL);
+
+    mpi_errno = MPIR_Alltoall(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                              recvtype, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_alltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_alltoallv(const void *sendbuf, const int *sendcounts,
+                                     const int *sdispls, MPI_Datatype sendtype,
+                                     void *recvbuf, const int *recvcounts,
+                                     const int *rdispls, MPI_Datatype recvtype,
+                                     MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLTOALLV);
+
+    mpi_errno = MPIR_Alltoallv(sendbuf, sendcounts, sdispls,
+                               sendtype, recvbuf, recvcounts, rdispls, recvtype, comm_ptr, errflag);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_alltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_alltoallw(const void *sendbuf, const int sendcounts[],
+                                     const int sdispls[], const MPI_Datatype sendtypes[],
+                                     void *recvbuf, const int recvcounts[],
+                                     const int rdispls[], const MPI_Datatype recvtypes[],
+                                     MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLTOALLW);
+
+    mpi_errno = MPIR_Alltoallw(sendbuf, sendcounts, sdispls,
+                               sendtypes, recvbuf, recvcounts,
+                               rdispls, recvtypes, comm_ptr, errflag);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_reduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_reduce(const void *sendbuf, void *recvbuf, int count,
+                                  MPI_Datatype datatype, MPI_Op op, int root,
+                                  MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_REDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_REDUCE);
+
+    mpi_errno = MPIR_Reduce(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_REDUCE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_reduce_scatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_reduce_scatter(const void *sendbuf, void *recvbuf,
+                                          const int recvcounts[], MPI_Datatype datatype,
+                                          MPI_Op op, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_REDUCE_SCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_REDUCE_SCATTER);
+
+    mpi_errno = MPIR_Reduce_scatter(sendbuf, recvbuf, recvcounts, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_REDUCE_SCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_reduce_scatter_block
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_reduce_scatter_block(const void *sendbuf, void *recvbuf,
+                                                int recvcount, MPI_Datatype datatype,
+                                                MPI_Op op, MPIR_Comm * comm_ptr,
+                                                MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_REDUCE_SCATTER_BLOCK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_REDUCE_SCATTER_BLOCK);
+
+    mpi_errno = MPIR_Reduce_scatter_block(sendbuf, recvbuf, recvcount,
+                                          datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_REDUCE_SCATTER_BLOCK);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_scan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_scan(const void *sendbuf, void *recvbuf, int count,
+                                MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_SCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_SCAN);
+
+    mpi_errno = MPIR_Scan(sendbuf, recvbuf, count, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_SCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_exscan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_exscan(const void *sendbuf, void *recvbuf, int count,
+                                  MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                  MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_EXSCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_EXSCAN);
+
+    mpi_errno = MPIR_Exscan(sendbuf, recvbuf, count, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_EXSCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_allgather(const void *sendbuf, int sendcount,
+                                              MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                              MPI_Datatype recvtype, MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLGATHER);
+
+    mpi_errno =
+        MPIR_Neighbor_allgather_impl(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
+                                     comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_allgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_allgatherv(const void *sendbuf, int sendcount,
+                                               MPI_Datatype sendtype, void *recvbuf,
+                                               const int recvcounts[], const int displs[],
+                                               MPI_Datatype recvtype, MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLGATHERV);
+
+    mpi_errno = MPIR_Neighbor_allgatherv_impl(sendbuf, sendcount, sendtype,
+                                              recvbuf, recvcounts, displs, recvtype, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_alltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_alltoall(const void *sendbuf, int sendcount,
+                                             MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                             MPI_Datatype recvtype, MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLTOALL);
+
+    mpi_errno = MPIR_Neighbor_alltoall_impl(sendbuf, sendcount, sendtype,
+                                            recvbuf, recvcount, recvtype, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_alltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_alltoallv(const void *sendbuf, const int sendcounts[],
+                                              const int sdispls[], MPI_Datatype sendtype,
+                                              void *recvbuf, const int recvcounts[],
+                                              const int rdispls[], MPI_Datatype recvtype,
+                                              MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLTOALLV);
+
+    mpi_errno = MPIR_Neighbor_alltoallv_impl(sendbuf, sendcounts, sdispls, sendtype,
+                                             recvbuf, recvcounts, rdispls, recvtype, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_alltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_alltoallw(const void *sendbuf, const int sendcounts[],
+                                              const MPI_Aint sdispls[],
+                                              const MPI_Datatype sendtypes[], void *recvbuf,
+                                              const int recvcounts[], const MPI_Aint rdispls[],
+                                              const MPI_Datatype recvtypes[], MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLTOALLW);
+
+    mpi_errno = MPIR_Neighbor_alltoallw_impl(sendbuf, sendcounts, sdispls, sendtypes,
+                                             recvbuf, recvcounts, rdispls, recvtypes, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_allgather(const void *sendbuf, int sendcount,
+                                               MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                               MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                               MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLGATHER);
+
+    mpi_errno = MPIR_Ineighbor_allgather_impl(sendbuf, sendcount, sendtype,
+                                              recvbuf, recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_allgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_allgatherv(const void *sendbuf, int sendcount,
+                                                MPI_Datatype sendtype, void *recvbuf,
+                                                const int recvcounts[], const int displs[],
+                                                MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                                MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLGATHERV);
+
+    mpi_errno = MPIR_Ineighbor_allgatherv_impl(sendbuf, sendcount, sendtype,
+                                               recvbuf, recvcounts, displs, recvtype,
+                                               comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_alltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_alltoall(const void *sendbuf, int sendcount,
+                                              MPI_Datatype sendtype, void *recvbuf,
+                                              int recvcount, MPI_Datatype recvtype,
+                                              MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLTOALL);
+
+    mpi_errno = MPIR_Ineighbor_alltoall_impl(sendbuf, sendcount, sendtype,
+                                             recvbuf, recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_alltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_alltoallv(const void *sendbuf, const int sendcounts[],
+                                               const int sdispls[], MPI_Datatype sendtype,
+                                               void *recvbuf, const int recvcounts[],
+                                               const int rdispls[], MPI_Datatype recvtype,
+                                               MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLTOALLV);
+
+    mpi_errno = MPIR_Ineighbor_alltoallv_impl(sendbuf, sendcounts, sdispls, sendtype,
+                                              recvbuf, recvcounts, rdispls, recvtype,
+                                              comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_alltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_alltoallw(const void *sendbuf, const int sendcounts[],
+                                               const MPI_Aint sdispls[],
+                                               const MPI_Datatype sendtypes[], void *recvbuf,
+                                               const int recvcounts[], const MPI_Aint rdispls[],
+                                               const MPI_Datatype recvtypes[], MPIR_Comm * comm_ptr,
+                                               MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLTOALLW);
+
+    mpi_errno = MPIR_Ineighbor_alltoallw_impl(sendbuf, sendcounts, sdispls, sendtypes,
+                                              recvbuf, recvcounts, rdispls, recvtypes,
+                                              comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ibarrier
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ibarrier(MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IBARRIER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IBARRIER);
+
+    mpi_errno = MPIR_Ibarrier_impl(comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IBARRIER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ibcast
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ibcast(void *buffer, int count, MPI_Datatype datatype,
+                                  int root, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IBCAST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IBCAST);
+
+    mpi_errno = MPIR_Ibcast_impl(buffer, count, datatype, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IBCAST);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iallgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                      void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                      MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLGATHER);
+
+    mpi_errno = MPIR_Iallgather_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                     recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iallgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iallgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                       void *recvbuf, const int *recvcounts, const int *displs,
+                                       MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                       MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLGATHERV);
+
+    mpi_errno = MPIR_Iallgatherv_impl(sendbuf, sendcount, sendtype,
+                                      recvbuf, recvcounts, displs, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iallreduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iallreduce(const void *sendbuf, void *recvbuf, int count,
+                                      MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                      MPI_Request * request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLREDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLREDUCE);
+
+    mpi_errno = MPIR_Iallreduce_impl(sendbuf, recvbuf, count, datatype, op, comm, request);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLREDUCE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ialltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ialltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                     MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLTOALL);
+
+    mpi_errno = MPIR_Ialltoall_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                    recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ialltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ialltoallv(const void *sendbuf, const int *sendcounts,
+                                      const int *sdispls, MPI_Datatype sendtype,
+                                      void *recvbuf, const int *recvcounts,
+                                      const int *rdispls, MPI_Datatype recvtype,
+                                      MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLTOALLV);
+
+    mpi_errno = MPIR_Ialltoallv_impl(sendbuf, sendcounts, sdispls,
+                                     sendtype, recvbuf, recvcounts,
+                                     rdispls, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ialltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ialltoallw(const void *sendbuf, const int *sendcounts,
+                                      const int *sdispls, const MPI_Datatype sendtypes[],
+                                      void *recvbuf, const int *recvcounts,
+                                      const int *rdispls, const MPI_Datatype recvtypes[],
+                                      MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLTOALLW);
+
+    mpi_errno = MPIR_Ialltoallw_impl(sendbuf, sendcounts, sdispls,
+                                     sendtypes, recvbuf, recvcounts,
+                                     rdispls, recvtypes, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iexscan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iexscan(const void *sendbuf, void *recvbuf, int count,
+                                   MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                   MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IEXSCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IEXSCAN);
+
+    mpi_errno = MPIR_Iexscan_impl(sendbuf, recvbuf, count, datatype, op, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IEXSCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_igather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_igather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                   int root, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IGATHER);
+
+    mpi_errno = MPIR_Igather_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                  recvcount, recvtype, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_igatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_igatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, const int *recvcounts, const int *displs,
+                                    MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr,
+                                    MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IGATHERV);
+
+    mpi_errno = MPIR_Igatherv_impl(sendbuf, sendcount, sendtype,
+                                   recvbuf, recvcounts, displs, recvtype, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ireduce_scatter_block
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ireduce_scatter_block(const void *sendbuf, void *recvbuf,
+                                                 int recvcount, MPI_Datatype datatype,
+                                                 MPI_Op op, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IREDUCE_SCATTER_BLOCK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IREDUCE_SCATTER_BLOCK);
+
+    mpi_errno = MPIR_Ireduce_scatter_block_impl(sendbuf, recvbuf, recvcount,
+                                                datatype, op, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IREDUCE_SCATTER_BLOCK);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ireduce_scatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ireduce_scatter(const void *sendbuf, void *recvbuf,
+                                           const int recvcounts[], MPI_Datatype datatype,
+                                           MPI_Op op, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IREDUCE_SCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IREDUCE_SCATTER);
+
+    mpi_errno = MPIR_Ireduce_scatter_impl(sendbuf, recvbuf, recvcounts, datatype, op,
+                                          comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IREDUCE_SCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ireduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ireduce(const void *sendbuf, void *recvbuf, int count,
+                                   MPI_Datatype datatype, MPI_Op op, int root,
+                                   MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IREDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IREDUCE);
+
+    mpi_errno = MPIR_Ireduce_impl(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IREDUCE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iscan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iscan(const void *sendbuf, void *recvbuf, int count,
+                                 MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                 MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ISCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ISCAN);
+
+    mpi_errno = MPIR_Iscan_impl(sendbuf, recvbuf, count, datatype, op, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ISCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iscatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iscatter(const void *sendbuf, int sendcount,
+                                    MPI_Datatype sendtype, void *recvbuf,
+                                    int recvcount, MPI_Datatype recvtype,
+                                    int root, MPIR_Comm * comm, MPI_Request * request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ISCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ISCATTER);
+
+    mpi_errno = MPIR_Iscatter_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                   recvcount, recvtype, root, comm, request);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ISCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iscatterv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iscatterv(const void *sendbuf, const int *sendcounts,
+                                     const int *displs, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount,
+                                     MPI_Datatype recvtype, int root,
+                                     MPIR_Comm * comm, MPI_Request * request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ISCATTERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ISCATTERV);
+
+    mpi_errno = MPIR_Iscatterv_impl(sendbuf, sendcounts, displs, sendtype,
+                                    recvbuf, recvcount, recvtype, root, comm, request);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ISCATTERV);
+    return mpi_errno;
+}
+
+#endif
diff --git a/src/mpid/ch4/netmod/ofi/ofi_comm.h b/src/mpid/ch4/netmod/ofi/ofi_comm.h
new file mode 100644
index 0000000..9f540c9
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_comm.h
@@ -0,0 +1,65 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_COMM_H_INCLUDED
+#define NETMOD_OFI_COMM_H_INCLUDED
+
+#include "ofi_impl.h"
+#include "mpl_utlist.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_comm_create
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_comm_create(MPIR_Comm * comm)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_COMM_CREATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_COMM_CREATE);
+
+    MPIDI_OFI_map_create(&MPIDI_OFI_COMM(comm).huge_send_counters);
+    MPIDI_OFI_map_create(&MPIDI_OFI_COMM(comm).huge_recv_counters);
+    MPIDI_OFI_index_allocator_create(&MPIDI_OFI_COMM(comm).win_id_allocator, 0);
+    MPIDI_OFI_index_allocator_create(&MPIDI_OFI_COMM(comm).rma_id_allocator, 1);
+
+    mpi_errno = MPIDI_CH4U_init_comm(comm);
+
+    /* Do not handle intercomms */
+    if (comm->comm_kind == MPIR_COMM_KIND__INTERCOMM)
+        goto fn_exit;
+
+    MPIR_Assert(comm->coll_fns != NULL);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_COMM_CREATE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_comm_destroy
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_comm_destroy(MPIR_Comm * comm)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_COMM_DESTROY);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_COMM_DESTROY);
+
+    mpi_errno = MPIDI_CH4U_destroy_comm(comm);
+    MPIDI_OFI_map_destroy(MPIDI_OFI_COMM(comm).huge_send_counters);
+    MPIDI_OFI_map_destroy(MPIDI_OFI_COMM(comm).huge_recv_counters);
+    MPIDI_OFI_index_allocator_destroy(MPIDI_OFI_COMM(comm).win_id_allocator);
+    MPIDI_OFI_index_allocator_destroy(MPIDI_OFI_COMM(comm).rma_id_allocator);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_COMM_DESTROY);
+    return mpi_errno;
+}
+
+
+#endif /* NETMOD_OFI_COMM_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_control.h b/src/mpid/ch4/netmod/ofi/ofi_control.h
new file mode 100644
index 0000000..ff9dab7
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_control.h
@@ -0,0 +1,72 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_AM_OFI_CONTROL_H_INCLUDED
+#define NETMOD_AM_OFI_CONTROL_H_INCLUDED
+
+#include "ofi_am_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_do_control_win
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_do_control_win(MPIDI_OFI_win_control_t * control,
+                                           int rank, MPIR_Win * win, int use_comm, int use_lock)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_OFI_DO_CONTROL_WIN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_OFI_DO_CONTROL_WIN);
+
+    control->win_id = MPIDI_OFI_WIN(win).win_id;
+    control->origin_rank = win->comm_ptr->rank;
+
+    mpi_errno = MPIDI_OFI_do_inject(rank,
+                                    win->comm_ptr,
+                                    MPIDI_OFI_INTERNAL_HANDLER_CONTROL,
+                                    (void *) control,
+                                    sizeof(*control), NULL, FALSE, use_comm, use_lock);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_OFI_DO_CONTROL_WIN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_do_control_send
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_do_control_send(MPIDI_OFI_send_control_t * control,
+                                            char *send_buf,
+                                            size_t msgsize,
+                                            int rank,
+                                            MPIR_Comm * comm_ptr,
+                                            MPIR_Request * ackreq, int need_lock)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_OFI_DO_CONTROL_SEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_OFI_DO_CONTROL_SEND);
+
+    control->origin_rank = comm_ptr->rank;
+    control->send_buf = (uintptr_t) send_buf;
+    control->msgsize = msgsize;
+    control->comm_id = comm_ptr->context_id;
+    control->endpoint_id = MPIDI_OFI_COMM_TO_EP(comm_ptr, comm_ptr->rank);
+    control->ackreq = ackreq;
+
+    mpi_errno = MPIDI_OFI_do_inject(rank, comm_ptr,
+                                    MPIDI_OFI_INTERNAL_HANDLER_CONTROL,
+                                    (void *) control,
+                                    sizeof(*control), NULL, FALSE, TRUE, need_lock);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_OFI_DO_CONTROL_SEND);
+    return mpi_errno;
+}
+
+
+#endif /* NETMOD_AM_OFI_CONTROL_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_datatype.h b/src/mpid/ch4/netmod/ofi/ofi_datatype.h
new file mode 100644
index 0000000..37ddc69
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_datatype.h
@@ -0,0 +1,31 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_DATATYPE_H_INCLUDED
+#define NETMOD_OFI_DATATYPE_H_INCLUDED
+
+#include "ofi_impl.h"
+
+static inline void MPIDI_NM_datatype_destroy(MPIR_Datatype * datatype_p)
+{
+    return;
+}
+
+static inline void MPIDI_NM_datatype_commit(MPIR_Datatype * datatype_p)
+{
+    return;
+}
+
+static inline void MPIDI_NM_datatype_dup(MPIR_Datatype * old_datatype_p,
+                                         MPIR_Datatype * new_datatype_p)
+{
+    return;
+}
+#endif
diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.h b/src/mpid/ch4/netmod/ofi/ofi_events.h
new file mode 100644
index 0000000..3ddb664
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_events.h
@@ -0,0 +1,804 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_EVENTS_H_INCLUDED
+#define NETMOD_OFI_EVENTS_H_INCLUDED
+
+#include "ofi_impl.h"
+#include "ofi_am_impl.h"
+#include "ofi_am_events.h"
+#include "ofi_control.h"
+
+__ALWAYS_INLINE__ int MPIDI_OFI_get_huge_event(struct fi_cq_tagged_entry *wc, MPIR_Request * req);
+
+__ALWAYS_INLINE__ int MPIDI_OFI_cqe_get_source(struct fi_cq_tagged_entry *wc, int do_data)
+{
+    if (do_data)
+        return wc->data;
+    else
+        return MPIDI_OFI_init_get_source(wc->tag);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_peek_event
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_peek_event(struct fi_cq_tagged_entry *wc, MPIR_Request * rreq)
+{
+    size_t count;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_NETMOD_PEEK_EVENT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_NETMOD_PEEK_EVENT);
+    MPIDI_OFI_REQUEST(rreq, util_id) = MPIDI_OFI_PEEK_FOUND;
+    rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, MPIDI_OFI_ENABLE_DATA);
+    rreq->status.MPI_TAG = MPIDI_OFI_init_get_tag(wc->tag);
+    count = wc->len;
+    rreq->status.MPI_ERROR = MPI_SUCCESS;
+    MPIR_STATUS_SET_COUNT(rreq->status, count);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_NETMOD_PEEK_EVENT);
+    return MPI_SUCCESS;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_peek_empty_event
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_peek_empty_event(struct fi_cq_tagged_entry *wc, MPIR_Request * rreq)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_NETMOD_PEEK_EMPTY_EVENT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_NETMOD_PEEK_EMPTY_EVENT);
+    MPIDI_OFI_dynamic_process_request_t *ctrl;
+
+    switch (MPIDI_OFI_REQUEST(rreq, event_id)) {
+    case MPIDI_OFI_EVENT_PEEK:
+        MPIDI_OFI_REQUEST(rreq, util_id) = MPIDI_OFI_PEEK_NOT_FOUND;
+        rreq->status.MPI_ERROR = MPI_SUCCESS;
+        break;
+
+    case MPIDI_OFI_EVENT_ACCEPT_PROBE:
+        ctrl = (MPIDI_OFI_dynamic_process_request_t *) rreq;
+        ctrl->done = MPIDI_OFI_PEEK_NOT_FOUND;
+        break;
+
+    default:
+        MPIR_Assert(0);
+        break;
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_NETMOD_PEEK_EMPTY_EVENT);
+    return MPI_SUCCESS;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_recv_event
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPI_Aint last;
+    size_t count;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_RECV_EVENT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_RECV_EVENT);
+
+    rreq->status.MPI_ERROR = MPI_SUCCESS;
+    rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, MPIDI_OFI_ENABLE_DATA);
+    rreq->status.MPI_TAG = MPIDI_OFI_init_get_tag(wc->tag);
+    count = wc->len;
+    MPIR_STATUS_SET_COUNT(rreq->status, count);
+
+#ifdef MPIDI_BUILD_CH4_SHM
+
+    if (MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(rreq)) {
+        int continue_matching = 1;
+
+        MPIDI_CH4R_anysource_matched(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(rreq), MPIDI_CH4R_NETMOD,
+                                     &continue_matching);
+
+        /* It is always possible to cancel a request on shm side w/o an aux thread */
+
+        /* Decouple requests */
+        if (unlikely(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(rreq))) {
+            MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(rreq)) = NULL;
+            MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(rreq) = NULL;
+        }
+
+        if (!continue_matching)
+            goto fn_exit;
+    }
+
+#endif
+
+    if (MPIDI_OFI_REQUEST(rreq, noncontig)) {
+        last = count;
+        MPID_Segment_unpack(&MPIDI_OFI_REQUEST(rreq, noncontig->segment), 0, &last,
+                            MPIDI_OFI_REQUEST(rreq, noncontig->pack_buffer));
+        MPL_free(MPIDI_OFI_REQUEST(rreq, noncontig));
+        if (last != (MPI_Aint) count) {
+            rreq->status.MPI_ERROR =
+                MPIR_Err_create_code(MPI_SUCCESS,
+                                     MPIR_ERR_RECOVERABLE,
+                                     __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0);
+        }
+    }
+
+    dtype_release_if_not_builtin(MPIDI_OFI_REQUEST(rreq, datatype));
+
+    /* If syncronous, ack and complete when the ack is done */
+    if (unlikely(MPIDI_OFI_is_tag_sync(wc->tag))) {
+        uint64_t ss_bits = MPIDI_OFI_init_sendtag(MPIDI_OFI_REQUEST(rreq, util_id),
+                                                  MPIDI_OFI_REQUEST(rreq, util_comm->rank),
+                                                  rreq->status.MPI_TAG,
+                                                  MPIDI_OFI_SYNC_SEND_ACK, MPIDI_OFI_ENABLE_DATA);
+        MPIR_Comm *c = MPIDI_OFI_REQUEST(rreq, util_comm);
+        int r = rreq->status.MPI_SOURCE;
+        mpi_errno = MPIDI_OFI_send_handler(MPIDI_OFI_EP_TX_TAG(0), NULL, 0, NULL,
+                                           MPIDI_OFI_REQUEST(rreq, util_comm->rank),
+                                           MPIDI_OFI_comm_to_phys(c, r, MPIDI_OFI_API_TAG),
+                                           ss_bits, NULL, MPIDI_OFI_DO_INJECT,
+                                           MPIDI_OFI_ENABLE_DATA, MPIDI_OFI_CALL_NO_LOCK);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+
+    MPIDI_CH4U_request_complete(rreq);
+
+    /* Polling loop will check for truncation */
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_RECV_EVENT);
+    return mpi_errno;
+  fn_fail:
+    rreq->status.MPI_ERROR = mpi_errno;
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_recv_huge_event
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_recv_huge_event(struct fi_cq_tagged_entry *wc, MPIR_Request * rreq)
+{
+    MPIDI_OFI_huge_recv_t *recv;
+    MPIR_Comm *comm_ptr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_RECV_HUGE_EVENT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_RECV_HUGE_EVENT);
+
+    /* Look up the receive sequence number and chunk queue */
+    comm_ptr = MPIDI_OFI_REQUEST(rreq, util_comm);
+    recv =
+        (MPIDI_OFI_huge_recv_t *) MPIDI_OFI_map_lookup(MPIDI_OFI_COMM(comm_ptr).huge_recv_counters,
+                                                       MPIDI_OFI_cqe_get_source(wc,
+                                                                                MPIDI_OFI_ENABLE_DATA));
+    if (recv == MPIDI_OFI_MAP_NOT_FOUND) {
+        recv = (MPIDI_OFI_huge_recv_t *) MPL_calloc(sizeof(*recv), 1);
+        MPIDI_OFI_map_set(MPIDI_OFI_COMM(comm_ptr).huge_recv_counters,
+                          MPIDI_OFI_cqe_get_source(wc, MPIDI_OFI_ENABLE_DATA), recv);
+    }
+
+    recv->event_id = MPIDI_OFI_EVENT_GET_HUGE;
+    recv->localreq = rreq;
+    recv->done_fn = MPIDI_OFI_recv_event;
+    recv->wc = *wc;
+    MPIDI_OFI_get_huge_event(NULL, (MPIR_Request *) recv);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_RECV_HUGE_EVENT);
+    return MPI_SUCCESS;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_send_event
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_send_event(struct fi_cq_tagged_entry *wc, MPIR_Request * sreq)
+{
+    int c;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_SEND_EVENT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_SEND_EVENT);
+
+    MPIR_cc_decr(sreq->cc_ptr, &c);
+
+    if (c == 0) {
+        if (MPIDI_OFI_REQUEST(sreq, noncontig))
+            MPL_free(MPIDI_OFI_REQUEST(sreq, noncontig));
+
+        dtype_release_if_not_builtin(MPIDI_OFI_REQUEST(sreq, datatype));
+        MPIDI_CH4U_request_release(sreq);
+    }   /* c != 0, ssend */
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_SEND_EVENT);
+    return MPI_SUCCESS;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_send_huge_event
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_send_huge_event(struct fi_cq_tagged_entry *wc, MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int c;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_SEND_EVENT_HUGE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_SEND_EVENT_HUGE);
+
+    MPIR_cc_decr(sreq->cc_ptr, &c);
+
+    if (c == 0) {
+        MPIR_Comm *comm;
+        void *ptr;
+        MPIDI_OFI_huge_counter_t *cntr;
+        comm = MPIDI_OFI_REQUEST(sreq, util_comm);
+        ptr =
+            MPIDI_OFI_map_lookup(MPIDI_OFI_COMM(comm).huge_send_counters,
+                                 MPIDI_OFI_REQUEST(sreq, util_id));
+        MPIR_Assert(ptr != MPIDI_OFI_MAP_NOT_FOUND);
+        cntr = (MPIDI_OFI_huge_counter_t *) ptr;
+        cntr->outstanding--;
+        if (cntr->outstanding == 0) {
+            MPIDI_OFI_send_control_t ctrl;
+            uint64_t key;
+            int key_back;
+            MPIDI_OFI_map_erase(MPIDI_OFI_COMM(comm).huge_send_counters,
+                                MPIDI_OFI_REQUEST(sreq, util_id));
+            key = fi_mr_key(cntr->mr);
+            key_back = (key >> MPIDI_Global.huge_rma_shift);
+            MPIDI_OFI_index_allocator_free(MPIDI_OFI_COMM(comm).rma_id_allocator, key_back);
+            MPIDI_OFI_CALL_NOLOCK(fi_close(&cntr->mr->fid), mr_unreg);
+            MPL_free(ptr);
+            ctrl.type = MPIDI_OFI_CTRL_HUGE_CLEANUP;
+            MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_do_control_send
+                                   (&ctrl, NULL, 0, MPIDI_OFI_REQUEST(sreq, util_id), comm, NULL,
+                                    FALSE /* no lock */));
+        }
+
+        if (MPIDI_OFI_REQUEST(sreq, noncontig))
+            MPL_free(MPIDI_OFI_REQUEST(sreq, noncontig));
+
+        dtype_release_if_not_builtin(MPIDI_OFI_REQUEST(sreq, datatype));
+        MPIDI_CH4U_request_release(sreq);
+    }   /* c != 0, ssend */
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_SEND_EVENT_HUGE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_ssend_ack_event
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_ssend_ack_event(struct fi_cq_tagged_entry *wc, MPIR_Request * sreq)
+{
+    int mpi_errno;
+    MPIDI_OFI_ssendack_request_t *req = (MPIDI_OFI_ssendack_request_t *) sreq;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_SSEND_ACK_EVENT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_SSEND_ACK_EVENT);
+    mpi_errno = MPIDI_OFI_send_event(NULL, req->signal_req);
+    MPIDI_OFI_ssendack_request_t_tls_free(req);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_SSEND_ACK_EVENT);
+    return mpi_errno;
+}
+
+__ALWAYS_INLINE__ uintptr_t MPIDI_OFI_recv_rbase(MPIDI_OFI_huge_recv_t * recv)
+{
+#ifdef USE_OFI_MR_SCALABLE
+    return 0;
+#else
+    return recv->remote_info.send_buf;
+#endif
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_get_huge_event
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_get_huge_event(struct fi_cq_tagged_entry *wc, MPIR_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_huge_recv_t *recv = (MPIDI_OFI_huge_recv_t *) req;
+    uint64_t remote_key;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_GETHUGE_EVENT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_GETHUGE_EVENT);
+
+    if (recv->localreq && recv->cur_offset != 0) {
+        size_t bytesSent = recv->cur_offset - MPIDI_Global.max_send;
+        size_t bytesLeft = recv->remote_info.msgsize - bytesSent - MPIDI_Global.max_send;
+        size_t bytesToGet =
+            (bytesLeft <= MPIDI_Global.max_send) ? bytesLeft : MPIDI_Global.max_send;
+
+        if (bytesToGet == 0ULL) {
+            MPIDI_OFI_send_control_t ctrl;
+            recv->wc.len = recv->cur_offset;
+            recv->done_fn(&recv->wc, recv->localreq);
+            ctrl.type = MPIDI_OFI_CTRL_HUGEACK;
+            MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_do_control_send
+                                   (&ctrl, NULL, 0, recv->remote_info.origin_rank, recv->comm_ptr,
+                                    recv->remote_info.ackreq, FALSE));
+            /* "recv" and maps will be freed in MPIDI_OFI_get_huge_cleanup */
+            goto fn_exit;
+        }
+
+        if (MPIDI_OFI_ENABLE_MR_SCALABLE)
+            remote_key = recv->remote_info.rma_key << MPIDI_Global.huge_rma_shift;
+        else
+            remote_key = recv->remote_info.rma_key;
+
+        MPIDI_OFI_conditional_cntr_incr();
+        MPIDI_OFI_CALL_RETRY(fi_read(MPIDI_OFI_EP_TX_RMA(0),    /* endpoint     */
+                                     (void *) ((uintptr_t) recv->wc.buf + recv->cur_offset),    /* local buffer */
+                                     bytesToGet,        /* bytes        */
+                                     NULL,      /* descriptor   */
+                                     MPIDI_OFI_comm_to_phys(recv->comm_ptr, recv->remote_info.origin_rank, MPIDI_OFI_API_MSG),  /* Destination  */
+                                     MPIDI_OFI_recv_rbase(recv) + recv->cur_offset,     /* remote maddr */
+                                     remote_key,        /* Key          */
+                                     (void *) &recv->context), rdma_readfrom,   /* Context */
+                             MPIDI_OFI_CALL_NO_LOCK);
+        recv->cur_offset += bytesToGet;
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_GETHUGE_EVENT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_chunk_done_event
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_chunk_done_event(struct fi_cq_tagged_entry *wc, MPIR_Request * req)
+{
+    int c;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_CHUNK_DONE_EVENT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_CHUNK_DONE_EVENT);
+
+    MPIDI_OFI_chunk_request *creq = (MPIDI_OFI_chunk_request *) req;
+    MPIR_cc_decr(creq->parent->cc_ptr, &c);
+
+    if (c == 0)
+        MPIDI_CH4U_request_release(creq->parent);
+
+    MPL_free(creq);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_CHUNK_DONE_EVENT);
+    return MPI_SUCCESS;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_inject_emu_event
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_inject_emu_event(struct fi_cq_tagged_entry *wc, MPIR_Request * req)
+{
+    int incomplete;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_INJECT_EMU_EVENT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_INJECT_EMU_EVENT);
+
+    MPIR_cc_decr(req->cc_ptr, &incomplete);
+
+    if (!incomplete) {
+        MPL_free(MPIDI_OFI_REQUEST(req, util.inject_buf));
+        MPIDI_CH4U_request_release(req);
+        OPA_decr_int(&MPIDI_Global.am_inflight_inject_emus);
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_INJECT_EMU_EVENT);
+    return MPI_SUCCESS;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_rma_done_event
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_rma_done_event(struct fi_cq_tagged_entry *wc, MPIR_Request * in_req)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_OFI_RMA_DONE_EVENT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_OFI_RMA_DONE_EVENT);
+
+    MPIDI_OFI_win_request_t *req = (MPIDI_OFI_win_request_t *) in_req;
+    MPIDI_OFI_win_request_complete(req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_OFI_RMA_DONE_EVENT);
+    return MPI_SUCCESS;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_accept_probe_event
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_accept_probe_event(struct fi_cq_tagged_entry *wc,
+                                                   MPIR_Request * rreq)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_OFI_ACCEPT_PROBE_EVENT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_OFI_ACCEPT_PROBE_EVENT);
+    MPIDI_OFI_dynamic_process_request_t *ctrl = (MPIDI_OFI_dynamic_process_request_t *) rreq;
+    ctrl->source = MPIDI_OFI_cqe_get_source(wc, MPIDI_OFI_ENABLE_DATA);
+    ctrl->tag = MPIDI_OFI_init_get_tag(wc->tag);
+    ctrl->msglen = wc->len;
+    ctrl->done = MPIDI_OFI_PEEK_FOUND;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_OFI_ACCEPT_PROBE_EVENT);
+    return MPI_SUCCESS;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_dynproc_done_event
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_dynproc_done_event(struct fi_cq_tagged_entry *wc,
+                                                   MPIR_Request * rreq)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_OFI_DYNPROC_DONE_EVENT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_OFI_DYNPROC_DONE_EVENT);
+    MPIDI_OFI_dynamic_process_request_t *ctrl = (MPIDI_OFI_dynamic_process_request_t *) rreq;
+    ctrl->done++;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_OFI_DYNPROC_DONE_EVENT);
+    return MPI_SUCCESS;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_am_send_event
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_am_send_event(struct fi_cq_tagged_entry *wc, MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_am_header_t *msg_hdr;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_HANDLE_SEND_COMPLETION);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_HANDLE_SEND_COMPLETION);
+
+    msg_hdr = &MPIDI_OFI_AMREQUEST_HDR(sreq, msg_hdr);
+    MPIDI_OFI_am_request_complete(sreq);
+
+    switch (msg_hdr->am_type) {
+    case MPIDI_AMTYPE_LMT_ACK:
+    case MPIDI_AMTYPE_LMT_REQ:
+        goto fn_exit;
+
+    default:
+        break;
+    }
+
+    if (MPIDI_OFI_AMREQUEST_HDR(sreq, pack_buffer)) {
+        MPL_free(MPIDI_OFI_AMREQUEST_HDR(sreq, pack_buffer));
+        MPIDI_OFI_AMREQUEST_HDR(sreq, pack_buffer) = NULL;
+    }
+
+    mpi_errno = MPIDI_Global.am_send_cmpl_handlers[msg_hdr->handler_id] (sreq);
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_HANDLE_SEND_COMPLETION);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_am_recv_event
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_am_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_am_header_t *am_hdr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_HANDLE_RECV_COMPLETION);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_HANDLE_RECV_COMPLETION);
+
+    am_hdr = (MPIDI_OFI_am_header_t *) wc->buf;
+
+    switch (am_hdr->am_type) {
+    case MPIDI_AMTYPE_SHORT_HDR:
+        mpi_errno = MPIDI_OFI_handle_short_am_hdr(am_hdr, am_hdr->payload);
+
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+
+        break;
+
+    case MPIDI_AMTYPE_SHORT:
+        mpi_errno = MPIDI_OFI_handle_short_am(am_hdr);
+
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+
+        break;
+
+    case MPIDI_AMTYPE_LMT_REQ:
+        mpi_errno = MPIDI_OFI_handle_long_am(am_hdr);
+
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+
+        break;
+
+    case MPIDI_AMTYPE_LMT_ACK:
+        mpi_errno = MPIDI_OFI_handle_lmt_ack(am_hdr);
+
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+
+        break;
+
+    default:
+        MPIR_Assert(0);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_HANDLE_RECV_COMPLETION);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_am_read_event
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_am_read_event(struct fi_cq_tagged_entry *wc,
+                                              MPIR_Request * dont_use_me)
+{
+    int mpi_errno = MPI_SUCCESS;
+    void *netmod_context = NULL;
+    MPIR_Request *rreq;
+    MPIDI_OFI_am_request_t *ofi_req;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_HANDLE_READ_COMPLETION);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_HANDLE_READ_COMPLETION);
+
+    ofi_req = container_of(wc->op_context, MPIDI_OFI_am_request_t, context);
+    ofi_req->req_hdr->lmt_cntr--;
+
+    if (ofi_req->req_hdr->lmt_cntr)
+        goto fn_exit;
+
+    rreq = (MPIR_Request *) ofi_req->req_hdr->rreq_ptr;
+    mpi_errno = MPIDI_OFI_dispatch_ack(MPIDI_OFI_AMREQUEST_HDR(rreq, lmt_info).src_rank,
+                                       MPIDI_OFI_AMREQUEST_HDR(rreq, lmt_info).context_id,
+                                       MPIDI_OFI_AMREQUEST_HDR(rreq, lmt_info).sreq_ptr,
+                                       MPIDI_AMTYPE_LMT_ACK, netmod_context);
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    MPIDI_OFI_am_request_complete(rreq);
+    ofi_req->req_hdr->cmpl_handler_fn(rreq);
+  fn_exit:
+    MPIDI_CH4R_release_buf((void *) ofi_req);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_HANDLE_READ_COMPLETION);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_am_repost_event
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_am_repost_event(struct fi_cq_tagged_entry *wc, MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_REPOST_BUFFER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_REPOST_BUFFER);
+
+    mpi_errno = MPIDI_OFI_repost_buffer(wc->op_context, rreq);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_REPOST_BUFFER);
+    return mpi_errno;
+}
+
+__ALWAYS_INLINE__ int MPIDI_OFI_dispatch_function(struct fi_cq_tagged_entry *wc,
+                                                  MPIR_Request * req, int buffered)
+{
+    int mpi_errno;
+
+    if (likely(MPIDI_OFI_REQUEST(req, event_id) == MPIDI_OFI_EVENT_SEND)) {
+        mpi_errno = MPIDI_OFI_send_event(wc, req);
+        goto fn_exit;
+    }
+    else if (likely(MPIDI_OFI_REQUEST(req, event_id) == MPIDI_OFI_EVENT_RECV)) {
+        mpi_errno = MPIDI_OFI_recv_event(wc, req);
+        goto fn_exit;
+    }
+    else if (likely(MPIDI_OFI_REQUEST(req, event_id) == MPIDI_OFI_EVENT_RMA_DONE)) {
+        mpi_errno = MPIDI_OFI_rma_done_event(wc, req);
+        goto fn_exit;
+    }
+    else if (likely(MPIDI_OFI_REQUEST(req, event_id) == MPIDI_OFI_EVENT_AM_SEND)) {
+        mpi_errno = MPIDI_OFI_am_send_event(wc, req);
+        goto fn_exit;
+    }
+    else if (likely(MPIDI_OFI_REQUEST(req, event_id) == MPIDI_OFI_EVENT_AM_RECV)) {
+        mpi_errno = MPIDI_OFI_am_recv_event(wc, req);
+
+        if (unlikely((wc->flags & FI_MULTI_RECV) && !buffered))
+            MPIDI_OFI_am_repost_event(wc, req);
+
+        goto fn_exit;
+    }
+    else if (likely(MPIDI_OFI_REQUEST(req, event_id) == MPIDI_OFI_EVENT_AM_READ)) {
+        mpi_errno = MPIDI_OFI_am_read_event(wc, req);
+        goto fn_exit;
+    }
+    else if (unlikely(1)) {
+        switch (MPIDI_OFI_REQUEST(req, event_id)) {
+        case MPIDI_OFI_EVENT_AM_MULTI:
+            mpi_errno = MPIDI_OFI_am_repost_event(wc, req);
+            break;
+
+        case MPIDI_OFI_EVENT_PEEK:
+            mpi_errno = MPIDI_OFI_peek_event(wc, req);
+            break;
+
+        case MPIDI_OFI_EVENT_RECV_HUGE:
+            mpi_errno = MPIDI_OFI_recv_huge_event(wc, req);
+            break;
+
+        case MPIDI_OFI_EVENT_SEND_HUGE:
+            mpi_errno = MPIDI_OFI_send_huge_event(wc, req);
+            break;
+
+        case MPIDI_OFI_EVENT_SSEND_ACK:
+            mpi_errno = MPIDI_OFI_ssend_ack_event(wc, req);
+            break;
+
+        case MPIDI_OFI_EVENT_GET_HUGE:
+            mpi_errno = MPIDI_OFI_get_huge_event(wc, req);
+            break;
+
+        case MPIDI_OFI_EVENT_CHUNK_DONE:
+            mpi_errno = MPIDI_OFI_chunk_done_event(wc, req);
+            break;
+
+        case MPIDI_OFI_EVENT_INJECT_EMU:
+            mpi_errno = MPIDI_OFI_inject_emu_event(wc, req);
+            break;
+
+        case MPIDI_OFI_EVENT_DYNPROC_DONE:
+            mpi_errno = MPIDI_OFI_dynproc_done_event(wc, req);
+            break;
+
+        case MPIDI_OFI_EVENT_ACCEPT_PROBE:
+            mpi_errno = MPIDI_OFI_accept_probe_event(wc, req);
+            break;
+
+        case MPIDI_OFI_EVENT_ABORT:
+        default:
+            mpi_errno = MPI_SUCCESS;
+            MPIR_Assert(0);
+            break;
+        }
+    }
+
+  fn_exit:
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_get_buffered
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_get_buffered(struct fi_cq_tagged_entry *wc, ssize_t num)
+{
+    int rc = 0;
+
+    if ((MPIDI_Global.cq_buff_head != MPIDI_Global.cq_buff_tail) ||
+        !slist_empty(&MPIDI_Global.cq_buff_list)) {
+        if (MPIDI_Global.cq_buff_head != MPIDI_Global.cq_buff_tail) {
+            wc[0] = MPIDI_Global.cq_buffered[MPIDI_Global.cq_buff_tail].cq_entry;
+            MPIDI_Global.cq_buff_tail = (MPIDI_Global.cq_buff_tail + 1) % MPIDI_OFI_NUM_CQ_BUFFERED;
+        }
+        else {
+            MPIDI_OFI_cq_list_t *MPIDI_OFI_cq_list_entry;
+            struct slist_entry *entry = slist_remove_head(&MPIDI_Global.cq_buff_list);
+            MPIDI_OFI_cq_list_entry = container_of(entry, MPIDI_OFI_cq_list_t, entry);
+            wc[0] = MPIDI_OFI_cq_list_entry->cq_entry;
+            MPL_free((void *) MPIDI_OFI_cq_list_entry);
+        }
+
+        rc = 1;
+    }
+
+    return rc;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_handle_cq_entries
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_handle_cq_entries(struct fi_cq_tagged_entry *wc,
+                                                  ssize_t num, int buffered)
+{
+    int i, mpi_errno = MPI_SUCCESS;
+    MPIR_Request *req;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_HANDLE_CQ_ENTRIES);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_HANDLE_CQ_ENTRIES);
+
+    for (i = 0; i < num; i++) {
+        req = MPIDI_OFI_context_to_request(wc[i].op_context);
+        MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_dispatch_function(&wc[i], req, buffered));
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_HANDLE_CQ_ENTRIES);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_handle_cq_error
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_handle_cq_error(ssize_t ret)
+{
+    int mpi_errno = MPI_SUCCESS;
+    struct fi_cq_err_entry e;
+    MPIR_Request *req;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_HANDLE_CQ_ERROR);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_HANDLE_CQ_ERROR);
+
+    switch (ret) {
+    case -FI_EAVAIL:
+        fi_cq_readerr(MPIDI_Global.p2p_cq, &e, 0);
+
+        switch (e.err) {
+        case FI_ETRUNC:
+            req = MPIDI_OFI_context_to_request(e.op_context);
+
+            switch (req->kind) {
+            case MPIR_REQUEST_KIND__SEND:
+                mpi_errno = MPIDI_OFI_dispatch_function(NULL, req, 0);
+                break;
+
+            case MPIR_REQUEST_KIND__RECV:
+                mpi_errno = MPIDI_OFI_dispatch_function((struct fi_cq_tagged_entry *) &e, req, 0);
+                req->status.MPI_ERROR = MPI_ERR_TRUNCATE;
+                break;
+
+            default:
+                MPIR_ERR_SETFATALANDJUMP4(mpi_errno, MPI_ERR_OTHER, "**ofid_poll",
+                                          "**ofid_poll %s %d %s %s", __SHORT_FILE__,
+                                          __LINE__, FCNAME, fi_strerror(e.err));
+            }
+
+            break;
+
+        case FI_ECANCELED:
+            req = MPIDI_OFI_context_to_request(e.op_context);
+            MPIR_STATUS_SET_CANCEL_BIT(req->status, TRUE);
+            break;
+
+        case FI_ENOMSG:
+            req = MPIDI_OFI_context_to_request(e.op_context);
+            MPIDI_OFI_peek_empty_event(NULL, req);
+            break;
+        }
+
+        break;
+
+    default:
+        MPIR_ERR_SETFATALANDJUMP4(mpi_errno, MPI_ERR_OTHER, "**ofid_poll",
+                                  "**ofid_poll %s %d %s %s", __SHORT_FILE__, __LINE__,
+                                  FCNAME, fi_strerror(errno));
+        break;
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_HANDLE_CQ_ERROR);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#endif /* NETMOD_OFI_EVENTS_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h
new file mode 100644
index 0000000..9e6ff5c
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h
@@ -0,0 +1,484 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_IMPL_H_INCLUDED
+#define NETMOD_OFI_IMPL_H_INCLUDED
+
+#include <mpidimpl.h>
+#include "ofi_types.h"
+#include "mpidch4r.h"
+#include "ch4_impl.h"
+#include "ofi_iovec_util.h"
+
+/* Tag the prototypes with always_inline to force object allocation */
+/* routines to inline  This allows the library, compiled without    */
+/* ipo/pgo enabled to inline MPI layer functions                    */
+__ALWAYS_INLINE__ MPIR_Request *MPIR_Request_create(MPIR_Request_kind_t kind);
+__ALWAYS_INLINE__ void *MPIR_Handle_obj_alloc(MPIR_Object_alloc_t *);
+__ALWAYS_INLINE__ void *MPIR_Handle_obj_alloc_unsafe(MPIR_Object_alloc_t *);
+__ALWAYS_INLINE__ void MPIR_Handle_obj_free(MPIR_Object_alloc_t *, void *);
+__ALWAYS_INLINE__ void *MPIR_Handle_get_ptr_indirect(int, MPIR_Object_alloc_t *);
+__ALWAYS_INLINE__ MPIDII_av_entry_t *MPIDIU_comm_rank_to_av(MPIR_Comm * comm, int rank);
+
+#define MPIDI_OFI_DT(dt)         ((dt)->dev.netmod.ofi)
+#define MPIDI_OFI_OP(op)         ((op)->dev.netmod.ofi)
+#define MPIDI_OFI_COMM(comm)     ((comm)->dev.ch4.netmod.ofi)
+#define MPIDI_OFI_COMM_TO_INDEX(comm,rank) \
+    MPIDIU_comm_rank_to_pid(comm, rank, NULL, NULL)
+#ifdef MPIDI_OFI_CONFIG_USE_AV_TABLE
+#define MPIDI_OFI_COMM_TO_PHYS(comm,rank) \
+    ((fi_addr_t)MPIDI_OFI_COMM_TO_INDEX(comm,rank))
+#define MPIDI_OFI_TO_PHYS(avtid, rank)            ((fi_addr_t)rank)
+#else
+#define MPIDI_OFI_COMM_TO_PHYS(comm,rank)                       \
+    MPIDI_OFI_AV(MPIDIU_comm_rank_to_av((comm), (rank))).dest
+#define MPIDI_OFI_TO_PHYS(avtid, lpid)                                 \
+    MPIDI_OFI_AV(&MPIDIU_get_av((avtid), (lpid))).dest
+#endif
+
+#define MPIDI_OFI_WIN(win)     ((win)->dev.netmod.ofi)
+/*
+ * Helper routines and macros for request completion
+ */
+#define MPIDI_OFI_ssendack_request_t_tls_alloc(req)             \
+    do {                                                                \
+        (req) = (MPIDI_OFI_ssendack_request_t*)                 \
+            MPIR_Request_create(MPIR_REQUEST_KIND__SEND);               \
+        if (req == NULL)                                                \
+            MPID_Abort(NULL, MPI_ERR_NO_SPACE, -1,                      \
+                       "Cannot allocate Ssendack Request");             \
+    } while (0)
+
+#define MPIDI_OFI_ssendack_request_t_tls_free(req) \
+  MPIR_Handle_obj_free(&MPIR_Request_mem, (req))
+
+#define MPIDI_OFI_ssendack_request_t_alloc_and_init(req)        \
+    do {                                                                \
+        MPIDI_OFI_ssendack_request_t_tls_alloc(req);            \
+        MPIR_Assert(req != NULL);                                       \
+        MPIR_Assert(HANDLE_GET_MPI_KIND(req->handle)                    \
+                    == MPID_SSENDACK_REQUEST);                          \
+    } while (0)
+
+#define MPIDI_OFI_request_create_null_rreq(rreq_, mpi_errno_, FAIL_) \
+  do {                                                                  \
+    (rreq_) = MPIR_Request_create(MPIR_REQUEST_KIND__RECV);             \
+    if ((rreq_) != NULL) {                                              \
+      MPIR_cc_set(&(rreq_)->cc, 0);                                     \
+      (rreq_)->kind = MPIR_REQUEST_KIND__RECV;                                \
+      MPIR_Status_set_procnull(&(rreq_)->status);                       \
+    }                                                                   \
+    else {                                                              \
+      MPIR_ERR_SETANDJUMP(mpi_errno_,MPI_ERR_OTHER,"**nomemreq");       \
+    }                                                                   \
+  } while (0)
+
+
+#define MPIDI_OFI_PROGRESS()                                      \
+    do {                                                          \
+        mpi_errno = MPIDI_Progress_test();                        \
+        if (mpi_errno!=MPI_SUCCESS) MPIR_ERR_POP(mpi_errno);      \
+    } while (0)
+
+#define MPIDI_OFI_PROGRESS_NONINLINE()                            \
+    do {                                                          \
+        mpi_errno = MPIDI_OFI_progress_test_no_inline();          \
+        if (mpi_errno!=MPI_SUCCESS) MPIR_ERR_POP(mpi_errno);      \
+    } while (0)
+
+#define MPIDI_OFI_PROGRESS_WHILE(cond)                 \
+    while (cond) MPIDI_OFI_PROGRESS()
+
+#define MPIDI_OFI_ERR  MPIR_ERR_CHKANDJUMP4
+#define MPIDI_OFI_CALL(FUNC,STR)                                     \
+    do {                                                    \
+        MPID_THREAD_CS_ENTER(POBJ,MPIDI_OFI_THREAD_FI_MUTEX);   \
+        ssize_t _ret = FUNC;                                \
+        MPID_THREAD_CS_EXIT(POBJ,MPIDI_OFI_THREAD_FI_MUTEX);    \
+        MPIDI_OFI_ERR(_ret<0,                       \
+                              mpi_errno,                    \
+                              MPI_ERR_OTHER,                \
+                              "**ofid_"#STR,                \
+                              "**ofid_"#STR" %s %d %s %s",  \
+                              __SHORT_FILE__,               \
+                              __LINE__,                     \
+                              FCNAME,                       \
+                              fi_strerror(-_ret));          \
+    } while (0)
+
+#define MPIDI_OFI_CALL_NOLOCK(FUNC,STR)                              \
+    do {                                                    \
+        ssize_t _ret = FUNC;                                \
+        MPIDI_OFI_ERR(_ret<0,                       \
+                              mpi_errno,                    \
+                              MPI_ERR_OTHER,                \
+                              "**ofid_"#STR,                \
+                              "**ofid_"#STR" %s %d %s %s",  \
+                              __SHORT_FILE__,               \
+                              __LINE__,                     \
+                              FCNAME,                       \
+                              fi_strerror(-_ret));          \
+    } while (0)
+
+#define MPIDI_OFI_CALL_LOCK 1
+#define MPIDI_OFI_CALL_NO_LOCK 0
+#define MPIDI_OFI_CALL_RETRY(FUNC,STR,LOCK)                               \
+    do {                                                    \
+    ssize_t _ret;                                           \
+    do {                                                    \
+        if (LOCK == MPIDI_OFI_CALL_LOCK)                    \
+            MPID_THREAD_CS_ENTER(POBJ,MPIDI_OFI_THREAD_FI_MUTEX);   \
+        _ret = FUNC;                                        \
+        if (LOCK == MPIDI_OFI_CALL_LOCK)                    \
+            MPID_THREAD_CS_EXIT(POBJ,MPIDI_OFI_THREAD_FI_MUTEX);    \
+        if (likely(_ret==0)) break;                          \
+        MPIDI_OFI_ERR(_ret!=-FI_EAGAIN,             \
+                              mpi_errno,                    \
+                              MPI_ERR_OTHER,                \
+                              "**ofid_"#STR,                \
+                              "**ofid_"#STR" %s %d %s %s",  \
+                              __SHORT_FILE__,               \
+                              __LINE__,                     \
+                              FCNAME,                       \
+                              fi_strerror(-_ret));          \
+        if (LOCK == MPIDI_OFI_CALL_NO_LOCK)                 \
+            MPID_THREAD_CS_EXIT(POBJ,MPIDI_OFI_THREAD_FI_MUTEX);     \
+        MPIDI_OFI_PROGRESS_NONINLINE();                              \
+        if (LOCK == MPIDI_OFI_CALL_NO_LOCK)                 \
+            MPID_THREAD_CS_ENTER(POBJ,MPIDI_OFI_THREAD_FI_MUTEX);    \
+    } while (_ret == -FI_EAGAIN);                           \
+    } while (0)
+
+#define MPIDI_OFI_CALL_RETRY2(FUNC1,FUNC2,STR)                       \
+    do {                                                    \
+    ssize_t _ret;                                           \
+    MPID_THREAD_CS_ENTER(POBJ,MPIDI_OFI_THREAD_FI_MUTEX);       \
+    FUNC1;                                                  \
+    do {                                                    \
+        _ret = FUNC2;                                       \
+        MPID_THREAD_CS_EXIT(POBJ,MPIDI_OFI_THREAD_FI_MUTEX);    \
+        if (likely(_ret==0)) break;                          \
+        MPIDI_OFI_ERR(_ret!=-FI_EAGAIN,             \
+                              mpi_errno,                    \
+                              MPI_ERR_OTHER,                \
+                              "**ofid_"#STR,                \
+                              "**ofid_"#STR" %s %d %s %s",  \
+                              __SHORT_FILE__,               \
+                              __LINE__,                     \
+                              FCNAME,                       \
+                              fi_strerror(-_ret));          \
+        MPIDI_OFI_PROGRESS_NONINLINE();                         \
+        MPID_THREAD_CS_ENTER(POBJ,MPIDI_OFI_THREAD_FI_MUTEX);   \
+    } while (_ret == -FI_EAGAIN);                           \
+    } while (0)
+
+#define MPIDI_OFI_CALL_RETURN(FUNC, _ret)                               \
+        do {                                                            \
+            MPID_THREAD_CS_ENTER(POBJ,MPIDI_OFI_THREAD_FI_MUTEX);       \
+            (_ret) = FUNC;                                              \
+            MPID_THREAD_CS_EXIT(POBJ,MPIDI_OFI_THREAD_FI_MUTEX);        \
+        } while (0)
+
+#define MPIDI_OFI_PMI_CALL_POP(FUNC,STR)                    \
+  do                                                          \
+    {                                                         \
+      pmi_errno  = FUNC;                                      \
+      MPIDI_OFI_ERR(pmi_errno!=PMI_SUCCESS,           \
+                            mpi_errno,                        \
+                            MPI_ERR_OTHER,                    \
+                            "**ofid_"#STR,                    \
+                            "**ofid_"#STR" %s %d %s %s",      \
+                            __SHORT_FILE__,                   \
+                            __LINE__,                         \
+                            FCNAME,                           \
+                            #STR);                            \
+    } while (0)
+
+#define MPIDI_OFI_MPI_CALL_POP(FUNC)                               \
+  do                                                                 \
+    {                                                                \
+      mpi_errno = FUNC;                                              \
+      if (unlikely(mpi_errno!=MPI_SUCCESS)) MPIR_ERR_POP(mpi_errno); \
+    } while (0)
+
+#define MPIDI_OFI_STR_CALL(FUNC,STR)                                   \
+  do                                                            \
+    {                                                           \
+      str_errno = FUNC;                                         \
+      MPIDI_OFI_ERR(str_errno!=MPL_STR_SUCCESS,        \
+                            mpi_errno,                          \
+                            MPI_ERR_OTHER,                      \
+                            "**"#STR,                           \
+                            "**"#STR" %s %d %s %s",             \
+                            __SHORT_FILE__,                     \
+                            __LINE__,                           \
+                            FCNAME,                             \
+                            #STR);                              \
+    } while (0)
+
+#define MPIDI_OFI_REQUEST_CREATE(req, kind)                 \
+    do {                                                      \
+        (req) = MPIR_Request_create(kind);  \
+        MPIR_Request_add_ref((req));                                \
+    } while (0)
+
+#define MPIDI_OFI_SEND_REQUEST_CREATE_LW(req)                   \
+    do {                                                                \
+        (req) = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);           \
+        MPIR_cc_set(&(req)->cc, 0);                                     \
+    } while (0)
+
+#define MPIDI_OFI_SSEND_ACKREQUEST_CREATE(req)            \
+    do {                                                          \
+        MPIDI_OFI_ssendack_request_t_tls_alloc(req);      \
+    } while (0)
+
+#define WINFO(w,rank) MPIDI_CH4U_WINFO(w,rank)
+
+__ALWAYS_INLINE__ uintptr_t MPIDI_OFI_winfo_base(MPIR_Win * w, int rank)
+{
+#if MPIDI_OFI_ENABLE_MR_SCALABLE
+    return 0;
+#else
+    return MPIDI_OFI_WIN(w).winfo[rank].base;
+#endif
+}
+
+__ALWAYS_INLINE__ uint64_t MPIDI_OFI_winfo_mr_key(MPIR_Win * w, int rank)
+{
+#if MPIDI_OFI_ENABLE_MR_SCALABLE
+    return MPIDI_OFI_WIN(w).mr_key;
+#else
+    return MPIDI_OFI_WIN(w).winfo[rank].mr_key;
+#endif
+}
+
+#ifdef MPIDI_OFI_CONFIG_USE_SCALABLE_ENDPOINTS
+__ALWAYS_INLINE__ void MPIDI_OFI_win_conditional_cntr_incr(MPIR_Win * win)
+{
+}
+
+__ALWAYS_INLINE__ void MPIDI_OFI_win_cntr_incr(MPIR_Win * win)
+{
+    (*MPIDI_OFI_WIN(win).issued_cntr)++;
+}
+
+__ALWAYS_INLINE__ void MPIDI_OFI_conditional_cntr_incr()
+{
+}
+
+__ALWAYS_INLINE__ void MPIDI_OFI_cntr_incr()
+{
+    MPIDI_Global.rma_issued_cntr++;
+}
+#else
+__ALWAYS_INLINE__ void MPIDI_OFI_win_conditional_cntr_incr(MPIR_Win * win)
+{
+    (*MPIDI_OFI_WIN(win).issued_cntr)++;
+}
+
+__ALWAYS_INLINE__ void MPIDI_OFI_win_cntr_incr(MPIR_Win * win)
+{
+    (*MPIDI_OFI_WIN(win).issued_cntr)++;
+}
+
+__ALWAYS_INLINE__ void MPIDI_OFI_conditional_cntr_incr()
+{
+    MPIDI_Global.rma_issued_cntr++;
+}
+
+__ALWAYS_INLINE__ void MPIDI_OFI_cntr_incr()
+{
+    MPIDI_Global.rma_issued_cntr++;
+}
+#endif
+
+/* Externs:  see util.c for definition */
+extern int MPIDI_OFI_handle_cq_error_util(ssize_t ret);
+extern int MPIDI_OFI_progress_test_no_inline();
+extern int MPIDI_OFI_control_handler(void *am_hdr,
+                                     void **data, size_t * data_sz, int *is_contig,
+                                     MPIDI_NM_am_completion_handler_fn * cmpl_handler_fn,
+                                     MPIR_Request ** req);
+extern void MPIDI_OFI_map_create(void **map);
+extern void MPIDI_OFI_map_destroy(void *map);
+extern void MPIDI_OFI_map_set(void *_map, uint64_t id, void *val);
+extern void MPIDI_OFI_map_erase(void *_map, uint64_t id);
+extern void *MPIDI_OFI_map_lookup(void *_map, uint64_t id);
+extern int MPIDI_OFI_control_dispatch(void *buf);
+extern void MPIDI_OFI_index_datatypes();
+extern void MPIDI_OFI_index_allocator_create(void **_indexmap, int start);
+extern int MPIDI_OFI_index_allocator_alloc(void *_indexmap);
+extern void MPIDI_OFI_index_allocator_free(void *_indexmap, int index);
+extern void MPIDI_OFI_index_allocator_destroy(void *_indexmap);
+
+/* Common Utility functions used by the
+ * C and C++ components
+ */
+__ALWAYS_INLINE__ MPIDI_OFI_win_request_t *MPIDI_OFI_win_request_alloc_and_init(int extra)
+{
+    MPIDI_OFI_win_request_t *req;
+    req = (MPIDI_OFI_win_request_t *) MPIR_Request_create(MPIR_REQUEST_KIND__RMA);
+    memset((char *) req + MPIDI_REQUEST_HDR_SIZE, 0,
+           sizeof(MPIDI_OFI_win_request_t) - MPIDI_REQUEST_HDR_SIZE);
+    req->noncontig =
+        (MPIDI_OFI_win_noncontig_t *) MPL_calloc(1, (extra) + sizeof(*(req->noncontig)));
+    return req;
+}
+
+__ALWAYS_INLINE__ void MPIDI_OFI_win_datatype_unmap(MPIDI_OFI_win_datatype_t * dt)
+{
+    if (dt->map != &dt->__map)
+        MPL_free(dt->map);
+}
+
+__ALWAYS_INLINE__ void MPIDI_OFI_win_request_complete(MPIDI_OFI_win_request_t * req)
+{
+    int count;
+    MPIR_Assert(HANDLE_GET_MPI_KIND(req->handle) == MPIR_REQUEST);
+    MPIR_Object_release_ref(req, &count);
+    MPIR_Assert(count >= 0);
+    if (count == 0) {
+        MPIDI_OFI_win_datatype_unmap(&req->noncontig->target_dt);
+        MPIDI_OFI_win_datatype_unmap(&req->noncontig->origin_dt);
+        MPIDI_OFI_win_datatype_unmap(&req->noncontig->result_dt);
+        MPL_free(req->noncontig);
+        MPIR_Handle_obj_free(&MPIR_Request_mem, (req));
+    }
+}
+
+__ALWAYS_INLINE__ fi_addr_t MPIDI_OFI_comm_to_phys(MPIR_Comm * comm, int rank, int ep_family)
+{
+#ifdef MPIDI_OFI_CONFIG_USE_SCALABLE_ENDPOINTS
+    int ep_num = MPIDI_OFI_COMM_TO_EP(comm, rank);
+    int offset = MPIDI_Global.ctx[ep_num].ctx_offset;
+    int rx_idx = offset + ep_family;
+    return fi_rx_addr(MPIDI_OFI_COMM_TO_PHYS(comm, rank), rx_idx, MPIDI_OFI_MAX_ENDPOINTS_BITS);
+#else
+    return MPIDI_OFI_COMM_TO_PHYS(comm, rank);
+#endif
+}
+
+__ALWAYS_INLINE__ fi_addr_t MPIDI_OFI_to_phys(int rank, int ep_family)
+{
+#ifdef MPIDI_OFI_CONFIG_USE_SCALABLE_ENDPOINTS
+    int ep_num = 0;
+    int offset = MPIDI_Global.ctx[ep_num].ctx_offset;
+    int rx_idx = offset + ep_family;
+    return fi_rx_addr(MPIDI_OFI_TO_PHYS(0, rank), rx_idx, MPIDI_OFI_MAX_ENDPOINTS_BITS);
+#else
+    return MPIDI_OFI_TO_PHYS(0, rank);
+#endif
+}
+
+__ALWAYS_INLINE__ bool MPIDI_OFI_is_tag_sync(uint64_t match_bits)
+{
+    return (0 != (MPIDI_OFI_SYNC_SEND & match_bits));
+}
+
+__ALWAYS_INLINE__ uint64_t MPIDI_OFI_init_sendtag(MPIR_Context_id_t contextid,
+                                                  int source, int tag, uint64_t type, int do_data)
+{
+    uint64_t match_bits;
+    match_bits = contextid;
+
+    if (!do_data) {
+        match_bits = (match_bits << MPIDI_OFI_SOURCE_SHIFT);
+        match_bits |= source;
+    }
+
+    match_bits = (match_bits << MPIDI_OFI_TAG_SHIFT);
+    match_bits |= (MPIDI_OFI_TAG_MASK & tag) | type;
+    return match_bits;
+}
+
+/* receive posting */
+__ALWAYS_INLINE__ uint64_t MPIDI_OFI_init_recvtag(uint64_t * mask_bits,
+                                                  MPIR_Context_id_t contextid,
+                                                  int source, int tag, int do_data)
+{
+    uint64_t match_bits = 0;
+    *mask_bits = MPIDI_OFI_PROTOCOL_MASK;
+    match_bits = contextid;
+
+    if (!do_data) {
+        match_bits = (match_bits << MPIDI_OFI_SOURCE_SHIFT);
+
+        if (MPI_ANY_SOURCE == source) {
+            match_bits = (match_bits << MPIDI_OFI_TAG_SHIFT);
+            *mask_bits |= MPIDI_OFI_SOURCE_MASK;
+        }
+        else {
+            match_bits |= source;
+            match_bits = (match_bits << MPIDI_OFI_TAG_SHIFT);
+        }
+    }
+    else {
+        match_bits = (match_bits << MPIDI_OFI_TAG_SHIFT);
+    }
+
+    if (MPI_ANY_TAG == tag)
+        *mask_bits |= MPIDI_OFI_TAG_MASK;
+    else
+        match_bits |= (MPIDI_OFI_TAG_MASK & tag);
+
+    return match_bits;
+}
+
+__ALWAYS_INLINE__ int MPIDI_OFI_init_get_tag(uint64_t match_bits)
+{
+    return ((int) (match_bits & MPIDI_OFI_TAG_MASK));
+}
+
+__ALWAYS_INLINE__ int MPIDI_OFI_init_get_source(uint64_t match_bits)
+{
+    return ((int) ((match_bits & MPIDI_OFI_SOURCE_MASK) >> MPIDI_OFI_TAG_SHIFT));
+}
+
+__ALWAYS_INLINE__ MPIR_Request *MPIDI_OFI_context_to_request(void *context)
+{
+    char *base = (char *) context;
+    return (MPIR_Request *) container_of(base, MPIR_Request, dev.ch4.netmod);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_send_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_send_handler(struct fid_ep *ep, const void *buf, size_t len,
+                                             void *desc, uint64_t dest, fi_addr_t dest_addr,
+                                             uint64_t tag, void *context, int is_inject,
+                                             int do_data, int do_lock)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    if (is_inject) {
+        if (do_data)
+            MPIDI_OFI_CALL_RETRY(fi_tinjectdata(ep, buf, len, dest, dest_addr, tag), tinjectdata,
+                                 do_lock);
+        else
+            MPIDI_OFI_CALL_RETRY(fi_tinject(ep, buf, len, dest_addr, tag), tinject, do_lock);
+    }
+    else {
+        if (do_data)
+            MPIDI_OFI_CALL_RETRY(fi_tsenddata(ep, buf, len, desc, dest, dest_addr, tag, context),
+                                 tsenddata, do_lock);
+        else
+            MPIDI_OFI_CALL_RETRY(fi_tsend(ep, buf, len, desc, dest_addr, tag, context), tsend,
+                                 do_lock);
+    }
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#endif
diff --git a/src/mpid/ch4/netmod/ofi/ofi_init.h b/src/mpid/ch4/netmod/ofi/ofi_init.h
new file mode 100644
index 0000000..09e34d2
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_init.h
@@ -0,0 +1,896 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_INIT_H_INCLUDED
+#define NETMOD_OFI_INIT_H_INCLUDED
+
+#include "ofi_impl.h"
+#include "mpir_cvars.h"
+#include "pmi.h"
+
+static inline int MPIDI_OFI_choose_provider(struct fi_info *prov, struct fi_info **prov_use);
+static inline int MPIDI_OFI_create_endpoint(struct fi_info *prov_use,
+                                            struct fid_domain *domain,
+                                            struct fid_cq *p2p_cq,
+                                            struct fid_cntr *rma_ctr,
+                                            struct fid_av *av,
+                                            struct fid_ep **ep, int index, int do_scalable_ep);
+
+#define MPIDI_OFI_CHOOSE_PROVIDER(prov, prov_use,errstr)                          \
+    do {                                                                \
+        struct fi_info *p = prov;                                               \
+        MPIR_ERR_CHKANDJUMP4(p==NULL, mpi_errno,MPI_ERR_OTHER,"**ofid_addrinfo", \
+                             "**ofid_addrinfo %s %d %s %s",__SHORT_FILE__, \
+                             __LINE__,FCNAME, errstr);                  \
+        MPIDI_OFI_choose_provider(prov,prov_use);                           \
+    } while (0);
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_init_generic
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_init_generic(int rank,
+                                         int size,
+                                         int appnum,
+                                         int *tag_ub,
+                                         MPIR_Comm * comm_world,
+                                         MPIR_Comm * comm_self,
+                                         int spawned,
+                                         int num_contexts,
+                                         void **netmod_contexts,
+                                         int do_av_table,
+                                         int do_scalable_ep,
+                                         int do_am,
+                                         int do_tagged,
+                                         int do_data, int do_stx_rma, int do_mr_scalable)
+{
+    int mpi_errno = MPI_SUCCESS, pmi_errno, i, fi_version;
+    int thr_err = 0, str_errno, maxlen;
+    char *table = NULL, *provname = NULL;
+    struct fi_info *hints, *prov, *prov_use;
+    struct fi_cq_attr cq_attr;
+    struct fi_cntr_attr cntr_attr;
+    fi_addr_t *mapped_table;
+    struct fi_av_attr av_attr;
+    char valS[MPIDI_KVSAPPSTRLEN], *val;
+    char keyS[MPIDI_KVSAPPSTRLEN];
+    size_t optlen;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_INIT);
+
+    CH4_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.netmod) ==
+                            offsetof(MPIDI_OFI_chunk_request, context));
+    CH4_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.netmod) ==
+                            offsetof(MPIDI_OFI_huge_recv_t, context));
+    CH4_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.netmod) ==
+                            offsetof(MPIDI_OFI_am_repost_request_t, context));
+    CH4_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.netmod) ==
+                            offsetof(MPIDI_OFI_ssendack_request_t, context));
+    CH4_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.netmod) ==
+                            offsetof(MPIDI_OFI_dynamic_process_request_t, context));
+    CH4_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.netmod) ==
+                            offsetof(MPIDI_OFI_win_request_t, context));
+    CH4_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.ch4u.netmod_am.ofi.context) ==
+                            offsetof(struct MPIR_Request, dev.ch4.netmod.ofi.context));
+    CH4_COMPILE_TIME_ASSERT(sizeof(MPIDI_Devreq_t) >= sizeof(MPIDI_OFI_request_t));
+    CH4_COMPILE_TIME_ASSERT(sizeof(MPIR_Request) >= sizeof(MPIDI_OFI_win_request_t));
+    CH4_COMPILE_TIME_ASSERT(sizeof(MPIDI_Devgpid_t) >= sizeof(MPIDI_OFI_gpid_t));
+    CH4_COMPILE_TIME_ASSERT(sizeof(MPIR_Context_id_t) * 8 >= MPIDI_OFI_AM_CONTEXT_ID_BITS);
+
+    *tag_ub = (1ULL << MPIDI_OFI_TAG_SHIFT) - 1;
+
+    MPID_Thread_mutex_create(&MPIDI_OFI_THREAD_UTIL_MUTEX, &thr_err);
+    MPID_Thread_mutex_create(&MPIDI_OFI_THREAD_PROGRESS_MUTEX, &thr_err);
+    MPID_Thread_mutex_create(&MPIDI_OFI_THREAD_FI_MUTEX, &thr_err);
+    MPID_Thread_mutex_create(&MPIDI_OFI_THREAD_SPAWN_MUTEX, &thr_err);
+
+    /* ------------------------------------------------------------------------ */
+    /* Hints to filter providers                                                */
+    /* See man fi_getinfo for a list                                            */
+    /* of all filters                                                           */
+    /* mode:  Select capabilities that this netmod will support                 */
+    /*        FI_CONTEXT:  This netmod will pass in context into communication  */
+    /*        to optimize storage locality between MPI requests and OFI opaque  */
+    /*        data structures.                                                  */
+    /*        FI_ASYNC_IOV:  MPICH will provide storage for iovecs on           */
+    /*        communication calls, avoiding the OFI provider needing to require */
+    /*        a copy.                                                           */
+    /*        FI_LOCAL_MR unset:  Note that we do not set FI_LOCAL_MR,          */
+    /*        which means this netmod does not support exchange of memory       */
+    /*        regions on communication calls.                                   */
+    /* caps:     Capabilities required from the provider.  The bits specified   */
+    /*           with buffered receive, cancel, and remote complete implements  */
+    /*           MPI semantics.                                                 */
+    /*           Tagged: used to support tag matching, 2-sided                  */
+    /*           RMA|Atomics:  supports MPI 1-sided                             */
+    /*           MSG|MULTI_RECV:  Supports synchronization protocol for 1-sided */
+    /*           FI_DIRECTED_RECV: Support not putting the source in the match  */
+    /*                             bits                                         */
+    /*           We expect to register all memory up front for use with this    */
+    /*           endpoint, so the netmod requires dynamic memory regions        */
+    /* ------------------------------------------------------------------------ */
+
+    /* ------------------------------------------------------------------------ */
+    /* fi_allocinfo: allocate and zero an fi_info structure and all related     */
+    /* substructures                                                            */
+    /* ------------------------------------------------------------------------ */
+    hints = fi_allocinfo();
+    MPIR_Assert(hints != NULL);
+
+    hints->mode = FI_CONTEXT | FI_ASYNC_IOV;    /* We can handle contexts  */
+    hints->caps = 0ULL; /* Tag matching interface  */
+    hints->caps |= FI_RMA;      /* RMA(read/write)         */
+    hints->caps |= FI_ATOMICS;  /* Atomics capabilities    */
+
+    if (do_tagged) {
+        hints->caps |= FI_TAGGED;       /* Tag matching interface  */
+    }
+
+    if (do_am) {
+        hints->caps |= FI_MSG;  /* Message Queue apis      */
+        hints->caps |= FI_MULTI_RECV;   /* Shared receive buffer   */
+    }
+
+    if (do_data) {
+        hints->caps |= FI_DIRECTED_RECV;        /* Match source address    */
+    }
+
+    /* ------------------------------------------------------------------------ */
+    /* FI_VERSION provides binary backward and forward compatibility support    */
+    /* Specify the version of OFI is coded to, the provider will select struct  */
+    /* layouts that are compatible with this version.                           */
+    /* ------------------------------------------------------------------------ */
+    fi_version = FI_VERSION(MPIDI_OFI_MAJOR_VERSION, MPIDI_OFI_MINOR_VERSION);
+
+    /* ------------------------------------------------------------------------ */
+    /* Set object options to be filtered by getinfo                             */
+    /* domain_attr:  domain attribute requirements                              */
+    /* op_flags:     persistent flag settings for an endpoint                   */
+    /* endpoint type:  see FI_EP_RDM                                            */
+    /* Filters applied (for this netmod, we need providers that can support):   */
+    /* THREAD_DOMAIN:  Progress serialization is handled by netmod (locking)    */
+    /* PROGRESS_AUTO:  request providers that make progress without requiring   */
+    /*                 the ADI to dedicate a thread to advance the state        */
+    /* FI_DELIVERY_COMPLETE:  RMA operations are visible in remote memory       */
+    /* FI_COMPLETION:  Selective completions of RMA ops                         */
+    /* FI_EP_RDM:  Reliable datagram                                            */
+    /* ------------------------------------------------------------------------ */
+    hints->addr_format = FI_FORMAT_UNSPEC;
+    hints->domain_attr->threading = FI_THREAD_DOMAIN;
+    hints->domain_attr->control_progress = FI_PROGRESS_MANUAL;
+    hints->domain_attr->data_progress = FI_PROGRESS_MANUAL;
+    hints->domain_attr->resource_mgmt = FI_RM_ENABLED;
+    hints->domain_attr->av_type = do_av_table ? FI_AV_TABLE : FI_AV_MAP;
+    hints->domain_attr->mr_mode = do_mr_scalable ? FI_MR_SCALABLE : FI_MR_BASIC;
+    hints->tx_attr->op_flags = FI_DELIVERY_COMPLETE | FI_COMPLETION;
+    hints->tx_attr->msg_order = FI_ORDER_SAS;
+    hints->tx_attr->comp_order = FI_ORDER_NONE;
+    hints->rx_attr->op_flags = FI_COMPLETION;
+    hints->rx_attr->total_buffered_recv = 0;    /* FI_RM_ENABLED ensures buffering of unexpected messages */
+    hints->ep_attr->type = FI_EP_RDM;
+
+    /* ------------------------------------------------------------------------ */
+    /* fi_getinfo:  returns information about fabric  services for reaching a   */
+    /* remote node or service.  this does not necessarily allocate resources.   */
+    /* Pass NULL for name/service because we want a list of providers supported */
+    /* ------------------------------------------------------------------------ */
+    provname = MPIR_CVAR_OFI_USE_PROVIDER ? (char *) MPL_strdup(MPIR_CVAR_OFI_USE_PROVIDER) : NULL;
+    hints->fabric_attr->prov_name = provname;
+    MPIDI_OFI_CALL(fi_getinfo(fi_version, NULL, NULL, 0ULL, hints, &prov), addrinfo);
+    MPIDI_OFI_CHOOSE_PROVIDER(prov, &prov_use, "No suitable provider provider found");
+
+    MPIDI_Global.prov_use = fi_dupinfo(prov_use);
+    MPIR_Assert(MPIDI_Global.prov_use);
+
+    /* ------------------------------------------------------------------------ */
+    /* Set global attributes attributes based on the provider choice            */
+    /* ------------------------------------------------------------------------ */
+    MPIDI_Global.max_buffered_send = prov_use->tx_attr->inject_size;
+    MPIDI_Global.max_buffered_write = prov_use->tx_attr->inject_size;
+    MPIDI_Global.max_send = prov_use->ep_attr->max_msg_size;
+    MPIDI_Global.max_write = prov_use->ep_attr->max_msg_size;
+    MPIDI_Global.iov_limit = MIN(prov_use->tx_attr->iov_limit, MPIDI_OFI_IOV_MAX);
+    MPIDI_Global.rma_iov_limit = MIN(prov_use->tx_attr->rma_iov_limit, MPIDI_OFI_IOV_MAX);
+    MPIDI_Global.max_mr_key_size = prov_use->domain_attr->mr_key_size;
+
+    if (MPIDI_Global.max_mr_key_size >= 8) {
+        MPIDI_Global.max_windows_bits = MPIDI_OFI_MAX_WINDOWS_BITS_64;
+        MPIDI_Global.max_huge_rma_bits = MPIDI_OFI_MAX_HUGE_RMA_BITS_64;
+        MPIDI_Global.max_huge_rmas = MPIDI_OFI_MAX_HUGE_RMAS_64;
+        MPIDI_Global.huge_rma_shift = MPIDI_OFI_HUGE_RMA_SHIFT_64;
+        MPIDI_Global.context_shift = MPIDI_OFI_CONTEXT_SHIFT_64;
+    }
+    else if (MPIDI_Global.max_mr_key_size >= 4) {
+        MPIDI_Global.max_windows_bits = MPIDI_OFI_MAX_WINDOWS_BITS_32;
+        MPIDI_Global.max_huge_rma_bits = MPIDI_OFI_MAX_HUGE_RMA_BITS_32;
+        MPIDI_Global.max_huge_rmas = MPIDI_OFI_MAX_HUGE_RMAS_32;
+        MPIDI_Global.huge_rma_shift = MPIDI_OFI_HUGE_RMA_SHIFT_32;
+        MPIDI_Global.context_shift = MPIDI_OFI_CONTEXT_SHIFT_32;
+    }
+    else if (MPIDI_Global.max_mr_key_size >= 2) {
+        MPIDI_Global.max_windows_bits = MPIDI_OFI_MAX_WINDOWS_BITS_16;
+        MPIDI_Global.max_huge_rma_bits = MPIDI_OFI_MAX_HUGE_RMA_BITS_16;
+        MPIDI_Global.max_huge_rmas = MPIDI_OFI_MAX_HUGE_RMAS_16;
+        MPIDI_Global.huge_rma_shift = MPIDI_OFI_HUGE_RMA_SHIFT_16;
+        MPIDI_Global.context_shift = MPIDI_OFI_CONTEXT_SHIFT_16;
+    }
+    else {
+        MPIR_ERR_SETFATALANDJUMP4(mpi_errno,
+                                  MPI_ERR_OTHER,
+                                  "**ofid_rma_init",
+                                  "**ofid_rma_init %s %d %s %s",
+                                  __SHORT_FILE__, __LINE__, FCNAME, "Key space too small");
+    }
+
+    /* ------------------------------------------------------------------------ */
+    /* Open fabric                                                              */
+    /* The getinfo struct returns a fabric attribute struct that can be used to */
+    /* instantiate the virtual or physical network.  This opens a "fabric       */
+    /* provider".   We choose the first available fabric, but getinfo           */
+    /* returns a list.                                                          */
+    /* ------------------------------------------------------------------------ */
+    MPIDI_OFI_CALL(fi_fabric(prov_use->fabric_attr, &MPIDI_Global.fabric, NULL), fabric);
+
+    /* ------------------------------------------------------------------------ */
+    /* Create the access domain, which is the physical or virtual network or    */
+    /* hardware port/collection of ports.  Returns a domain object that can be  */
+    /* used to create endpoints.                                                */
+    /* ------------------------------------------------------------------------ */
+    MPIDI_OFI_CALL(fi_domain(MPIDI_Global.fabric, prov_use, &MPIDI_Global.domain, NULL),
+                   opendomain);
+
+    /* ------------------------------------------------------------------------ */
+    /* Create the objects that will be bound to the endpoint.                   */
+    /* The objects include:                                                     */
+    /*     * dynamic memory-spanning memory region                              */
+    /*     * completion queues for events                                       */
+    /*     * counters for rma operations                                        */
+    /*     * address vector of other endpoint addresses                         */
+    /* ------------------------------------------------------------------------ */
+
+    /* ------------------------------------------------------------------------ */
+    /* Construct:  Completion Queues                                            */
+    /* ------------------------------------------------------------------------ */
+    memset(&cq_attr, 0, sizeof(cq_attr));
+    cq_attr.format = FI_CQ_FORMAT_TAGGED;
+    MPIDI_OFI_CALL(fi_cq_open(MPIDI_Global.domain,      /* In:  Domain Object                */
+                              &cq_attr, /* In:  Configuration object         */
+                              &MPIDI_Global.p2p_cq,     /* Out: CQ Object                    */
+                              NULL), opencq);   /* In:  Context for cq events        */
+
+    /* ------------------------------------------------------------------------ */
+    /* Construct:  Counters                                                     */
+    /* ------------------------------------------------------------------------ */
+    memset(&cntr_attr, 0, sizeof(cntr_attr));
+    cntr_attr.events = FI_CNTR_EVENTS_COMP;
+    MPIDI_OFI_CALL(fi_cntr_open(MPIDI_Global.domain,    /* In:  Domain Object        */
+                                &cntr_attr,     /* In:  Configuration object */
+                                &MPIDI_Global.rma_cmpl_cntr,    /* Out: Counter Object       */
+                                NULL), openct); /* Context: counter events   */
+
+    /* ------------------------------------------------------------------------ */
+    /* Construct:  Address Vector                                               */
+    /* ------------------------------------------------------------------------ */
+
+    memset(&av_attr, 0, sizeof(av_attr));
+
+
+    if (do_av_table) {
+        av_attr.type = FI_AV_TABLE;
+        mapped_table = NULL;
+    }
+    else {
+        av_attr.type = FI_AV_MAP;
+        mapped_table = (fi_addr_t *) MPL_malloc(size * sizeof(fi_addr_t));
+    }
+
+    av_attr.rx_ctx_bits = MPIDI_OFI_MAX_ENDPOINTS_BITS;
+
+    MPIDI_OFI_CALL(fi_av_open(MPIDI_Global.domain,      /* In:  Domain Object         */
+                              &av_attr, /* In:  Configuration object  */
+                              &MPIDI_Global.av, /* Out: AV Object             */
+                              NULL), avopen);   /* Context: AV events         */
+
+    /* ------------------------------------------------------------------------ */
+    /* Construct:  Shared TX Context for RMA                                    */
+    /* ------------------------------------------------------------------------ */
+    if (do_stx_rma) {
+        int ret;
+        struct fi_tx_attr tx_attr;
+        memset(&tx_attr, 0, sizeof(tx_attr));
+        MPIDI_OFI_CALL_RETURN(fi_stx_context(MPIDI_Global.domain,
+                                             &tx_attr,
+                                             &MPIDI_Global.stx_ctx, NULL /* context */), ret);
+        if (ret < 0) {
+            MPL_DBG_MSG(MPIDI_CH4_DBG_GENERAL, VERBOSE,
+                        "Failed to create shared TX context for RMA, "
+                        "falling back to global EP/counter scheme");
+            MPIDI_Global.stx_ctx = NULL;
+        }
+    }
+
+    /* ------------------------------------------------------------------------ */
+    /* Create a transport level communication endpoint.  To use the endpoint,   */
+    /* it must be bound to completion counters or event queues and enabled,     */
+    /* and the resources consumed by it, such as address vectors, counters,     */
+    /* completion queues, etc.                                                  */
+    /* ------------------------------------------------------------------------ */
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_create_endpoint(prov_use,
+                                                     MPIDI_Global.domain,
+                                                     MPIDI_Global.p2p_cq,
+                                                     MPIDI_Global.rma_cmpl_cntr,
+                                                     MPIDI_Global.av,
+                                                     &MPIDI_Global.ep, 0, do_scalable_ep));
+
+    /* ---------------------------------- */
+    /* Get our endpoint name and publish  */
+    /* the socket to the KVS              */
+    /* ---------------------------------- */
+    MPIDI_Global.addrnamelen = FI_NAME_MAX;
+    MPIDI_OFI_CALL(fi_getname((fid_t) MPIDI_Global.ep, MPIDI_Global.addrname,
+                              &MPIDI_Global.addrnamelen), getname);
+    MPIR_Assert(MPIDI_Global.addrnamelen <= FI_NAME_MAX);
+
+    val = valS;
+    str_errno = MPL_STR_SUCCESS;
+    maxlen = MPIDI_KVSAPPSTRLEN;
+    memset(val, 0, maxlen);
+    MPIDI_OFI_STR_CALL(MPL_str_add_binary_arg(&val, &maxlen, "OFI", (char *) &MPIDI_Global.addrname,
+                                              MPIDI_Global.addrnamelen), buscard_len);
+    MPIDI_OFI_PMI_CALL_POP(PMI_KVS_Get_my_name(MPIDI_Global.kvsname, MPIDI_KVSAPPSTRLEN), pmi);
+
+    val = valS;
+    sprintf(keyS, "OFI-%d", rank);
+    MPIDI_OFI_PMI_CALL_POP(PMI_KVS_Put(MPIDI_Global.kvsname, keyS, val), pmi);
+    MPIDI_OFI_PMI_CALL_POP(PMI_KVS_Commit(MPIDI_Global.kvsname), pmi);
+    MPIDI_OFI_PMI_CALL_POP(PMI_Barrier(), pmi);
+
+    /* -------------------------------- */
+    /* Create our address table from    */
+    /* encoded KVS values               */
+    /* -------------------------------- */
+    table = (char *) MPL_malloc(size * MPIDI_Global.addrnamelen);
+    maxlen = MPIDI_KVSAPPSTRLEN;
+
+    for (i = 0; i < size; i++) {
+        sprintf(keyS, "OFI-%d", i);
+        MPIDI_OFI_PMI_CALL_POP(PMI_KVS_Get(MPIDI_Global.kvsname, keyS, valS, MPIDI_KVSAPPSTRLEN),
+                               pmi);
+        MPIDI_OFI_STR_CALL(MPL_str_get_binary_arg
+                           (valS, "OFI", (char *) &table[i * MPIDI_Global.addrnamelen],
+                            MPIDI_Global.addrnamelen, &maxlen), buscard_len);
+    }
+
+    /* -------------------------------- */
+    /* Table is constructed.  Map it    */
+    /* -------------------------------- */
+    MPIDI_OFI_CALL(fi_av_insert(MPIDI_Global.av, table, size, mapped_table, 0ULL, NULL), avmap);
+    if (!do_av_table) { /* AV_MAP */
+        for (i = 0; i < size; i++) {
+            MPIDI_OFI_AV(&MPIDIU_get_av(0, i)).dest = mapped_table[i];
+        }
+        MPL_free(mapped_table);
+    }
+
+    /* -------------------------------- */
+    /* Create the id to object maps     */
+    /* -------------------------------- */
+    MPIDI_OFI_map_create(&MPIDI_Global.win_map);
+
+    /* ---------------------------------- */
+    /* Initialize Active Message          */
+    /* ---------------------------------- */
+    if (do_am) {
+        /* Maximum possible message size for short message send (=eager send)
+         * See MPIDI_OFI_do_send_am for short/long switching logic */
+        MPIR_Assert(MPIDI_OFI_DEFAULT_SHORT_SEND_SIZE <= MPIDI_Global.max_send);
+        MPIDI_Global.am_buf_pool =
+            MPIDI_CH4U_create_buf_pool(MPIDI_OFI_BUF_POOL_NUM, MPIDI_OFI_BUF_POOL_SIZE);
+        mpi_errno = MPIDI_CH4U_init(comm_world, comm_self, num_contexts, netmod_contexts);
+
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+
+        slist_init(&MPIDI_Global.cq_buff_list);
+        MPIDI_Global.cq_buff_head = MPIDI_Global.cq_buff_tail = 0;
+        optlen = MPIDI_OFI_DEFAULT_SHORT_SEND_SIZE;
+
+        MPIDI_OFI_CALL(fi_setopt(&(MPIDI_OFI_EP_RX_MSG(0)->fid),
+                                 FI_OPT_ENDPOINT,
+                                 FI_OPT_MIN_MULTI_RECV, &optlen, sizeof(optlen)), setopt);
+
+        for (i = 0; i < MPIDI_OFI_NUM_AM_BUFFERS; i++) {
+            MPIDI_Global.am_bufs[i] = MPL_malloc(MPIDI_OFI_AM_BUFF_SZ);
+            MPIDI_Global.am_reqs[i].event_id = MPIDI_OFI_EVENT_AM_RECV;
+            MPIDI_Global.am_reqs[i].index = i;
+            MPIR_Assert(MPIDI_Global.am_bufs[i]);
+            MPIDI_Global.am_iov[i].iov_base = MPIDI_Global.am_bufs[i];
+            MPIDI_Global.am_iov[i].iov_len = MPIDI_OFI_AM_BUFF_SZ;
+            MPIDI_Global.am_msg[i].msg_iov = &MPIDI_Global.am_iov[i];
+            MPIDI_Global.am_msg[i].desc = NULL;
+            MPIDI_Global.am_msg[i].addr = FI_ADDR_UNSPEC;
+            MPIDI_Global.am_msg[i].context = &MPIDI_Global.am_reqs[i].context;
+            MPIDI_Global.am_msg[i].iov_count = 1;
+            MPIDI_OFI_CALL_RETRY(fi_recvmsg(MPIDI_OFI_EP_RX_MSG(0),
+                                            &MPIDI_Global.am_msg[i],
+                                            FI_MULTI_RECV | FI_COMPLETION), prepost,
+                                 MPIDI_OFI_CALL_LOCK);
+        }
+
+        /* Grow the header handlers down */
+        MPIDI_Global.am_handlers[MPIDI_OFI_INTERNAL_HANDLER_CONTROL] = MPIDI_OFI_control_handler;
+        MPIDI_Global.am_send_cmpl_handlers[MPIDI_OFI_INTERNAL_HANDLER_CONTROL] = NULL;
+    }
+    OPA_store_int(&MPIDI_Global.am_inflight_inject_emus, 0);
+    OPA_store_int(&MPIDI_Global.am_inflight_rma_send_mrs, 0);
+
+    /* max_inject_size is temporarily set to 1 inorder to avoid deadlock in
+     * shm initialization since PMI_Barrier does not call progress and flush its injects */
+    MPIDI_Global.max_buffered_send = 1;
+    MPIDI_Global.max_buffered_write = 1;
+
+    MPIDI_Global.max_buffered_send = prov_use->tx_attr->inject_size;
+    MPIDI_Global.max_buffered_write = prov_use->tx_attr->inject_size;
+
+    MPIR_Datatype_init_names();
+    MPIDI_OFI_index_datatypes();
+
+    /* -------------------------------- */
+    /* Initialize Dynamic Tasking       */
+    /* -------------------------------- */
+    if (spawned) {
+        char parent_port[MPIDI_MAX_KVS_VALUE_LEN];
+        MPIDI_OFI_PMI_CALL_POP(PMI_KVS_Get(MPIDI_Global.kvsname,
+                                           MPIDI_PARENT_PORT_KVSKEY,
+                                           parent_port, MPIDI_MAX_KVS_VALUE_LEN), pmi);
+        MPIDI_OFI_MPI_CALL_POP(MPIDI_Comm_connect
+                               (parent_port, NULL, 0, comm_world, &MPIR_Process.comm_parent));
+        MPIR_Assert(MPIR_Process.comm_parent != NULL);
+        MPL_strncpy(MPIR_Process.comm_parent->name, "MPI_COMM_PARENT", MPI_MAX_OBJECT_NAME);
+    }
+
+  fn_exit:
+
+    /* -------------------------------- */
+    /* Free temporary resources         */
+    /* -------------------------------- */
+    if (provname) {
+        MPL_free(provname);
+        hints->fabric_attr->prov_name = NULL;
+    }
+
+    if (prov)
+        fi_freeinfo(prov);
+
+    fi_freeinfo(hints);
+
+    if (table)
+        MPL_free(table);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_INIT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_init(int rank,
+                                int size,
+                                int appnum,
+                                int *tag_ub,
+                                MPIR_Comm * comm_world,
+                                MPIR_Comm * comm_self,
+                                int spawned, int num_contexts, void **netmod_contexts)
+{
+    int mpi_errno;
+    mpi_errno = MPIDI_OFI_init_generic(rank, size, appnum, tag_ub, comm_world,
+                                       comm_self, spawned, num_contexts,
+                                       netmod_contexts,
+                                       MPIDI_OFI_ENABLE_AV_TABLE,
+                                       MPIDI_OFI_ENABLE_SCALABLE_ENDPOINTS,
+                                       MPIDI_OFI_ENABLE_AM,
+                                       MPIDI_OFI_ENABLE_TAGGED,
+                                       MPIDI_OFI_ENABLE_DATA,
+                                       MPIDI_OFI_ENABLE_STX_RMA, MPIDI_OFI_ENABLE_MR_SCALABLE);
+    return mpi_errno;
+}
+
+
+
+
+static inline int MPIDI_OFI_finalize_generic(int do_scalable_ep, int do_am, int do_stx_rma)
+{
+    int thr_err = 0, mpi_errno = MPI_SUCCESS;
+    int i = 0;
+    int barrier[2] = { 0 };
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+    MPIR_Comm *comm;
+
+    /* Progress until we drain all inflight RMA send long buffers */
+    while (OPA_load_int(&MPIDI_Global.am_inflight_rma_send_mrs) > 0)
+        MPIDI_OFI_PROGRESS();
+
+    /* Barrier over allreduce, but force non-immediate send */
+    MPIDI_Global.max_buffered_send = 0;
+    MPIDI_OFI_MPI_CALL_POP(MPIR_Allreduce_impl(&barrier[0], &barrier[1], 1, MPI_INT,
+                                               MPI_SUM, MPIR_Process.comm_world, &errflag));
+
+    /* Progress until we drain all inflight injection emulation requests */
+    while (OPA_load_int(&MPIDI_Global.am_inflight_inject_emus) > 0)
+        MPIDI_OFI_PROGRESS();
+    MPIR_Assert(OPA_load_int(&MPIDI_Global.am_inflight_inject_emus) == 0);
+
+    if (do_scalable_ep) {
+        MPIDI_OFI_CALL(fi_close((fid_t) MPIDI_OFI_EP_TX_TAG(0)), epclose);
+        MPIDI_OFI_CALL(fi_close((fid_t) MPIDI_OFI_EP_TX_RMA(0)), epclose);
+        MPIDI_OFI_CALL(fi_close((fid_t) MPIDI_OFI_EP_TX_MSG(0)), epclose);
+        MPIDI_OFI_CALL(fi_close((fid_t) MPIDI_OFI_EP_TX_CTR(0)), epclose);
+
+        MPIDI_OFI_CALL(fi_close((fid_t) MPIDI_OFI_EP_RX_TAG(0)), epclose);
+        MPIDI_OFI_CALL(fi_close((fid_t) MPIDI_OFI_EP_RX_RMA(0)), epclose);
+        MPIDI_OFI_CALL(fi_close((fid_t) MPIDI_OFI_EP_RX_MSG(0)), epclose);
+        MPIDI_OFI_CALL(fi_close((fid_t) MPIDI_OFI_EP_RX_CTR(0)), epclose);
+    }
+
+    if (do_stx_rma && MPIDI_Global.stx_ctx != NULL)
+        MPIDI_OFI_CALL(fi_close(&MPIDI_Global.stx_ctx->fid), stx_ctx_close);
+    MPIDI_OFI_CALL(fi_close(&MPIDI_Global.ep->fid), epclose);
+    MPIDI_OFI_CALL(fi_close(&MPIDI_Global.av->fid), avclose);
+    MPIDI_OFI_CALL(fi_close(&MPIDI_Global.p2p_cq->fid), cqclose);
+    MPIDI_OFI_CALL(fi_close(&MPIDI_Global.rma_cmpl_cntr->fid), cqclose);
+    MPIDI_OFI_CALL(fi_close(&MPIDI_Global.domain->fid), domainclose);
+
+    fi_freeinfo(MPIDI_Global.prov_use);
+
+    /* --------------------------------------- */
+    /* Free comm world addr table              */
+    /* --------------------------------------- */
+    comm = MPIR_Process.comm_world;
+    MPIR_Comm_release_always(comm);
+
+    comm = MPIR_Process.comm_self;
+    MPIR_Comm_release_always(comm);
+
+    MPIDI_CH4U_finalize();
+
+    MPIDI_OFI_map_destroy(MPIDI_Global.win_map);
+
+    if (do_am) {
+        for (i = 0; i < MPIDI_OFI_NUM_AM_BUFFERS; i++)
+            MPL_free(MPIDI_Global.am_bufs[i]);
+
+        MPIDI_CH4R_destroy_buf_pool(MPIDI_Global.am_buf_pool);
+
+        MPIR_Assert(MPIDI_Global.cq_buff_head == MPIDI_Global.cq_buff_tail);
+        MPIR_Assert(slist_empty(&MPIDI_Global.cq_buff_list));
+    }
+
+    PMI_Finalize();
+
+    MPID_Thread_mutex_destroy(&MPIDI_OFI_THREAD_UTIL_MUTEX, &thr_err);
+    MPID_Thread_mutex_destroy(&MPIDI_OFI_THREAD_PROGRESS_MUTEX, &thr_err);
+    MPID_Thread_mutex_destroy(&MPIDI_OFI_THREAD_FI_MUTEX, &thr_err);
+    MPID_Thread_mutex_destroy(&MPIDI_OFI_THREAD_SPAWN_MUTEX, &thr_err);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_NM_finalize(void)
+{
+    return MPIDI_OFI_finalize_generic(MPIDI_OFI_ENABLE_SCALABLE_ENDPOINTS,
+                                      MPIDI_OFI_ENABLE_AM, MPIDI_OFI_ENABLE_STX_RMA);
+}
+
+static inline void *MPIDI_NM_alloc_mem(size_t size, MPIR_Info * info_ptr)
+{
+
+    void *ap;
+    ap = MPL_malloc(size);
+    return ap;
+}
+
+static inline int MPIDI_NM_free_mem(void *ptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPL_free(ptr);
+
+    return mpi_errno;
+}
+
+static inline int MPIDI_NM_comm_get_lpid(MPIR_Comm * comm_ptr,
+                                         int idx, int *lpid_ptr, MPL_bool is_remote)
+{
+    int avtid = 0, lpid = 0;
+    if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM)
+        MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid);
+    else if (is_remote)
+        MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid);
+    else {
+        MPIDIU_comm_rank_to_pid_local(comm_ptr, idx, &lpid, &avtid);
+    }
+
+    *lpid_ptr = MPIDIU_LPID_CREATE(avtid, lpid);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_gpid_get(MPIR_Comm * comm_ptr, int rank, MPIR_Gpid * gpid)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Assert(rank < comm_ptr->local_size);
+    size_t sz = sizeof(MPIDI_OFI_GPID(gpid).addr);
+    MPIDI_OFI_CALL(fi_av_lookup(MPIDI_Global.av, MPIDI_OFI_COMM_TO_PHYS(comm_ptr, rank),
+                                &MPIDI_OFI_GPID(gpid).addr, &sz), avlookup);
+    MPIR_Assert(sz <= sizeof(MPIDI_OFI_GPID(gpid).addr));
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_NM_get_node_id(MPIR_Comm * comm, int rank, MPID_Node_id_t * id_p)
+{
+    MPIDI_CH4U_get_node_id(comm, rank, id_p);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_get_max_node_id(MPIR_Comm * comm, MPID_Node_id_t * max_id_p)
+{
+    MPIDI_CH4U_get_max_node_id(comm, max_id_p);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_getallincomm(MPIR_Comm * comm_ptr,
+                                        int local_size, MPIR_Gpid local_gpids[], int *singleAVT)
+{
+    int i;
+
+    for (i = 0; i < comm_ptr->local_size; i++)
+        MPIDI_GPID_Get(comm_ptr, i, &local_gpids[i]);
+
+    return 0;
+}
+
+static inline int MPIDI_NM_gpid_tolpidarray_generic(int size,
+                                                    MPIR_Gpid gpid[], int lpid[], int use_av_table)
+{
+    int i, mpi_errno = MPI_SUCCESS;
+    int *new_avt_procs;
+    int n_new_procs = 0;
+    int max_n_avts;
+    new_avt_procs = (int *) MPL_malloc(size * sizeof(int));
+    max_n_avts = MPIDIU_get_max_n_avts();
+
+    for (i = 0; i < size; i++) {
+        int j, k;
+        char tbladdr[FI_NAME_MAX];
+        int found = 0;
+
+        for (k = 0; k < max_n_avts; k++) {
+            if (MPIDIU_get_av_table(k) == NULL) {
+                continue;
+            }
+            for (j = 0; j < MPIDIU_get_av_table(k)->size; j++) {
+                size_t sz = sizeof(MPIDI_OFI_GPID(&gpid[i]).addr);
+                MPIDI_OFI_CALL(fi_av_lookup
+                               (MPIDI_Global.av, MPIDI_OFI_TO_PHYS(k, j), &tbladdr, &sz), avlookup);
+                MPIR_Assert(sz <= sizeof(MPIDI_OFI_GPID(&gpid[i]).addr));
+
+                if (!memcmp(tbladdr, MPIDI_OFI_GPID(&gpid[i]).addr, sz)) {
+                    lpid[i] = MPIDIU_LPID_CREATE(k, j);
+                    found = 1;
+                    break;
+                }
+            }
+        }
+
+        if (!found) {
+            new_avt_procs[n_new_procs] = i;
+            n_new_procs++;
+        }
+    }
+
+    /* create new av_table, insert processes */
+    if (n_new_procs > 0) {
+        int avtid;
+        MPIDIU_new_avt(n_new_procs, &avtid);
+
+        for (i = 0; i < n_new_procs; i++) {
+            if (use_av_table) { /* logical addressing */
+                MPIDI_OFI_CALL(fi_av_insert
+                               (MPIDI_Global.av, &MPIDI_OFI_GPID(&gpid[new_avt_procs[i]]).addr, 1,
+                                NULL, 0ULL, NULL), avmap);
+                /* FIXME: get logical address */
+            }
+            else {
+                MPIDI_OFI_CALL(fi_av_insert
+                               (MPIDI_Global.av, &MPIDI_OFI_GPID(&gpid[new_avt_procs[i]]).addr, 1,
+                                (fi_addr_t *) & MPIDI_OFI_AV(&MPIDIU_get_av(avtid, i)).dest, 0ULL,
+                                NULL), avmap);
+            }
+            /* highest bit is marked as 1 to indicate this is a new process */
+            lpid[i] = MPIDIU_LPID_CREATE(avtid, i);
+            MPIDIU_LPID_SET_NEW_AVT_MARK(lpid[i]);
+        }
+    }
+
+
+  fn_exit:
+    MPL_free(new_avt_procs);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_NM_gpid_tolpidarray(int size, MPIR_Gpid gpid[], int lpid[])
+{
+    return MPIDI_NM_gpid_tolpidarray_generic(size, gpid, lpid, MPIDI_OFI_ENABLE_AV_TABLE);
+}
+
+static inline int MPIDI_NM_create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr,
+                                                       int size, const int lpids[])
+{
+    return 0;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_create_endpoint
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_create_endpoint(struct fi_info *prov_use,
+                                            struct fid_domain *domain,
+                                            struct fid_cq *p2p_cq,
+                                            struct fid_cntr *rma_ctr,
+                                            struct fid_av *av,
+                                            struct fid_ep **ep, int index, int do_scalable_ep)
+{
+    int mpi_errno = MPI_SUCCESS;
+    struct fi_tx_attr tx_attr;
+    struct fi_rx_attr rx_attr;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_CREATE_ENDPOINT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_CREATE_ENDPOINT);
+
+    if (do_scalable_ep) {
+        MPIDI_OFI_CALL(fi_scalable_ep(domain, prov_use, ep, NULL), ep);
+        MPIDI_OFI_CALL(fi_scalable_ep_bind(*ep, &av->fid, 0), bind);
+
+        tx_attr = *prov_use->tx_attr;
+        tx_attr.caps = FI_TAGGED;
+        tx_attr.caps |= FI_DELIVERY_COMPLETE;
+        tx_attr.op_flags = FI_DELIVERY_COMPLETE;
+        MPIDI_OFI_CALL(fi_tx_context(*ep, index, &tx_attr, &MPIDI_OFI_EP_TX_TAG(index), NULL), ep);
+        MPIDI_OFI_CALL(fi_ep_bind(MPIDI_OFI_EP_TX_TAG(index), &p2p_cq->fid, FI_SEND), bind);
+
+        tx_attr = *prov_use->tx_attr;
+        tx_attr.caps = FI_RMA;
+        tx_attr.caps |= FI_ATOMICS;
+        tx_attr.caps |= FI_DELIVERY_COMPLETE;
+        tx_attr.op_flags = FI_DELIVERY_COMPLETE;
+        MPIDI_OFI_CALL(fi_tx_context(*ep, index + 1, &tx_attr, &MPIDI_OFI_EP_TX_RMA(index), NULL),
+                       ep);
+        MPIDI_OFI_CALL(fi_ep_bind(MPIDI_OFI_EP_TX_RMA(index), &p2p_cq->fid, FI_SEND), bind);
+
+        tx_attr = *prov_use->tx_attr;
+        tx_attr.caps = FI_MSG;
+        tx_attr.op_flags = 0;
+        MPIDI_OFI_CALL(fi_tx_context(*ep, index + 2, &tx_attr, &MPIDI_OFI_EP_TX_MSG(index), NULL),
+                       ep);
+        MPIDI_OFI_CALL(fi_ep_bind(MPIDI_OFI_EP_TX_MSG(index), &p2p_cq->fid, FI_SEND), bind);
+
+        tx_attr = *prov_use->tx_attr;
+        tx_attr.caps = FI_RMA;
+        tx_attr.caps |= FI_ATOMICS;
+        tx_attr.caps |= FI_DELIVERY_COMPLETE;
+        tx_attr.op_flags = FI_DELIVERY_COMPLETE;
+        MPIDI_OFI_CALL(fi_tx_context(*ep, index + 3, &tx_attr, &MPIDI_OFI_EP_TX_CTR(index), NULL),
+                       ep);
+        MPIDI_OFI_CALL(fi_ep_bind(MPIDI_OFI_EP_TX_CTR(index), &rma_ctr->fid, FI_WRITE | FI_READ),
+                       bind);
+
+        rx_attr = *prov_use->rx_attr;
+        rx_attr.caps = FI_TAGGED;
+        rx_attr.caps |= FI_DELIVERY_COMPLETE;
+        rx_attr.op_flags = 0;
+        MPIDI_OFI_CALL(fi_rx_context(*ep, index, &rx_attr, &MPIDI_OFI_EP_RX_TAG(index), NULL), ep);
+        MPIDI_OFI_CALL(fi_ep_bind(MPIDI_OFI_EP_RX_TAG(index), &p2p_cq->fid, FI_RECV), bind);
+
+        rx_attr = *prov_use->rx_attr;
+        rx_attr.caps = FI_RMA;
+        rx_attr.caps |= FI_ATOMICS;
+        rx_attr.op_flags = 0;
+        MPIDI_OFI_CALL(fi_rx_context(*ep, index + 1, &rx_attr, &MPIDI_OFI_EP_RX_RMA(index), NULL),
+                       ep);
+
+        /* Note:  This bind should cause the "passive target" rx context to never generate an event
+         * We need this bind for manual progress to ensure that progress is made on the
+         * rx_ctr or rma operations during completion queue reads */
+        if (prov_use->domain_attr->data_progress == FI_PROGRESS_MANUAL)
+            MPIDI_OFI_CALL(fi_ep_bind(MPIDI_OFI_EP_RX_RMA(index), &p2p_cq->fid,
+                                      FI_SEND | FI_RECV | FI_SELECTIVE_COMPLETION), bind);
+
+        rx_attr = *prov_use->rx_attr;
+        rx_attr.caps = FI_MSG;
+        rx_attr.caps |= FI_MULTI_RECV;
+        rx_attr.op_flags = FI_MULTI_RECV;
+        MPIDI_OFI_CALL(fi_rx_context(*ep, index + 2, &rx_attr, &MPIDI_OFI_EP_RX_MSG(index), NULL),
+                       ep);
+        MPIDI_OFI_CALL(fi_ep_bind(MPIDI_OFI_EP_RX_MSG(index), &p2p_cq->fid, FI_RECV), bind);
+
+        rx_attr = *prov_use->rx_attr;
+        rx_attr.caps = FI_RMA;
+        rx_attr.caps |= FI_ATOMICS;
+        rx_attr.op_flags = 0;
+        MPIDI_OFI_CALL(fi_rx_context(*ep, index + 3, &rx_attr, &MPIDI_OFI_EP_RX_CTR(index), NULL),
+                       ep);
+
+        /* See note above */
+        if (prov_use->domain_attr->data_progress == FI_PROGRESS_MANUAL)
+            MPIDI_OFI_CALL(fi_ep_bind(MPIDI_OFI_EP_RX_CTR(index), &p2p_cq->fid,
+                                      FI_SEND | FI_RECV | FI_SELECTIVE_COMPLETION), bind);
+
+        MPIDI_OFI_CALL(fi_enable(MPIDI_OFI_EP_TX_TAG(index)), ep_enable);
+        MPIDI_OFI_CALL(fi_enable(MPIDI_OFI_EP_TX_RMA(index)), ep_enable);
+        MPIDI_OFI_CALL(fi_enable(MPIDI_OFI_EP_TX_MSG(index)), ep_enable);
+        MPIDI_OFI_CALL(fi_enable(MPIDI_OFI_EP_TX_CTR(index)), ep_enable);
+
+        MPIDI_OFI_CALL(fi_enable(MPIDI_OFI_EP_RX_TAG(index)), ep_enable);
+        MPIDI_OFI_CALL(fi_enable(MPIDI_OFI_EP_RX_RMA(index)), ep_enable);
+        MPIDI_OFI_CALL(fi_enable(MPIDI_OFI_EP_RX_MSG(index)), ep_enable);
+        MPIDI_OFI_CALL(fi_enable(MPIDI_OFI_EP_RX_CTR(index)), ep_enable);
+    }
+    else {
+        /* ---------------------------------------------------------- */
+        /* Bind the CQs, counters,  and AV to the endpoint object     */
+        /* ---------------------------------------------------------- */
+        /* "Normal Endpoint */
+        MPIDI_OFI_CALL(fi_endpoint(domain, prov_use, ep, NULL), ep);
+        MPIDI_OFI_CALL(fi_ep_bind(*ep, &p2p_cq->fid, FI_SEND | FI_RECV | FI_SELECTIVE_COMPLETION),
+                       bind);
+        MPIDI_OFI_CALL(fi_ep_bind(*ep, &rma_ctr->fid, FI_READ | FI_WRITE), bind);
+        MPIDI_OFI_CALL(fi_ep_bind(*ep, &av->fid, 0), bind);
+        MPIDI_OFI_CALL(fi_enable(*ep), ep_enable);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_CREATE_ENDPOINT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_OFI_choose_provider(struct fi_info *prov, struct fi_info **prov_use)
+{
+    struct fi_info *p = prov;
+    int i = 0;
+    *prov_use = prov;
+
+    if (MPIR_CVAR_OFI_DUMP_PROVIDERS) {
+        fprintf(stdout, "Dumping Providers(first=%p):\n", prov);
+
+        while (p) {
+            fprintf(stdout, "%s", fi_tostr(p, FI_TYPE_INFO));
+            p = p->next;
+        }
+    }
+
+    return i;
+}
+
+#endif /* NETMOD_OFI_INIT_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_iovec_util.h b/src/mpid/ch4/netmod/ofi/ofi_iovec_util.h
new file mode 100644
index 0000000..36c039f
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_iovec_util.h
@@ -0,0 +1,391 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_IOVEC_UTIL_H_INCLUDED
+#define NETMOD_OFI_IOVEC_UTIL_H_INCLUDED
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif /* HAVE_STDINT_H */
+#ifdef HAVE_SYS_UIO_H
+#include <sys/uio.h>
+#endif /* HAVE_SYS_UIO_H */
+#include <assert.h>
+#include "ofi_types.h"
+
+#define MPIDI_OFI_IOV_DONE     0
+#define MPIDI_OFI_IOV_SUCCESS  0
+#define MPIDI_OFI_IOV_EAGAIN   1
+#define MPIDI_OFI_IOV_ERROR   -1
+
+#define MPIDI_OFI_INIT_IOV_STATE(var)                                             \
+  do {                                                                  \
+    iov_state->var## _base_addr = var;                                  \
+    iov_state->var## _count     = var## _count;                         \
+    iov_state->var## _iov       = var## _iov;                           \
+    iov_state->var## _idx       = 0;                                    \
+    iov_state->var## _addr = (uintptr_t)iov_state->var## _iov[iov_state->var## _idx].iov_base + iov_state->var## _base_addr; \
+    iov_state->var## _size = (uintptr_t)iov_state->var## _iov[iov_state->var## _idx].iov_len; \
+    while (iov_state->var## _size == 0) {                                \
+      iov_state->var## _idx++;                                          \
+      if (iov_state->var## _idx < iov_state->var## _count) {             \
+        iov_state->var## _addr = (uintptr_t)iov_state->var## _iov[iov_state->var## _idx].iov_base + iov_state->var## _base_addr; \
+        iov_state->var## _size = (uintptr_t)iov_state->var## _iov[iov_state->var## _idx].iov_len; \
+      } else {                                                          \
+        break;                                                          \
+      }                                                                 \
+    }                                                                   \
+  } while (0)
+
+#define MPIDI_OFI_NEXT_IOV_STATE(var)                                             \
+  do {                                                                  \
+    *var## _addr_next       = iov_state->var## _addr;                   \
+    iov_state->var## _addr += buf_size;                                 \
+    iov_state->var## _size -= buf_size;                                 \
+    while (iov_state->var## _size == 0) {                                \
+      iov_state->var## _idx++;                                          \
+      if (iov_state->var## _idx < iov_state->var## _count) {             \
+        iov_state->var## _addr = (uintptr_t)iov_state->var## _iov[iov_state->var## _idx].iov_base + iov_state->var## _base_addr; \
+        iov_state->var## _size = (uintptr_t)iov_state->var## _iov[iov_state->var## _idx].iov_len; \
+      } else {                                                          \
+        break;                                                          \
+      }                                                                 \
+    }                                                                   \
+  } while (0)
+
+#define MPIDI_OFI_INIT_IOV(var)                                                   \
+  do {                                                                  \
+    ((struct iovec*)(&var## _iov[0]))->iov_len  = last_len;                  \
+    ((struct iovec*)(&var## _iov[0]))->iov_base = (void*)var## _last_addr;   \
+    *var## _iovs_nout = 1;                                              \
+  } while (0)
+
+#define MPIDI_OFI_UPDATE_IOV(var)                                                 \
+  do {                                                                  \
+  var## _idx++;                                                         \
+  (*var## _iovs_nout)++;                                                \
+  ((struct iovec*)(&var## _iov[var## _idx]))->iov_base = (void *)var## _addr; \
+  ((struct iovec*)(&var## _iov[var## _idx]))->iov_len  = len;                \
+  } while (0)
+
+#define MPIDI_OFI_UPDATE_IOV_STATE1(var1,var2)                                    \
+  do {                                                                  \
+    if (*var2## _iovs_nout>=var2## _max_iovs) return MPIDI_OFI_IOV_EAGAIN;   \
+    ((struct iovec*)(&var1## _iov[var1## _idx]))->iov_len += len;            \
+    var2## _idx++;                                                      \
+    (*var2## _iovs_nout)++;                                             \
+    ((struct iovec*)(&var2## _iov[var2## _idx]))->iov_base = (void *)var2## _addr; \
+    ((struct iovec*)(&var2## _iov[var2## _idx]))->iov_len  = len;            \
+    MPIDI_OFI_next_iovec_state(iov_state,&origin_addr, &target_addr, &len); \
+  } while (0)
+
+#define MPIDI_OFI_UPDATE_IOV_STATE2(var1,var2,var3)                               \
+  do {                                                                  \
+    if (*var2## _iovs_nout>=var2## _max_iovs) return MPIDI_OFI_IOV_EAGAIN;   \
+    if (*var3## _iovs_nout>=var3## _max_iovs) return MPIDI_OFI_IOV_EAGAIN;   \
+    ((struct iovec*)(&var1## _iov[var1## _idx]))->iov_len += len;            \
+    var2## _idx++;                                                      \
+    (*var2## _iovs_nout)++;                                             \
+    ((struct iovec*)(&var2## _iov[var2## _idx]))->iov_base = (void *)var2## _addr; \
+    ((struct iovec*)(&var2## _iov[var2## _idx]))->iov_len  = len;            \
+    var3## _idx++;                                                      \
+    (*var3## _iovs_nout)++;                                             \
+    ((struct iovec*)(&var3## _iov[var3## _idx]))->iov_base = (void *)var3## _addr; \
+    ((struct iovec*)(&var3## _iov[var3## _idx]))->iov_len  = len;            \
+    MPIDI_OFI_next_iovec_state2(iov_state,&origin_addr, &result_addr, &target_addr,&len); \
+  } while (0)
+
+static inline
+    int MPIDI_OFI_init_iovec_state(MPIDI_OFI_iovec_state_t * iov_state,
+                                   uintptr_t origin,
+                                   uintptr_t target,
+                                   size_t origin_count,
+                                   size_t target_count,
+                                   size_t buf_limit,
+                                   struct iovec *origin_iov, struct iovec *target_iov)
+{
+    iov_state->buf_limit = buf_limit;
+    iov_state->buf_limit_left = buf_limit;
+
+    if ((origin_count > 0) && (target_count > 0)) {
+        MPIDI_OFI_INIT_IOV_STATE(target);
+        MPIDI_OFI_INIT_IOV_STATE(origin);
+    }
+    else
+        return MPIDI_OFI_IOV_ERROR;
+
+    return MPIDI_OFI_IOV_SUCCESS;
+}
+
+static inline
+    int MPIDI_OFI_init_iovec_state2(MPIDI_OFI_iovec_state_t * iov_state,
+                                    uintptr_t origin,
+                                    uintptr_t result,
+                                    uintptr_t target,
+                                    size_t origin_count,
+                                    size_t result_count,
+                                    size_t target_count,
+                                    size_t buf_limit,
+                                    struct iovec *origin_iov,
+                                    struct iovec *result_iov, struct iovec *target_iov)
+{
+    iov_state->buf_limit = buf_limit;
+    iov_state->buf_limit_left = buf_limit;
+
+    if ((origin_count > 0) && (target_count > 0) && (result_count > 0)) {
+        MPIDI_OFI_INIT_IOV_STATE(target);
+        MPIDI_OFI_INIT_IOV_STATE(origin);
+        MPIDI_OFI_INIT_IOV_STATE(result);
+    }
+    else
+        return MPIDI_OFI_IOV_ERROR;
+
+    return MPIDI_OFI_IOV_SUCCESS;
+}
+
+
+static inline
+    int MPIDI_OFI_peek_iovec_state(MPIDI_OFI_iovec_state_t * iov_state,
+                                   uintptr_t * next_origin_addr,
+                                   uintptr_t * next_target_addr, size_t * buf_len)
+{
+    if ((iov_state->origin_size != 0) && (iov_state->target_size != 0)) {
+        *next_origin_addr = iov_state->origin_addr;
+        *next_target_addr = iov_state->target_addr;
+        *buf_len =
+            MPL_MIN(MPL_MIN(iov_state->target_size, iov_state->origin_size),
+                    iov_state->buf_limit_left);
+        return MPIDI_OFI_IOV_EAGAIN;
+    }
+    else {
+        if (((iov_state->origin_size != 0) || (iov_state->target_size != 0)))
+            return MPIDI_OFI_IOV_ERROR;
+
+        return MPIDI_OFI_IOV_DONE;
+    }
+}
+
+static inline
+    int MPIDI_OFI_peek_iovec_state2(MPIDI_OFI_iovec_state_t * iov_state,
+                                    uintptr_t * next_origin_addr,
+                                    uintptr_t * next_result_addr,
+                                    uintptr_t * next_target_addr, size_t * buf_len)
+{
+    if ((iov_state->origin_size != 0) && (iov_state->target_size != 0) &&
+        (iov_state->result_size != 0)) {
+        *next_origin_addr = iov_state->origin_addr;
+        *next_result_addr = iov_state->result_addr;
+        *next_target_addr = iov_state->target_addr;
+        *buf_len = MPL_MIN(MPL_MIN(MPL_MIN(iov_state->target_size, iov_state->origin_size),
+                                   iov_state->result_size), iov_state->buf_limit_left);
+        return MPIDI_OFI_IOV_EAGAIN;
+    }
+    else {
+        if (((iov_state->origin_size != 0) || (iov_state->target_size != 0) ||
+             (iov_state->result_size != 0)))
+            return MPIDI_OFI_IOV_ERROR;
+
+        return MPIDI_OFI_IOV_DONE;
+    }
+}
+
+
+static inline
+    int MPIDI_OFI_next_iovec_state(MPIDI_OFI_iovec_state_t * iov_state,
+                                   uintptr_t * origin_addr_next,
+                                   uintptr_t * target_addr_next, size_t * buf_len)
+{
+    if ((iov_state->origin_size != 0) && (iov_state->target_size != 0)) {
+        uintptr_t buf_size =
+            MPL_MIN(MPL_MIN(iov_state->target_size, iov_state->origin_size),
+                    iov_state->buf_limit_left);
+        *buf_len = buf_size;
+        MPIDI_OFI_NEXT_IOV_STATE(target);
+        MPIDI_OFI_NEXT_IOV_STATE(origin);
+        return MPIDI_OFI_IOV_EAGAIN;
+    }
+    else {
+        if (((iov_state->origin_size != 0) || (iov_state->target_size != 0)))
+            return MPIDI_OFI_IOV_ERROR;
+
+        return MPIDI_OFI_IOV_DONE;
+    }
+}
+
+static inline
+    int MPIDI_OFI_next_iovec_state2(MPIDI_OFI_iovec_state_t * iov_state,
+                                    uintptr_t * origin_addr_next,
+                                    uintptr_t * result_addr_next,
+                                    uintptr_t * target_addr_next, size_t * buf_len)
+{
+    if ((iov_state->origin_size != 0) && (iov_state->target_size != 0) &&
+        (iov_state->result_size != 0)) {
+        uintptr_t buf_size =
+            MPL_MIN(MPL_MIN(MPL_MIN(iov_state->target_size, iov_state->origin_size),
+                            iov_state->result_size), iov_state->buf_limit_left);
+        *buf_len = buf_size;
+        MPIDI_OFI_NEXT_IOV_STATE(target);
+        MPIDI_OFI_NEXT_IOV_STATE(origin);
+        MPIDI_OFI_NEXT_IOV_STATE(result);
+        return MPIDI_OFI_IOV_EAGAIN;
+    }
+    else {
+        if (((iov_state->origin_size != 0) || (iov_state->target_size != 0) ||
+             (iov_state->result_size != 0)))
+            return MPIDI_OFI_IOV_ERROR;
+
+        return MPIDI_OFI_IOV_DONE;
+    }
+}
+
+static inline
+    int MPIDI_OFI_merge_iov_list(MPIDI_OFI_iovec_state_t * iov_state,
+                                 struct iovec *origin_iov,
+                                 size_t origin_max_iovs,
+                                 struct fi_rma_iov *target_iov,
+                                 size_t target_max_iovs,
+                                 size_t * origin_iovs_nout, size_t * target_iovs_nout)
+{
+    int rc;
+    uintptr_t origin_addr = (uintptr_t) NULL, target_addr = (uintptr_t) NULL;
+    uintptr_t origin_last_addr = 0, target_last_addr = 0;
+    int origin_idx = 0, target_idx = 0;
+    size_t len = 0, last_len = 0;
+
+    CH4_COMPILE_TIME_ASSERT(offsetof(struct iovec, iov_base) == offsetof(struct fi_rma_iov, addr));
+    CH4_COMPILE_TIME_ASSERT(offsetof(struct iovec, iov_len) == offsetof(struct fi_rma_iov, len));
+
+    rc = MPIDI_OFI_next_iovec_state(iov_state, &origin_last_addr, &target_last_addr, &last_len);
+    assert(rc != MPIDI_OFI_IOV_ERROR);
+    MPIDI_OFI_INIT_IOV(target);
+    MPIDI_OFI_INIT_IOV(origin);
+    iov_state->buf_limit_left -= last_len;
+
+    while (rc > 0) {
+        rc = MPIDI_OFI_peek_iovec_state(iov_state, &origin_addr, &target_addr, &len);
+        assert(rc != MPIDI_OFI_IOV_ERROR);
+
+        if (rc == MPIDI_OFI_IOV_DONE) {
+            iov_state->buf_limit_left = iov_state->buf_limit;
+            return MPIDI_OFI_IOV_EAGAIN;
+        }
+
+        if (target_last_addr + last_len == target_addr) {
+            MPIDI_OFI_UPDATE_IOV_STATE1(target, origin);
+        }
+        else if (origin_last_addr + last_len == origin_addr) {
+            MPIDI_OFI_UPDATE_IOV_STATE1(origin, target);
+        }
+        else {
+            if ((*origin_iovs_nout >= origin_max_iovs) || (*target_iovs_nout >= target_max_iovs)) {
+                iov_state->buf_limit_left = iov_state->buf_limit;
+                return MPIDI_OFI_IOV_EAGAIN;
+            }
+
+            MPIDI_OFI_UPDATE_IOV(target);
+            MPIDI_OFI_UPDATE_IOV(origin);
+            MPIDI_OFI_next_iovec_state(iov_state, &origin_addr, &target_addr, &len);
+        }
+
+        origin_last_addr = origin_addr;
+        target_last_addr = target_addr;
+        last_len = len;
+        iov_state->buf_limit_left -= len;
+        if (iov_state->buf_limit_left == 0) {
+            iov_state->buf_limit_left = iov_state->buf_limit;
+            return MPIDI_OFI_IOV_EAGAIN;
+        }
+    }
+
+    if (rc == MPIDI_OFI_IOV_DONE)
+        return MPIDI_OFI_IOV_DONE;
+    else {
+        iov_state->buf_limit_left = iov_state->buf_limit;
+        return MPIDI_OFI_IOV_EAGAIN;
+    }
+}
+
+static inline
+    int MPIDI_OFI_merge_iov_list2(MPIDI_OFI_iovec_state_t * iov_state,
+                                  struct iovec *origin_iov,
+                                  size_t origin_max_iovs,
+                                  struct iovec *result_iov,
+                                  size_t result_max_iovs,
+                                  struct fi_rma_iov *target_iov,
+                                  size_t target_max_iovs,
+                                  size_t * origin_iovs_nout,
+                                  size_t * result_iovs_nout, size_t * target_iovs_nout)
+{
+    int rc;
+    uintptr_t origin_addr = (uintptr_t) NULL, result_addr = (uintptr_t) NULL, target_addr =
+        (uintptr_t) NULL;
+    uintptr_t origin_last_addr = 0, result_last_addr = 0, target_last_addr = 0;
+    int origin_idx = 0, result_idx = 0, target_idx = 0;
+    size_t len = 0, last_len = 0;
+
+    rc = MPIDI_OFI_next_iovec_state2(iov_state, &origin_last_addr, &result_last_addr,
+                                     &target_last_addr, &last_len);
+    assert(rc != MPIDI_OFI_IOV_ERROR);
+    MPIDI_OFI_INIT_IOV(target);
+    MPIDI_OFI_INIT_IOV(origin);
+    MPIDI_OFI_INIT_IOV(result);
+    iov_state->buf_limit_left -= last_len;
+
+    while (rc > 0) {
+        rc = MPIDI_OFI_peek_iovec_state2(iov_state, &origin_addr, &result_addr, &target_addr, &len);
+        assert(rc != MPIDI_OFI_IOV_ERROR);
+
+        if (rc == MPIDI_OFI_IOV_DONE) {
+            iov_state->buf_limit_left = iov_state->buf_limit;
+            return MPIDI_OFI_IOV_EAGAIN;
+        }
+
+        if (target_last_addr + last_len == target_addr) {
+            MPIDI_OFI_UPDATE_IOV_STATE2(target, origin, result);
+        }
+        else if (origin_last_addr + last_len == origin_addr) {
+            MPIDI_OFI_UPDATE_IOV_STATE2(origin, target, result);
+        }
+        else if (result_last_addr + last_len == result_addr) {
+            MPIDI_OFI_UPDATE_IOV_STATE2(result, target, origin);
+        }
+        else {
+            if ((*origin_iovs_nout >= origin_max_iovs) || (*target_iovs_nout >= target_max_iovs) ||
+                (*result_iovs_nout >= result_max_iovs)) {
+                iov_state->buf_limit_left = iov_state->buf_limit;
+                return MPIDI_OFI_IOV_EAGAIN;
+            }
+
+            MPIDI_OFI_UPDATE_IOV(target);
+            MPIDI_OFI_UPDATE_IOV(origin);
+            MPIDI_OFI_UPDATE_IOV(result);
+            MPIDI_OFI_next_iovec_state2(iov_state, &origin_addr, &result_addr, &target_addr, &len);
+        }
+
+        origin_last_addr = origin_addr;
+        result_last_addr = result_addr;
+        target_last_addr = target_addr;
+        last_len = len;
+        iov_state->buf_limit_left -= len;
+        if (iov_state->buf_limit_left == 0) {
+            iov_state->buf_limit_left = iov_state->buf_limit;
+            return MPIDI_OFI_IOV_EAGAIN;
+        }
+
+    }
+
+    if (rc == MPIDI_OFI_IOV_DONE)
+        return MPIDI_OFI_IOV_DONE;
+    else {
+        iov_state->buf_limit_left = iov_state->buf_limit;
+        return MPIDI_OFI_IOV_EAGAIN;
+    }
+}
+#endif /* __mpid_iovec_util__h__ */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_op.h b/src/mpid/ch4/netmod/ofi/ofi_op.h
new file mode 100644
index 0000000..7666d25
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_op.h
@@ -0,0 +1,27 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_OP_H_INCLUDED
+#define NETMOD_OFI_OP_H_INCLUDED
+
+#include "ofi_impl.h"
+
+static inline void MPIDI_NM_op_destroy(MPIR_Op * op_p)
+{
+    return;
+}
+
+static inline void MPIDI_NM_op_commit(MPIR_Op * op_p)
+{
+    return;
+}
+
+
+#endif
diff --git a/src/mpid/ch4/netmod/ofi/ofi_pre.h b/src/mpid/ch4/netmod/ofi/ofi_pre.h
new file mode 100644
index 0000000..5e2e966
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_pre.h
@@ -0,0 +1,190 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#ifndef NETMOD_OFI_PRE_H_INCLUDED
+#define NETMOD_OFI_PRE_H_INCLUDED
+
+#include <mpi.h>
+#include <rdma/fabric.h>
+#include <rdma/fi_endpoint.h>
+#include <rdma/fi_domain.h>
+#include <rdma/fi_tagged.h>
+#include <rdma/fi_rma.h>
+#include <rdma/fi_atomic.h>
+#include <rdma/fi_cm.h>
+#include <rdma/fi_errno.h>
+
+/* Defines */
+
+#define MPIDI_OFI_MAX_AM_HDR_SIZE    128
+#define MPIDI_OFI_AM_HANDLER_ID_BITS   8
+#define MPIDI_OFI_AM_TYPE_BITS         8
+#define MPIDI_OFI_AM_HDR_SZ_BITS       8
+#define MPIDI_OFI_AM_DATA_SZ_BITS     48
+#define MPIDI_OFI_AM_CONTEXT_ID_BITS  16
+#define MPIDI_OFI_AM_RANK_BITS        32
+#define MPIDI_OFI_AM_MSG_HEADER_SIZE (sizeof(MPIDI_OFI_am_header_t))
+
+#ifdef MPIDI_OFI_CONFIG_USE_SCALABLE_ENDPOINTS
+#define MPIDI_OFI_MAX_ENDPOINTS      256
+#define MPIDI_OFI_MAX_ENDPOINTS_BITS   8
+#else
+#define MPIDI_OFI_MAX_ENDPOINTS        1
+#define MPIDI_OFI_MAX_ENDPOINTS_BITS   0
+#endif
+
+/* Typedefs */
+
+struct MPIR_Comm;
+struct MPIR_Request;
+
+typedef struct {
+    void *huge_send_counters;
+    void *huge_recv_counters;
+    void *win_id_allocator;
+    void *rma_id_allocator;
+} MPIDI_OFI_comm_t;
+enum {
+    MPIDI_AMTYPE_SHORT_HDR = 0,
+    MPIDI_AMTYPE_SHORT,
+    MPIDI_AMTYPE_LMT_REQ,
+    MPIDI_AMTYPE_LMT_ACK
+};
+
+typedef struct {
+    /* context id and src rank so the target side can
+     * issue RDMA read operation */
+    MPIR_Context_id_t context_id;
+    int src_rank;
+
+    uint64_t src_offset;
+    uint64_t sreq_ptr;
+    uint64_t am_hdr_src;
+    uint64_t rma_key;
+} MPIDI_OFI_lmt_msg_payload_t;
+
+typedef struct {
+    uint64_t sreq_ptr;
+} MPIDI_OFI_ack_msg_payload_t;
+
+typedef struct MPIDI_OFI_am_header_t {
+    uint64_t handler_id:MPIDI_OFI_AM_HANDLER_ID_BITS;
+    uint64_t am_type:MPIDI_OFI_AM_TYPE_BITS;
+    uint64_t am_hdr_sz:MPIDI_OFI_AM_HDR_SZ_BITS;
+    uint64_t data_sz:MPIDI_OFI_AM_DATA_SZ_BITS;
+    uint64_t payload[0];
+} MPIDI_OFI_am_header_t;
+
+typedef struct {
+    MPIDI_OFI_am_header_t hdr;
+    MPIDI_OFI_ack_msg_payload_t pyld;
+} MPIDI_OFI_ack_msg_t;
+
+typedef struct {
+    MPIDI_OFI_am_header_t hdr;
+    MPIDI_OFI_lmt_msg_payload_t pyld;
+} MPIDI_OFI_lmt_msg_t;
+
+typedef struct {
+    MPIDI_OFI_lmt_msg_payload_t lmt_info;
+    uint64_t lmt_cntr;
+    struct fid_mr *lmt_mr;
+    void *pack_buffer;
+    void *rreq_ptr;
+    void *am_hdr;
+    int (*cmpl_handler_fn) (struct MPIR_Request * req);
+    uint16_t am_hdr_sz;
+    uint8_t pad[6];
+    MPIDI_OFI_am_header_t msg_hdr;
+    uint8_t am_hdr_buf[MPIDI_OFI_MAX_AM_HDR_SIZE];
+} MPIDI_OFI_am_request_header_t;
+
+typedef struct {
+    struct fi_context context;  /* fixed field, do not move */
+    int event_id;               /* fixed field, do not move */
+    MPIDI_OFI_am_request_header_t *req_hdr;
+} MPIDI_OFI_am_request_t;
+
+
+typedef struct MPIDI_OFI_noncontig_t {
+    struct MPIDU_Segment segment;
+    char pack_buffer[0];
+} MPIDI_OFI_noncontig_t;
+
+typedef struct {
+    struct fi_context context;  /* fixed field, do not move */
+    int event_id;               /* fixed field, do not move */
+    int util_id;
+    struct MPIR_Comm *util_comm;
+    MPI_Datatype datatype;
+    MPIDI_OFI_noncontig_t *noncontig;
+    /* persistent send fields */
+    union {
+        struct {
+            int type;
+            int rank;
+            int tag;
+            int count;
+            void *buf;
+        } persist;
+        struct iovec iov;
+        void *inject_buf;       /* Internal buffer for inject emulation */
+    } util;
+} MPIDI_OFI_request_t;
+
+typedef struct {
+    int index;
+} MPIDI_OFI_dt_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_OFI_op_t;
+
+struct MPIDI_OFI_win_request;
+
+/* Stores per-rank information for RMA */
+typedef struct {
+    int32_t disp_unit;
+#ifndef USE_OFI_MR_SCALABLE
+    /* For MR_BASIC mode we need to store an MR key and a base address of the target window */
+    uint64_t mr_key;
+    uintptr_t base;
+#endif
+} MPIDI_OFI_win_targetinfo_t;
+
+typedef struct {
+    struct fid_mr *mr;
+    uint64_t mr_key;
+    struct fid_ep *ep;          /* EP with counter & completion */
+    struct fid_ep *ep_nocmpl;   /* EP with counter only (no completion) */
+    uint64_t *issued_cntr;
+    uint64_t issued_cntr_v;     /* main body of an issued counter,
+                                 * if we are to use per-window counter */
+    struct fid_cntr *cmpl_cntr;
+    uint64_t win_id;
+    struct MPIDI_OFI_win_request *syncQ;
+    MPIDI_OFI_win_targetinfo_t *winfo;
+} MPIDI_OFI_win_t;
+
+typedef struct {
+    char addr[62];
+} MPIDI_OFI_gpid_t;
+
+typedef struct {
+#ifndef MPIDI_OFI_CONFIG_USE_AV_TABLE
+    fi_addr_t dest;
+#endif
+#ifdef MPIDI_OFI_CONFIG_USE_SCALABLE_ENDPOINTS
+    unsigned ep_idx:MPIDI_OFI_MAX_ENDPOINTS_BITS;
+#endif
+} MPIDI_OFI_addr_t;
+
+#endif
diff --git a/src/mpid/ch4/netmod/ofi/ofi_probe.h b/src/mpid/ch4/netmod/ofi/ofi_probe.h
new file mode 100644
index 0000000..b2eaff4
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_probe.h
@@ -0,0 +1,168 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_PROBE_H_INCLUDED
+#define NETMOD_OFI_PROBE_H_INCLUDED
+
+#include "ofi_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_do_iprobe
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_do_iprobe(int source,
+                                      int tag,
+                                      MPIR_Comm * comm,
+                                      int context_offset,
+                                      int *flag,
+                                      MPI_Status * status,
+                                      MPIR_Request ** message, uint64_t peek_flags)
+{
+    int mpi_errno = MPI_SUCCESS;
+    fi_addr_t remote_proc;
+    uint64_t match_bits, mask_bits;
+    MPIR_Request r, *rreq;      /* don't need to init request, output only */
+    struct fi_msg_tagged msg;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_NETMOD_DO_PROBE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_NETMOD_DO_PROBE);
+
+    if (unlikely(MPI_ANY_SOURCE == source))
+        remote_proc = FI_ADDR_UNSPEC;
+    else
+        remote_proc = MPIDI_OFI_comm_to_phys(comm, source, MPIDI_OFI_API_TAG);
+
+    if (message)
+        MPIDI_OFI_REQUEST_CREATE(rreq, MPIR_REQUEST_KIND__MPROBE);
+    else
+        rreq = &r;
+
+    match_bits =
+        MPIDI_OFI_init_recvtag(&mask_bits, comm->context_id + context_offset, source, tag,
+                               MPIDI_OFI_ENABLE_DATA);
+
+    MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_PEEK;
+    MPIDI_OFI_REQUEST(rreq, util_id) = MPIDI_OFI_PEEK_START;
+
+    msg.msg_iov = NULL;
+    msg.desc = NULL;
+    msg.iov_count = 0;
+    msg.addr = remote_proc;
+    msg.tag = match_bits;
+    msg.ignore = mask_bits;
+    msg.context = (void *) &(MPIDI_OFI_REQUEST(rreq, context));
+    msg.data = 0;
+
+    MPIDI_OFI_CALL(fi_trecvmsg
+                   (MPIDI_OFI_EP_RX_TAG(0), &msg,
+                    peek_flags | FI_PEEK | FI_COMPLETION | MPIDI_OFI_ENABLE_DATA), trecvmsg);
+    MPIDI_OFI_PROGRESS_WHILE(MPIDI_OFI_REQUEST(rreq, util_id) == MPIDI_OFI_PEEK_START);
+
+    switch (MPIDI_OFI_REQUEST(rreq, util_id)) {
+    case MPIDI_OFI_PEEK_NOT_FOUND:
+        *flag = 0;
+
+        if (message)
+            MPIR_Handle_obj_free(&MPIR_Request_mem, rreq);
+
+        goto fn_exit;
+        break;
+
+    case MPIDI_OFI_PEEK_FOUND:
+        MPIR_Request_extract_status(rreq, status);
+        *flag = 1;
+
+        if (message)
+            *message = rreq;
+
+        break;
+
+    default:
+        MPIR_Assert(0);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_NETMOD_DO_PROBE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_probe
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_probe(int source,
+                                 int tag, MPIR_Comm * comm, int context_offset, MPI_Status * status)
+{
+    int mpi_errno = MPI_SUCCESS, flag = 0;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_NETMOD_PROBE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_NETMOD_PROBE);
+
+    while (!flag) {
+        mpi_errno = MPIDI_Iprobe(source, tag, comm, context_offset, &flag, status);
+
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+
+        MPIDI_OFI_PROGRESS();
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_NETMOD_PROBE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_improbe
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_improbe(int source,
+                                   int tag,
+                                   MPIR_Comm * comm,
+                                   int context_offset,
+                                   int *flag, MPIR_Request ** message, MPI_Status * status)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_NETMOD_IMPROBE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_NETMOD_IMPROBE);
+    /* Set flags for mprobe peek, when ready */
+    int mpi_errno = MPIDI_OFI_do_iprobe(source, tag, comm, context_offset,
+                                        flag, status, message, FI_CLAIM | FI_COMPLETION);
+
+    if (*flag && *message) {
+        (*message)->comm = comm;
+        MPIR_Object_add_ref(comm);
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_NETMOD_IMPROBE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iprobe
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iprobe(int source,
+                                  int tag,
+                                  MPIR_Comm * comm,
+                                  int context_offset, int *flag, MPI_Status * status)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_NETMOD_IPROBE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_NETMOD_IPROBE);
+    mpi_errno = MPIDI_OFI_do_iprobe(source, tag, comm, context_offset, flag, status, NULL, 0ULL);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_NETMOD_IPROBE);
+    return mpi_errno;
+}
+
+#endif /* NETMOD_OFI_PROBE_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_proc.h b/src/mpid/ch4/netmod/ofi/ofi_proc.h
new file mode 100644
index 0000000..3f20c0f
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_proc.h
@@ -0,0 +1,29 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_PROC_H_INCLUDED
+#define NETMOD_OFI_PROC_H_INCLUDED
+
+#include "ofi_impl.h"
+
+static inline int MPIDI_NM_rank_is_local(int rank, MPIR_Comm * comm)
+{
+    int ret;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPIDI_NETMOD_RANK_IS_LOCAL);
+    MPIR_FUNC_VERBOSE_ENTER(MPIDI_NETMOD_RANK_IS_LOCAL);
+
+    ret = MPIDI_CH4U_rank_is_local(rank, comm);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPIDI_NETMOD_RANK_IS_LOCAL);
+    return ret;
+}
+
+#endif /*NETMOD_OFI_PROC_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_progress.h b/src/mpid/ch4/netmod/ofi/ofi_progress.h
new file mode 100644
index 0000000..e5da34b
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_progress.h
@@ -0,0 +1,63 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_PROGRESS_H_INCLUDED
+#define NETMOD_OFI_PROGRESS_H_INCLUDED
+
+#include "ofi_impl.h"
+#include "ofi_events.h"
+#include "ofi_am_events.h"
+
+__ALWAYS_INLINE__
+    int MPIDI_OFI_progress_generic(void *netmod_context, int blocking, int do_am, int do_tagged)
+{
+    int mpi_errno;
+    struct fi_cq_tagged_entry wc[MPIDI_OFI_NUM_CQ_ENTRIES];
+    ssize_t ret;
+
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_OFI_THREAD_FI_MUTEX);
+
+    if (unlikely(MPIDI_OFI_get_buffered(wc, 1)))
+        mpi_errno = MPIDI_OFI_handle_cq_entries(wc, 1, 1);
+    else if (likely(1)) {
+        ret = fi_cq_read(MPIDI_Global.p2p_cq, (void *) wc, MPIDI_OFI_NUM_CQ_ENTRIES);
+
+        if (likely(ret > 0))
+            mpi_errno = MPIDI_OFI_handle_cq_entries(wc, ret, 0);
+        else if (ret == -FI_EAGAIN)
+            mpi_errno = MPI_SUCCESS;
+        else
+            mpi_errno = MPIDI_OFI_handle_cq_error(ret);
+    }
+
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_OFI_THREAD_FI_MUTEX);
+
+    MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
+    MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_progress
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_progress(void *netmod_context, int blocking)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_PROGRESS);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_PROGRESS);
+    mpi_errno = MPIDI_OFI_progress_generic(netmod_context,
+                                           blocking, MPIDI_OFI_ENABLE_AM, MPIDI_OFI_ENABLE_TAGGED);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_PROGRESS);
+    return mpi_errno;
+}
+
+#endif /* NETMOD_OFI_PROGRESS_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_recv.h b/src/mpid/ch4/netmod/ofi/ofi_recv.h
new file mode 100644
index 0000000..ecc93b5
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_recv.h
@@ -0,0 +1,280 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_RECV_H_INCLUDED
+#define NETMOD_OFI_RECV_H_INCLUDED
+
+#include "ofi_impl.h"
+
+#define MPIDI_OFI_ON_HEAP      0
+#define MPIDI_OFI_USE_EXISTING 1
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_do_irecv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_do_irecv(void *buf,
+                                         int count,
+                                         MPI_Datatype datatype,
+                                         int rank,
+                                         int tag,
+                                         MPIR_Comm * comm,
+                                         int context_offset,
+                                         MPIR_Request ** request, int mode, uint64_t flags)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq = NULL;
+    uint64_t match_bits, mask_bits;
+    MPIR_Context_id_t context_id = comm->recvcontext_id + context_offset;
+    size_t data_sz;
+    int dt_contig;
+    MPI_Aint dt_true_lb;
+    MPIR_Datatype *dt_ptr;
+    struct fi_msg_tagged msg;
+    char *recv_buf;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_DO_IRECV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_DO_IRECV);
+
+    if (mode == MPIDI_OFI_ON_HEAP)      /* Branch should compile out */
+        MPIDI_OFI_REQUEST_CREATE(rreq, MPIR_REQUEST_KIND__RECV);
+    else if (mode == MPIDI_OFI_USE_EXISTING) {
+        rreq = *request;
+        rreq->kind = MPIR_REQUEST_KIND__RECV;
+    }
+
+    *request = rreq;
+
+    match_bits = MPIDI_OFI_init_recvtag(&mask_bits, context_id, rank, tag, MPIDI_OFI_ENABLE_DATA);
+
+    MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);
+    MPIDI_OFI_REQUEST(rreq, datatype) = datatype;
+    dtype_add_ref_if_not_builtin(datatype);
+
+    recv_buf = (char *) buf + dt_true_lb;
+
+    if (!dt_contig) {
+        MPIDI_OFI_REQUEST(rreq, noncontig) =
+            (MPIDI_OFI_noncontig_t *) MPL_malloc(data_sz + sizeof(MPID_Segment));
+        MPIR_ERR_CHKANDJUMP1(MPIDI_OFI_REQUEST(rreq, noncontig->pack_buffer) == NULL, mpi_errno,
+                             MPI_ERR_OTHER, "**nomem", "**nomem %s", "Recv Pack Buffer alloc");
+        recv_buf = MPIDI_OFI_REQUEST(rreq, noncontig->pack_buffer);
+        MPID_Segment_init(buf, count, datatype, &MPIDI_OFI_REQUEST(rreq, noncontig->segment), 0);
+    }
+    else
+        MPIDI_OFI_REQUEST(rreq, noncontig) = NULL;
+
+    MPIDI_OFI_REQUEST(rreq, util_comm) = comm;
+    MPIDI_OFI_REQUEST(rreq, util_id) = context_id;
+
+    if (unlikely(data_sz > MPIDI_Global.max_send)) {
+        MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV_HUGE;
+        data_sz = MPIDI_Global.max_send;
+    }
+    else
+        MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV;
+
+    if (!flags) /* Branch should compile out */
+        MPIDI_OFI_CALL_RETRY(fi_trecv(MPIDI_OFI_EP_RX_TAG(0),
+                                      recv_buf,
+                                      data_sz,
+                                      NULL,
+                                      (MPI_ANY_SOURCE ==
+                                       rank) ? FI_ADDR_UNSPEC : MPIDI_OFI_comm_to_phys(comm, rank,
+                                                                                       MPIDI_OFI_API_TAG),
+                                      match_bits, mask_bits,
+                                      (void *) &(MPIDI_OFI_REQUEST(rreq, context))), trecv,
+                             MPIDI_OFI_CALL_LOCK);
+    else {
+        MPIDI_OFI_REQUEST(rreq, util.iov).iov_base = recv_buf;
+        MPIDI_OFI_REQUEST(rreq, util.iov).iov_len = data_sz;
+
+        msg.msg_iov = &MPIDI_OFI_REQUEST(rreq, util.iov);
+        msg.desc = NULL;
+        msg.iov_count = 1;
+        msg.tag = match_bits;
+        msg.ignore = mask_bits;
+        msg.context = (void *) &(MPIDI_OFI_REQUEST(rreq, context));
+        msg.data = 0;
+        msg.addr = FI_ADDR_UNSPEC;
+
+        MPIDI_OFI_CALL_RETRY(fi_trecvmsg(MPIDI_OFI_EP_RX_TAG(0), &msg, flags), trecv,
+                             MPIDI_OFI_CALL_LOCK);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_DO_IRECV);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_recv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_NM_recv(void *buf,
+                                    int count,
+                                    MPI_Datatype datatype,
+                                    int rank,
+                                    int tag,
+                                    MPIR_Comm * comm,
+                                    int context_offset,
+                                    MPI_Status * status, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_RECV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_RECV);
+    mpi_errno = MPIDI_OFI_do_irecv(buf, count, datatype, rank, tag, comm,
+                                   context_offset, request, MPIDI_OFI_ON_HEAP, 0ULL);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_RECV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_recv_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_NM_recv_init(void *buf,
+                                         int count,
+                                         MPI_Datatype datatype,
+                                         int rank,
+                                         int tag,
+                                         MPIR_Comm * comm,
+                                         int context_offset, MPIR_Request ** request)
+{
+    MPIR_Request *rreq;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_RECV_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_RECV_INIT);
+
+    MPIDI_OFI_REQUEST_CREATE((rreq), MPIR_REQUEST_KIND__PREQUEST_RECV);
+
+    *request = rreq;
+    rreq->comm = comm;
+    MPIR_Comm_add_ref(comm);
+
+    MPIDI_OFI_REQUEST(rreq, util.persist.buf) = (void *) buf;
+    MPIDI_OFI_REQUEST(rreq, util.persist.count) = count;
+    MPIDI_OFI_REQUEST(rreq, datatype) = datatype;
+    MPIDI_OFI_REQUEST(rreq, util.persist.rank) = rank;
+    MPIDI_OFI_REQUEST(rreq, util.persist.tag) = tag;
+    MPIDI_OFI_REQUEST(rreq, util_comm) = comm;
+    MPIDI_OFI_REQUEST(rreq, util_id) = comm->context_id + context_offset;
+    rreq->u.persist.real_request = NULL;
+
+    MPIDI_CH4U_request_complete(rreq);
+
+    MPIDI_OFI_REQUEST(rreq, util.persist.type) = MPIDI_PTYPE_RECV;
+
+    if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) {
+        MPIR_Datatype *dt_ptr;
+        MPID_Datatype_get_ptr(datatype, dt_ptr);
+        MPID_Datatype_add_ref(dt_ptr);
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_RECV_INIT);
+    return MPI_SUCCESS;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_imrecv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_NM_imrecv(void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      MPIR_Request * message, MPIR_Request ** rreqp)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_IMRECV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_IMRECV);
+
+    if (message == NULL) {
+        MPIDI_OFI_request_create_null_rreq(rreq, mpi_errno, fn_fail);
+        *rreqp = rreq;
+        goto fn_exit;
+    }
+
+    MPIR_Assert(message != NULL);
+    MPIR_Assert(message->kind == MPIR_REQUEST_KIND__MPROBE);
+
+    *rreqp = rreq = message;
+
+    mpi_errno = MPIDI_OFI_do_irecv(buf, count, datatype, message->status.MPI_SOURCE,
+                                   message->status.MPI_TAG, rreq->comm, 0,
+                                   &rreq, MPIDI_OFI_USE_EXISTING, FI_CLAIM | FI_COMPLETION);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_IMRECV);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_irecv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_NM_irecv(void *buf,
+                                     int count,
+                                     MPI_Datatype datatype,
+                                     int rank,
+                                     int tag,
+                                     MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_IRECV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_IRECV);
+    mpi_errno = MPIDI_OFI_do_irecv(buf, count, datatype, rank, tag, comm,
+                                   context_offset, request, MPIDI_OFI_ON_HEAP, 0ULL);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_IRECV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_cancel_recv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_NM_cancel_recv(MPIR_Request * rreq)
+{
+
+    int mpi_errno = MPI_SUCCESS;
+    ssize_t ret;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_CANCEL_RECV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_CANCEL_RECV);
+
+#ifndef MPIDI_BUILD_CH4_SHM
+    MPIDI_OFI_PROGRESS();
+#endif /* MPIDI_BUILD_CH4_SHM */
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_OFI_THREAD_FI_MUTEX);
+    ret = fi_cancel((fid_t) MPIDI_OFI_EP_RX_TAG(0), &(MPIDI_OFI_REQUEST(rreq, context)));
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_OFI_THREAD_FI_MUTEX);
+
+    if (ret == 0) {
+        while ((!MPIR_STATUS_GET_CANCEL_BIT(rreq->status)) && (!MPIR_cc_is_complete(&rreq->cc))) {
+            if ((mpi_errno =
+                 MPIDI_NM_progress(&MPIDI_OFI_REQUEST(rreq, context), 0)) != MPI_SUCCESS)
+                goto fn_exit;
+        }
+
+        if (MPIR_STATUS_GET_CANCEL_BIT(rreq->status)) {
+            MPIR_STATUS_SET_CANCEL_BIT(rreq->status, TRUE);
+            MPIR_STATUS_SET_COUNT(rreq->status, 0);
+            MPIDI_CH4U_request_complete(rreq);
+        }
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_CANCEL_RECV);
+    return mpi_errno;
+}
+
+#endif /* NETMOD_OFI_RECV_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_rma.h b/src/mpid/ch4/netmod/ofi/ofi_rma.h
new file mode 100644
index 0000000..fda3960
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_rma.h
@@ -0,0 +1,1298 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_RMA_H_INCLUDED
+#define NETMOD_OFI_RMA_H_INCLUDED
+
+#include "ofi_impl.h"
+
+#define MPIDI_OFI_QUERY_ATOMIC_COUNT         0
+#define MPIDI_OFI_QUERY_FETCH_ATOMIC_COUNT   1
+#define MPIDI_OFI_QUERY_COMPARE_ATOMIC_COUNT 2
+
+#define MPIDI_OFI_INIT_CHUNK_CONTEXT(win,sigreq)                        \
+    do {                                                                \
+    if (sigreq) {                                                        \
+        int tmp;                                                        \
+        MPIDI_OFI_chunk_request *creq;                                  \
+        MPIR_cc_incr((*sigreq)->cc_ptr, &tmp);                          \
+        creq=(MPIDI_OFI_chunk_request*)MPL_malloc(sizeof(*creq));       \
+        creq->event_id = MPIDI_OFI_EVENT_CHUNK_DONE;                    \
+        creq->parent   = *sigreq;                                       \
+        msg.context    = &creq->context;                                \
+        MPIDI_OFI_win_conditional_cntr_incr(win);                       \
+    }                                                                   \
+    else MPIDI_OFI_win_cntr_incr(win);                                  \
+    } while (0)
+
+#define MPIDI_OFI_INIT_SIGNAL_REQUEST(win,sigreq,flags,ep)              \
+    do {                                                                \
+        if (sigreq)                                                      \
+        {                                                               \
+            MPIDI_OFI_REQUEST_CREATE((*(sigreq)), MPIR_REQUEST_KIND__RMA); \
+            MPIR_cc_set((*(sigreq))->cc_ptr, 0);                        \
+            *(flags)                    = FI_COMPLETION;                \
+            *(ep)                       = MPIDI_OFI_WIN(win).ep;        \
+        }                                                               \
+        else {                                                          \
+            *(ep) = MPIDI_OFI_WIN(win).ep_nocmpl;                       \
+            *(flags)                    = 0ULL;                         \
+        }                                                               \
+    } while (0)
+
+#define MPIDI_OFI_GET_BASIC_TYPE(a,b)   \
+    do {                                        \
+        if (MPIR_DATATYPE_IS_PREDEFINED(a))     \
+            b = a;                              \
+        else {                                  \
+            MPIR_Datatype *dt_ptr;              \
+            MPID_Datatype_get_ptr(a,dt_ptr);    \
+            b = dt_ptr->basic_type;             \
+        }                                       \
+    } while (0)
+
+static inline uint32_t MPIDI_OFI_winfo_disp_unit(MPIR_Win * win, int rank)
+{
+    if (MPIDI_OFI_WIN(win).winfo)
+        return MPIDI_OFI_WIN(win).winfo[rank].disp_unit;
+    else
+        return win->disp_unit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_query_datatype
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_query_datatype(MPI_Datatype dt,
+                                           enum fi_datatype *fi_dt,
+                                           MPI_Op op,
+                                           enum fi_op *fi_op, size_t * count, size_t * dtsize)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_QUERY_DT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_QUERY_DT);
+    MPIR_Datatype *dt_ptr;
+    int op_index, dt_index, rc;
+
+    MPID_Datatype_get_ptr(dt, dt_ptr);
+
+    /* OP_NULL is the oddball                          */
+    /* todo...change configure to table this correctly */
+    dt_index = MPIDI_OFI_DATATYPE(dt_ptr).index;
+
+    if (op == MPI_OP_NULL)
+        op_index = 14;
+    else
+        op_index = (0x000000FFU & op) - 1;
+
+    *fi_dt = (enum fi_datatype) MPIDI_Global.win_op_table[dt_index][op_index].dt;
+    *fi_op = (enum fi_op) MPIDI_Global.win_op_table[dt_index][op_index].op;
+    *dtsize = MPIDI_Global.win_op_table[dt_index][op_index].dtsize;
+
+    if (*count == MPIDI_OFI_QUERY_ATOMIC_COUNT)
+        *count = MPIDI_Global.win_op_table[dt_index][op_index].max_atomic_count;
+
+    if (*count == MPIDI_OFI_QUERY_FETCH_ATOMIC_COUNT)
+        *count = MPIDI_Global.win_op_table[dt_index][op_index].max_fetch_atomic_count;
+
+    if (*count == MPIDI_OFI_QUERY_COMPARE_ATOMIC_COUNT)
+        *count = MPIDI_Global.win_op_table[dt_index][op_index].max_compare_atomic_count;
+
+    if (((int) *fi_dt) == -1 || ((int) *fi_op) == -1)
+        rc = -1;
+    else
+        rc = MPI_SUCCESS;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_QUERY_DT);
+    return rc;
+}
+
+
+static inline void MPIDI_OFI_win_datatype_basic(int count,
+                                                MPI_Datatype datatype,
+                                                MPIDI_OFI_win_datatype_t * dt)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_DATATYPE_BASIC);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_DATATYPE_BASIC);
+
+    if (datatype != MPI_DATATYPE_NULL)
+        MPIDI_Datatype_get_info(dt->count = count,
+                                dt->type = datatype,
+                                dt->contig, dt->size, dt->pointer, dt->true_lb);
+    else
+        memset(dt, 0, sizeof(*dt));
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_DATATYPE_BASIC);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_win_datatype_map
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline void MPIDI_OFI_win_datatype_map(MPIDI_OFI_win_datatype_t * dt)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_DATATYPE_MAP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_DATATYPE_MAP);
+
+    if (dt->contig) {
+        dt->num_contig = 1;
+        dt->map = &dt->__map;
+        dt->map[0].DLOOP_VECTOR_BUF = (void *) (size_t) dt->true_lb;
+        dt->map[0].DLOOP_VECTOR_LEN = dt->size;
+    }
+    else {
+        unsigned map_size = dt->pointer->max_contig_blocks * dt->count + 1;
+        dt->num_contig = map_size;
+        dt->map = (DLOOP_VECTOR *) MPL_malloc(map_size * sizeof(DLOOP_VECTOR));
+        MPIR_Assert(dt->map != NULL);
+
+        MPID_Segment seg;
+        DLOOP_Offset last = dt->pointer->size * dt->count;
+        MPIDU_Segment_init(NULL, dt->count, dt->type, &seg, 0);
+        MPIDU_Segment_pack_vector(&seg, 0, &last, dt->map, &dt->num_contig);
+        MPIR_Assert((unsigned) dt->num_contig <= map_size);
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_DATATYPE_MAP);
+}
+
+__ALWAYS_INLINE__ int MPIDI_OFI_allocate_win_request_put_get(MPIR_Win * win,
+                                                             int origin_count,
+                                                             int target_count,
+                                                             int target_rank,
+                                                             MPI_Datatype origin_datatype,
+                                                             MPI_Datatype target_datatype,
+                                                             MPIDI_OFI_win_request_t ** winreq,
+                                                             uint64_t * flags,
+                                                             struct fid_ep **ep,
+                                                             MPIR_Request ** sigreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    size_t o_size, t_size;
+    MPIDI_OFI_win_request_t *req;
+
+    o_size = sizeof(struct iovec);
+    t_size = sizeof(struct fi_rma_iov);
+    req = MPIDI_OFI_win_request_alloc_and_init(MPIDI_Global.iov_limit * (o_size + t_size));
+    *winreq = req;
+
+    req->noncontig->buf.iov.put_get.originv = (struct iovec *) &req->noncontig->buf.iov_store[0];
+    req->noncontig->buf.iov.put_get.targetv =
+        (struct fi_rma_iov *) &req->noncontig->buf.iov_store[o_size * MPIDI_Global.iov_limit];
+    MPIDI_OFI_INIT_SIGNAL_REQUEST(win, sigreq, flags, ep);
+    MPIDI_OFI_win_datatype_basic(origin_count, origin_datatype, &req->noncontig->origin_dt);
+    MPIDI_OFI_win_datatype_basic(target_count, target_datatype, &req->noncontig->target_dt);
+    MPIR_ERR_CHKANDJUMP((req->noncontig->origin_dt.size != req->noncontig->target_dt.size),
+                        mpi_errno, MPI_ERR_SIZE, "**rmasize");
+
+    MPIDI_OFI_win_datatype_map(&req->noncontig->origin_dt);
+    MPIDI_OFI_win_datatype_map(&req->noncontig->target_dt);
+    req->target_rank = target_rank;
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+__ALWAYS_INLINE__ int MPIDI_OFI_allocate_win_request_accumulate(MPIR_Win * win,
+                                                                int origin_count,
+                                                                int target_count,
+                                                                int target_rank,
+                                                                MPI_Datatype origin_datatype,
+                                                                MPI_Datatype target_datatype,
+                                                                MPIDI_OFI_win_request_t ** winreq,
+                                                                uint64_t * flags,
+                                                                struct fid_ep **ep,
+                                                                MPIR_Request ** sigreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    size_t o_size, t_size;
+    MPIDI_OFI_win_request_t *req;
+
+    o_size = sizeof(struct fi_ioc);
+    t_size = sizeof(struct fi_rma_ioc);
+    req = MPIDI_OFI_win_request_alloc_and_init(MPIDI_Global.iov_limit * (o_size + t_size));
+    *winreq = req;
+
+    req->noncontig->buf.iov.accumulate.originv =
+        (struct fi_ioc *) &req->noncontig->buf.iov_store[0];
+    req->noncontig->buf.iov.accumulate.targetv =
+        (struct fi_rma_ioc *) &req->noncontig->buf.iov_store[o_size * MPIDI_Global.iov_limit];
+    MPIDI_OFI_INIT_SIGNAL_REQUEST(win, sigreq, flags, ep);
+    MPIDI_OFI_win_datatype_basic(origin_count, origin_datatype, &req->noncontig->origin_dt);
+    MPIDI_OFI_win_datatype_basic(target_count, target_datatype, &req->noncontig->target_dt);
+    MPIR_ERR_CHKANDJUMP((req->noncontig->origin_dt.size != req->noncontig->target_dt.size),
+                        mpi_errno, MPI_ERR_SIZE, "**rmasize");
+
+    MPIDI_OFI_win_datatype_map(&req->noncontig->origin_dt);
+    MPIDI_OFI_win_datatype_map(&req->noncontig->target_dt);
+    req->target_rank = target_rank;
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+__ALWAYS_INLINE__ int MPIDI_OFI_allocate_win_request_get_accumulate(MPIR_Win * win,
+                                                                    int origin_count,
+                                                                    int target_count,
+                                                                    int result_count,
+                                                                    int target_rank,
+                                                                    MPI_Op op,
+                                                                    MPI_Datatype origin_datatype,
+                                                                    MPI_Datatype target_datatype,
+                                                                    MPI_Datatype result_datatype,
+                                                                    MPIDI_OFI_win_request_t **
+                                                                    winreq, uint64_t * flags,
+                                                                    struct fid_ep **ep,
+                                                                    MPIR_Request ** sigreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    size_t o_size, t_size, r_size;
+    MPIDI_OFI_win_request_t *req;
+
+    o_size = sizeof(struct fi_ioc);
+    t_size = sizeof(struct fi_rma_ioc);
+    r_size = sizeof(struct fi_ioc);
+    req = MPIDI_OFI_win_request_alloc_and_init(MPIDI_Global.iov_limit * (o_size + t_size + r_size));
+    *winreq = req;
+
+    req->noncontig->buf.iov.get_accumulate.originv =
+        (struct fi_ioc *) &req->noncontig->buf.iov_store[0];
+    req->noncontig->buf.iov.get_accumulate.targetv =
+        (struct fi_rma_ioc *) &req->noncontig->buf.iov_store[o_size * MPIDI_Global.iov_limit];
+    req->noncontig->buf.iov.get_accumulate.resultv =
+        (struct fi_ioc *) &req->noncontig->buf.iov_store[o_size * MPIDI_Global.iov_limit +
+                                                         t_size * MPIDI_Global.rma_iov_limit];
+    MPIDI_OFI_INIT_SIGNAL_REQUEST(win, sigreq, flags, ep);
+    MPIDI_OFI_win_datatype_basic(origin_count, origin_datatype, &req->noncontig->origin_dt);
+    MPIDI_OFI_win_datatype_basic(target_count, target_datatype, &req->noncontig->target_dt);
+    MPIDI_OFI_win_datatype_basic(result_count, result_datatype, &req->noncontig->result_dt);
+
+    MPIR_ERR_CHKANDJUMP((req->noncontig->origin_dt.size != req->noncontig->target_dt.size &&
+                         op != MPI_NO_OP), mpi_errno, MPI_ERR_SIZE, "**rmasize");
+    MPIR_ERR_CHKANDJUMP((req->noncontig->result_dt.size != req->noncontig->target_dt.size),
+                        mpi_errno, MPI_ERR_SIZE, "**rmasize");
+
+    if (op != MPI_NO_OP)
+        MPIDI_OFI_win_datatype_map(&req->noncontig->origin_dt);
+
+    MPIDI_OFI_win_datatype_map(&req->noncontig->target_dt);
+    MPIDI_OFI_win_datatype_map(&req->noncontig->result_dt);
+
+    req->target_rank = target_rank;
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_do_put
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_do_put(const void *origin_addr,
+                                   int origin_count,
+                                   MPI_Datatype origin_datatype,
+                                   int target_rank,
+                                   MPI_Aint target_disp,
+                                   int target_count,
+                                   MPI_Datatype target_datatype,
+                                   MPIR_Win * win, MPIR_Request ** sigreq)
+{
+    int rc, mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_win_request_t *req;
+    size_t offset, omax, tmax, tout, oout;
+    uint64_t flags;
+    struct fid_ep *ep;
+    struct fi_msg_rma msg;
+    unsigned i;
+    struct iovec *originv;
+    struct fi_rma_iov *targetv;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_DO_PUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_DO_PUT);
+
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_allocate_win_request_put_get(win,
+                                                                  origin_count,
+                                                                  target_count,
+                                                                  target_rank,
+                                                                  origin_datatype,
+                                                                  target_datatype,
+                                                                  &req, &flags, &ep, sigreq));
+
+    offset = target_disp * MPIDI_OFI_winfo_disp_unit(win, target_rank);
+
+    req->event_id = MPIDI_OFI_EVENT_ABORT;
+    msg.desc = NULL;
+    msg.addr = MPIDI_OFI_comm_to_phys(win->comm_ptr, req->target_rank, MPIDI_OFI_API_CTR);
+    msg.context = NULL;
+    msg.data = 0;
+    req->next = MPIDI_OFI_WIN(win).syncQ;
+    MPIDI_OFI_WIN(win).syncQ = req;
+    MPIDI_OFI_init_iovec_state(&req->noncontig->iovs,
+                               (uintptr_t) origin_addr,
+                               (uintptr_t) MPIDI_OFI_winfo_base(win, req->target_rank) + offset,
+                               req->noncontig->origin_dt.num_contig,
+                               req->noncontig->target_dt.num_contig,
+                               INT64_MAX,
+                               req->noncontig->origin_dt.map, req->noncontig->target_dt.map);
+    rc = MPIDI_OFI_IOV_EAGAIN;
+
+    while (rc == MPIDI_OFI_IOV_EAGAIN) {
+        originv = req->noncontig->buf.iov.put_get.originv;
+        targetv = req->noncontig->buf.iov.put_get.targetv;
+        omax = MPIDI_Global.iov_limit;
+        tmax = MPIDI_Global.rma_iov_limit;
+        rc = MPIDI_OFI_merge_iov_list(&req->noncontig->iovs, originv,
+                                      omax, targetv, tmax, &oout, &tout);
+
+        if (rc == MPIDI_OFI_IOV_DONE)
+            break;
+
+        for (i = 0; i < tout; i++)
+            targetv[i].key = MPIDI_OFI_winfo_mr_key(win, target_rank);
+        MPIR_Assert(rc != MPIDI_OFI_IOV_ERROR);
+        msg.msg_iov = originv;
+        msg.iov_count = oout;
+        msg.rma_iov = targetv;
+        msg.rma_iov_count = tout;
+        MPIDI_OFI_CALL_RETRY2(MPIDI_OFI_INIT_CHUNK_CONTEXT(win, sigreq),
+                              fi_writemsg(ep, &msg, flags), rdma_write);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_DO_PUT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_put
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_put(const void *origin_addr,
+                               int origin_count,
+                               MPI_Datatype origin_datatype,
+                               int target_rank,
+                               MPI_Aint target_disp,
+                               int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_PUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_PUT);
+    int target_contig, origin_contig, mpi_errno = MPI_SUCCESS;
+    size_t target_bytes, origin_bytes;
+    MPI_Aint origin_true_lb, target_true_lb;
+    size_t offset;
+
+    MPIDI_CH4U_EPOCH_CHECK_SYNC(win, mpi_errno, goto fn_fail);
+    MPIDI_CH4U_EPOCH_START_CHECK(win, mpi_errno, goto fn_fail);
+
+    MPIDI_Datatype_check_contig_size_lb(target_datatype, target_count,
+                                        target_contig, target_bytes, target_true_lb);
+    MPIDI_Datatype_check_contig_size_lb(origin_datatype, origin_count,
+                                        origin_contig, origin_bytes, origin_true_lb);
+
+    MPIR_ERR_CHKANDJUMP((origin_bytes != target_bytes), mpi_errno, MPI_ERR_SIZE, "**rmasize");
+
+    if (unlikely((origin_bytes == 0) || (target_rank == MPI_PROC_NULL)))
+        goto fn_exit;
+
+    if (target_rank == win->comm_ptr->rank) {
+        offset = target_disp * MPIDI_OFI_winfo_disp_unit(win, target_rank);
+        mpi_errno = MPIR_Localcopy(origin_addr,
+                                   origin_count,
+                                   origin_datatype,
+                                   (char *) win->base + offset, target_count, target_datatype);
+        goto fn_exit;
+    }
+
+    if (origin_contig && target_contig && origin_bytes <= MPIDI_Global.max_buffered_write) {
+        MPIDI_OFI_CALL_RETRY2(MPIDI_OFI_win_cntr_incr(win),
+                              fi_inject_write(MPIDI_OFI_WIN(win).ep_nocmpl,
+                                              (char *) origin_addr + origin_true_lb, target_bytes,
+                                              MPIDI_OFI_comm_to_phys(win->comm_ptr, target_rank,
+                                                                     MPIDI_OFI_API_CTR),
+                                              (uint64_t) (char *) MPIDI_OFI_winfo_base(win,
+                                                                                       target_rank)
+                                              + target_disp * MPIDI_OFI_winfo_disp_unit(win,
+                                                                                        target_rank)
+                                              + target_true_lb, MPIDI_OFI_winfo_mr_key(win,
+                                                                                       target_rank)),
+                              rdma_inject_write);
+    }
+    else {
+        mpi_errno = MPIDI_OFI_do_put(origin_addr,
+                                     origin_count,
+                                     origin_datatype,
+                                     target_rank,
+                                     target_disp, target_count, target_datatype, win, NULL);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_PUT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_do_get
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_do_get(void *origin_addr,
+                                   int origin_count,
+                                   MPI_Datatype origin_datatype,
+                                   int target_rank,
+                                   MPI_Aint target_disp,
+                                   int target_count,
+                                   MPI_Datatype target_datatype,
+                                   MPIR_Win * win, MPIR_Request ** sigreq)
+{
+    int rc, mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_win_request_t *req;
+    size_t offset, omax, tmax, tout, oout;
+    uint64_t flags;
+    struct fid_ep *ep;
+    struct fi_msg_rma msg;
+    struct iovec *originv;
+    struct fi_rma_iov *targetv;
+    unsigned i;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_DO_GET);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_DO_GET);
+
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_allocate_win_request_put_get(win, origin_count, target_count,
+                                                                  target_rank,
+                                                                  origin_datatype, target_datatype,
+                                                                  &req, &flags, &ep, sigreq));
+
+    offset = target_disp * MPIDI_OFI_winfo_disp_unit(win, target_rank);
+    req->event_id = MPIDI_OFI_EVENT_ABORT;
+    msg.desc = NULL;
+    msg.addr = MPIDI_OFI_comm_to_phys(win->comm_ptr, req->target_rank, MPIDI_OFI_API_CTR);
+    msg.context = NULL;
+    msg.data = 0;
+    req->next = MPIDI_OFI_WIN(win).syncQ;
+    MPIDI_OFI_WIN(win).syncQ = req;
+    MPIDI_OFI_init_iovec_state(&req->noncontig->iovs,
+                               (uintptr_t) origin_addr,
+                               (uintptr_t) MPIDI_OFI_winfo_base(win, req->target_rank) + offset,
+                               req->noncontig->origin_dt.num_contig,
+                               req->noncontig->target_dt.num_contig,
+                               INT64_MAX,
+                               req->noncontig->origin_dt.map, req->noncontig->target_dt.map);
+    rc = MPIDI_OFI_IOV_EAGAIN;
+
+    while (rc == MPIDI_OFI_IOV_EAGAIN) {
+        originv = req->noncontig->buf.iov.put_get.originv;
+        targetv = req->noncontig->buf.iov.put_get.targetv;
+        omax = MPIDI_Global.iov_limit;
+        tmax = MPIDI_Global.rma_iov_limit;
+        rc = MPIDI_OFI_merge_iov_list(&req->noncontig->iovs, originv,
+                                      omax, targetv, tmax, &oout, &tout);
+
+        if (rc == MPIDI_OFI_IOV_DONE)
+            break;
+
+        MPIR_Assert(rc != MPIDI_OFI_IOV_ERROR);
+
+        for (i = 0; i < tout; i++)
+            targetv[i].key = MPIDI_OFI_winfo_mr_key(win, target_rank);
+
+        msg.msg_iov = originv;
+        msg.iov_count = oout;
+        msg.rma_iov = targetv;
+        msg.rma_iov_count = tout;
+        MPIDI_OFI_CALL_RETRY2(MPIDI_OFI_INIT_CHUNK_CONTEXT(win, sigreq),
+                              fi_readmsg(ep, &msg, flags), rdma_write);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_DO_GET);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_get
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_get(void *origin_addr,
+                               int origin_count,
+                               MPI_Datatype origin_datatype,
+                               int target_rank,
+                               MPI_Aint target_disp,
+                               int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    int origin_contig, target_contig, mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_win_datatype_t origin_dt, target_dt;
+    size_t origin_bytes;
+    size_t offset;
+    struct fi_rma_iov riov;
+    struct iovec iov;
+    struct fi_msg_rma msg;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_GET);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_GET);
+
+    MPIDI_CH4U_EPOCH_CHECK_SYNC(win, mpi_errno, goto fn_fail);
+    MPIDI_CH4U_EPOCH_START_CHECK(win, mpi_errno, goto fn_fail);
+
+    MPIDI_Datatype_check_contig_size(origin_datatype, origin_count, origin_contig, origin_bytes);
+
+    if (unlikely((origin_bytes == 0) || (target_rank == MPI_PROC_NULL)))
+        goto fn_exit;
+
+    if (target_rank == win->comm_ptr->rank) {
+        offset = target_disp * MPIDI_OFI_winfo_disp_unit(win, target_rank);
+        mpi_errno = MPIR_Localcopy((char *) win->base + offset,
+                                   target_count,
+                                   target_datatype, origin_addr, origin_count, origin_datatype);
+    }
+
+    MPIDI_Datatype_check_contig(origin_datatype, origin_contig);
+    MPIDI_Datatype_check_contig(target_datatype, target_contig);
+
+    if (origin_contig && target_contig) {
+        offset = target_disp * MPIDI_OFI_winfo_disp_unit(win, target_rank);
+        MPIDI_OFI_win_datatype_basic(origin_count, origin_datatype, &origin_dt);
+        MPIDI_OFI_win_datatype_basic(target_count, target_datatype, &target_dt);
+        MPIR_ERR_CHKANDJUMP((origin_dt.size != target_dt.size),
+                            mpi_errno, MPI_ERR_SIZE, "**rmasize");
+
+        msg.desc = NULL;
+        msg.msg_iov = &iov;
+        msg.iov_count = 1;
+        msg.addr = MPIDI_OFI_comm_to_phys(win->comm_ptr, target_rank, MPIDI_OFI_API_CTR);
+        msg.rma_iov = &riov;
+        msg.rma_iov_count = 1;
+        msg.context = NULL;
+        msg.data = 0;
+        iov.iov_base = (char *) origin_addr + origin_dt.true_lb;
+        iov.iov_len = target_dt.size;
+        riov.addr =
+            (uint64_t) ((char *) MPIDI_OFI_winfo_base(win, target_rank) + offset +
+                        target_dt.true_lb);
+        riov.len = target_dt.size;
+        riov.key = MPIDI_OFI_winfo_mr_key(win, target_rank);
+        MPIDI_OFI_CALL_RETRY2(MPIDI_OFI_win_cntr_incr(win),
+                              fi_readmsg(MPIDI_OFI_WIN(win).ep_nocmpl, &msg, 0), rdma_write);
+    }
+    else {
+        mpi_errno = MPIDI_OFI_do_get(origin_addr,
+                                     origin_count,
+                                     origin_datatype,
+                                     target_rank,
+                                     target_disp, target_count, target_datatype, win, NULL);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_GET);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_rput
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_rput(const void *origin_addr,
+                                int origin_count,
+                                MPI_Datatype origin_datatype,
+                                int target_rank,
+                                MPI_Aint target_disp,
+                                int target_count,
+                                MPI_Datatype target_datatype,
+                                MPIR_Win * win, MPIR_Request ** request)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_RPUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_RPUT);
+    int mpi_errno;
+    size_t origin_bytes;
+    size_t offset;
+    MPIR_Request *rreq;
+
+    MPIDI_Datatype_check_size(origin_datatype, origin_count, origin_bytes);
+
+    if (unlikely((origin_bytes == 0) || (target_rank == MPI_PROC_NULL))) {
+        mpi_errno = MPI_SUCCESS;
+        rreq = MPIR_Request_create(MPIR_REQUEST_KIND__RMA);
+        MPIR_Request_add_ref(rreq);
+        MPIDI_CH4U_request_complete(rreq);
+        goto fn_exit;
+    }
+
+    if (target_rank == win->comm_ptr->rank) {
+        rreq = MPIR_Request_create(MPIR_REQUEST_KIND__RMA);
+        MPIR_Request_add_ref(rreq);
+        offset = target_disp * MPIDI_OFI_winfo_disp_unit(win, target_rank);
+        mpi_errno = MPIR_Localcopy(origin_addr,
+                                   origin_count,
+                                   origin_datatype,
+                                   (char *) win->base + offset, target_count, target_datatype);
+        MPIDI_CH4U_request_complete(rreq);
+        goto fn_exit;
+    }
+
+    mpi_errno = MPIDI_OFI_do_put((void *) origin_addr,
+                                 origin_count,
+                                 origin_datatype,
+                                 target_rank,
+                                 target_disp, target_count, target_datatype, win, &rreq);
+
+  fn_exit:
+    *request = rreq;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_RPUT);
+    return mpi_errno;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_compare_and_swap
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_compare_and_swap(const void *origin_addr,
+                                            const void *compare_addr,
+                                            void *result_addr,
+                                            MPI_Datatype datatype,
+                                            int target_rank, MPI_Aint target_disp, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    enum fi_op fi_op;
+    enum fi_datatype fi_dt;
+    MPIDI_OFI_win_datatype_t origin_dt, target_dt, result_dt;
+    size_t offset, max_size, dt_size;
+    void *buffer, *tbuffer, *rbuffer;
+    struct fi_ioc originv, resultv, comparev;
+    struct fi_rma_ioc targetv;
+    struct fi_msg_atomic msg;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_COMPARE_AND_SWAP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_COMPARE_AND_SWAP);
+
+    MPIDI_CH4U_EPOCH_CHECK_SYNC(win, mpi_errno, goto fn_fail);
+
+    offset = target_disp * MPIDI_OFI_winfo_disp_unit(win, target_rank);
+
+    MPIDI_OFI_win_datatype_basic(1, datatype, &origin_dt);
+    MPIDI_OFI_win_datatype_basic(1, datatype, &result_dt);
+    MPIDI_OFI_win_datatype_basic(1, datatype, &target_dt);
+
+    if ((origin_dt.size == 0) || (target_rank == MPI_PROC_NULL))
+        goto fn_exit;
+
+    buffer = (char *) origin_addr + origin_dt.true_lb;
+    rbuffer = (char *) result_addr + result_dt.true_lb;
+    tbuffer = (char *) MPIDI_OFI_winfo_base(win, target_rank) + offset;
+
+    MPIDI_CH4U_EPOCH_START_CHECK(win, mpi_errno, goto fn_fail);
+
+    max_size = MPIDI_OFI_QUERY_COMPARE_ATOMIC_COUNT;
+    MPIDI_OFI_query_datatype(datatype, &fi_dt, MPI_OP_NULL, &fi_op, &max_size, &dt_size);
+
+    originv.addr = (void *) buffer;
+    originv.count = 1;
+    resultv.addr = (void *) rbuffer;
+    resultv.count = 1;
+    comparev.addr = (void *) compare_addr;
+    comparev.count = 1;
+    targetv.addr = (uint64_t) tbuffer;
+    targetv.count = 1;
+    targetv.key = MPIDI_OFI_winfo_mr_key(win, target_rank);;
+
+    msg.msg_iov = &originv;
+    msg.desc = NULL;
+    msg.iov_count = 1;
+    msg.addr = MPIDI_OFI_comm_to_phys(win->comm_ptr, target_rank, MPIDI_OFI_API_CTR);
+    msg.rma_iov = &targetv;
+    msg.rma_iov_count = 1;
+    msg.datatype = fi_dt;
+    msg.op = fi_op;
+    msg.context = NULL;
+    msg.data = 0;
+    MPIDI_OFI_CALL_RETRY2(MPIDI_OFI_win_cntr_incr(win),
+                          fi_compare_atomicmsg(MPIDI_OFI_WIN(win).ep_nocmpl, &msg,
+                                               &comparev, NULL, 1, &resultv, NULL, 1, 0), atomicto);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_COMPARE_AND_SWAP);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_do_accumulate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_do_accumulate(const void *origin_addr,
+                                          int origin_count,
+                                          MPI_Datatype origin_datatype,
+                                          int target_rank,
+                                          MPI_Aint target_disp,
+                                          int target_count,
+                                          MPI_Datatype target_datatype,
+                                          MPI_Op op, MPIR_Win * win, MPIR_Request ** sigreq)
+{
+    int rc, acccheck = 0, mpi_errno = MPI_SUCCESS;
+    uint64_t flags;
+    MPIDI_OFI_win_request_t *req;
+    size_t offset, max_size, dt_size, omax, tmax, tout, oout;
+    struct fid_ep *ep;
+    MPI_Datatype basic_type;
+    enum fi_op fi_op;
+    enum fi_datatype fi_dt;
+    struct fi_msg_atomic msg;
+    struct fi_ioc *originv;
+    struct fi_rma_ioc *targetv;
+    unsigned i;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_DO_ACCUMULATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_DO_ACCUMULATE);
+
+    MPIDI_CH4U_EPOCH_CHECK_SYNC(win, mpi_errno, goto fn_fail);
+
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_allocate_win_request_accumulate(win,
+                                                                     origin_count,
+                                                                     target_count,
+                                                                     target_rank,
+                                                                     origin_datatype,
+                                                                     target_datatype,
+                                                                     &req, &flags, &ep, sigreq));
+
+    if ((req->noncontig->origin_dt.size == 0) || (target_rank == MPI_PROC_NULL)) {
+        MPIDI_OFI_win_request_complete(req);
+
+        if (sigreq)
+            MPIDI_CH4U_request_release(*sigreq);
+
+        return MPI_SUCCESS;
+    }
+
+    offset = target_disp * MPIDI_OFI_winfo_disp_unit(win, target_rank);
+
+    MPIDI_CH4U_EPOCH_START_CHECK(win, mpi_errno, goto fn_fail);
+
+    MPIDI_OFI_GET_BASIC_TYPE(target_datatype, basic_type);
+
+    switch (basic_type) {
+        /* 8 byte types */
+    case MPI_FLOAT_INT:
+    case MPI_2INT:
+    case MPI_LONG_INT:
+#ifdef HAVE_FORTRAN_BINDING
+    case MPI_2REAL:
+    case MPI_2INTEGER:
+#endif
+        {
+            basic_type = MPI_LONG_LONG;
+            acccheck = 1;
+            break;
+        }
+
+        /* 16-byte types */
+#ifdef HAVE_FORTRAN_BINDING
+
+    case MPI_2DOUBLE_PRECISION:
+#endif
+#ifdef MPICH_DEFINE_2COMPLEX
+    case MPI_2COMPLEX:
+#endif
+        {
+            basic_type = MPI_DOUBLE_COMPLEX;
+            acccheck = 1;
+            break;
+        }
+
+        /* Types with pads or too large to handle */
+    case MPI_DATATYPE_NULL:
+    case MPI_SHORT_INT:
+    case MPI_DOUBLE_INT:
+    case MPI_LONG_DOUBLE_INT:
+#ifdef MPICH_DEFINE_2COMPLEX
+    case MPI_2DOUBLE_COMPLEX:
+#endif
+        goto am_fallback;
+    }
+
+    if (acccheck && op != MPI_REPLACE)
+        goto am_fallback;
+
+    max_size = MPIDI_OFI_QUERY_ATOMIC_COUNT;
+
+    MPIDI_OFI_query_datatype(basic_type, &fi_dt, op, &fi_op, &max_size, &dt_size);
+    if (max_size == 0)
+        goto am_fallback;
+
+    req->event_id = MPIDI_OFI_EVENT_ABORT;
+    req->next = MPIDI_OFI_WIN(win).syncQ;
+    MPIDI_OFI_WIN(win).syncQ = req;
+    max_size = max_size * dt_size;
+
+    MPIDI_OFI_init_iovec_state(&req->noncontig->iovs,
+                               (uintptr_t) origin_addr,
+                               (uintptr_t) MPIDI_OFI_winfo_base(win, req->target_rank) + offset,
+                               req->noncontig->origin_dt.num_contig,
+                               req->noncontig->target_dt.num_contig,
+                               max_size,
+                               req->noncontig->origin_dt.map, req->noncontig->target_dt.map);
+
+    msg.desc = NULL;
+    msg.addr = MPIDI_OFI_comm_to_phys(win->comm_ptr, req->target_rank, MPIDI_OFI_API_CTR);
+    msg.context = NULL;
+    msg.data = 0;
+    msg.datatype = fi_dt;
+    msg.op = fi_op;
+    rc = MPIDI_OFI_IOV_EAGAIN;
+
+    while (rc == MPIDI_OFI_IOV_EAGAIN) {
+        originv = req->noncontig->buf.iov.accumulate.originv;
+        targetv = req->noncontig->buf.iov.accumulate.targetv;
+        omax = MPIDI_Global.iov_limit;
+        tmax = MPIDI_Global.rma_iov_limit;
+        rc = MPIDI_OFI_merge_iov_list(&req->noncontig->iovs, (struct iovec *) originv, omax,
+                                      (struct fi_rma_iov *) targetv, tmax, &oout, &tout);
+
+        if (rc == MPIDI_OFI_IOV_DONE)
+            break;
+
+        MPIR_Assert(rc != MPIDI_OFI_IOV_ERROR);
+
+        for (i = 0; i < tout; i++)
+            targetv[i].key = MPIDI_OFI_winfo_mr_key(win, target_rank);
+
+        for (i = 0; i < oout; i++)
+            originv[i].count /= dt_size;
+
+        for (i = 0; i < tout; i++)
+            targetv[i].count /= dt_size;
+
+        msg.msg_iov = originv;
+        msg.iov_count = oout;
+        msg.rma_iov = targetv;
+        msg.rma_iov_count = tout;
+        MPIDI_OFI_CALL_RETRY2(MPIDI_OFI_INIT_CHUNK_CONTEXT(win, sigreq),
+                              fi_atomicmsg(ep, &msg, flags), rdma_atomicto);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_DO_ACCUMULATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+  am_fallback:
+    /* Fall back to active message */
+    MPIDI_OFI_win_request_complete(req);
+    return MPIDI_CH4U_accumulate(origin_addr, origin_count, origin_datatype,
+                                 target_rank, target_disp, target_count, target_datatype, op, win);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_get_accumulate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_do_get_accumulate(const void *origin_addr,
+                                              int origin_count,
+                                              MPI_Datatype origin_datatype,
+                                              void *result_addr,
+                                              int result_count,
+                                              MPI_Datatype result_datatype,
+                                              int target_rank,
+                                              MPI_Aint target_disp,
+                                              int target_count,
+                                              MPI_Datatype target_datatype,
+                                              MPI_Op op, MPIR_Win * win, MPIR_Request ** sigreq)
+{
+    int rc, acccheck = 0, mpi_errno = MPI_SUCCESS;
+    uint64_t flags;
+    MPIDI_OFI_win_request_t *req;
+    size_t offset, max_size, dt_size, omax, rmax, tmax, tout, rout, oout;
+    struct fid_ep *ep;
+    MPI_Datatype rt, basic_type, basic_type_res;
+    enum fi_op fi_op;
+    enum fi_datatype fi_dt;
+    struct fi_msg_atomic msg;
+    struct fi_ioc *originv, *resultv;
+    struct fi_rma_ioc *targetv;
+    unsigned i;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_GET_ACCUMULATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_GET_ACCUMULATE);
+
+    MPIDI_CH4U_EPOCH_CHECK_SYNC(win, mpi_errno, goto fn_fail);
+
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_allocate_win_request_get_accumulate(win,
+                                                                         origin_count,
+                                                                         target_count,
+                                                                         result_count,
+                                                                         target_rank,
+                                                                         op,
+                                                                         origin_datatype,
+                                                                         target_datatype,
+                                                                         result_datatype,
+                                                                         &req, &flags, &ep,
+                                                                         sigreq));
+
+    if ((req->noncontig->result_dt.size == 0) || (target_rank == MPI_PROC_NULL)) {
+        MPIDI_OFI_win_request_complete(req);
+
+        if (sigreq)
+            MPIDI_CH4U_request_release(*sigreq);
+
+        goto fn_exit;
+    }
+
+    offset = target_disp * MPIDI_OFI_winfo_disp_unit(win, target_rank);
+
+    MPIDI_CH4U_EPOCH_START_CHECK(win, mpi_errno, goto fn_fail);
+
+    MPIDI_OFI_GET_BASIC_TYPE(target_datatype, basic_type);
+    rt = result_datatype;
+
+    switch (basic_type) {
+        /* 8 byte types */
+    case MPI_FLOAT_INT:
+    case MPI_2INT:
+    case MPI_LONG_INT:
+#ifdef HAVE_FORTRAN_BINDING
+    case MPI_2REAL:
+    case MPI_2INTEGER:
+#endif
+        {
+            basic_type = rt = MPI_LONG_LONG;
+            acccheck = 1;
+            break;
+        }
+
+        /* 16-byte types */
+#ifdef HAVE_FORTRAN_BINDING
+
+    case MPI_2DOUBLE_PRECISION:
+#endif
+#ifdef MPICH_DEFINE_2COMPLEX
+    case MPI_2COMPLEX:
+#endif
+        {
+            basic_type = rt = MPI_DOUBLE_COMPLEX;
+            acccheck = 1;
+            break;
+        }
+
+        /* Types with pads or too large to handle */
+    case MPI_DATATYPE_NULL:
+    case MPI_SHORT_INT:
+    case MPI_DOUBLE_INT:
+    case MPI_LONG_DOUBLE_INT:
+#ifdef MPICH_DEFINE_2COMPLEX
+    case MPI_2DOUBLE_COMPLEX:
+#endif
+        goto am_fallback;
+        break;
+    }
+
+    if (acccheck && op != MPI_REPLACE && op != MPI_NO_OP)
+        goto am_fallback;
+
+    MPIDI_OFI_GET_BASIC_TYPE(rt, basic_type_res);
+    MPIR_Assert(basic_type_res != MPI_DATATYPE_NULL);
+
+    max_size = MPIDI_OFI_QUERY_FETCH_ATOMIC_COUNT;
+    MPIDI_OFI_query_datatype(basic_type_res, &fi_dt, op, &fi_op, &max_size, &dt_size);
+    max_size = max_size * dt_size;
+    if (max_size == 0)
+        goto am_fallback;
+
+    req->event_id = MPIDI_OFI_EVENT_RMA_DONE;
+    req->next = MPIDI_OFI_WIN(win).syncQ;
+    MPIDI_OFI_WIN(win).syncQ = req;
+
+    if (op != MPI_NO_OP)
+        MPIDI_OFI_init_iovec_state2(&req->noncontig->iovs,
+                                    (uintptr_t) origin_addr,
+                                    (uintptr_t) result_addr,
+                                    (uintptr_t) MPIDI_OFI_winfo_base(win,
+                                                                     req->target_rank) + offset,
+                                    req->noncontig->origin_dt.num_contig,
+                                    req->noncontig->result_dt.num_contig,
+                                    req->noncontig->target_dt.num_contig, max_size,
+                                    req->noncontig->origin_dt.map, req->noncontig->result_dt.map,
+                                    req->noncontig->target_dt.map);
+    else
+        MPIDI_OFI_init_iovec_state(&req->noncontig->iovs,
+                                   (uintptr_t) result_addr,
+                                   (uintptr_t) MPIDI_OFI_winfo_base(win, req->target_rank) + offset,
+                                   req->noncontig->result_dt.num_contig,
+                                   req->noncontig->target_dt.num_contig,
+                                   max_size,
+                                   req->noncontig->result_dt.map, req->noncontig->target_dt.map);
+
+    msg.desc = NULL;
+    msg.addr = MPIDI_OFI_comm_to_phys(win->comm_ptr, req->target_rank, MPIDI_OFI_API_CTR);
+    msg.context = NULL;
+    msg.data = 0;
+    msg.datatype = fi_dt;
+    msg.op = fi_op;
+    rc = MPIDI_OFI_IOV_EAGAIN;
+
+    while (rc == MPIDI_OFI_IOV_EAGAIN) {
+        originv = req->noncontig->buf.iov.get_accumulate.originv;
+        targetv = req->noncontig->buf.iov.get_accumulate.targetv;
+        resultv = req->noncontig->buf.iov.get_accumulate.resultv;
+        omax = rmax = MPIDI_Global.iov_limit;
+        tmax = MPIDI_Global.rma_iov_limit;
+
+        if (op != MPI_NO_OP)
+            rc = MPIDI_OFI_merge_iov_list2(&req->noncontig->iovs, (struct iovec *) originv,
+                                           omax, (struct iovec *) resultv, rmax,
+                                           (struct fi_rma_iov *) targetv, tmax, &oout, &rout,
+                                           &tout);
+        else {
+            oout = 0;
+            rc = MPIDI_OFI_merge_iov_list(&req->noncontig->iovs, (struct iovec *) resultv,
+                                          rmax, (struct fi_rma_iov *) targetv, tmax, &rout, &tout);
+        }
+
+        if (rc == MPIDI_OFI_IOV_DONE)
+            break;
+
+        MPIR_Assert(rc != MPIDI_OFI_IOV_ERROR);
+
+        for (i = 0; i < oout; i++)
+            originv[i].count /= dt_size;
+
+        for (i = 0; i < rout; i++)
+            resultv[i].count /= dt_size;
+
+        for (i = 0; i < tout; i++) {
+            targetv[i].count /= dt_size;
+            targetv[i].key = MPIDI_OFI_winfo_mr_key(win, target_rank);
+        }
+
+        msg.msg_iov = originv;
+        msg.iov_count = oout;
+        msg.rma_iov = targetv;
+        msg.rma_iov_count = tout;
+        MPIDI_OFI_CALL_RETRY2(MPIDI_OFI_INIT_CHUNK_CONTEXT(win, sigreq),
+                              fi_fetch_atomicmsg(ep, &msg, resultv,
+                                                 NULL, rout, flags), rdma_readfrom);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_GET_ACCUMULATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+  am_fallback:
+    MPIDI_OFI_win_request_complete(req);
+    return MPIDI_CH4U_get_accumulate(origin_addr, origin_count, origin_datatype,
+                                     result_addr, result_count, result_datatype,
+                                     target_rank, target_disp, target_count,
+                                     target_datatype, op, win);
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_raccumulate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_raccumulate(const void *origin_addr,
+                                       int origin_count,
+                                       MPI_Datatype origin_datatype,
+                                       int target_rank,
+                                       MPI_Aint target_disp,
+                                       int target_count,
+                                       MPI_Datatype target_datatype,
+                                       MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_RACCUMULATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_RACCUMULATE);
+    MPIR_Request *rreq;
+    int mpi_errno = MPIDI_OFI_do_accumulate((void *) origin_addr,
+                                            origin_count,
+                                            origin_datatype,
+                                            target_rank,
+                                            target_disp,
+                                            target_count,
+                                            target_datatype,
+                                            op,
+                                            win,
+                                            &rreq);
+    *request = rreq;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_RACCUMULATE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_rget_accumulate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_rget_accumulate(const void *origin_addr,
+                                           int origin_count,
+                                           MPI_Datatype origin_datatype,
+                                           void *result_addr,
+                                           int result_count,
+                                           MPI_Datatype result_datatype,
+                                           int target_rank,
+                                           MPI_Aint target_disp,
+                                           int target_count,
+                                           MPI_Datatype target_datatype,
+                                           MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_RGET_ACCUMULATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_RGET_ACCUMULATE);
+
+    mpi_errno = MPIDI_OFI_do_get_accumulate(origin_addr, origin_count, origin_datatype,
+                                            result_addr, result_count, result_datatype,
+                                            target_rank, target_disp, target_count,
+                                            target_datatype, op, win, &rreq);
+    *request = rreq;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_RGET_ACCUMULATE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_fetch_and_op
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_fetch_and_op(const void *origin_addr,
+                                        void *result_addr,
+                                        MPI_Datatype datatype,
+                                        int target_rank,
+                                        MPI_Aint target_disp, MPI_Op op, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_FETCH_AND_OP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_FETCH_AND_OP);
+
+    /*  This can be optimized by directly calling the fi directly
+     *  and avoiding all the datatype processing of the full
+     *  MPIDI_Get_accumulate
+     */
+    mpi_errno = MPIDI_OFI_do_get_accumulate(origin_addr, 1, datatype,
+                                            result_addr, 1, datatype,
+                                            target_rank, target_disp, 1, datatype, op, win, NULL);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_FETCH_AND_OP);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_rget
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_rget(void *origin_addr,
+                                int origin_count,
+                                MPI_Datatype origin_datatype,
+                                int target_rank,
+                                MPI_Aint target_disp,
+                                int target_count,
+                                MPI_Datatype target_datatype,
+                                MPIR_Win * win, MPIR_Request ** request)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_RGET);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_RGET);
+    int mpi_errno;
+    size_t origin_bytes;
+    size_t offset;
+    MPIR_Request *rreq;
+
+    MPIDI_Datatype_check_size(origin_datatype, origin_count, origin_bytes);
+
+    if (unlikely((origin_bytes == 0) || (target_rank == MPI_PROC_NULL))) {
+        mpi_errno = MPI_SUCCESS;
+        rreq = MPIR_Request_create(MPIR_REQUEST_KIND__RMA);
+        MPIR_Request_add_ref(rreq);
+        MPIDI_CH4U_request_complete(rreq);
+        goto fn_exit;
+    }
+
+    if (target_rank == win->comm_ptr->rank) {
+        rreq = MPIR_Request_create(MPIR_REQUEST_KIND__RMA);
+        MPIR_Request_add_ref(rreq);
+        offset = win->disp_unit * target_disp;
+        mpi_errno = MPIR_Localcopy((char *) win->base + offset,
+                                   target_count,
+                                   target_datatype, origin_addr, origin_count, origin_datatype);
+        MPIDI_CH4U_request_complete(rreq);
+        goto fn_exit;
+    }
+
+    mpi_errno = MPIDI_OFI_do_get(origin_addr,
+                                 origin_count,
+                                 origin_datatype,
+                                 target_rank,
+                                 target_disp, target_count, target_datatype, win, &rreq);
+  fn_exit:
+    *request = rreq;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_RGET);
+    return mpi_errno;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_get_accumulate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_get_accumulate(const void *origin_addr,
+                                          int origin_count,
+                                          MPI_Datatype origin_datatype,
+                                          void *result_addr,
+                                          int result_count,
+                                          MPI_Datatype result_datatype,
+                                          int target_rank,
+                                          MPI_Aint target_disp,
+                                          int target_count,
+                                          MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_GET_ACCUMULATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_GET_ACCUMULATE);
+    mpi_errno = MPIDI_OFI_do_get_accumulate(origin_addr, origin_count, origin_datatype,
+                                            result_addr, result_count, result_datatype,
+                                            target_rank, target_disp, target_count,
+                                            target_datatype, op, win, NULL);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_GET_ACCUMULATE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_accumulate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_accumulate(const void *origin_addr,
+                                      int origin_count,
+                                      MPI_Datatype origin_datatype,
+                                      int target_rank,
+                                      MPI_Aint target_disp,
+                                      int target_count,
+                                      MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_ACCUMULATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_ACCUMULATE);
+    int mpi_errno = MPIDI_OFI_do_accumulate(origin_addr,
+                                            origin_count,
+                                            origin_datatype,
+                                            target_rank,
+                                            target_disp,
+                                            target_count,
+                                            target_datatype,
+                                            op,
+                                            win,
+                                            NULL);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_ACCUMULATE);
+    return mpi_errno;
+}
+
+#endif /* NETMOD_OFI_RMA_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_send.h b/src/mpid/ch4/netmod/ofi/ofi_send.h
new file mode 100644
index 0000000..8cc943d
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_send.h
@@ -0,0 +1,566 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_SEND_H_INCLUDED
+#define NETMOD_OFI_SEND_H_INCLUDED
+
+#include "ofi_impl.h"
+#include <../mpi/pt2pt/bsendutil.h>
+
+#define MPIDI_OFI_SENDPARAMS const void *buf,int count,MPI_Datatype datatype, \
+    int rank,int tag,MPIR_Comm *comm,                               \
+    int context_offset,MPIR_Request **request
+
+#define MPIDI_OFI_SENDARGS buf,count,datatype,rank,tag, \
+                 comm,context_offset,request
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_send_lightweight
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_send_lightweight(const void *buf,
+                                                 size_t data_sz,
+                                                 int rank,
+                                                 int tag, MPIR_Comm * comm, int context_offset)
+{
+    int mpi_errno = MPI_SUCCESS;
+    uint64_t match_bits;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_SEND_LIGHTWEIGHT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_SEND_LIGHTWEIGHT);
+    match_bits =
+        MPIDI_OFI_init_sendtag(comm->context_id + context_offset, comm->rank, tag, 0,
+                               MPIDI_OFI_ENABLE_DATA);
+    mpi_errno =
+        MPIDI_OFI_send_handler(MPIDI_OFI_EP_TX_TAG(0), buf, data_sz, NULL, comm->rank,
+                               MPIDI_OFI_comm_to_phys(comm, rank, MPIDI_OFI_API_TAG), match_bits,
+                               NULL, MPIDI_OFI_DO_INJECT, MPIDI_OFI_ENABLE_DATA,
+                               MPIDI_OFI_CALL_LOCK);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_SEND_LIGHTWEIGHT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_send_lightweight_request
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_send_lightweight_request(const void *buf,
+                                                         size_t data_sz,
+                                                         int rank,
+                                                         int tag,
+                                                         MPIR_Comm * comm,
+                                                         int context_offset,
+                                                         MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    uint64_t match_bits;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_SEND_LIGHTWEIGHT_REQUEST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_SEND_LIGHTWEIGHT_REQUEST);
+    MPIR_Request *r;
+    MPIDI_OFI_SEND_REQUEST_CREATE_LW(r);
+    *request = r;
+    match_bits =
+        MPIDI_OFI_init_sendtag(comm->context_id + context_offset, comm->rank, tag, 0,
+                               MPIDI_OFI_ENABLE_DATA);
+    mpi_errno =
+        MPIDI_OFI_send_handler(MPIDI_OFI_EP_TX_TAG(0), buf, data_sz, NULL, comm->rank,
+                               MPIDI_OFI_comm_to_phys(comm, rank, MPIDI_OFI_API_TAG), match_bits,
+                               NULL, MPIDI_OFI_DO_INJECT, MPIDI_OFI_ENABLE_DATA,
+                               MPIDI_OFI_CALL_LOCK);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_SEND_LIGHTWEIGHT_REQUEST);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_send_normal
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_send_normal(MPIDI_OFI_SENDPARAMS,
+                                            int dt_contig,
+                                            size_t data_sz,
+                                            MPIR_Datatype * dt_ptr,
+                                            MPI_Aint dt_true_lb, uint64_t type)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *sreq = NULL;
+    MPI_Aint last;
+    char *send_buf;
+    uint64_t match_bits;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_SEND_NORMAL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_SEND_NORMAL);
+
+    MPIDI_OFI_REQUEST_CREATE(sreq, MPIR_REQUEST_KIND__SEND);
+    *request = sreq;
+    match_bits =
+        MPIDI_OFI_init_sendtag(comm->context_id + context_offset, comm->rank, tag, type,
+                               MPIDI_OFI_ENABLE_DATA);
+    MPIDI_OFI_REQUEST(sreq, event_id) = MPIDI_OFI_EVENT_SEND;
+    MPIDI_OFI_REQUEST(sreq, datatype) = datatype;
+    dtype_add_ref_if_not_builtin(datatype);
+
+    if (type == MPIDI_OFI_SYNC_SEND) {  /* Branch should compile out */
+        int c = 1;
+        uint64_t ssend_match, ssend_mask;
+        MPIDI_OFI_ssendack_request_t *ackreq;
+        MPIDI_OFI_SSEND_ACKREQUEST_CREATE(ackreq);
+        ackreq->event_id = MPIDI_OFI_EVENT_SSEND_ACK;
+        ackreq->signal_req = sreq;
+        MPIR_cc_incr(sreq->cc_ptr, &c);
+        ssend_match =
+            MPIDI_OFI_init_recvtag(&ssend_mask, comm->context_id + context_offset, rank, tag,
+                                   MPIDI_OFI_ENABLE_DATA);
+        ssend_match |= MPIDI_OFI_SYNC_SEND_ACK;
+        MPIDI_OFI_CALL_RETRY(fi_trecv(MPIDI_OFI_EP_RX_TAG(0),   /* endpoint    */
+                                      NULL,     /* recvbuf     */
+                                      0,        /* data sz     */
+                                      NULL,     /* memregion descr  */
+                                      MPIDI_OFI_comm_to_phys(comm, rank, MPIDI_OFI_API_TAG),    /* remote proc */
+                                      ssend_match,      /* match bits  */
+                                      0ULL,     /* mask bits   */
+                                      (void *) &(ackreq->context)), trecvsync, MPIDI_OFI_CALL_LOCK);
+    }
+
+    send_buf = (char *) buf + dt_true_lb;
+
+    if (!dt_contig) {
+        size_t segment_first;
+        segment_first = 0;
+        last = data_sz;
+        MPIDI_OFI_REQUEST(sreq, noncontig) =
+            (MPIDI_OFI_noncontig_t *) MPL_malloc(data_sz + sizeof(MPID_Segment));
+        MPIR_ERR_CHKANDJUMP1(MPIDI_OFI_REQUEST(sreq, noncontig) == NULL, mpi_errno, MPI_ERR_OTHER,
+                             "**nomem", "**nomem %s", "Send Pack buffer alloc");
+        MPID_Segment_init(buf, count, datatype, &MPIDI_OFI_REQUEST(sreq, noncontig->segment), 0);
+        MPID_Segment_pack(&MPIDI_OFI_REQUEST(sreq, noncontig->segment), segment_first, &last,
+                          MPIDI_OFI_REQUEST(sreq, noncontig->pack_buffer));
+        send_buf = MPIDI_OFI_REQUEST(sreq, noncontig->pack_buffer);
+    }
+    else
+        MPIDI_OFI_REQUEST(sreq, noncontig) = NULL;
+
+    if (data_sz <= MPIDI_Global.max_buffered_send) {
+        mpi_errno =
+            MPIDI_OFI_send_handler(MPIDI_OFI_EP_TX_TAG(0), send_buf, data_sz, NULL, comm->rank,
+                                   MPIDI_OFI_comm_to_phys(comm, rank, MPIDI_OFI_API_TAG),
+                                   match_bits, NULL, MPIDI_OFI_DO_INJECT, MPIDI_OFI_ENABLE_DATA,
+                                   MPIDI_OFI_CALL_LOCK);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+        MPIDI_OFI_send_event(NULL, sreq);
+    }
+    else if (data_sz <= MPIDI_Global.max_send) {
+        mpi_errno =
+            MPIDI_OFI_send_handler(MPIDI_OFI_EP_TX_TAG(0), send_buf, data_sz, NULL, comm->rank,
+                                   MPIDI_OFI_comm_to_phys(comm, rank, MPIDI_OFI_API_TAG),
+                                   match_bits, (void *) &(MPIDI_OFI_REQUEST(sreq, context)),
+                                   MPIDI_OFI_DO_SEND, MPIDI_OFI_ENABLE_DATA, MPIDI_OFI_CALL_LOCK);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+    else if (unlikely(1)) {
+        MPIDI_OFI_send_control_t ctrl;
+        int c;
+        uint64_t rma_key = 0;
+        MPIDI_OFI_huge_counter_t *cntr;
+        void *ptr;
+        c = 1;
+        MPIDI_OFI_REQUEST(sreq, event_id) = MPIDI_OFI_EVENT_SEND_HUGE;
+        MPIR_cc_incr(sreq->cc_ptr, &c);
+        ptr = MPIDI_OFI_map_lookup(MPIDI_OFI_COMM(comm).huge_send_counters, rank);
+
+        MPID_THREAD_CS_ENTER(POBJ, MPIDI_OFI_THREAD_FI_MUTEX);
+
+        if (ptr == MPIDI_OFI_MAP_NOT_FOUND) {
+            ptr = MPL_malloc(sizeof(MPIDI_OFI_huge_counter_t));
+            cntr = (MPIDI_OFI_huge_counter_t *) ptr;
+            cntr->outstanding = 0;
+            cntr->counter = 0;
+            MPIDI_OFI_map_set(MPIDI_OFI_COMM(comm).huge_send_counters, rank, ptr);
+        }
+
+        cntr = (MPIDI_OFI_huge_counter_t *) ptr;
+
+        ctrl.rma_key = MPIDI_OFI_index_allocator_alloc(MPIDI_OFI_COMM(comm).rma_id_allocator);
+        MPIR_Assert(ctrl.rma_key < MPIDI_Global.max_huge_rmas);
+        if (MPIDI_OFI_ENABLE_MR_SCALABLE)
+            rma_key = ctrl.rma_key << MPIDI_Global.huge_rma_shift;
+        MPIDI_OFI_CALL_NOLOCK(fi_mr_reg(MPIDI_Global.domain,    /* In:  Domain Object       */
+                                        send_buf,       /* In:  Lower memory address */
+                                        data_sz,        /* In:  Length              */
+                                        FI_REMOTE_READ, /* In:  Expose MR for read  */
+                                        0ULL,   /* In:  offset(not used)    */
+                                        rma_key,        /* In:  requested key       */
+                                        0ULL,   /* In:  flags               */
+                                        &cntr->mr,      /* Out: memregion object    */
+                                        NULL), mr_reg); /* In:  context             */
+
+        if (!MPIDI_OFI_ENABLE_MR_SCALABLE) {
+            /* MR_BASIC */
+            ctrl.rma_key = fi_mr_key(cntr->mr);
+        }
+
+        cntr->outstanding++;
+        cntr->counter++;
+        MPIR_Assert(cntr->outstanding != USHRT_MAX);
+        MPIR_Assert(cntr->counter != USHRT_MAX);
+        MPIDI_OFI_REQUEST(sreq, util_comm) = comm;
+        MPIDI_OFI_REQUEST(sreq, util_id) = rank;
+        mpi_errno = MPIDI_OFI_send_handler(MPIDI_OFI_EP_TX_TAG(0), send_buf,
+                                           MPIDI_Global.max_send,
+                                           NULL,
+                                           comm->rank,
+                                           MPIDI_OFI_comm_to_phys(comm, rank, MPIDI_OFI_API_TAG),
+                                           match_bits,
+                                           (void *) &(MPIDI_OFI_REQUEST(sreq, context)),
+                                           MPIDI_OFI_DO_SEND, MPIDI_OFI_ENABLE_DATA,
+                                           MPIDI_OFI_CALL_NO_LOCK);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+        ctrl.type = MPIDI_OFI_CTRL_HUGE;
+        ctrl.seqno = cntr->counter - 1;
+        MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_do_control_send
+                               (&ctrl, send_buf, data_sz, rank, comm, sreq, FALSE));
+        MPID_THREAD_CS_EXIT(POBJ, MPIDI_OFI_THREAD_FI_MUTEX);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_SEND_NORMAL);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_send
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_send(MPIDI_OFI_SENDPARAMS, int noreq, uint64_t syncflag)
+{
+    int dt_contig, mpi_errno;
+    size_t data_sz;
+    MPI_Aint dt_true_lb;
+    MPIR_Datatype *dt_ptr;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_NM_SEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_NM_SEND);
+
+    MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);
+
+    if (likely(!syncflag && dt_contig && (data_sz <= MPIDI_Global.max_buffered_send)))
+        if (noreq)
+            mpi_errno = MPIDI_OFI_send_lightweight((char *) buf + dt_true_lb, data_sz,
+                                                   rank, tag, comm, context_offset);
+        else
+            mpi_errno = MPIDI_OFI_send_lightweight_request((char *) buf + dt_true_lb, data_sz,
+                                                           rank, tag, comm, context_offset,
+                                                           request);
+    else
+        mpi_errno = MPIDI_OFI_send_normal(buf, count, datatype, rank, tag, comm,
+                                          context_offset, request, dt_contig,
+                                          data_sz, dt_ptr, dt_true_lb, syncflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_NM_SEND);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_persistent_send
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_OFI_persistent_send(MPIDI_OFI_SENDPARAMS)
+{
+    MPIR_Request *sreq;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_NM_PSEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_NM_PSEND);
+
+    MPIDI_OFI_REQUEST_CREATE(sreq, MPIR_REQUEST_KIND__PREQUEST_SEND);
+    *request = sreq;
+
+    MPIR_Comm_add_ref(comm);
+    sreq->comm = comm;
+    MPIDI_OFI_REQUEST(sreq, util.persist.buf) = (void *) buf;
+    MPIDI_OFI_REQUEST(sreq, util.persist.count) = count;
+    MPIDI_OFI_REQUEST(sreq, datatype) = datatype;
+    MPIDI_OFI_REQUEST(sreq, util.persist.rank) = rank;
+    MPIDI_OFI_REQUEST(sreq, util.persist.tag) = tag;
+    MPIDI_OFI_REQUEST(sreq, util_comm) = comm;
+    MPIDI_OFI_REQUEST(sreq, util_id) = comm->context_id + context_offset;
+    sreq->u.persist.real_request = NULL;
+    MPIDI_CH4U_request_complete(sreq);
+
+    if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) {
+        MPIR_Datatype *dt_ptr;
+        MPID_Datatype_get_ptr(datatype, dt_ptr);
+        MPID_Datatype_add_ref(dt_ptr);
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_NM_PSEND);
+    return MPI_SUCCESS;
+}
+
+#define STARTALL_CASE(CASELABEL,FUNC,CONTEXTID) \
+  case CASELABEL:                               \
+  {                                             \
+    rc = FUNC(MPIDI_OFI_REQUEST(preq,util.persist.buf),              \
+              MPIDI_OFI_REQUEST(preq,util.persist.count),            \
+              MPIDI_OFI_REQUEST(preq,datatype),           \
+              MPIDI_OFI_REQUEST(preq,util.persist.rank),             \
+              MPIDI_OFI_REQUEST(preq,util.persist.tag),              \
+              preq->comm,                       \
+              MPIDI_OFI_REQUEST(preq,util_id) -           \
+              CONTEXTID,                        \
+              &preq->u.persist.real_request);          \
+    break;                                      \
+  }
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_send
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_NM_send(MPIDI_OFI_SENDPARAMS)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_SEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_SEND);
+    mpi_errno = MPIDI_OFI_send(MPIDI_OFI_SENDARGS, 1, 0ULL);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_SEND);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_rsend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_NM_rsend(MPIDI_OFI_SENDPARAMS)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_RSEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_RSEND);
+    mpi_errno = MPIDI_OFI_send(MPIDI_OFI_SENDARGS, 1, 0ULL);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_RSEND);
+    return mpi_errno;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_irsend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_NM_irsend(MPIDI_OFI_SENDPARAMS)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_IRSEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_IRSEND);
+    mpi_errno = MPIDI_OFI_send(MPIDI_OFI_SENDARGS, 0, 0ULL);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_IRSEND);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ssend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_NM_ssend(MPIDI_OFI_SENDPARAMS)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_SSEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_SSEND);
+    mpi_errno = MPIDI_OFI_send(MPIDI_OFI_SENDARGS, 0, MPIDI_OFI_SYNC_SEND);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_SSEND);
+    return mpi_errno;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_isend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_NM_isend(MPIDI_OFI_SENDPARAMS)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_ISEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_ISEND);
+    mpi_errno = MPIDI_OFI_send(MPIDI_OFI_SENDARGS, 0, 0ULL);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_ISEND);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_issend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_NM_issend(MPIDI_OFI_SENDPARAMS)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_ISSEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_ISSEND);
+    mpi_errno = MPIDI_OFI_send(MPIDI_OFI_SENDARGS, 0, MPIDI_OFI_SYNC_SEND);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_ISSEND);
+    return mpi_errno;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_startall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_NM_startall(int count, MPIR_Request * requests[])
+{
+    int rc = MPI_SUCCESS, i;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_STARTALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_STARTALL);
+
+    for (i = 0; i < count; i++) {
+        MPIR_Request *const preq = requests[i];
+
+        switch (MPIDI_OFI_REQUEST(preq, util.persist.type)) {
+#ifdef MPIDI_BUILD_CH4_SHM
+            STARTALL_CASE(MPIDI_PTYPE_RECV, MPIDI_NM_irecv, preq->comm->recvcontext_id);
+#else
+            STARTALL_CASE(MPIDI_PTYPE_RECV, MPIDI_Irecv, preq->comm->recvcontext_id);
+#endif
+
+#ifdef MPIDI_BUILD_CH4_SHM
+            STARTALL_CASE(MPIDI_PTYPE_SEND, MPIDI_NM_isend, preq->comm->context_id);
+#else
+            STARTALL_CASE(MPIDI_PTYPE_SEND, MPIDI_Isend, preq->comm->context_id);
+#endif
+            STARTALL_CASE(MPIDI_PTYPE_SSEND, MPIDI_Issend, preq->comm->context_id);
+
+        case MPIDI_PTYPE_BSEND:{
+                MPI_Request sreq_handle;
+                rc = MPIR_Ibsend_impl(MPIDI_OFI_REQUEST(preq, util.persist.buf),
+                                      MPIDI_OFI_REQUEST(preq, util.persist.count),
+                                      MPIDI_OFI_REQUEST(preq, datatype),
+                                      MPIDI_OFI_REQUEST(preq, util.persist.rank),
+                                      MPIDI_OFI_REQUEST(preq, util.persist.tag),
+                                      preq->comm, &sreq_handle);
+
+                if (rc == MPI_SUCCESS)
+                    MPIR_Request_get_ptr(sreq_handle, preq->u.persist.real_request);
+
+                break;
+            }
+
+        default:
+            rc = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, __FUNCTION__,
+                                      __LINE__, MPI_ERR_INTERN, "**ch3|badreqtype",
+                                      "**ch3|badreqtype %d", MPIDI_OFI_REQUEST(preq,
+                                                                               util.persist.type));
+        }
+
+        if (rc == MPI_SUCCESS) {
+            preq->status.MPI_ERROR = MPI_SUCCESS;
+
+            if (MPIDI_OFI_REQUEST(preq, util.persist.type) == MPIDI_PTYPE_BSEND) {
+                preq->cc_ptr = &preq->cc;
+                MPIR_cc_set(&preq->cc, 0);
+            }
+            else
+                preq->cc_ptr = &preq->u.persist.real_request->cc;
+        }
+        else {
+            preq->u.persist.real_request = NULL;
+            preq->status.MPI_ERROR = rc;
+            preq->cc_ptr = &preq->cc;
+            MPIR_cc_set(&preq->cc, 0);
+        }
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_STARTALL);
+    return rc;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_send_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_NM_send_init(MPIDI_OFI_SENDPARAMS)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_SEND_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_SEND_INIT);
+    mpi_errno = MPIDI_OFI_persistent_send(MPIDI_OFI_SENDARGS);
+    MPIDI_OFI_REQUEST((*request), util.persist.type) = MPIDI_PTYPE_SEND;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_SEND_INIT);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ssend_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_NM_ssend_init(MPIDI_OFI_SENDPARAMS)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_SSEND_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_SSEND_INIT);
+    mpi_errno = MPIDI_OFI_persistent_send(MPIDI_OFI_SENDARGS);
+    MPIDI_OFI_REQUEST((*request), util.persist.type) = MPIDI_PTYPE_SSEND;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_SSEND_INIT);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_bsend_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_NM_bsend_init(MPIDI_OFI_SENDPARAMS)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_BSEND_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_BSEND_INIT);
+    mpi_errno = MPIDI_OFI_persistent_send(MPIDI_OFI_SENDARGS);
+    MPIDI_OFI_REQUEST((*request), util.persist.type) = MPIDI_PTYPE_BSEND;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_BSEND_INIT);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_rsend_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_NM_rsend_init(MPIDI_OFI_SENDPARAMS)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_RSEND_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_RSEND_INIT);
+    mpi_errno = MPIDI_OFI_persistent_send(MPIDI_OFI_SENDARGS);
+    MPIDI_OFI_REQUEST((*request), util.persist.type) = MPIDI_PTYPE_SEND;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_RSEND_INIT);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_cancel_send
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int MPIDI_NM_cancel_send(MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_CANCEL_SEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_CANCEL_SEND);
+    /* Sends cannot be cancelled */
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_CANCEL_SEND);
+    return mpi_errno;
+}
+
+#endif /* NETMOD_OFI_SEND_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_spawn.h b/src/mpid/ch4/netmod/ofi/ofi_spawn.h
new file mode 100644
index 0000000..9b109d8
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_spawn.h
@@ -0,0 +1,561 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_DYNPROC_H_INCLUDED
+#define NETMOD_OFI_DYNPROC_H_INCLUDED
+
+#include "ofi_impl.h"
+
+#define MPIDI_OFI_PORT_NAME_TAG_KEY "tag"
+#define MPIDI_OFI_CONNENTRY_TAG_KEY "connentry"
+
+// FIXME:
+#ifdef MPIDI_OFI_CONFIG_USE_AV_TABLE
+#define MPIDI_OFI_TABLE_INDEX_INCR()
+#else
+#define MPIDI_OFI_TABLE_INDEX_INCR()
+#endif
+
+static inline void MPIDI_OFI_free_port_name_tag(int tag)
+{
+    int index, rem_tag;
+
+    index = tag / (sizeof(int) * 8);
+    rem_tag = tag - (index * sizeof(int) * 8);
+
+    MPIDI_Global.port_name_tag_mask[index] &= ~(1 << ((8 * sizeof(int)) - 1 - rem_tag));
+}
+
+
+static inline int MPIDI_OFI_get_port_name_tag(int *port_name_tag)
+{
+    unsigned i, j;
+    int mpi_errno = MPI_SUCCESS;
+
+    for (i = 0; i < MPIR_MAX_CONTEXT_MASK; i++)
+        if (MPIDI_Global.port_name_tag_mask[i] != ~0)
+            break;
+
+    if (i < MPIR_MAX_CONTEXT_MASK)
+        for (j = 0; j < (8 * sizeof(int)); j++) {
+            if ((MPIDI_Global.port_name_tag_mask[i] | (1 << ((8 * sizeof(int)) - j - 1))) !=
+                MPIDI_Global.port_name_tag_mask[i]) {
+                MPIDI_Global.port_name_tag_mask[i] |= (1 << ((8 * sizeof(int)) - j - 1));
+                *port_name_tag = ((i * 8 * sizeof(int)) + j);
+                goto fn_exit;
+            }
+        }
+    else
+        goto fn_fail;
+
+  fn_exit:
+    return mpi_errno;
+
+  fn_fail:
+    *port_name_tag = -1;
+    mpi_errno = MPI_ERR_OTHER;
+    goto fn_exit;
+}
+
+static inline int MPIDI_OFI_get_tag_from_port(const char *port_name, int *port_name_tag)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int str_errno = MPL_STR_SUCCESS;
+
+    if (strlen(port_name) == 0)
+        goto fn_exit;
+
+    str_errno = MPL_str_get_int_arg(port_name, MPIDI_OFI_PORT_NAME_TAG_KEY, port_name_tag);
+    MPIR_ERR_CHKANDJUMP(str_errno, mpi_errno, MPI_ERR_OTHER, "**argstr_no_port_name_tag");
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+static inline int MPIDI_OFI_get_conn_name_from_port(const char *port_name, char *connname)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int maxlen = MPIDI_KVSAPPSTRLEN;
+    MPL_str_get_binary_arg(port_name,
+                           MPIDI_OFI_CONNENTRY_TAG_KEY,
+                           connname, MPIDI_Global.addrnamelen, &maxlen);
+    return mpi_errno;
+}
+
+static inline int MPIDI_OFI_dynproc_create_intercomm(const char *port_name,
+                                                     char *addr_table,
+                                                     MPID_Node_id_t * node_table,
+                                                     int entries,
+                                                     MPIR_Comm * comm_ptr,
+                                                     MPIR_Comm ** newcomm,
+                                                     int is_low_group, char *api)
+{
+    int i, context_id_offset, mpi_errno = MPI_SUCCESS;
+    MPIR_Comm *tmp_comm_ptr = NULL;
+
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_get_tag_from_port(port_name, &context_id_offset));
+    MPIDI_OFI_MPI_CALL_POP(MPIR_Comm_create(&tmp_comm_ptr));
+
+    tmp_comm_ptr->context_id = MPIR_CONTEXT_SET_FIELD(DYNAMIC_PROC, context_id_offset, 1);
+    tmp_comm_ptr->recvcontext_id = tmp_comm_ptr->context_id;
+    tmp_comm_ptr->remote_size = entries;
+    tmp_comm_ptr->local_size = comm_ptr->local_size;
+    tmp_comm_ptr->rank = comm_ptr->rank;
+    tmp_comm_ptr->comm_kind = MPIR_COMM_KIND__INTERCOMM;
+    tmp_comm_ptr->local_comm = comm_ptr;
+    tmp_comm_ptr->is_low_group = is_low_group;
+    MPIDII_COMM(tmp_comm_ptr, local_map).mode = MPIDII_COMM(comm_ptr, map).mode;
+    MPIDII_COMM(tmp_comm_ptr, local_map).size = MPIDII_COMM(comm_ptr, map).size;
+    MPIDII_COMM(tmp_comm_ptr, local_map).avtid = MPIDII_COMM(comm_ptr, map).avtid;
+    switch (MPIDII_COMM(comm_ptr, map).mode) {
+    case MPIDII_RANK_MAP_DIRECT:
+    case MPIDII_RANK_MAP_DIRECT_INTRA:
+        break;
+    case MPIDII_RANK_MAP_OFFSET:
+    case MPIDII_RANK_MAP_OFFSET_INTRA:
+        MPIDII_COMM(tmp_comm_ptr, local_map).reg.offset = MPIDII_COMM(comm_ptr, map).reg.offset;
+        break;
+    case MPIDII_RANK_MAP_STRIDE:
+    case MPIDII_RANK_MAP_STRIDE_INTRA:
+    case MPIDII_RANK_MAP_STRIDE_BLOCK:
+    case MPIDII_RANK_MAP_STRIDE_BLOCK_INTRA:
+        MPIDII_COMM(tmp_comm_ptr, local_map).reg.stride.stride =
+            MPIDII_COMM(comm_ptr, map).reg.stride.stride;
+        MPIDII_COMM(tmp_comm_ptr, local_map).reg.stride.blocksize =
+            MPIDII_COMM(comm_ptr, map).reg.stride.blocksize;
+        MPIDII_COMM(tmp_comm_ptr, local_map).reg.stride.offset =
+            MPIDII_COMM(comm_ptr, map).reg.stride.offset;
+        break;
+    case MPIDII_RANK_MAP_LUT:
+    case MPIDII_RANK_MAP_LUT_INTRA:
+        MPIDII_COMM(tmp_comm_ptr, local_map).irreg.lut.t = MPIDII_COMM(comm_ptr, map).irreg.lut.t;
+        MPIDII_COMM(tmp_comm_ptr, local_map).irreg.lut.lpid =
+            MPIDII_COMM(comm_ptr, map).irreg.lut.lpid;
+        MPIR_Object_add_ref(MPIDII_COMM(comm_ptr, map).irreg.lut.t);
+        break;
+    case MPIDII_RANK_MAP_MLUT:
+        MPIDII_COMM(tmp_comm_ptr, local_map).irreg.mlut.t = MPIDII_COMM(comm_ptr, map).irreg.mlut.t;
+        MPIDII_COMM(tmp_comm_ptr, local_map).irreg.mlut.gpid =
+            MPIDII_COMM(comm_ptr, map).irreg.mlut.gpid;
+        MPIR_Object_add_ref(MPIDII_COMM(comm_ptr, map).irreg.mlut.t);
+        break;
+    case MPIDII_RANK_MAP_NONE:
+        MPIR_Assert(0);
+        break;
+    }
+
+    int avtid;
+    avtid = 0;
+    MPIDIU_new_avt(entries, &avtid);
+
+#ifdef MPIDI_OFI_CONFIG_USE_AV_TABLE
+    MPIDI_OFI_CALL(fi_av_insert(MPIDI_Global.av, addr_table, entries, NULL, 0ULL, NULL), avmap);
+#else
+    fi_addr_t *mapped_table;
+    mapped_table = (fi_addr_t *) MPL_malloc(entries * sizeof(fi_addr_t));
+
+    MPIDI_OFI_CALL(fi_av_insert(MPIDI_Global.av, addr_table, entries,
+                                mapped_table, 0ULL, NULL), avmap);
+    for (i = 0; i < entries; i++) {
+        MPIDI_OFI_AV(&MPIDIU_get_av(avtid, i)).dest = mapped_table[i];
+    }
+    MPL_free(mapped_table);
+#endif
+
+    MPIDIU_update_node_map(avtid, entries, node_table);
+
+    /* set mapping for remote group */
+    MPIDII_COMM(tmp_comm_ptr, map).mode = MPIDII_RANK_MAP_DIRECT;
+    MPIDII_COMM(tmp_comm_ptr, map).size = entries;
+    MPIDII_COMM(tmp_comm_ptr, map).avtid = avtid;
+
+    MPIR_Comm_commit(tmp_comm_ptr);
+
+    MPIDI_OFI_MPI_CALL_POP(MPIR_Comm_dup_impl(tmp_comm_ptr, newcomm));
+
+    tmp_comm_ptr->local_comm = NULL;    /* avoid freeing local comm with comm_release */
+    MPIR_Comm_release(tmp_comm_ptr);
+
+    MPL_free(addr_table);
+    MPL_free(node_table);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_OFI_dynproc_bcast(int root,
+                                          MPIR_Comm * comm_ptr,
+                                          int *out_root,
+                                          ssize_t * out_table_size,
+                                          char **out_addr_table, MPID_Node_id_t ** out_node_table)
+{
+    int entries, mpi_errno = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+
+    MPIDI_OFI_MPI_CALL_POP(MPIR_Bcast_intra(out_root, 1, MPI_INT, root, comm_ptr, &errflag));
+    MPIDI_OFI_MPI_CALL_POP(MPIR_Bcast_intra(out_table_size, 1, MPI_LONG_LONG_INT,
+                                            root, comm_ptr, &errflag));
+
+    if (*out_addr_table == NULL)
+        *out_addr_table = (char *) MPL_malloc(*out_table_size);
+
+    MPIDI_OFI_MPI_CALL_POP(MPIR_Bcast_intra(*out_addr_table, *out_table_size,
+                                            MPI_CHAR, root, comm_ptr, &errflag));
+
+    entries = *out_table_size / MPIDI_Global.addrnamelen;
+
+    if (*out_node_table == NULL)
+        *out_node_table = (MPID_Node_id_t *) MPL_malloc(MPIDI_Global.addrnamelen * entries);
+
+    MPIDI_OFI_MPI_CALL_POP(MPIR_Bcast_intra(*out_node_table, entries * sizeof(MPID_Node_id_t),
+                                            MPI_CHAR, root, comm_ptr, &errflag));
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+static inline int MPIDI_OFI_dynproc_exchange_map(int root,
+                                                 int phase,
+                                                 int port_id,
+                                                 fi_addr_t * conn,
+                                                 char *conname,
+                                                 MPIR_Comm * comm_ptr,
+                                                 ssize_t * out_table_size,
+                                                 int *out_root,
+                                                 char **out_addr_table,
+                                                 MPID_Node_id_t ** out_node_table)
+{
+    int i, mpi_errno = MPI_SUCCESS;
+
+    MPIDI_OFI_dynamic_process_request_t req[2];
+    uint64_t match_bits = 0;
+    uint64_t mask_bits = 0;
+    struct fi_msg_tagged msg;
+    req[0].done = MPIDI_OFI_PEEK_START;
+    req[0].event_id = MPIDI_OFI_EVENT_ACCEPT_PROBE;
+    req[1].done = MPIDI_OFI_PEEK_START;
+    req[1].event_id = MPIDI_OFI_EVENT_ACCEPT_PROBE;
+    match_bits = MPIDI_OFI_init_recvtag(&mask_bits, port_id,
+                                        MPI_ANY_SOURCE, MPI_ANY_TAG, MPIDI_OFI_ENABLE_DATA);
+    match_bits |= MPIDI_OFI_DYNPROC_SEND;
+
+    if (phase == 0) {
+        /* Receive the addresses                           */
+        /* We don't know the size, so probe for table size */
+        /* Receive phase updates the connection            */
+        /* With the probed address                         */
+        msg.msg_iov = NULL;
+        msg.desc = NULL;
+        msg.iov_count = 0;
+        msg.addr = FI_ADDR_UNSPEC;
+        msg.tag = match_bits;
+        msg.ignore = mask_bits;
+        msg.context = (void *) &req[0].context;
+        msg.data = 0;
+
+        while (req[0].done != MPIDI_OFI_PEEK_FOUND) {
+            req[0].done = MPIDI_OFI_PEEK_START;
+            MPIDI_OFI_CALL(fi_trecvmsg
+                           (MPIDI_OFI_EP_RX_TAG(0), &msg,
+                            FI_PEEK | FI_COMPLETION | MPIDI_OFI_ENABLE_DATA), trecv);
+            MPIDI_OFI_PROGRESS_WHILE(req[0].done == MPIDI_OFI_PEEK_START);
+        }
+
+        *out_table_size = req[0].msglen;
+        *out_root = req[0].tag;
+        *out_addr_table = (char *) MPL_malloc(*out_table_size);
+
+        int entries = req[0].msglen / MPIDI_Global.addrnamelen;
+        *out_node_table = (MPID_Node_id_t *) MPL_malloc(entries * sizeof(MPID_Node_id_t));
+
+        req[0].done = 0;
+        req[0].event_id = MPIDI_OFI_EVENT_DYNPROC_DONE;
+        req[1].done = 0;
+        req[1].event_id = MPIDI_OFI_EVENT_DYNPROC_DONE;
+
+        MPIDI_OFI_CALL_RETRY(fi_trecv(MPIDI_OFI_EP_RX_TAG(0),
+                                      *out_addr_table,
+                                      *out_table_size,
+                                      NULL,
+                                      FI_ADDR_UNSPEC,
+                                      match_bits,
+                                      mask_bits, &req[0].context), trecv, MPIDI_OFI_CALL_LOCK);
+        MPIDI_OFI_CALL_RETRY(fi_trecv(MPIDI_OFI_EP_RX_TAG(0),
+                                      *out_node_table,
+                                      entries * sizeof(MPID_Node_id_t),
+                                      NULL,
+                                      FI_ADDR_UNSPEC,
+                                      match_bits,
+                                      mask_bits, &req[1].context), trecv, MPIDI_OFI_CALL_LOCK);
+
+        MPIDI_OFI_PROGRESS_WHILE(!req[0].done || !req[1].done);
+        memcpy(conname, *out_addr_table + req[0].source * MPIDI_Global.addrnamelen,
+               MPIDI_Global.addrnamelen);
+    }
+
+    if (phase == 1) {
+        /* Send our table to the child */
+        /* Send phase maps the entry   */
+        char *my_addr_table;
+        int tag = root;
+        int tblsz = MPIDI_Global.addrnamelen * comm_ptr->local_size;
+        my_addr_table = (char *) MPL_malloc(tblsz);
+
+        MPID_Node_id_t *my_node_table;
+        MPID_Node_id_t nodetblsz = sizeof(*my_node_table) * comm_ptr->local_size;
+        my_node_table = (MPID_Node_id_t *) MPL_malloc(nodetblsz);
+
+        match_bits = MPIDI_OFI_init_sendtag(port_id,
+                                            comm_ptr->rank,
+                                            tag, MPIDI_OFI_DYNPROC_SEND, MPIDI_OFI_ENABLE_DATA);
+
+        for (i = 0; i < comm_ptr->local_size; i++) {
+            size_t sz = MPIDI_Global.addrnamelen;
+            MPIDI_OFI_CALL(fi_av_lookup(MPIDI_Global.av,
+                                        MPIDI_OFI_COMM_TO_PHYS(comm_ptr, i),
+                                        my_addr_table + i * MPIDI_Global.addrnamelen,
+                                        &sz), avlookup);
+            MPIR_Assert(sz == MPIDI_Global.addrnamelen);
+        }
+
+        for (i = 0; i < comm_ptr->local_size; i++)
+            MPIDI_CH4U_get_node_id(comm_ptr, i, &my_node_table[i]);
+
+        /* fi_av_map here is not quite right for some providers */
+        /* we need to get this connection from the sockname     */
+        req[0].done = 0;
+        req[0].event_id = MPIDI_OFI_EVENT_DYNPROC_DONE;
+        req[1].done = 0;
+        req[1].event_id = MPIDI_OFI_EVENT_DYNPROC_DONE;
+        mpi_errno = MPIDI_OFI_send_handler(MPIDI_OFI_EP_TX_TAG(0),
+                                           my_addr_table,
+                                           tblsz,
+                                           NULL,
+                                           comm_ptr->rank,
+                                           *conn,
+                                           match_bits,
+                                           (void *) &req[0].context,
+                                           MPIDI_OFI_DO_SEND, MPIDI_OFI_ENABLE_DATA,
+                                           MPIDI_OFI_CALL_LOCK);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+        MPIDI_OFI_send_handler(MPIDI_OFI_EP_TX_TAG(0),
+                               my_node_table,
+                               nodetblsz,
+                               NULL,
+                               comm_ptr->rank,
+                               *conn,
+                               match_bits,
+                               (void *) &req[1].context,
+                               MPIDI_OFI_DO_SEND, MPIDI_OFI_ENABLE_DATA, MPIDI_OFI_CALL_LOCK);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+
+        MPIDI_OFI_PROGRESS_WHILE(!req[0].done || !req[1].done);
+
+        MPL_free(my_addr_table);
+        MPL_free(my_node_table);
+    }
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_comm_connect
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_comm_connect(const char *port_name,
+                                        MPIR_Info * info,
+                                        int root, MPIR_Comm * comm_ptr, MPIR_Comm ** newcomm)
+{
+    int entries, mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_COMM_CONNECT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_COMM_CONNECT);
+
+    char *parent_addr_table = NULL;
+    MPID_Node_id_t *parent_node_table = NULL;
+    ssize_t parent_table_sz = -1LL;
+    int parent_root = -1;
+    int rank = comm_ptr->rank;
+    int port_id;
+
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_OFI_THREAD_SPAWN_MUTEX);
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_get_tag_from_port(port_name, &port_id));
+
+    if (rank == root) {
+        fi_addr_t conn;
+        char conname[FI_NAME_MAX];
+        MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_get_conn_name_from_port(port_name, conname));
+        MPIDI_OFI_CALL(fi_av_insert(MPIDI_Global.av, conname, 1, &conn, 0ULL, NULL), avmap);
+        MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_dynproc_exchange_map
+                               (root, 1, port_id, &conn, conname, comm_ptr, &parent_table_sz,
+                                &parent_root, &parent_addr_table, &parent_node_table));
+        MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_dynproc_exchange_map
+                               (root, 0, port_id, &conn, conname, comm_ptr, &parent_table_sz,
+                                &parent_root, &parent_addr_table, &parent_node_table));
+        MPIDI_OFI_CALL(fi_av_remove(MPIDI_Global.av, &conn, 1, 0ULL), avmap);
+    }
+
+    /* Map the new address table */
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_dynproc_bcast(root, comm_ptr, &parent_root,
+                                                   &parent_table_sz,
+                                                   &parent_addr_table, &parent_node_table));
+
+    /* Now Create the New Intercomm */
+    entries = parent_table_sz / MPIDI_Global.addrnamelen;
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_dynproc_create_intercomm(port_name,
+                                                              parent_addr_table,
+                                                              parent_node_table,
+                                                              entries,
+                                                              comm_ptr,
+                                                              newcomm, 0, (char *) "Connect"));
+  fn_exit:
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_OFI_THREAD_SPAWN_MUTEX);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_COMM_CONNECT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_comm_disconnect
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_comm_disconnect(MPIR_Comm * comm_ptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_COMM_DISCONNECT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_COMM_DISCONNECT);
+
+    MPIDI_OFI_MPI_CALL_POP(MPIR_Barrier_impl(comm_ptr, &errflag));
+    MPIDI_OFI_MPI_CALL_POP(MPIR_Comm_free_impl(comm_ptr));
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_COMM_DISCONNECT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_comm_open_port
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_open_port(MPIR_Info * info_ptr, char *port_name)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int str_errno = MPL_STR_SUCCESS;
+    int port_name_tag = 0;
+    int len = MPI_MAX_PORT_NAME;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_COMM_OPEN_PORT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_COMM_OPEN_PORT);
+
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_get_port_name_tag(&port_name_tag));
+    MPIDI_OFI_STR_CALL(MPL_str_add_int_arg(&port_name, &len, MPIDI_OFI_PORT_NAME_TAG_KEY,
+                                           port_name_tag), port_str);
+    MPIDI_OFI_STR_CALL(MPL_str_add_binary_arg(&port_name, &len, MPIDI_OFI_CONNENTRY_TAG_KEY,
+                                              MPIDI_Global.addrname,
+                                              MPIDI_Global.addrnamelen), port_str);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_COMM_OPEN_PORT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_comm_close_port
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_close_port(const char *port_name)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int port_name_tag;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_COMM_CLOSE_PORT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_COMM_CLOSE_PORT);
+
+    mpi_errno = MPIDI_OFI_get_tag_from_port(port_name, &port_name_tag);
+    MPIDI_OFI_free_port_name_tag(port_name_tag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_COMM_CLOSE_PORT);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_comm_close_port
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_comm_accept(const char *port_name,
+                                       MPIR_Info * info,
+                                       int root, MPIR_Comm * comm_ptr, MPIR_Comm ** newcomm)
+{
+    int entries, mpi_errno = MPI_SUCCESS;
+    char *child_addr_table = NULL;
+    MPID_Node_id_t *child_node_table = NULL;
+    ssize_t child_table_sz = -1LL;
+    int child_root = -1;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_COMM_CLOSE_PORT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_COMM_CLOSE_PORT);
+
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_OFI_THREAD_SPAWN_MUTEX);
+    int rank = comm_ptr->rank;
+
+    if (rank == root) {
+        fi_addr_t conn;
+        char conname[FI_NAME_MAX];
+        int port_id;
+        MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_get_tag_from_port(port_name, &port_id));
+        MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_dynproc_exchange_map
+                               (root, 0, port_id, &conn, conname, comm_ptr, &child_table_sz,
+                                &child_root, &child_addr_table, &child_node_table));
+        MPIDI_OFI_CALL(fi_av_insert(MPIDI_Global.av, conname, 1, &conn, 0ULL, NULL), avmap);
+        MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_dynproc_exchange_map
+                               (root, 1, port_id, &conn, conname, comm_ptr, &child_table_sz,
+                                &child_root, &child_addr_table, &child_node_table));
+        MPIDI_OFI_CALL(fi_av_remove(MPIDI_Global.av, &conn, 1, 0ULL), avmap);
+    }
+
+    /* Map the new address table */
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_dynproc_bcast(root, comm_ptr, &child_root,
+                                                   &child_table_sz,
+                                                   &child_addr_table, &child_node_table));
+    /* Now Create the New Intercomm */
+    entries = child_table_sz / MPIDI_Global.addrnamelen;
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_OFI_dynproc_create_intercomm(port_name,
+                                                              child_addr_table,
+                                                              child_node_table,
+                                                              entries,
+                                                              comm_ptr,
+                                                              newcomm, 1, (char *) "Accept"));
+  fn_exit:
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_OFI_THREAD_SPAWN_MUTEX);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_COMM_CLOSE_PORT);
+    return mpi_errno;
+
+  fn_fail:
+    goto fn_exit;
+}
+
+#endif /* NETMOD_OFI_WIN_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_types.h b/src/mpid/ch4/netmod/ofi/ofi_types.h
new file mode 100644
index 0000000..a501d7e
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_types.h
@@ -0,0 +1,545 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_TYPES_H_INCLUDED
+#define NETMOD_OFI_TYPES_H_INCLUDED
+
+#include <netdb.h>
+#include <stddef.h>
+#include <inttypes.h>
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+#include "ofi_pre.h"
+#include "ch4_types.h"
+#include "mpidch4r.h"
+#include "fi_list.h"
+
+#define __SHORT_FILE__                          \
+    (strrchr(__FILE__,'/')                      \
+     ? strrchr(__FILE__,'/')+1                  \
+     : __FILE__                                 \
+)
+#define MPIDI_OFI_MAP_NOT_FOUND            ((void*)(-1UL))
+#define MPIDI_OFI_MAJOR_VERSION            1
+#define MPIDI_OFI_MINOR_VERSION            0
+#define MPIDI_OFI_DEFAULT_SHORT_SEND_SIZE  (16 * 1024)
+#define MPIDI_OFI_NUM_AM_BUFFERS           (8)
+#define MPIDI_OFI_AM_BUFF_SZ               (1 * 1024 * 1024)
+#define MPIDI_OFI_CACHELINE_SIZE           (64)
+#define MPIDI_OFI_IOV_MAX                  (32)
+#define MPIDI_OFI_BUF_POOL_SIZE            (1024)
+#define MPIDI_OFI_BUF_POOL_NUM             (1024)
+#define MPIDI_OFI_NUM_CQ_BUFFERED          (1024)
+#define MPIDI_OFI_MAX_AM_HANDLERS_TOTAL    (24)
+#define MPIDI_OFI_INTERNAL_HANDLER_CONTROL (MPIDI_OFI_MAX_AM_HANDLERS_TOTAL-1)
+#define MPIDI_OFI_INTERNAL_HANDLER_NEXT    (MPIDI_OFI_MAX_AM_HANDLERS_TOTAL-2)
+#define MPIDI_OFI_MAX_AM_HANDLERS          (MPIDI_OFI_INTERNAL_HANDLER_NEXT-1)
+
+#ifdef USE_OFI_TAGGED
+#define MPIDI_OFI_ENABLE_TAGGED          1
+#define MPIDI_OFI_ENABLE_AM              1
+#define MPIDI_OFI_ENABLE_RMA             1
+#else
+#define MPIDI_OFI_ENABLE_TAGGED          0
+#define MPIDI_OFI_ENABLE_AM              1
+#define MPIDI_OFI_ENABLE_RMA             1
+#endif
+
+#ifdef MPIDI_OFI_CONFIG_USE_SCALABLE_ENDPOINTS
+#define MPIDI_OFI_ENABLE_SCALABLE_ENDPOINTS 1
+#else
+#define MPIDI_OFI_ENABLE_SCALABLE_ENDPOINTS 0
+#endif
+
+#ifdef MPIDI_OFI_CONFIG_USE_AV_TABLE
+#define MPIDI_OFI_ENABLE_AV_TABLE 1
+#else
+#define MPIDI_OFI_ENABLE_AV_TABLE 0
+#endif
+
+#ifdef USE_OFI_IMMEDIATE_DATA
+#define MPIDI_OFI_ENABLE_DATA FI_REMOTE_CQ_DATA
+#else
+#define MPIDI_OFI_ENABLE_DATA 0
+#endif
+
+#ifdef USE_OFI_STX_RMA
+#define MPIDI_OFI_ENABLE_STX_RMA 1
+#else
+#define MPIDI_OFI_ENABLE_STX_RMA 0
+#endif
+
+#ifdef USE_OFI_MR_SCALABLE
+#define MPIDI_OFI_ENABLE_MR_SCALABLE 1
+#else
+#define MPIDI_OFI_ENABLE_MR_SCALABLE 0
+#endif
+
+#ifdef USE_OFI_IMMEDIATE_DATA
+/* match/ignore bit manipulation
+ *
+ * 0123 4567 01234567 01234567 01234567 01234567 01234567 01234567 01234567
+ *     |             |                 |
+ * ^   |   Unused    |    context id   |           message tag
+ * |   |             |                 |
+ * +---- protocol
+ */
+#define MPIDI_OFI_PROTOCOL_MASK (0x9000000000000000ULL)
+#define MPIDI_OFI_CONTEXT_MASK  (0x00007FFF80000000ULL)
+#define MPIDI_OFI_SOURCE_MASK   (0x0000000000000000ULL)
+#define MPIDI_OFI_TAG_MASK      (0x000000007FFFFFFFULL)
+#define MPIDI_OFI_SYNC_SEND     (0x1000000000000000ULL)
+#define MPIDI_OFI_SYNC_SEND_ACK (0x2000000000000000ULL)
+#define MPIDI_OFI_DYNPROC_SEND  (0x4000000000000000ULL)
+#define MPIDI_OFI_TAG_SHIFT     (31)
+#define MPIDI_OFI_SOURCE_SHIFT  (0)
+#else
+/* match/ignore bit manipulation
+ *
+ * 0123 4567 01234567 0123 4567 01234567 0123 4567 01234567 01234567 01234567
+ *     |                  |                  |
+ * ^   |    context id    |       source     |       message tag
+ * |   |                  |                  |
+ * +---- protocol
+ */
+#define MPIDI_OFI_PROTOCOL_MASK (0x9000000000000000ULL)
+#define MPIDI_OFI_CONTEXT_MASK  (0x0FFFF00000000000ULL)
+#define MPIDI_OFI_SOURCE_MASK   (0x00000FFFF0000000ULL)
+#define MPIDI_OFI_TAG_MASK      (0x000000000FFFFFFFULL)
+#define MPIDI_OFI_SYNC_SEND     (0x1000000000000000ULL)
+#define MPIDI_OFI_SYNC_SEND_ACK (0x2000000000000000ULL)
+#define MPIDI_OFI_DYNPROC_SEND  (0x4000000000000000ULL)
+#define MPIDI_OFI_TAG_SHIFT     (28)
+#define MPIDI_OFI_SOURCE_SHIFT  (16)
+#endif
+
+/* RMA Key Space division
+ *    |                  |                  |                    |
+ *    ...     Context ID |   Huge RMA       |  Window Instance   |
+ *    |                  |                  |                    |
+ */
+/* 64-bit key space                         */
+/* 2M  window instances per comm           */
+/* 2M  outstanding huge RMAS per comm      */
+/* 4M  communicators                       */
+#define MPIDI_OFI_MAX_WINDOWS_BITS_64  (21)
+#define MPIDI_OFI_MAX_HUGE_RMA_BITS_64 (21)
+#define MPIDI_OFI_MAX_HUGE_RMAS_64     (1<<(MPIDI_OFI_MAX_HUGE_RMA_BITS_64))
+#define MPIDI_OFI_MAX_WINDOWS_64       (1<<(MPIDI_OFI_MAX_WINDOWS_BITS_64))
+#define MPIDI_OFI_HUGE_RMA_SHIFT_64    (MPIDI_OFI_MAX_WINDOWS_BITS_64)
+#define MPIDI_OFI_CONTEXT_SHIFT_64     (MPIDI_OFI_MAX_WINDOWS_BITS_64+MPIDI_OFI_MAX_HUGE_RMA_BITS_64)
+
+/* 32-bit key space                         */
+/* 4096 window instances per comm           */
+/* 256  outstanding huge RMAS per comm      */
+/* 4096 communicators                       */
+#define MPIDI_OFI_MAX_WINDOWS_BITS_32  (12)
+#define MPIDI_OFI_MAX_HUGE_RMA_BITS_32 (8)
+#define MPIDI_OFI_MAX_HUGE_RMAS_32     (1<<(MPIDI_OFI_MAX_HUGE_RMA_BITS_32))
+#define MPIDI_OFI_MAX_WINDOWS_32       (1<<(MPIDI_OFI_MAX_WINDOWS_BITS_32))
+#define MPIDI_OFI_HUGE_RMA_SHIFT_32    (MPIDI_OFI_MAX_WINDOWS_BITS_32)
+#define MPIDI_OFI_CONTEXT_SHIFT_32     (MPIDI_OFI_MAX_WINDOWS_BITS_32+MPIDI_OFI_MAX_HUGE_RMA_BITS_32)
+
+/* 16-bit key space                         */
+/* 64 window instances per comm             */
+/* 16 outstanding huge RMAS per comm        */
+/* 64 communicators                          */
+#define MPIDI_OFI_MAX_WINDOWS_BITS_16  (6)
+#define MPIDI_OFI_MAX_HUGE_RMA_BITS_16 (4)
+#define MPIDI_OFI_MAX_HUGE_RMAS_16     (1<<(MPIDI_OFI_MAX_HUGE_RMA_BITS_16))
+#define MPIDI_OFI_MAX_WINDOWS_16       (1<<(MPIDI_OFI_MAX_WINDOWS_BITS_16))
+#define MPIDI_OFI_HUGE_RMA_SHIFT_16    (MPIDI_OFI_MAX_WINDOWS_BITS_16)
+#define MPIDI_OFI_CONTEXT_SHIFT_16     (MPIDI_OFI_MAX_WINDOWS_BITS_16+MPIDI_OFI_MAX_HUGE_RMA_BITS_16)
+
+#ifdef HAVE_FORTRAN_BINDING
+#ifdef MPICH_DEFINE_2COMPLEX
+#define MPIDI_OFI_DT_SIZES 62
+#else
+#define MPIDI_OFI_DT_SIZES 60
+#endif
+#else
+#define MPIDI_OFI_DT_SIZES 40
+#endif
+#define MPIDI_OFI_OP_SIZES 15
+
+#define MPIDI_OFI_API_TAG 0
+#define MPIDI_OFI_API_RMA 1
+#define MPIDI_OFI_API_MSG 2
+#define MPIDI_OFI_API_CTR 3
+
+#define MPIDI_OFI_THREAD_UTIL_MUTEX     MPIDI_Global.mutexes[0].m
+#define MPIDI_OFI_THREAD_PROGRESS_MUTEX MPIDI_Global.mutexes[1].m
+#define MPIDI_OFI_THREAD_FI_MUTEX       MPIDI_Global.mutexes[2].m
+#define MPIDI_OFI_THREAD_SPAWN_MUTEX    MPIDI_Global.mutexes[3].m
+
+/* Field accessor macros */
+#define MPIDI_OFI_GPID(gpid)               ((gpid)->dev.netmod.ofi)
+#define MPIDI_OFI_OBJECT_HEADER_SIZE       offsetof(MPIDI_OFI_offset_checker_t,  pad)
+#define MPIDI_OFI_AMREQUEST(req,field)     ((req)->dev.ch4.ch4u.netmod_am.ofi.field)
+#define MPIDI_OFI_AMREQUEST_HDR(req,field) ((req)->dev.ch4.ch4u.netmod_am.ofi.req_hdr->field)
+#define MPIDI_OFI_AMREQUEST_HDR_PTR(req)   ((req)->dev.ch4.ch4u.netmod_am.ofi.req_hdr)
+#define MPIDI_OFI_REQUEST(req,field)       ((req)->dev.ch4.netmod.ofi.field)
+#define MPIDI_OFI_AV(av)                   ((av)->netmod.ofi)
+
+#define MPIDI_OFI_DATATYPE(dt)   ((dt)->dev.netmod.ofi)
+#define MPIDI_OFI_COMM(comm)     ((comm)->dev.ch4.netmod.ofi)
+
+#ifdef MPIDI_OFI_CONFIG_USE_SCALABLE_ENDPOINTS
+#define MPIDI_OFI_COMM_TO_EP(comm,rank)  MPIDI_OFI_AV(MPIDIU_comm_rank_to_av(comm, rank)).ep_idx
+#define MPIDI_OFI_EP_TX_TAG(x) MPIDI_Global.ctx[x].tx_tag
+#define MPIDI_OFI_EP_TX_RMA(x) MPIDI_Global.ctx[x].tx_rma
+#define MPIDI_OFI_EP_TX_MSG(x) MPIDI_Global.ctx[x].tx_msg
+#define MPIDI_OFI_EP_TX_CTR(x) MPIDI_Global.ctx[x].tx_ctr
+#define MPIDI_OFI_EP_RX_TAG(x) MPIDI_Global.ctx[x].rx_tag
+#define MPIDI_OFI_EP_RX_RMA(x) MPIDI_Global.ctx[x].rx_rma
+#define MPIDI_OFI_EP_RX_MSG(x) MPIDI_Global.ctx[x].rx_msg
+#define MPIDI_OFI_EP_RX_CTR(x) MPIDI_Global.ctx[x].rx_ctr
+#else
+#define MPIDI_OFI_COMM_TO_EP(comm,rank) 0
+#define MPIDI_OFI_EP_TX_TAG(x) MPIDI_Global.ep
+#define MPIDI_OFI_EP_TX_RMA(x) MPIDI_Global.ep
+#define MPIDI_OFI_EP_TX_MSG(x) MPIDI_Global.ep
+#define MPIDI_OFI_EP_TX_CTR(x) MPIDI_Global.ep
+#define MPIDI_OFI_EP_RX_TAG(x) MPIDI_Global.ep
+#define MPIDI_OFI_EP_RX_RMA(x) MPIDI_Global.ep
+#define MPIDI_OFI_EP_RX_MSG(x) MPIDI_Global.ep
+#define MPIDI_OFI_EP_RX_CTR(x) MPIDI_Global.ep
+#endif
+
+#define MPIDI_OFI_DO_SEND        0
+#define MPIDI_OFI_DO_INJECT      1
+#define MPIDI_OFI_NUM_CQ_ENTRIES 8
+
+/* Typedefs */
+enum {
+    MPIDI_OFI_CTRL_ASSERT,    /**< Lock acknowledge      */
+    MPIDI_OFI_CTRL_LOCKACK,   /**< Lock acknowledge      */
+    MPIDI_OFI_CTRL_LOCKALLACK,/**< Lock all acknowledge  */
+    MPIDI_OFI_CTRL_LOCKREQ,   /**< Lock window           */
+    MPIDI_OFI_CTRL_LOCKALLREQ,/**< Lock all window       */
+    MPIDI_OFI_CTRL_UNLOCK,    /**< Unlock window         */
+    MPIDI_OFI_CTRL_UNLOCKACK, /**< Unlock window         */
+    MPIDI_OFI_CTRL_UNLOCKALL, /**< Unlock window         */
+    MPIDI_OFI_CTRL_UNLOCKALLACK,
+    /**< Unlock window         */
+    MPIDI_OFI_CTRL_COMPLETE,  /**< End a START epoch     */
+    MPIDI_OFI_CTRL_POST,      /**< Begin POST epoch      */
+    MPIDI_OFI_CTRL_HUGE,      /**< Huge message          */
+    MPIDI_OFI_CTRL_HUGEACK,   /**< Huge message ack      */
+    MPIDI_OFI_CTRL_HUGE_CLEANUP
+    /**< Huge message cleanup  */
+};
+
+enum {
+    MPIDI_OFI_EVENT_ABORT,
+    MPIDI_OFI_EVENT_SEND,
+    MPIDI_OFI_EVENT_RECV,
+    MPIDI_OFI_EVENT_RMA_DONE,
+    MPIDI_OFI_EVENT_AM_SEND,
+    MPIDI_OFI_EVENT_AM_RECV,
+    MPIDI_OFI_EVENT_AM_READ,
+    MPIDI_OFI_EVENT_AM_MULTI,
+    MPIDI_OFI_EVENT_PEEK,
+    MPIDI_OFI_EVENT_RECV_HUGE,
+    MPIDI_OFI_EVENT_SEND_HUGE,
+    MPIDI_OFI_EVENT_SSEND_ACK,
+    MPIDI_OFI_EVENT_GET_HUGE,
+    MPIDI_OFI_EVENT_CHUNK_DONE,
+    MPIDI_OFI_EVENT_INJECT_EMU,
+    MPIDI_OFI_EVENT_DYNPROC_DONE,
+    MPIDI_OFI_EVENT_ACCEPT_PROBE
+};
+
+enum {
+    MPIDI_OFI_REQUEST_LOCK,
+    MPIDI_OFI_REQUEST_LOCKALL
+};
+
+enum {
+    MPIDI_OFI_PEEK_START,
+    MPIDI_OFI_PEEK_NOT_FOUND,
+    MPIDI_OFI_PEEK_FOUND
+};
+
+typedef struct {
+    char pad[MPIDI_REQUEST_HDR_SIZE];
+    struct fi_context context;  /* fixed field, do not move */
+    int event_id;               /* fixed field, do not move */
+    int index;
+} MPIDI_OFI_am_repost_request_t;
+
+typedef struct {
+    char pad[MPIDI_REQUEST_HDR_SIZE];
+    struct fi_context context;  /* fixed field, do not move */
+    int event_id;               /* fixed field, do not move */
+    MPIR_Request *signal_req;
+} MPIDI_OFI_ssendack_request_t;
+
+typedef struct {
+    char pad[MPIDI_REQUEST_HDR_SIZE];
+    struct fi_context context;  /* fixed field, do not move */
+    int event_id;               /* fixed field, do not move */
+    int done;
+    uint32_t tag;
+    uint32_t source;
+    uint64_t msglen;
+} MPIDI_OFI_dynamic_process_request_t;
+
+typedef struct {
+    uint8_t op;
+    uint8_t dt;
+    unsigned atomic_valid:2;
+    unsigned fetch_atomic_valid:2;
+    unsigned compare_atomic_valid:2;
+    unsigned dtsize:10;
+    uint64_t max_atomic_count;
+    uint64_t max_compare_atomic_count;
+    uint64_t max_fetch_atomic_count;
+} MPIDI_OFI_atomic_valid_t;
+
+typedef struct {
+    struct fid_ep *tx_tag;
+    struct fid_ep *rx_tag;
+
+    struct fid_ep *tx_rma;
+    struct fid_ep *rx_rma;
+
+    struct fid_ep *tx_msg;
+    struct fid_ep *rx_msg;
+
+    struct fid_ep *tx_ctr;
+    struct fid_ep *rx_ctr;
+
+    int ctx_offset;
+} MPIDI_OFI_context_t;
+
+typedef union {
+    MPID_Thread_mutex_t m;
+    char cacheline[MPIDI_OFI_CACHELINE_SIZE];
+} MPIDI_OFI_cacheline_mutex_t __attribute__ ((aligned(MPIDI_OFI_CACHELINE_SIZE)));
+
+typedef struct {
+    struct fi_cq_tagged_entry cq_entry;
+    fi_addr_t source;
+    struct slist_entry entry;
+} MPIDI_OFI_cq_list_t;
+
+typedef struct {
+    struct fi_cq_tagged_entry cq_entry;
+} MPIDI_OFI_cq_buff_entry_t;
+
+/* Global state data */
+#define MPIDI_KVSAPPSTRLEN 1024
+typedef struct {
+    /* OFI objects */
+    int avtid;
+    struct fi_info *prov_use;
+    struct fid_domain *domain;
+    struct fid_fabric *fabric;
+    struct fid_av *av;
+    struct fid_ep *ep;
+    struct fid_cq *p2p_cq;
+    struct fid_cntr *rma_cmpl_cntr;
+    struct fid_stx *stx_ctx;    /* shared TX context for RMA */
+
+    /* Queryable limits */
+    uint64_t max_buffered_send;
+    uint64_t max_buffered_write;
+    uint64_t max_send;
+    uint64_t max_write;
+    uint64_t max_short_send;
+    uint64_t max_mr_key_size;
+    int max_windows_bits;
+    int max_huge_rma_bits;
+    int max_huge_rmas;
+    int huge_rma_shift;
+    int context_shift;
+    size_t iov_limit;
+    size_t rma_iov_limit;
+
+    /* Mutexex and endpoints */
+    MPIDI_OFI_cacheline_mutex_t mutexes[4];
+    MPIDI_OFI_context_t ctx[MPIDI_OFI_MAX_ENDPOINTS];
+
+    /* Window/RMA Globals */
+    void *win_map;
+    uint64_t rma_issued_cntr;
+    MPIDI_OFI_atomic_valid_t win_op_table[MPIDI_OFI_DT_SIZES][MPIDI_OFI_OP_SIZES];
+
+    /* Active Message Globals */
+    struct iovec am_iov[MPIDI_OFI_NUM_AM_BUFFERS];
+    struct fi_msg am_msg[MPIDI_OFI_NUM_AM_BUFFERS];
+    void *am_bufs[MPIDI_OFI_NUM_AM_BUFFERS];
+    MPIDI_OFI_am_repost_request_t am_reqs[MPIDI_OFI_NUM_AM_BUFFERS];
+    MPIDI_NM_am_target_handler_fn am_handlers[MPIDI_OFI_MAX_AM_HANDLERS_TOTAL];
+    MPIDI_NM_am_origin_handler_fn am_send_cmpl_handlers[MPIDI_OFI_MAX_AM_HANDLERS_TOTAL];
+    MPIU_buf_pool_t *am_buf_pool;
+    OPA_int_t am_inflight_inject_emus;
+    OPA_int_t am_inflight_rma_send_mrs;
+
+    /* Completion queue buffering */
+    MPIDI_OFI_cq_buff_entry_t cq_buffered[MPIDI_OFI_NUM_CQ_BUFFERED];
+    struct slist cq_buff_list;
+    int cq_buff_head;
+    int cq_buff_tail;
+
+    /* Process management and PMI globals */
+    int pname_set;
+    int pname_len;
+    int jobid;
+    char addrname[FI_NAME_MAX];
+    size_t addrnamelen;
+    char kvsname[MPIDI_KVSAPPSTRLEN];
+    char pname[MPI_MAX_PROCESSOR_NAME];
+    int port_name_tag_mask[MPIR_MAX_CONTEXT_MASK];
+} MPIDI_OFI_global_t;
+
+typedef struct {
+    uint32_t index;
+} MPIDI_OFI_datatype_t;
+/* These control structures have to be the same size */
+typedef struct {
+    int16_t type;
+    int16_t lock_type;
+    int origin_rank;
+    uint64_t win_id;
+    int dummy[8];
+} MPIDI_OFI_win_control_t;
+
+typedef struct {
+    int16_t type;
+    int16_t seqno;
+    int origin_rank;
+    MPIR_Request *ackreq;
+    uintptr_t send_buf;
+    size_t msgsize;
+    int comm_id;
+    int endpoint_id;
+    uint64_t rma_key;
+} MPIDI_OFI_send_control_t;
+
+typedef struct {
+    MPIR_OBJECT_HEADER;
+    void *pad;
+} MPIDI_OFI_offset_checker_t;
+
+typedef struct {
+    uintptr_t target_base_addr;
+    uintptr_t origin_base_addr;
+    uintptr_t result_base_addr;
+    size_t target_count;
+    size_t origin_count;
+    size_t result_count;
+    struct iovec *target_iov;
+    struct iovec *origin_iov;
+    struct iovec *result_iov;
+    size_t target_idx;
+    uintptr_t target_addr;
+    uintptr_t target_size;
+    size_t origin_idx;
+    uintptr_t origin_addr;
+    uintptr_t origin_size;
+    size_t result_idx;
+    uintptr_t result_addr;
+    uintptr_t result_size;
+    size_t buf_limit;
+    size_t buf_limit_left;
+} MPIDI_OFI_iovec_state_t;
+
+typedef struct {
+    MPIR_Datatype *pointer;
+    MPI_Datatype type;
+    int count;
+    int contig;
+    MPI_Aint true_lb;
+    size_t size;
+    int num_contig;
+    DLOOP_VECTOR *map;
+    DLOOP_VECTOR __map;
+} MPIDI_OFI_win_datatype_t;
+
+typedef struct {
+    char pad[MPIDI_REQUEST_HDR_SIZE];
+    struct fi_context context;  /* fixed field, do not move */
+    int event_id;               /* fixed field, do not move */
+    struct MPIDI_Iovec_array *next;
+    union {
+        struct {
+            struct iovec *originv;
+            struct fi_rma_iov *targetv;
+        } put_get;
+        struct {
+            struct fi_ioc *originv;
+            struct fi_rma_ioc *targetv;
+            struct fi_ioc *resultv;
+            struct fi_ioc *comparev;
+        } cas;
+        struct {
+            struct fi_ioc *originv;
+            struct fi_rma_ioc *targetv;
+        } accumulate;
+        struct {
+            struct fi_ioc *originv;
+            struct fi_rma_ioc *targetv;
+            struct fi_ioc *resultv;
+        } get_accumulate;
+    } iov;
+    char iov_store[0];          /* Flexible array, do not move */
+} MPIDI_OFI_iovec_array_t;
+
+typedef struct {
+    MPIDI_OFI_iovec_state_t iovs;
+    MPIDI_OFI_win_datatype_t origin_dt;
+    MPIDI_OFI_win_datatype_t target_dt;
+    MPIDI_OFI_win_datatype_t result_dt;
+    MPIDI_OFI_iovec_array_t buf;        /* Do not move me, flexible array */
+} MPIDI_OFI_win_noncontig_t;
+
+typedef struct MPIDI_OFI_win_request {
+    MPIR_OBJECT_HEADER;
+    char pad[MPIDI_REQUEST_HDR_SIZE - MPIDI_OFI_OBJECT_HEADER_SIZE];
+    struct fi_context context;  /* fixed field, do not move */
+    int event_id;               /* fixed field, do not move */
+    struct MPIDI_OFI_win_request *next;
+    int target_rank;
+    MPIDI_OFI_win_noncontig_t *noncontig;
+} MPIDI_OFI_win_request_t;
+
+typedef struct {
+    char pad[MPIDI_REQUEST_HDR_SIZE];
+    struct fi_context context;  /* fixed field, do not move */
+    int event_id;               /* fixed field, do not move */
+    MPIR_Request *parent;       /* Parent request           */
+} MPIDI_OFI_chunk_request;
+
+typedef struct {
+    char pad[MPIDI_REQUEST_HDR_SIZE];
+    struct fi_context context;  /* fixed field, do not move */
+    int event_id;               /* fixed field, do not move */
+    int (*done_fn) (struct fi_cq_tagged_entry * wc, MPIR_Request * req);
+    MPIDI_OFI_send_control_t remote_info;
+    size_t cur_offset;
+    MPIR_Comm *comm_ptr;
+    MPIR_Request *localreq;
+    struct fi_cq_tagged_entry wc;
+} MPIDI_OFI_huge_recv_t;
+
+typedef struct MPIDI_OFI_huge_counter_t {
+    uint16_t counter;
+    uint16_t outstanding;
+    struct fid_mr *mr;
+} MPIDI_OFI_huge_counter_t;
+
+/* Externs */
+extern MPIDI_OFI_global_t MPIDI_Global;
+extern int MPIR_Datatype_init_names(void);
+
+#endif /* NETMOD_OFI_IMPL_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_unimpl.h b/src/mpid/ch4/netmod/ofi/ofi_unimpl.h
new file mode 100644
index 0000000..db8e66b
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_unimpl.h
@@ -0,0 +1,19 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/mpid/ch4/netmod/ofi/ofi_win.h b/src/mpid/ch4/netmod/ofi/ofi_win.h
new file mode 100644
index 0000000..2975845
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_win.h
@@ -0,0 +1,1254 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_OFI_WIN_H_INCLUDED
+#define NETMOD_OFI_WIN_H_INCLUDED
+
+#include "ofi_impl.h"
+#include <opa_primitives.h>
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_win_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_win_allgather(MPIR_Win * win, void *base, int disp_unit)
+{
+    int i, same_disp, mpi_errno = MPI_SUCCESS;
+    uint32_t first;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+    MPIR_Comm *comm_ptr = win->comm_ptr;
+    int raw_prefix, idx, bitpos;
+    unsigned gen_id;
+    MPIDI_OFI_win_targetinfo_t *winfo;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_OFI_WIN_ALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_OFI_WIN_ALLGATHER);
+
+    /* Calculate a canonical context id */
+    raw_prefix = MPIR_CONTEXT_READ_FIELD(PREFIX, comm_ptr->context_id);
+    idx = raw_prefix / MPIR_CONTEXT_INT_BITS;
+    bitpos = raw_prefix % MPIR_CONTEXT_INT_BITS;
+    gen_id = (idx * MPIR_CONTEXT_INT_BITS) + (31 - bitpos);
+
+    int total_bits_avail = MPIDI_Global.max_mr_key_size * 8;
+    uint64_t window_instance = (uint64_t) (MPIDI_OFI_WIN(win).win_id) >> 32;
+    int bits_for_instance_id = MPIDI_Global.max_windows_bits;
+    int bits_for_context_id;
+    uint64_t max_contexts_allowed;
+    uint64_t max_instances_allowed;
+
+    bits_for_context_id = total_bits_avail -
+        MPIDI_Global.max_windows_bits - MPIDI_Global.max_huge_rma_bits;
+    max_contexts_allowed = 1 << (bits_for_context_id);
+    max_instances_allowed = 1 << (bits_for_instance_id);
+    MPIR_ERR_CHKANDSTMT(gen_id >= max_contexts_allowed, mpi_errno, MPI_ERR_OTHER,
+                        goto fn_fail, "**ofid_mr_reg");
+    MPIR_ERR_CHKANDSTMT(window_instance >= max_instances_allowed, mpi_errno, MPI_ERR_OTHER,
+                        goto fn_fail, "**ofid_mr_reg");
+
+    if (MPIDI_OFI_ENABLE_MR_SCALABLE) {
+        /* Context id in lower bits, instance in upper bits */
+        MPIDI_OFI_WIN(win).mr_key = (gen_id << MPIDI_Global.context_shift) | window_instance;
+    }
+    else {
+        MPIDI_OFI_WIN(win).mr_key = 0;
+    }
+
+    MPIDI_OFI_CALL(fi_mr_reg(MPIDI_Global.domain,       /* In:  Domain Object       */
+                             base,      /* In:  Lower memory address */
+                             win->size, /* In:  Length              */
+                             FI_REMOTE_READ | FI_REMOTE_WRITE,  /* In:  Expose MR for read  */
+                             0ULL,      /* In:  offset(not used)    */
+                             MPIDI_OFI_WIN(win).mr_key, /* In:  requested key       */
+                             0ULL,      /* In:  flags               */
+                             &MPIDI_OFI_WIN(win).mr,    /* Out: memregion object    */
+                             NULL), mr_reg);    /* In:  context             */
+
+    MPIDI_OFI_WIN(win).winfo = MPL_malloc(sizeof(*winfo) * comm_ptr->local_size);
+
+    winfo = MPIDI_OFI_WIN(win).winfo;
+    winfo[comm_ptr->rank].disp_unit = disp_unit;
+
+#ifndef USE_OFI_MR_SCALABLE
+    /* MR_BASIC */
+    MPIDI_OFI_WIN(win).mr_key = fi_mr_key(MPIDI_OFI_WIN(win).mr);
+    winfo[comm_ptr->rank].mr_key = MPIDI_OFI_WIN(win).mr_key;
+    winfo[comm_ptr->rank].base = (uintptr_t) base;
+#endif
+
+    mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0,
+                                    MPI_DATATYPE_NULL,
+                                    winfo, sizeof(*winfo), MPI_BYTE, comm_ptr, &errflag);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    if (MPIDI_OFI_ENABLE_MR_SCALABLE) {
+        first = winfo[0].disp_unit;
+        same_disp = 1;
+        for (i = 1; i < comm_ptr->local_size; i++) {
+            if (winfo[i].disp_unit != first) {
+                same_disp = 0;
+                break;
+            }
+        }
+        if (same_disp) {
+            MPL_free(MPIDI_OFI_WIN(win).winfo);
+            MPIDI_OFI_WIN(win).winfo = NULL;
+        }
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_OFI_PROGRESS_WIN_ALLGATHER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_win_init_generic
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_win_init_generic(MPI_Aint length,
+                                             int disp_unit,
+                                             MPIR_Win ** win_ptr,
+                                             MPIR_Info * info,
+                                             MPIR_Comm * comm_ptr,
+                                             int create_flavor, int model, int do_stx_rma)
+{
+    int mpi_errno = MPI_SUCCESS;
+    uint64_t window_instance;
+    MPIR_Win *win;
+    struct fi_info *finfo;
+    struct fi_cntr_attr cntr_attr;
+
+    CH4_COMPILE_TIME_ASSERT(sizeof(MPIDI_Devwin_t) >= sizeof(MPIDI_OFI_win_t));
+    CH4_COMPILE_TIME_ASSERT(sizeof(MPIDI_Devdt_t) >= sizeof(MPIDI_OFI_datatype_t));
+
+    /* Note: MPIDI_CH4U_win_init will interpret the info object */
+    mpi_errno = MPIDI_CH4R_win_init(length, disp_unit, &win, info, comm_ptr, create_flavor, model);
+    MPIR_ERR_CHKANDSTMT(mpi_errno != MPI_SUCCESS,
+                        mpi_errno, MPI_ERR_NO_MEM, goto fn_fail, "**nomem");
+    *win_ptr = win;
+
+    memset(&MPIDI_OFI_WIN(win), 0, sizeof(MPIDI_OFI_win_t));
+
+    /* context id lower bits, window instance upper bits */
+    window_instance =
+        MPIDI_OFI_index_allocator_alloc(MPIDI_OFI_COMM(win->comm_ptr).win_id_allocator);
+    MPIDI_OFI_WIN(win).win_id = ((uint64_t) comm_ptr->context_id) | (window_instance << 32);
+    MPIDI_OFI_map_set(MPIDI_Global.win_map, MPIDI_OFI_WIN(win).win_id, win);
+
+    if (do_stx_rma && MPIDI_Global.stx_ctx != NULL) {
+        /* Activate per-window EP/counter */
+        int ret;
+
+        finfo = fi_dupinfo(MPIDI_Global.prov_use);
+        MPIR_Assert(finfo);
+        finfo->ep_attr->tx_ctx_cnt = FI_SHARED_CONTEXT; /* Request a shared context */
+        MPIDI_OFI_CALL_RETURN(fi_endpoint(MPIDI_Global.domain,
+                                          finfo, &MPIDI_OFI_WIN(win).ep, NULL), ret);
+        fi_freeinfo(finfo);
+        if (ret < 0) {
+            MPL_DBG_MSG(MPIDI_CH4_DBG_GENERAL, VERBOSE,
+                        "Failed to create per-window EP (with completion), "
+                        "falling back to global EP/counter scheme");
+            goto fallback_global;
+        }
+
+        memset(&cntr_attr, 0, sizeof(cntr_attr));
+        cntr_attr.events = FI_CNTR_EVENTS_COMP;
+        MPIDI_OFI_CALL(fi_cntr_open(MPIDI_Global.domain,        /* In:  Domain Object        */
+                                    &cntr_attr, /* In:  Configuration object */
+                                    &MPIDI_OFI_WIN(win).cmpl_cntr,      /* Out: Counter Object       */
+                                    NULL), openct);     /* Context: counter events   */
+        MPIDI_OFI_WIN(win).issued_cntr = &MPIDI_OFI_WIN(win).issued_cntr_v;
+
+        MPIDI_OFI_CALL(fi_ep_bind(MPIDI_OFI_WIN(win).ep, &MPIDI_Global.stx_ctx->fid, 0), bind);
+        MPIDI_OFI_CALL(fi_ep_bind(MPIDI_OFI_WIN(win).ep,
+                                  &MPIDI_Global.p2p_cq->fid, FI_TRANSMIT | FI_SELECTIVE_COMPLETION),
+                       bind);
+        MPIDI_OFI_CALL(fi_ep_bind
+                       (MPIDI_OFI_WIN(win).ep, &MPIDI_OFI_WIN(win).cmpl_cntr->fid,
+                        FI_READ | FI_WRITE), bind);
+        MPIDI_OFI_CALL(fi_ep_bind(MPIDI_OFI_WIN(win).ep, &MPIDI_Global.av->fid, 0), bind);
+
+        MPIDI_OFI_CALL_RETURN(fi_ep_alias(MPIDI_OFI_WIN(win).ep, &MPIDI_OFI_WIN(win).ep_nocmpl,
+                                          FI_TRANSMIT), ret);
+        if (ret < 0) {
+            MPL_DBG_MSG(MPIDI_CH4_DBG_GENERAL, VERBOSE,
+                        "Failed to create an EP alias, "
+                        "falling back to global EP/counter scheme");
+            MPIDI_OFI_CALL(fi_close(&MPIDI_OFI_WIN(win).ep->fid), epclose);
+            MPIDI_OFI_CALL(fi_close(&MPIDI_OFI_WIN(win).cmpl_cntr->fid), epclose);
+            goto fallback_global;
+        }
+
+        MPIDI_OFI_CALL(fi_enable(MPIDI_OFI_WIN(win).ep), ep_enable);
+        MPIDI_OFI_CALL(fi_enable(MPIDI_OFI_WIN(win).ep_nocmpl), ep_enable);
+    }
+    else {
+      fallback_global:
+        /* Fallback for the traditional global EP/counter model */
+        MPIDI_OFI_WIN(win).ep = MPIDI_OFI_EP_TX_RMA(0);
+        MPIDI_OFI_WIN(win).ep_nocmpl = MPIDI_OFI_EP_TX_CTR(0);
+        MPIDI_OFI_WIN(win).cmpl_cntr = MPIDI_Global.rma_cmpl_cntr;
+        MPIDI_OFI_WIN(win).issued_cntr = &MPIDI_Global.rma_issued_cntr;
+    }
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_win_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_win_init(MPI_Aint length,
+                                     int disp_unit,
+                                     MPIR_Win ** win_ptr,
+                                     MPIR_Info * info,
+                                     MPIR_Comm * comm_ptr, int create_flavor, int model)
+{
+    int mpi_errno;
+    int use_stx_rma = MPIDI_OFI_ENABLE_SCALABLE_ENDPOINTS ? 0 : MPIDI_OFI_ENABLE_STX_RMA;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_OFI_WIN_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_OFI_WIN_INIT);
+
+    mpi_errno = MPIDI_OFI_win_init_generic(length,
+                                           disp_unit,
+                                           win_ptr,
+                                           info, comm_ptr, create_flavor, model, use_stx_rma);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_OFI_PROGRESS_WIN_INIT);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_progress_fence
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_Win_progress_fence(MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int itercount = 0;
+    int ret;
+    uint64_t tcount, donecount;
+    MPIDI_OFI_win_request_t *r;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_OFI_PROGRESS_WIN_COUNTER_FENCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_OFI_PROGRESS_WIN_COUNTER_FENCE);
+
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_OFI_THREAD_FI_MUTEX);
+    tcount = *MPIDI_OFI_WIN(win).issued_cntr;
+    donecount = fi_cntr_read(MPIDI_OFI_WIN(win).cmpl_cntr);
+
+    MPIR_Assert(donecount <= tcount);
+
+    while (tcount > donecount) {
+        MPIR_Assert(donecount <= tcount);
+        MPID_THREAD_CS_EXIT(POBJ, MPIDI_OFI_THREAD_FI_MUTEX);
+        MPIDI_OFI_PROGRESS();
+        MPID_THREAD_CS_ENTER(POBJ, MPIDI_OFI_THREAD_FI_MUTEX);
+        donecount = fi_cntr_read(MPIDI_OFI_WIN(win).cmpl_cntr);
+        itercount++;
+
+        if (itercount == 1000) {
+            ret = fi_cntr_wait(MPIDI_OFI_WIN(win).cmpl_cntr, tcount, 0);
+            MPIDI_OFI_ERR(ret < 0 && ret != -FI_ETIMEDOUT,
+                          mpi_errno,
+                          MPI_ERR_RMA_RANGE,
+                          "**ofid_cntr_wait",
+                          "**ofid_cntr_wait %s %d %s %s",
+                          __SHORT_FILE__, __LINE__, FCNAME, fi_strerror(-ret));
+            itercount = 0;
+        }
+    }
+
+    while (OPA_load_int(&MPIDI_CH4U_WIN(win, outstanding_ops)) != 0)
+        MPIDI_OFI_PROGRESS();
+
+    r = MPIDI_OFI_WIN(win).syncQ;
+
+    while (r) {
+        MPIDI_OFI_win_request_t *next = r->next;
+        MPIDI_OFI_rma_done_event(NULL, (MPIR_Request *) r);
+        r = next;
+    }
+
+    MPIDI_OFI_WIN(win).syncQ = NULL;
+  fn_exit:
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_OFI_THREAD_FI_MUTEX);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_OFI_PROGRESS_WIN_COUNTER_FENCE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_set_info
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_set_info(MPIR_Win * win, MPIR_Info * info)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_SET_INFO);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_SET_INFO);
+
+    mpi_errno = MPIDI_CH4R_win_set_info(win, info);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_SET_INFO);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_start
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_start(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_START);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_START);
+
+    MPIDI_CH4U_EPOCH_CHECK_TYPE(win, mpi_errno, goto fn_fail);
+
+    MPIR_Group_add_ref(group);
+
+    MPIDI_OFI_PROGRESS_WHILE(group->size != (int) MPIDI_CH4U_WIN(win, sync).pw.count);
+
+    MPIDI_CH4U_WIN(win, sync).pw.count = 0;
+
+    MPIR_ERR_CHKANDJUMP((MPIDI_CH4U_WIN(win, sync).sc.group != NULL),
+                        mpi_errno, MPI_ERR_GROUP, "**group");
+    MPIDI_CH4U_WIN(win, sync).sc.group = group;
+    MPIDI_CH4U_WIN(win, sync).origin_epoch_type = MPIDI_CH4U_EPOTYPE_START;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_START);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_complete
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_complete(MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_COMPLETE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_COMPLETE);
+
+    MPIDI_CH4U_EPOCH_START_CHECK2(win, mpi_errno, goto fn_fail);
+
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_Win_progress_fence(win));
+
+    MPIR_Group *group;
+    group = MPIDI_CH4U_WIN(win, sync).sc.group;
+    MPIR_Assert(group != NULL);
+    MPIDI_OFI_win_control_t msg;
+    msg.type = MPIDI_OFI_CTRL_COMPLETE;
+
+    int index, peer;
+
+    for (index = 0; index < group->size; ++index) {
+        peer = group->lrank_to_lpid[index].lpid;
+        mpi_errno = MPIDI_OFI_do_control_win(&msg, peer, win, 0, 1);
+
+        if (mpi_errno != MPI_SUCCESS)
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
+    }
+
+    MPIDI_CH4U_EPOCH_TARGET_EVENT(win);
+
+    MPIR_Group_release(MPIDI_CH4U_WIN(win, sync).sc.group);
+    MPIDI_CH4U_WIN(win, sync).sc.group = NULL;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_COMPLETE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_post
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_post(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    int peer, index, mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_win_control_t msg;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_POST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_POST);
+
+    MPIDI_CH4U_EPOCH_POST_CHECK(win, mpi_errno, goto fn_fail);
+
+    MPIR_Group_add_ref(group);
+    MPIR_ERR_CHKANDJUMP((MPIDI_CH4U_WIN(win, sync).pw.group != NULL),
+                        mpi_errno, MPI_ERR_GROUP, "**group");
+
+    MPIDI_CH4U_WIN(win, sync).pw.group = group;
+    MPIR_Assert(group != NULL);
+
+    msg.type = MPIDI_OFI_CTRL_POST;
+
+    for (index = 0; index < group->size; ++index) {
+        peer = group->lrank_to_lpid[index].lpid;
+        mpi_errno = MPIDI_OFI_do_control_win(&msg, peer, win, 0, 1);
+
+        if (mpi_errno != MPI_SUCCESS)
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
+    }
+
+    MPIDI_CH4U_WIN(win, sync).target_epoch_type = MPIDI_CH4U_EPOTYPE_POST;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_POST);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_wait
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_wait(MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_WAIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_WAIT);
+
+    MPIDI_CH4U_EPOCH_TARGET_CHECK(win, MPIDI_CH4U_EPOTYPE_POST, mpi_errno, return mpi_errno);
+
+    MPIR_Group *group;
+    group = MPIDI_CH4U_WIN(win, sync).pw.group;
+
+    MPIDI_OFI_PROGRESS_WHILE(group->size != (int) MPIDI_CH4U_WIN(win, sync).sc.count);
+
+    MPIDI_CH4U_WIN(win, sync).sc.count = 0;
+    MPIDI_CH4U_WIN(win, sync).pw.group = NULL;
+
+    MPIR_Group_release(group);
+
+    MPIDI_CH4U_EPOCH_ORIGIN_EVENT(win);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_WAIT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_test
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_test(MPIR_Win * win, int *flag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_TEST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_TEST);
+
+    MPIDI_CH4U_EPOCH_TARGET_CHECK(win, MPIDI_CH4U_EPOTYPE_POST, mpi_errno, return mpi_errno);
+
+    MPIR_Group *group;
+    group = MPIDI_CH4U_WIN(win, sync).pw.group;
+
+    if (group->size == (int) MPIDI_CH4U_WIN(win, sync).sc.count) {
+        MPIDI_CH4U_WIN(win, sync).sc.count = 0;
+        MPIDI_CH4U_WIN(win, sync).pw.group = NULL;
+        *flag = 1;
+        MPIR_Group_release(group);
+        MPIDI_CH4U_EPOCH_ORIGIN_EVENT(win);
+    }
+    else {
+        MPIDI_OFI_PROGRESS();
+        *flag = 0;
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_TEST);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_lock
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_lock(int lock_type, int rank, int assert, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_LOCK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_LOCK);
+
+    MPIDI_CH4U_win_sync_lock *slock = &MPIDI_CH4U_WIN(win, sync).lock;
+    MPIDI_CH4U_EPOCH_CHECK_TYPE(win, mpi_errno, goto fn_fail);
+
+    if (rank == MPI_PROC_NULL)
+        goto fn_exit0;
+
+    MPIDI_OFI_win_control_t msg;
+
+    msg.type = MPIDI_OFI_CTRL_LOCKREQ;
+    msg.lock_type = lock_type;
+
+    mpi_errno = MPIDI_OFI_do_control_win(&msg, rank, win, 1, 1);
+
+    if (mpi_errno != MPI_SUCCESS)
+        MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
+
+    MPIDI_OFI_PROGRESS_WHILE(!slock->remote.locked);
+
+  fn_exit0:
+    MPIDI_CH4U_WIN(win, sync).origin_epoch_type = MPIDI_CH4U_EPOTYPE_LOCK;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_LOCK);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_unlock
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_unlock(int rank, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_UNLOCK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_UNLOCK);
+
+    MPIDI_CH4U_EPOCH_ORIGIN_CHECK(win, MPIDI_CH4U_EPOTYPE_LOCK, mpi_errno, return mpi_errno);
+
+    if (rank == MPI_PROC_NULL)
+        goto fn_exit0;
+
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_Win_progress_fence(win));
+
+    MPIDI_OFI_win_control_t msg;
+    msg.type = MPIDI_OFI_CTRL_UNLOCK;
+    mpi_errno = MPIDI_OFI_do_control_win(&msg, rank, win, 1, 1);
+
+    if (mpi_errno != MPI_SUCCESS)
+        MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
+
+    MPIDI_OFI_PROGRESS_WHILE(MPIDI_CH4U_WIN(win, sync).lock.remote.locked);
+  fn_exit0:
+    MPIDI_CH4U_WIN(win, sync).origin_epoch_type = MPIDI_CH4U_EPOTYPE_NONE;
+    MPIDI_CH4U_WIN(win, sync).target_epoch_type = MPIDI_CH4U_EPOTYPE_NONE;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_UNLOCK);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_get_info
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_get_info(MPIR_Win * win, MPIR_Info ** info_p_p)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_GET_INFO);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_GET_INFO);
+
+    mpi_errno = MPIDI_CH4R_win_get_info(win, info_p_p);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_GET_INFO);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_free
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_free(MPIR_Win ** win_ptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+    MPIR_Win *win = *win_ptr;
+    uint32_t window_instance;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_FREE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_FREE);
+
+    MPIDI_CH4U_EPOCH_FREE_CHECK(win, mpi_errno, return mpi_errno);
+
+    mpi_errno = MPIR_Barrier_impl(win->comm_ptr, &errflag);
+
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+    window_instance = (uint32_t) (MPIDI_OFI_WIN(win).win_id >> 32);
+
+    MPIDI_OFI_index_allocator_free(MPIDI_OFI_COMM(win->comm_ptr).win_id_allocator, window_instance);
+    MPIDI_OFI_map_erase(MPIDI_Global.win_map, MPIDI_OFI_WIN(win).win_id);
+    if (MPIDI_OFI_WIN(win).ep_nocmpl != MPIDI_OFI_EP_TX_CTR(0))
+        MPIDI_OFI_CALL(fi_close(&MPIDI_OFI_WIN(win).ep_nocmpl->fid), epclose);
+    if (MPIDI_OFI_WIN(win).ep != MPIDI_OFI_EP_TX_RMA(0))
+        MPIDI_OFI_CALL(fi_close(&MPIDI_OFI_WIN(win).ep->fid), epclose);
+    if (MPIDI_OFI_WIN(win).cmpl_cntr != MPIDI_Global.rma_cmpl_cntr)
+        MPIDI_OFI_CALL(fi_close(&MPIDI_OFI_WIN(win).cmpl_cntr->fid), cqclose);
+    MPIDI_OFI_CALL(fi_close(&MPIDI_OFI_WIN(win).mr->fid), mr_unreg);
+    if (MPIDI_OFI_WIN(win).winfo) {
+        MPL_free(MPIDI_OFI_WIN(win).winfo);
+        MPIDI_OFI_WIN(win).winfo = NULL;
+    }
+
+    MPIDI_CH4R_win_finalize(win_ptr);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_FREE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_fence
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_fence(int massert, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_FENCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_FENCE);
+
+    MPIDI_CH4U_EPOCH_FENCE_CHECK(win, mpi_errno, goto fn_fail);
+
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_Win_progress_fence(win));
+
+    MPIDI_CH4U_EPOCH_FENCE_EVENT(win, massert);
+
+    /*
+     * We always make a barrier even if MPI_MODE_NOPRECEDE is specified.
+     * This is necessary because we no longer defer executions of RMA ops
+     * until synchronization calls as CH3 did. Otherwise, the code like
+     * this won't work correctly (cf. f77/rma/wingetf)
+     *
+     * Rank 0                          Rank 1
+     * ----                            ----
+     * Store to local mem in window
+     * MPI_Win_fence(MODE_NOPRECEDE)   MPI_Win_fence(MODE_NOPRECEDE)
+     * MPI_Get(from rank 1)
+     */
+    mpi_errno = MPIR_Barrier_impl(win->comm_ptr, &errflag);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_FENCE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_create
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_create(void *base,
+                                      MPI_Aint length,
+                                      int disp_unit,
+                                      MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+    MPIR_Win *win;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_CREATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_CREATE);
+
+    mpi_errno = MPIDI_OFI_win_init(length,
+                                   disp_unit,
+                                   win_ptr, info, comm_ptr, MPI_WIN_FLAVOR_CREATE, MPI_WIN_UNIFIED);
+
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+    win = *win_ptr;
+    win->base = base;
+
+    mpi_errno = MPIDI_OFI_win_allgather(win, base, disp_unit);
+
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+    mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag);
+
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_CREATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_attach
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_attach(MPIR_Win * win, void *base, MPI_Aint size)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_ATTACH);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_ATTACH);
+
+    MPIR_ERR_CHKANDSTMT((win->create_flavor != MPI_WIN_FLAVOR_DYNAMIC), mpi_errno,
+                        MPI_ERR_RMA_FLAVOR, goto fn_fail, "**rmaflavor");
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_ATTACH);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_allocate_shared
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_allocate_shared(MPI_Aint size,
+                                               int disp_unit,
+                                               MPIR_Info * info_ptr,
+                                               MPIR_Comm * comm_ptr,
+                                               void **base_ptr, MPIR_Win ** win_ptr)
+{
+    int i = 0, fd = -1, rc, first = 0, mpi_errno = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+    void *baseP = NULL;
+    MPIR_Win *win = NULL;
+    ssize_t total_size = 0LL;
+    MPI_Aint size_out = 0;
+    char shm_key[64];
+    void *map_ptr;
+    MPIDI_CH4U_win_shared_info_t *shared_table = NULL;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_ALLOCATE_SHARED);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_ALLOCATE_SHARED);
+
+    mpi_errno = MPIDI_OFI_win_init(size, disp_unit, win_ptr, info_ptr, comm_ptr,
+                                   MPI_WIN_FLAVOR_SHARED, MPI_WIN_UNIFIED);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIR_ERR_POP(mpi_errno);
+    win = *win_ptr;
+    MPIDI_CH4U_WIN(win, shared_table) =
+        (MPIDI_CH4U_win_shared_info_t *) MPL_malloc(sizeof(MPIDI_CH4U_win_shared_info_t) *
+                                                    comm_ptr->local_size);
+    shared_table = MPIDI_CH4U_WIN(win, shared_table);
+
+    shared_table[comm_ptr->rank].size = size;
+    shared_table[comm_ptr->rank].disp_unit = disp_unit;
+
+    mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE,
+                                    0,
+                                    MPI_DATATYPE_NULL,
+                                    shared_table,
+                                    sizeof(MPIDI_CH4U_win_shared_info_t),
+                                    MPI_BYTE, comm_ptr, &errflag);
+
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+    /* No allreduce here because this is a shared memory domain
+     * and should be a relatively small number of processes
+     * and a non performance sensitive API.
+     */
+    for (i = 0; i < comm_ptr->local_size; i++)
+        total_size += shared_table[i].size;
+
+    if (total_size == 0)
+        goto fn_zero;
+
+    sprintf(shm_key, "/mpi-%X-%" PRIx64, MPIDI_Global.jobid, MPIDI_OFI_WIN(win).win_id);
+
+    rc = shm_open(shm_key, O_CREAT | O_EXCL | O_RDWR, 0600);
+    first = (rc != -1);
+
+    if (!first) {
+        rc = shm_open(shm_key, O_RDWR, 0);
+
+        if (rc == -1) {
+            shm_unlink(shm_key);
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_NO_MEM, goto fn_fail, "**nomem");
+        }
+    }
+
+    /* Make the addresses symmetric by using MAP_FIXED */
+    size_t page_sz, mapsize;
+
+    mapsize = MPIDI_CH4R_get_mapsize(total_size, &page_sz);
+    fd = rc;
+    rc = ftruncate(fd, mapsize);
+
+    if (rc == -1) {
+        close(fd);
+
+        if (first)
+            shm_unlink(shm_key);
+
+        MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_NO_MEM, goto fn_fail, "**nomem");
+    }
+
+    if (comm_ptr->rank == 0) {
+        map_ptr = MPIDI_CH4R_generate_random_addr(mapsize);
+        map_ptr = mmap(map_ptr, mapsize, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0);
+
+        if (map_ptr == NULL || map_ptr == MAP_FAILED) {
+            close(fd);
+
+            if (first)
+                shm_unlink(shm_key);
+
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_NO_MEM, goto fn_fail, "**nomem");
+        }
+
+        mpi_errno = MPIR_Bcast_impl(&map_ptr, 1, MPI_UNSIGNED_LONG, 0, comm_ptr, &errflag);
+
+        if (mpi_errno != MPI_SUCCESS)
+            goto fn_fail;
+
+        MPIDI_CH4U_WIN(win, mmap_addr) = map_ptr;
+        MPIDI_CH4U_WIN(win, mmap_sz) = mapsize;
+    }
+    else {
+        mpi_errno = MPIR_Bcast_impl(&map_ptr, 1, MPI_UNSIGNED_LONG, 0, comm_ptr, &errflag);
+
+        if (mpi_errno != MPI_SUCCESS)
+            goto fn_fail;
+
+        rc = MPIDI_CH4R_check_maprange_ok(map_ptr, mapsize);
+        /* If we hit this assert, we need to iterate
+         * trying more addresses
+         */
+        MPIR_Assert(rc == 1);
+        map_ptr = mmap(map_ptr, mapsize, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0);
+        MPIDI_CH4U_WIN(win, mmap_addr) = map_ptr;
+        MPIDI_CH4U_WIN(win, mmap_sz) = mapsize;
+
+        if (map_ptr == NULL || map_ptr == MAP_FAILED) {
+            close(fd);
+
+            if (first)
+                shm_unlink(shm_key);
+
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_NO_MEM, goto fn_fail, "**nomem");
+        }
+    }
+
+    /* Scan for my offset into the buffer             */
+    /* Could use exscan if this is expensive at scale */
+    for (i = 0; i < comm_ptr->rank; i++)
+        size_out += shared_table[i].size;
+
+  fn_zero:
+
+    baseP = (size == 0) ? NULL : (void *) ((char *) map_ptr + size_out);
+    win->base = baseP;
+    win->size = size;
+    mpi_errno = MPIDI_OFI_win_allgather(win, baseP, disp_unit);
+
+    if (mpi_errno != MPI_SUCCESS)
+        return mpi_errno;
+
+    *(void **) base_ptr = (void *) win->base;
+    mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag);
+
+    close(fd);
+
+    if (first)
+        shm_unlink(shm_key);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_ALLOCATE_SHARED);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_detach
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_detach(MPIR_Win * win, const void *base)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_DETACH);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_DETACH);
+    MPIR_ERR_CHKANDSTMT((win->create_flavor != MPI_WIN_FLAVOR_DYNAMIC), mpi_errno,
+                        MPI_ERR_RMA_FLAVOR, goto fn_fail, "**rmaflavor");
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_DETACH);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_shared_query
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_shared_query(MPIR_Win * win,
+                                            int rank,
+                                            MPI_Aint * size, int *disp_unit, void *baseptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_SHARED_QUERY);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_SHARED_QUERY);
+    int offset = rank;
+    int i;
+    uintptr_t base = (uintptr_t) MPIDI_CH4U_WIN(win, mmap_addr);
+
+    MPIDI_CH4U_win_shared_info_t *shared_table = MPIDI_CH4U_WIN(win, shared_table);
+
+    if (rank < 0)
+        offset = 0;
+
+    *size = shared_table[offset].size;
+    *disp_unit = shared_table[offset].disp_unit;
+    if (*size > 0) {
+        for (i = 0; i < offset; i++)
+            base += shared_table[i].size;
+        *(void **) baseptr = (void *) base;
+    }
+    else
+        *(void **) baseptr = NULL;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_SHARED_QUERY);
+    return mpi_errno;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_allocate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_allocate(MPI_Aint size,
+                                        int disp_unit,
+                                        MPIR_Info * info,
+                                        MPIR_Comm * comm, void *baseptr, MPIR_Win ** win_ptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+    void *baseP;
+    MPIR_Win *win;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_ALLOCATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_ALLOCATE);
+
+    mpi_errno = MPIDI_OFI_win_init(size, disp_unit, win_ptr, info, comm,
+                                   MPI_WIN_FLAVOR_ALLOCATE, MPI_WIN_UNIFIED);
+
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+    mpi_errno = MPIDI_CH4R_get_symmetric_heap(size, comm, &baseP, *win_ptr);
+
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+    win = *win_ptr;
+    win->base = baseP;
+    mpi_errno = MPIDI_OFI_win_allgather(win, baseP, disp_unit);
+
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+    *(void **) baseptr = (void *) win->base;
+    mpi_errno = MPIR_Barrier_impl(comm, &errflag);
+
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_ALLOCATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_flush
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_flush(int rank, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_FLUSH);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_FLUSH);
+
+    MPIDI_CH4U_EPOCH_LOCK_CHECK(win, mpi_errno, goto fn_fail);
+
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_Win_progress_fence(win));
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_FLUSH);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_flush_local_all
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_flush_local_all(MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_FLUSH_LOCAL_ALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_FLUSH_LOCAL_ALL);
+
+    MPIDI_CH4U_EPOCH_LOCK_CHECK(win, mpi_errno, goto fn_fail);
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_Win_progress_fence(win));
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_FLUSH_LOCAL_ALL);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_unlock_all
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_unlock_all(MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_UNLOCK_ALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_UNLOCK_ALL);
+    int i;
+    MPIDI_CH4U_win_lock_info *lockQ;
+
+    MPIDI_CH4U_EPOCH_ORIGIN_CHECK(win, MPIDI_CH4U_EPOTYPE_LOCK_ALL, mpi_errno, return mpi_errno);
+
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_Win_progress_fence(win));
+
+    MPIR_Assert(MPIDI_CH4U_WIN(win, lockQ) != NULL);
+
+    lockQ = (MPIDI_CH4U_win_lock_info *) MPIDI_CH4U_WIN(win, lockQ);
+
+    for (i = 0; i < win->comm_ptr->local_size; i++) {
+        MPIDI_OFI_win_control_t msg;
+        lockQ[i].done = 0;
+        lockQ[i].peer = i;
+        lockQ[i].win = win;
+        msg.type = MPIDI_OFI_CTRL_UNLOCKALL;
+        mpi_errno = MPIDI_OFI_do_control_win(&msg, lockQ[i].peer, win, 1, 1);
+
+        if (mpi_errno != MPI_SUCCESS)
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
+
+        if (MPIDI_CH4U_WIN(win, sync).lock.remote.allLocked == 1)
+            lockQ[i].done = 1;
+    }
+
+    MPIDI_OFI_PROGRESS_WHILE(MPIDI_CH4U_WIN(win, sync).lock.remote.allLocked);
+
+    MPIDI_CH4U_WIN(win, sync).origin_epoch_type = MPIDI_CH4U_EPOTYPE_NONE;
+    MPIDI_CH4U_WIN(win, sync).target_epoch_type = MPIDI_CH4U_EPOTYPE_NONE;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_UNLOCK_ALL);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_create_dynamic
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_create_dynamic(MPIR_Info * info,
+                                              MPIR_Comm * comm, MPIR_Win ** win_ptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int rc = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+    MPIR_Win *win;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_CREATE_DYNAMIC);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_CREATE_DYNAMIC);
+
+    rc = MPIDI_OFI_win_init((uintptr_t) UINTPTR_MAX - (uintptr_t) MPI_BOTTOM,
+                            1, win_ptr, info, comm, MPI_WIN_FLAVOR_DYNAMIC, MPI_WIN_UNIFIED);
+
+    if (rc != MPI_SUCCESS)
+        goto fn_fail;
+
+    win = *win_ptr;
+    win->base = MPI_BOTTOM;
+
+    rc = MPIDI_OFI_win_allgather(win, win->base, 1);
+
+    if (rc != MPI_SUCCESS)
+        goto fn_fail;
+
+    mpi_errno = MPIR_Barrier_impl(comm, &errflag);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_CREATE_DYNAMIC);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_flush_local
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_flush_local(int rank, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_FLUSH_LOCAL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_FLUSH_LOCAL);
+
+    MPIDI_CH4U_EPOCH_LOCK_CHECK(win, mpi_errno, goto fn_fail);
+
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_Win_progress_fence(win));
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_FLUSH_LOCAL);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_sync
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_sync(MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_SYNC);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_SYNC);
+
+    MPIDI_CH4U_EPOCH_LOCK_CHECK(win, mpi_errno, goto fn_fail);
+
+    OPA_read_write_barrier();
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_SYNC);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_flush_all
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_flush_all(MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_FLUSH_ALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_FLUSH_ALL);
+
+    MPIDI_CH4U_EPOCH_LOCK_CHECK(win, mpi_errno, goto fn_fail);
+    MPIDI_OFI_MPI_CALL_POP(MPIDI_Win_progress_fence(win));
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_FLUSH_ALL);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_win_lock_all
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_win_lock_all(int assert, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_CH4U_win_lock_info *lockQ;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_WIN_LOCK_ALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_WIN_LOCK_ALL);
+
+    MPIDI_CH4U_EPOCH_CHECK_TYPE(win, mpi_errno, goto fn_fail);
+
+    int size;
+    size = win->comm_ptr->local_size;
+
+    if (!MPIDI_CH4U_WIN(win, lockQ)) {
+        MPIDI_CH4U_WIN(win, lockQ) =
+            (MPIDI_CH4U_win_lock_info *) MPL_calloc(size, sizeof(MPIDI_CH4U_win_lock_info));
+        MPIR_Assert(MPIDI_CH4U_WIN(win, lockQ) != NULL);
+    }
+
+    lockQ = MPIDI_CH4U_WIN(win, lockQ);
+    int i;
+
+    for (i = 0; i < size; i++) {
+        MPIDI_OFI_win_control_t msg;
+
+        lockQ[i].done = 0;
+        lockQ[i].peer = i;
+        lockQ[i].win = win;
+        lockQ[i].lock_type = MPI_LOCK_SHARED;
+
+        msg.type = MPIDI_OFI_CTRL_LOCKALLREQ;
+        msg.lock_type = MPI_LOCK_SHARED;
+        mpi_errno = MPIDI_OFI_do_control_win(&msg, lockQ[i].peer, lockQ[i].win, 1, 1);
+
+        if (mpi_errno != MPI_SUCCESS)
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
+
+        if (MPIDI_CH4U_WIN(win, sync).lock.remote.allLocked == 1)
+            lockQ[i].done = 1;
+    }
+
+    MPIDI_OFI_PROGRESS_WHILE(size != (int) MPIDI_CH4U_WIN(win, sync).lock.remote.allLocked);
+
+    MPIDI_CH4U_WIN(win, sync).origin_epoch_type = MPIDI_CH4U_EPOTYPE_LOCK_ALL;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_LOCK_ALL);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#endif /* NETMOD_OFI_WIN_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/subconfigure.m4 b/src/mpid/ch4/netmod/ofi/subconfigure.m4
new file mode 100644
index 0000000..35e4a03
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/subconfigure.m4
@@ -0,0 +1,153 @@
+[#] start of __file__
+dnl MPICH_SUBCFG_AFTER=src/mpid/ch4
+
+AC_DEFUN([PAC_SUBCFG_PREREQ_]PAC_SUBCFG_AUTO_SUFFIX,[
+    AM_COND_IF([BUILD_CH4],[
+        for net in $ch4_netmods ; do
+            AS_CASE([$net],[ofi],[build_ch4_netmod_ofi=yes])
+	    if test $net = "ofi" ; then
+	       AC_DEFINE(HAVE_CH4_NETMOD_OFI,1,[OFI netmod is built])
+           AC_DEFINE(MPIDI_BUILD_CH4_LOCALITY_INFO, 1, [CH4 should build locality info])
+	    fi
+        done
+
+        AC_ARG_WITH(ch4-netmod-ofi-args,
+        [  --with-ch4-netmod-ofi-args=arg1:arg2:arg3
+        CH4 OFI netmod arguments:
+                scalable-endpoints - Use OFI scalable endpoint mode
+                av-table           - Use OFI AV table (logical addressing mode).  Default is av-map mode
+                mr-basic           - USE OFI MR_BASIC mode. Default is MR_SCALABLE mode.
+                direct-provider    - USE OFI FI_DIRECT to compile in a single OFI direct provider
+                no-tagged          - Do not use OFI fi_tagged interfaces.
+                no-data            - Disable immediate data field
+                no-stx-rma         - Disable per-window EP & counter for RMA
+                ],
+                [ofi_netmod_args=$withval],
+                [ofi_netmod_args=])
+
+dnl Parse the device arguments
+    SAVE_IFS=$IFS
+    IFS=':'
+    args_array=$ofi_netmod_args
+    do_scalable_endpoints=false
+    do_direct_provider=false
+    do_av_table=false
+    do_tagged=true
+    do_data=true
+    do_stx_rma=true
+    do_mr_scalable=true
+    echo "Parsing Arguments for OFI Netmod"
+    for arg in $args_array; do
+    case ${arg} in
+      scalable-endpoints)
+              do_scalable_endpoints=true
+              echo " ---> CH4::OFI Provider : $arg"
+              ;;
+      av_table)
+              do_av_table=true
+              echo " ---> CH4::OFI Provider AV table : $arg"
+              ;;
+      direct-provider)
+              do_direct_provider=true
+              echo " ---> CH4::OFI Direct OFI Provider requested : $arg"
+              ;;
+      no-tagged)
+              do_tagged=false
+              echo " ---> CH4::OFI Disable fi_tagged interfaces : $arg"
+              ;;
+      no-data)
+              do_data=false
+              echo " ---> CH4::OFI Disable immediate data field : $arg"
+	      ;;
+      no-stx-rma)
+              do_stx_rma=false
+              echo " ---> CH4::OFI Disable per-window EP & counter for RMA : $arg"
+	      ;;
+      mr-basic)
+              do_mr_scalable=false
+              echo " ---> CH4::OFI Switching to MR_BASIC mode : $arg"
+	      ;;
+    esac
+    done
+    IFS=$SAVE_IFS
+
+    if [test "$do_scalable_endpoints" = "true"]; then
+       AC_MSG_NOTICE([Enabling OFI netmod scalable endpoints])
+       PAC_APPEND_FLAG([-DMPIDI_OFI_CONFIG_USE_SCALABLE_ENDPOINTS], [CPPFLAGS])
+    fi
+
+    if [test "$do_av_table" = "true"]; then
+       AC_MSG_NOTICE([Enabling OFI netmod AV table])
+       PAC_APPEND_FLAG([-DMPIDI_OFI_CONFIG_USE_AV_TABLE], [CPPFLAGS])
+    fi
+
+    if [test "$do_direct_provider" = "true"]; then
+       AC_MSG_NOTICE([Enabling OFI netmod direct provider])
+       PAC_APPEND_FLAG([-DFABRIC_DIRECT],[CPPFLAGS])
+    fi
+
+    if [test "$do_tagged" = "true"]; then
+       AC_DEFINE([USE_OFI_TAGGED], [1], [Define to use fi_tagged interfaces])
+       AC_MSG_NOTICE([Enabling fi_tagged interfaces])
+    fi
+
+    if [test "$do_data" = "true"]; then
+       AC_DEFINE([USE_OFI_IMMEDIATE_DATA], [1], [Define to use immediate data field])
+       AC_MSG_NOTICE([Enabling immediate data field])
+    fi
+
+    if [test "$do_stx_rma" = "true"]; then
+       AC_DEFINE([USE_OFI_STX_RMA], [1], [Define to use per-window EP & counter])
+       AC_MSG_NOTICE([Enabling per-window EP & counter])
+    fi
+
+    if [test "$do_mr_scalable" = "true"]; then
+       AC_DEFINE([USE_OFI_MR_SCALABLE], [1], [Define to use MR_SCALABLE])
+       AC_MSG_NOTICE([Enabling MR_SCALABLE])
+    fi
+])
+    AM_CONDITIONAL([BUILD_CH4_NETMOD_OFI],[test "X$build_ch4_netmod_ofi" = "Xyes"])
+])dnl
+
+AC_DEFUN([PAC_SUBCFG_BODY_]PAC_SUBCFG_AUTO_SUFFIX,[
+AM_COND_IF([BUILD_CH4_NETMOD_OFI],[
+    AC_MSG_NOTICE([RUNNING CONFIGURE FOR ch4:ofi])
+
+    ofisrcdir=""
+    AC_SUBST([ofisrcdir])
+    ofilib=""
+    AC_SUBST([ofilib])
+
+    PAC_SET_HEADER_LIB_PATH(libfabric)
+    PAC_PUSH_FLAG(LIBS)
+    PAC_CHECK_HEADER_LIB([rdma/fabric.h], [fabric], [fi_getinfo], [have_libfabric=yes], [have_libfabric=no])
+    PAC_POP_FLAG(LIBS)
+    if test "${have_libfabric}" = "yes" ; then
+        AC_MSG_NOTICE([CH4 OFI Netmod:  Using an external libfabric])
+        PAC_APPEND_FLAG([-lfabric],[WRAPPER_LIBS])
+    elif test ! -z "${with_libfabric}" ; then
+        AC_MSG_ERROR([Provided libfabric installation (--with-libfabric=${with_libfabric}) could not be configured.])
+    else
+        # fallback to embedded libfabric
+        AC_MSG_NOTICE([CH4 OFI Netmod:  Using an embedded libfabric])
+        PAC_CONFIG_SUBDIR_ARGS([src/mpid/ch4/netmod/ofi/libfabric],[],[],[AC_MSG_ERROR(libfabric configure failed)])
+        PAC_APPEND_FLAG([-I${master_top_builddir}/src/mpid/ch4/netmod/ofi/libfabric/include], [CPPFLAGS])
+        PAC_APPEND_FLAG([-I${use_top_srcdir}/src/mpid/ch4/netmod/ofi/libfabric/include], [CPPFLAGS])
+
+        ofisrcdir="${master_top_builddir}/src/mpid/ch4/netmod/ofi/libfabric"
+        ofilib="src/mpid/ch4/netmod/ofi/libfabric/src/libfabric.la"
+    fi
+
+    case $host_os in
+      darwin* )
+        PAC_APPEND_FLAG([-lstdc++ -ldl],[WRAPPER_LIBS])
+        ;;
+      *)
+        PAC_APPEND_FLAG([-lstdc++ -ldl -lrt],[WRAPPER_LIBS])
+        ;;
+    esac
+
+])dnl end AM_COND_IF(BUILD_CH4_NETMOD_OFI,...)
+])dnl end _BODY
+
+[#] end of __file__
diff --git a/src/mpid/ch4/netmod/ofi/util.c b/src/mpid/ch4/netmod/ofi/util.c
new file mode 100644
index 0000000..6c30026
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/util.c
@@ -0,0 +1,822 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#include <mpidimpl.h>
+#include "ofi_impl.h"
+#include "ofi_events.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_handle_cq_error_util
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+int MPIDI_OFI_handle_cq_error_util(ssize_t ret)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_OFI_HANDLE_CQ_ERROR);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_OFI_HANDLE_CQ_ERROR);
+
+    mpi_errno = MPIDI_OFI_handle_cq_error(ret);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_HANDLE_CQ_ERROR);
+    return mpi_errno;
+}
+
+int MPIDI_OFI_progress_test_no_inline()
+{
+    return MPIDI_Progress_test();
+}
+
+typedef struct {
+    uint64_t key;
+    void *value;
+    MPL_UT_hash_handle hh;      /* makes this structure hashable */
+} MPIDI_OFI_map_entry_t;
+
+typedef struct MPIDI_OFI_map_t {
+    MPIDI_OFI_map_entry_t *head;
+
+} MPIDI_OFI_map_t;
+
+void MPIDI_OFI_map_create(void **out_map)
+{
+    MPIDI_OFI_map_t *map;
+    map = MPL_malloc(sizeof(MPIDI_OFI_map_t));
+    MPIR_Assert(map != NULL);
+    map->head = NULL;
+    *out_map = map;
+}
+
+void MPIDI_OFI_map_destroy(void *in_map)
+{
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_OFI_THREAD_UTIL_MUTEX);
+    MPIDI_OFI_map_t *map = in_map;
+    MPL_HASH_CLEAR(hh, map->head);
+    MPL_free(map);
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_OFI_THREAD_UTIL_MUTEX);
+}
+
+void MPIDI_OFI_map_set(void *in_map, uint64_t id, void *val)
+{
+    MPIDI_OFI_map_t *map;
+    MPIDI_OFI_map_entry_t *map_entry;
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_OFI_THREAD_UTIL_MUTEX);
+    map = (MPIDI_OFI_map_t *) in_map;
+    map_entry = MPL_malloc(sizeof(MPIDI_OFI_map_entry_t));
+    MPIR_Assert(map_entry != NULL);
+    map_entry->key = id;
+    map_entry->value = val;
+    MPL_HASH_ADD(hh, map->head, key, sizeof(uint64_t), map_entry);
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_OFI_THREAD_UTIL_MUTEX);
+}
+
+void MPIDI_OFI_map_erase(void *in_map, uint64_t id)
+{
+    MPIDI_OFI_map_t *map;
+    MPIDI_OFI_map_entry_t *map_entry;
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_OFI_THREAD_UTIL_MUTEX);
+    map = (MPIDI_OFI_map_t *) in_map;
+    MPL_HASH_FIND(hh, map->head, &id, sizeof(uint64_t), map_entry);
+    MPIR_Assert(map_entry != NULL);
+    MPL_HASH_DELETE(hh, map->head, map_entry);
+    MPL_free(map_entry);
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_OFI_THREAD_UTIL_MUTEX);
+}
+
+void *MPIDI_OFI_map_lookup(void *in_map, uint64_t id)
+{
+    void *rc;
+    MPIDI_OFI_map_t *map;
+    MPIDI_OFI_map_entry_t *map_entry;
+
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_OFI_THREAD_UTIL_MUTEX);
+    map = (MPIDI_OFI_map_t *) in_map;
+    MPL_HASH_FIND(hh, map->head, &id, sizeof(uint64_t), map_entry);
+    if (map_entry == NULL)
+        rc = MPIDI_OFI_MAP_NOT_FOUND;
+    else
+        rc = map_entry->value;
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_OFI_THREAD_UTIL_MUTEX);
+    return rc;
+}
+
+typedef struct MPIDI_OFI_index_allocator_t {
+    int chunk_size;
+    int num_ints;
+    int start;
+    int last_free_index;
+    uint64_t *bitmask;
+} MPIDI_OFI_index_allocator_t;
+
+void MPIDI_OFI_index_allocator_create(void **indexmap, int start)
+{
+    MPIDI_OFI_index_allocator_t *allocator;
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_OFI_THREAD_UTIL_MUTEX);
+    allocator = MPL_malloc(sizeof(MPIDI_OFI_index_allocator_t));
+    allocator->chunk_size = 128;
+    allocator->num_ints = allocator->chunk_size;
+    allocator->start = start;
+    allocator->last_free_index = 0;
+    allocator->bitmask = MPL_malloc(sizeof(uint64_t) * allocator->num_ints);
+    memset(allocator->bitmask, 0xFF, sizeof(uint64_t) * allocator->num_ints);
+    assert(allocator != NULL);
+    *indexmap = allocator;
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_OFI_THREAD_UTIL_MUTEX);
+}
+
+#define MPIDI_OFI_INDEX_CALC(val,nval,shift,mask) \
+    if ((val & mask) == 0) {                              \
+        val >>= shift##ULL;                               \
+        nval += shift;                                    \
+    }
+int MPIDI_OFI_index_allocator_alloc(void *indexmap)
+{
+    int i;
+    MPIDI_OFI_index_allocator_t *allocator = indexmap;
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_OFI_THREAD_UTIL_MUTEX);
+    for (i = allocator->last_free_index; i < allocator->num_ints; i++) {
+        if (allocator->bitmask[i]) {
+            register uint64_t val, nval;
+            val = allocator->bitmask[i];
+            nval = 2;
+            MPIDI_OFI_INDEX_CALC(val, nval, 32, 0xFFFFFFFFULL);
+            MPIDI_OFI_INDEX_CALC(val, nval, 16, 0xFFFFULL);
+            MPIDI_OFI_INDEX_CALC(val, nval, 8, 0xFFULL);
+            MPIDI_OFI_INDEX_CALC(val, nval, 4, 0xFULL);
+            MPIDI_OFI_INDEX_CALC(val, nval, 2, 0x3ULL);
+            nval -= val & 0x1ULL;
+            allocator->bitmask[i] &= ~(0x1ULL << (nval - 1));
+            allocator->last_free_index = i;
+            MPID_THREAD_CS_EXIT(POBJ, MPIDI_OFI_THREAD_UTIL_MUTEX);
+            return i * sizeof(uint64_t) * 8 + (nval - 1) + allocator->start;
+        }
+        if (i == allocator->num_ints - 1) {
+            allocator->num_ints += allocator->chunk_size;
+            allocator->bitmask = MPL_realloc(allocator->bitmask,
+                                             sizeof(uint64_t) * allocator->num_ints);
+            memset(&allocator->bitmask[i + 1], 0xFF, sizeof(uint64_t) * allocator->chunk_size);
+        }
+    }
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_OFI_THREAD_UTIL_MUTEX);
+    return -1;
+}
+
+void MPIDI_OFI_index_allocator_free(void *indexmap, int index)
+{
+    int int_index, bitpos, numbits;
+    MPIDI_OFI_index_allocator_t *allocator = indexmap;
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_OFI_THREAD_UTIL_MUTEX);
+    numbits = sizeof(uint64_t) * 8;
+    int_index = (index + 1 - allocator->start) / numbits;
+    bitpos = (index - allocator->start) % numbits;
+
+    allocator->last_free_index = MPL_MIN(int_index, allocator->last_free_index);
+    allocator->bitmask[int_index] |= (0x1ULL << bitpos);
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_OFI_THREAD_UTIL_MUTEX);
+}
+
+void MPIDI_OFI_index_allocator_destroy(void *indexmap)
+{
+    MPIDI_OFI_index_allocator_t *allocator;
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_OFI_THREAD_UTIL_MUTEX);
+    allocator = (MPIDI_OFI_index_allocator_t *) indexmap;
+    MPL_free(allocator->bitmask);
+    MPL_free(allocator);
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_OFI_THREAD_UTIL_MUTEX);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_OFI_win_lock_advance
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_OFI_win_lock_advance(MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    struct MPIDI_CH4U_win_sync_lock *slock = &MPIDI_CH4U_WIN(win, sync).lock;
+    struct MPIDI_CH4U_win_queue *q = &slock->local.requested;
+
+    if ((q->head != NULL) &&
+        ((slock->local.count == 0) ||
+         ((slock->local.type == MPI_LOCK_SHARED) && (q->head->type == MPI_LOCK_SHARED)
+)
+)
+) {
+        struct MPIDI_CH4U_win_lock *lock = q->head;
+        q->head = lock->next;
+
+        if (q->head == NULL)
+            q->tail = NULL;
+
+        ++slock->local.count;
+        slock->local.type = lock->type;
+
+        if (lock->mtype == MPIDI_OFI_REQUEST_LOCK) {
+            MPIDI_OFI_win_control_t info;
+            info.type = MPIDI_OFI_CTRL_LOCKACK;
+            mpi_errno = MPIDI_OFI_do_control_win(&info, lock->rank, win, 1, 0);
+
+            if (mpi_errno != MPI_SUCCESS)
+                MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
+
+        }
+        else if (lock->mtype == MPIDI_OFI_REQUEST_LOCKALL) {
+            MPIDI_OFI_win_control_t info;
+            info.type = MPIDI_OFI_CTRL_LOCKALLACK;
+            mpi_errno = MPIDI_OFI_do_control_win(&info, lock->rank, win, 1, 0);
+        }
+        else
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
+
+        MPL_free(lock);
+        mpi_errno = MPIDI_OFI_win_lock_advance(win);
+
+        if (mpi_errno != MPI_SUCCESS)
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
+    }
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_OFI_win_lock_request_proc(const MPIDI_OFI_win_control_t * info,
+                                                  MPIR_Win * win, unsigned peer)
+{
+    int mpi_errno;
+    struct MPIDI_CH4U_win_lock *lock =
+        (struct MPIDI_CH4U_win_lock *) MPL_calloc(1, sizeof(struct MPIDI_CH4U_win_lock));
+
+    if (info->type == MPIDI_OFI_CTRL_LOCKREQ)
+        lock->mtype = MPIDI_OFI_REQUEST_LOCK;
+    else if (info->type == MPIDI_OFI_CTRL_LOCKALLREQ)
+        lock->mtype = MPIDI_OFI_REQUEST_LOCKALL;
+
+    lock->rank = info->origin_rank;
+    lock->type = info->lock_type;
+    struct MPIDI_CH4U_win_queue *q = &MPIDI_CH4U_WIN(win, sync).lock.local.requested;
+    MPIR_Assert((q->head != NULL) ^ (q->tail == NULL));
+
+    if (q->tail == NULL)
+        q->head = lock;
+    else
+        q->tail->next = lock;
+
+    q->tail = lock;
+
+    mpi_errno = MPIDI_OFI_win_lock_advance(win);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline void MPIDI_OFI_win_lock_ack_proc(const MPIDI_OFI_win_control_t * info,
+                                               MPIR_Win * win, unsigned peer)
+{
+    if (info->type == MPIDI_OFI_CTRL_LOCKACK)
+        MPIDI_CH4U_WIN(win, sync).lock.remote.locked = 1;
+    else if (info->type == MPIDI_OFI_CTRL_LOCKALLACK)
+        MPIDI_CH4U_WIN(win, sync).lock.remote.allLocked += 1;
+}
+
+
+static inline int MPIDI_OFI_win_unlock_proc(const MPIDI_OFI_win_control_t * info,
+                                            MPIR_Win * win, unsigned peer)
+{
+    int mpi_errno;
+    --MPIDI_CH4U_WIN(win, sync).lock.local.count;
+    MPIR_Assert((int) MPIDI_CH4U_WIN(win, sync).lock.local.count >= 0);
+    mpi_errno = MPIDI_OFI_win_lock_advance(win);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
+
+    MPIDI_OFI_win_control_t new_info;
+    new_info.type = MPIDI_OFI_CTRL_UNLOCKACK;
+    mpi_errno = MPIDI_OFI_do_control_win(&new_info, peer, win, 1, 0);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline void MPIDI_OFI_win_complete_proc(const MPIDI_OFI_win_control_t * info,
+                                               MPIR_Win * win, unsigned peer)
+{
+    ++MPIDI_CH4U_WIN(win, sync).sc.count;
+}
+
+static inline void MPIDI_OFI_win_post_proc(const MPIDI_OFI_win_control_t * info,
+                                           MPIR_Win * win, unsigned peer)
+{
+    ++MPIDI_CH4U_WIN(win, sync).pw.count;
+}
+
+
+static inline void MPIDI_OFI_win_unlock_done_proc(const MPIDI_OFI_win_control_t * info,
+                                                  MPIR_Win * win, unsigned peer)
+{
+    if (MPIDI_CH4U_WIN(win, sync).origin_epoch_type == MPIDI_CH4U_EPOTYPE_LOCK)
+        MPIDI_CH4U_WIN(win, sync).lock.remote.locked = 0;
+    else if (MPIDI_CH4U_WIN(win, sync).origin_epoch_type == MPIDI_CH4U_EPOTYPE_LOCK_ALL) {
+        MPIR_Assert((int) MPIDI_CH4U_WIN(win, sync).lock.remote.allLocked > 0);
+        MPIDI_CH4U_WIN(win, sync).lock.remote.allLocked -= 1;
+    }
+    else
+        MPIR_Assert(0);
+
+}
+
+static inline void MPIDI_OFI_get_huge_cleanup(MPIDI_OFI_send_control_t * info)
+{
+    MPIDI_OFI_huge_recv_t *recv;
+    MPIR_Comm *comm_ptr;
+    uint64_t mapid;
+    /* Look up the communicator */
+    mapid = ((uint64_t) info->endpoint_id << 32) | info->comm_id;
+    comm_ptr = MPIDI_CH4U_context_id_to_comm(mapid);
+    /* Look up the per destination receive queue object */
+    recv =
+        (MPIDI_OFI_huge_recv_t *) MPIDI_OFI_map_lookup(MPIDI_OFI_COMM(comm_ptr).huge_recv_counters,
+                                                       info->origin_rank);
+    MPIDI_OFI_map_erase(MPIDI_OFI_COMM(comm_ptr).huge_recv_counters, info->origin_rank);
+    MPL_free(recv);
+}
+
+static inline void MPIDI_OFI_get_huge(MPIDI_OFI_send_control_t * info)
+{
+    MPIDI_OFI_huge_recv_t *recv;
+    MPIR_Comm *comm_ptr;
+    /* Look up the communicator */
+    comm_ptr = MPIDI_CH4U_context_id_to_comm(info->comm_id);
+    /* Look up the per destination receive queue object */
+    recv =
+        (MPIDI_OFI_huge_recv_t *) MPIDI_OFI_map_lookup(MPIDI_OFI_COMM(comm_ptr).huge_recv_counters,
+                                                       info->origin_rank);
+    if (recv == MPIDI_OFI_MAP_NOT_FOUND) {
+        recv = (MPIDI_OFI_huge_recv_t *) MPL_calloc(sizeof(*recv), 1);
+        MPIDI_OFI_map_set(MPIDI_OFI_COMM(comm_ptr).huge_recv_counters, info->origin_rank, recv);
+    }
+    recv->event_id = MPIDI_OFI_EVENT_GET_HUGE;
+    recv->cur_offset = MPIDI_Global.max_send;
+    recv->remote_info = *info;
+    recv->comm_ptr = comm_ptr;
+    MPIDI_OFI_get_huge_event(NULL, (MPIR_Request *) recv);
+}
+
+int MPIDI_OFI_control_handler(void *am_hdr,
+                              void **data,
+                              size_t * data_sz,
+                              int *is_contig,
+                              MPIDI_NM_am_completion_handler_fn * cmpl_handler_fn,
+                              MPIR_Request ** req)
+{
+    int senderrank;
+    int mpi_errno = MPI_SUCCESS;
+    void *buf = am_hdr;
+    MPIDI_OFI_win_control_t *control = (MPIDI_OFI_win_control_t *) buf;
+    *req = NULL;
+    *cmpl_handler_fn = NULL;
+
+    switch (control->type) {
+    case MPIDI_OFI_CTRL_HUGEACK:{
+            MPIDI_OFI_send_control_t *ctrlsend = (MPIDI_OFI_send_control_t *) buf;
+            MPIDI_OFI_dispatch_function(NULL, ctrlsend->ackreq, 0);
+            goto fn_exit;
+        }
+        break;
+
+    case MPIDI_OFI_CTRL_HUGE:{
+            MPIDI_OFI_send_control_t *ctrlsend = (MPIDI_OFI_send_control_t *) buf;
+            MPIDI_OFI_get_huge(ctrlsend);
+            goto fn_exit;
+        }
+        break;
+
+    case MPIDI_OFI_CTRL_HUGE_CLEANUP:{
+            MPIDI_OFI_send_control_t *ctrlsend = (MPIDI_OFI_send_control_t *) buf;
+            MPIDI_OFI_get_huge_cleanup(ctrlsend);
+            goto fn_exit;
+        }
+        break;
+    }
+
+    MPIR_Win *win;
+    senderrank = control->origin_rank;
+    win = (MPIR_Win *) MPIDI_OFI_map_lookup(MPIDI_Global.win_map, control->win_id);
+    MPIR_Assert(win != MPIDI_OFI_MAP_NOT_FOUND);
+
+    switch (control->type) {
+    case MPIDI_OFI_CTRL_LOCKREQ:
+    case MPIDI_OFI_CTRL_LOCKALLREQ:
+        mpi_errno = MPIDI_OFI_win_lock_request_proc(control, win, senderrank);
+        break;
+
+    case MPIDI_OFI_CTRL_LOCKACK:
+    case MPIDI_OFI_CTRL_LOCKALLACK:
+        MPIDI_OFI_win_lock_ack_proc(control, win, senderrank);
+        break;
+
+    case MPIDI_OFI_CTRL_UNLOCK:
+    case MPIDI_OFI_CTRL_UNLOCKALL:
+        mpi_errno = MPIDI_OFI_win_unlock_proc(control, win, senderrank);
+        break;
+
+    case MPIDI_OFI_CTRL_UNLOCKACK:
+    case MPIDI_OFI_CTRL_UNLOCKALLACK:
+        MPIDI_OFI_win_unlock_done_proc(control, win, senderrank);
+        break;
+
+    case MPIDI_OFI_CTRL_COMPLETE:
+        MPIDI_OFI_win_complete_proc(control, win, senderrank);
+        break;
+
+    case MPIDI_OFI_CTRL_POST:
+        MPIDI_OFI_win_post_proc(control, win, senderrank);
+        break;
+
+    default:
+        fprintf(stderr, "Bad control type: 0x%08x  %d\n", control->type, control->type);
+        MPIR_Assert(0);
+    }
+
+  fn_exit:
+    return mpi_errno;
+}
+
+
+/* MPI Datatype Processing for RMA */
+#define isS_INT(x) ((x)==MPI_INTEGER ||                                \
+    (x) == MPI_INT32_T || (x) == MPI_INTEGER4 ||       \
+                     (x) == MPI_INT)
+#define isUS_INT(x) ((x) == MPI_UINT32_T || (x) == MPI_UNSIGNED)
+#define isS_SHORT(x) ((x) == MPI_SHORT || (x) == MPI_INT16_T ||        \
+                       (x) == MPI_INTEGER2)
+#define isUS_SHORT(x) ((x) == MPI_UNSIGNED_SHORT || (x) == MPI_UINT16_T)
+#define isS_CHAR(x) ((x) == MPI_SIGNED_CHAR || (x) == MPI_INT8_T ||    \
+                      (x) == MPI_INTEGER1 || (x) == MPI_CHAR)
+#define isUS_CHAR(x) ((x) == MPI_BYTE ||                               \
+                       (x) == MPI_UNSIGNED_CHAR || (x) == MPI_UINT8_T)
+#define isS_LONG(x) ((x) == MPI_LONG || (x) == MPI_AINT)
+#define isUS_LONG(x) ((x) == MPI_UNSIGNED_LONG)
+#define isS_LONG_LONG(x) ((x) == MPI_INT64_T || (x) == MPI_OFFSET ||   \
+    (x) == MPI_INTEGER8 || (x) == MPI_LONG_LONG || \
+                           (x) == MPI_LONG_LONG_INT || (x) == MPI_COUNT)
+#define isUS_LONG_LONG(x) ((x) == MPI_UINT64_T || (x) == MPI_UNSIGNED_LONG_LONG)
+#define isFLOAT(x) ((x) == MPI_FLOAT || (x) == MPI_REAL)
+#define isDOUBLE(x) ((x) == MPI_DOUBLE || (x) == MPI_DOUBLE_PRECISION)
+#define isLONG_DOUBLE(x) ((x) == MPI_LONG_DOUBLE)
+#define isLOC_TYPE(x) ((x) == MPI_2REAL || (x) == MPI_2DOUBLE_PRECISION || \
+    (x) == MPI_2INTEGER || (x) == MPI_FLOAT_INT ||  \
+    (x) == MPI_DOUBLE_INT || (x) == MPI_LONG_INT || \
+    (x) == MPI_2INT || (x) == MPI_SHORT_INT ||      \
+                        (x) == MPI_LONG_DOUBLE_INT)
+#define isBOOL(x) ((x) == MPI_C_BOOL)
+#define isLOGICAL(x) ((x) == MPI_LOGICAL)
+#define isSINGLE_COMPLEX(x) ((x) == MPI_COMPLEX || (x) == MPI_C_FLOAT_COMPLEX)
+#define isDOUBLE_COMPLEX(x) ((x) == MPI_DOUBLE_COMPLEX || (x) == MPI_COMPLEX8 || \
+                              (x) == MPI_C_DOUBLE_COMPLEX)
+
+
+#undef FUNCNAME
+#define FUNCNAME mpi_to_ofi
+#undef FCNAME
+#define FCNAME DECL_FUNC(mpi_to_ofi)
+static inline int mpi_to_ofi(MPI_Datatype dt, enum fi_datatype *fi_dt, MPI_Op op, enum fi_op *fi_op)
+{
+    *fi_dt = FI_DATATYPE_LAST;
+    *fi_op = FI_ATOMIC_OP_LAST;
+
+    if (isS_INT(dt))
+        *fi_dt = FI_INT32;
+    else if (isUS_INT(dt))
+        *fi_dt = FI_UINT32;
+    else if (isFLOAT(dt))
+        *fi_dt = FI_FLOAT;
+    else if (isDOUBLE(dt))
+        *fi_dt = FI_DOUBLE;
+    else if (isLONG_DOUBLE(dt))
+        *fi_dt = FI_LONG_DOUBLE;
+    else if (isS_CHAR(dt))
+        *fi_dt = FI_INT8;
+    else if (isUS_CHAR(dt))
+        *fi_dt = FI_UINT8;
+    else if (isS_SHORT(dt))
+        *fi_dt = FI_INT16;
+    else if (isUS_SHORT(dt))
+        *fi_dt = FI_UINT16;
+    else if (isS_LONG(dt))
+        *fi_dt = FI_INT64;
+    else if (isUS_LONG(dt))
+        *fi_dt = FI_UINT64;
+    else if (isS_LONG_LONG(dt))
+        *fi_dt = FI_INT64;
+    else if (isUS_LONG_LONG(dt))
+        *fi_dt = FI_UINT64;
+    else if (isSINGLE_COMPLEX(dt))
+        *fi_dt = FI_FLOAT_COMPLEX;
+    else if (isDOUBLE_COMPLEX(dt))
+        *fi_dt = FI_DOUBLE_COMPLEX;
+    else if (isLOC_TYPE(dt))
+        *fi_dt = FI_DATATYPE_LAST;
+    else if (isLOGICAL(dt))
+        *fi_dt = FI_UINT32;
+    else if (isBOOL(dt))
+        *fi_dt = FI_UINT8;
+
+    if (*fi_dt == FI_DATATYPE_LAST)
+        goto fn_fail;
+
+    *fi_op = FI_ATOMIC_OP_LAST;
+
+    switch (op) {
+    case MPI_SUM:
+        *fi_op = FI_SUM;
+        goto fn_exit;
+        break;
+
+    case MPI_PROD:
+        *fi_op = FI_PROD;
+        goto fn_exit;
+        break;
+
+    case MPI_MAX:
+        *fi_op = FI_MAX;
+        goto fn_exit;
+        break;
+
+    case MPI_MIN:
+        *fi_op = FI_MIN;
+        goto fn_exit;
+        break;
+
+    case MPI_BAND:
+        *fi_op = FI_BAND;
+        goto fn_exit;
+        break;
+
+    case MPI_BOR:
+        *fi_op = FI_BOR;
+        goto fn_exit;
+        break;
+
+    case MPI_BXOR:
+        *fi_op = FI_BXOR;
+        goto fn_exit;
+        break;
+
+    case MPI_LAND:
+        if (isLONG_DOUBLE(dt))
+            goto fn_fail;
+
+        *fi_op = FI_LAND;
+        goto fn_exit;
+        break;
+
+    case MPI_LOR:
+        if (isLONG_DOUBLE(dt))
+            goto fn_fail;
+
+        *fi_op = FI_LOR;
+        goto fn_exit;
+        break;
+
+    case MPI_LXOR:
+        if (isLONG_DOUBLE(dt))
+            goto fn_fail;
+
+        *fi_op = FI_LXOR;
+        goto fn_exit;
+        break;
+
+    case MPI_REPLACE:{
+            *fi_op = FI_ATOMIC_WRITE;
+            goto fn_exit;
+            break;
+        }
+
+    case MPI_NO_OP:{
+            *fi_op = FI_ATOMIC_READ;
+            goto fn_exit;
+            break;
+        }
+
+    case MPI_OP_NULL:{
+            *fi_op = FI_CSWAP;
+            goto fn_exit;
+            break;
+        }
+
+    default:
+        goto fn_fail;
+        break;
+    }
+
+  fn_exit:
+    return MPI_SUCCESS;
+  fn_fail:
+    return -1;
+}
+
+static MPI_Datatype mpi_dtypes[] = {
+    MPI_CHAR, MPI_UNSIGNED_CHAR, MPI_SIGNED_CHAR, MPI_BYTE,
+    MPI_WCHAR, MPI_SHORT, MPI_UNSIGNED_SHORT, MPI_INT,
+    MPI_UNSIGNED, MPI_LONG, MPI_UNSIGNED_LONG, MPI_FLOAT,
+    MPI_DOUBLE, MPI_LONG_DOUBLE, MPI_LONG_LONG, MPI_UNSIGNED_LONG_LONG,
+    MPI_PACKED, MPI_LB, MPI_UB, MPI_2INT,
+
+    MPI_INT8_T, MPI_INT16_T, MPI_INT32_T,
+    MPI_INT64_T, MPI_UINT8_T, MPI_UINT16_T,
+    MPI_UINT32_T, MPI_UINT64_T, MPI_C_BOOL,
+    MPI_C_FLOAT_COMPLEX, MPI_C_DOUBLE_COMPLEX, MPI_C_LONG_DOUBLE_COMPLEX,
+    /* address/offset/count types */
+    MPI_AINT, MPI_OFFSET, MPI_COUNT,
+    /* Fortran types */
+#ifdef HAVE_FORTRAN_BINDING
+    MPI_COMPLEX, MPI_DOUBLE_COMPLEX, MPI_LOGICAL, MPI_REAL,
+    MPI_DOUBLE_PRECISION, MPI_INTEGER, MPI_2INTEGER,
+
+#ifdef MPICH_DEFINE_2COMPLEX
+    MPI_2COMPLEX, MPI_2DOUBLE_COMPLEX,
+#endif
+    MPI_2REAL, MPI_2DOUBLE_PRECISION, MPI_CHARACTER,
+    MPI_REAL4, MPI_REAL8, MPI_REAL16, MPI_COMPLEX8, MPI_COMPLEX16,
+    MPI_COMPLEX32, MPI_INTEGER1, MPI_INTEGER2, MPI_INTEGER4, MPI_INTEGER8,
+    MPI_INTEGER16,
+#endif
+    MPI_FLOAT_INT, MPI_DOUBLE_INT,
+    MPI_LONG_INT, MPI_SHORT_INT,
+    MPI_LONG_DOUBLE_INT,
+    (MPI_Datatype) - 1,
+};
+
+static MPI_Op mpi_ops[] = {
+    MPI_MAX, MPI_MIN, MPI_SUM, MPI_PROD,
+    MPI_LAND, MPI_BAND, MPI_LOR, MPI_BOR,
+    MPI_LXOR, MPI_BXOR, MPI_MINLOC, MPI_MAXLOC,
+    MPI_REPLACE, MPI_NO_OP, MPI_OP_NULL,
+};
+
+#define _TBL MPIDI_Global.win_op_table[i][j]
+#define CHECK_ATOMIC(fcn,field1,field2)            \
+  atomic_count = 0;                                \
+  ret = fcn(MPIDI_OFI_EP_TX_RMA(0),                          \
+    fi_dt,                                 \
+    fi_op,                                 \
+            &atomic_count);                        \
+  if (ret == 0 && atomic_count != 0)                \
+    {                                              \
+  _TBL.field1 = 1;                             \
+  _TBL.field2 = atomic_count;                  \
+    }
+
+static inline void create_dt_map()
+{
+    int i, j;
+    size_t dtsize[FI_DATATYPE_LAST];
+    dtsize[FI_INT8] = sizeof(int8_t);
+    dtsize[FI_UINT8] = sizeof(uint8_t);
+    dtsize[FI_INT16] = sizeof(int16_t);
+    dtsize[FI_UINT16] = sizeof(uint16_t);
+    dtsize[FI_INT32] = sizeof(int32_t);
+    dtsize[FI_UINT32] = sizeof(uint32_t);
+    dtsize[FI_INT64] = sizeof(int64_t);
+    dtsize[FI_UINT64] = sizeof(uint64_t);
+    dtsize[FI_FLOAT] = sizeof(float);
+    dtsize[FI_DOUBLE] = sizeof(double);
+    dtsize[FI_FLOAT_COMPLEX] = sizeof(float complex);
+    dtsize[FI_DOUBLE_COMPLEX] = sizeof(double complex);
+    dtsize[FI_LONG_DOUBLE] = sizeof(long double);
+    dtsize[FI_LONG_DOUBLE_COMPLEX] = sizeof(long double complex);
+
+    for (i = 0; i < MPIDI_OFI_DT_SIZES; i++)
+        for (j = 0; j < MPIDI_OFI_OP_SIZES; j++) {
+            enum fi_datatype fi_dt = (enum fi_datatype) -1;
+            enum fi_op fi_op = (enum fi_op) -1;
+            mpi_to_ofi(mpi_dtypes[i], &fi_dt, mpi_ops[j], &fi_op);
+            MPIR_Assert(fi_dt != (enum fi_datatype) -1);
+            MPIR_Assert(fi_op != (enum fi_op) -1);
+            _TBL.dt = fi_dt;
+            _TBL.op = fi_op;
+            _TBL.atomic_valid = 0;
+            _TBL.max_atomic_count = 0;
+            _TBL.max_fetch_atomic_count = 0;
+            _TBL.max_compare_atomic_count = 0;
+            ssize_t ret;
+            size_t atomic_count;
+
+            if (fi_dt != FI_DATATYPE_LAST && fi_op != FI_ATOMIC_OP_LAST) {
+                CHECK_ATOMIC(fi_atomicvalid, atomic_valid, max_atomic_count);
+                CHECK_ATOMIC(fi_fetch_atomicvalid, fetch_atomic_valid, max_fetch_atomic_count);
+                CHECK_ATOMIC(fi_compare_atomicvalid, compare_atomic_valid,
+                             max_compare_atomic_count);
+                _TBL.dtsize = dtsize[fi_dt];
+            }
+        }
+}
+
+static inline void add_index(MPI_Datatype datatype, int *index)
+{
+    MPIR_Datatype *dt_ptr;
+    MPID_Datatype_get_ptr(datatype, dt_ptr);
+    MPIDI_OFI_DATATYPE(dt_ptr).index = *index;
+    (*index)++;
+}
+
+void MPIDI_OFI_index_datatypes()
+{
+    int index = 0;
+
+    add_index(MPI_CHAR, &index);
+    add_index(MPI_UNSIGNED_CHAR, &index);
+    add_index(MPI_SIGNED_CHAR, &index);
+    add_index(MPI_BYTE, &index);
+    add_index(MPI_WCHAR, &index);
+    add_index(MPI_SHORT, &index);
+    add_index(MPI_UNSIGNED_SHORT, &index);
+    add_index(MPI_INT, &index);
+    add_index(MPI_UNSIGNED, &index);
+    add_index(MPI_LONG, &index);
+    add_index(MPI_UNSIGNED_LONG, &index);       /* 10 */
+
+    add_index(MPI_FLOAT, &index);
+    add_index(MPI_DOUBLE, &index);
+    add_index(MPI_LONG_DOUBLE, &index);
+    add_index(MPI_LONG_LONG, &index);
+    add_index(MPI_UNSIGNED_LONG_LONG, &index);
+    add_index(MPI_PACKED, &index);
+    add_index(MPI_LB, &index);
+    add_index(MPI_UB, &index);
+    add_index(MPI_2INT, &index);
+
+    /* C99 types */
+    add_index(MPI_INT8_T, &index);      /* 20 */
+    add_index(MPI_INT16_T, &index);
+    add_index(MPI_INT32_T, &index);
+    add_index(MPI_INT64_T, &index);
+    add_index(MPI_UINT8_T, &index);
+    add_index(MPI_UINT16_T, &index);
+    add_index(MPI_UINT32_T, &index);
+    add_index(MPI_UINT64_T, &index);
+    add_index(MPI_C_BOOL, &index);
+    add_index(MPI_C_FLOAT_COMPLEX, &index);
+    add_index(MPI_C_DOUBLE_COMPLEX, &index);    /* 30 */
+    add_index(MPI_C_LONG_DOUBLE_COMPLEX, &index);
+
+    /* address/offset/count types */
+    add_index(MPI_AINT, &index);
+    add_index(MPI_OFFSET, &index);
+    add_index(MPI_COUNT, &index);
+
+    /* Fortran types */
+#ifdef HAVE_FORTRAN_BINDING
+    add_index(MPI_COMPLEX, &index);
+    add_index(MPI_DOUBLE_COMPLEX, &index);
+    add_index(MPI_LOGICAL, &index);
+    add_index(MPI_REAL, &index);
+    add_index(MPI_DOUBLE_PRECISION, &index);
+    add_index(MPI_INTEGER, &index);     /* 40 */
+    add_index(MPI_2INTEGER, &index);
+#ifdef MPICH_DEFINE_2COMPLEX
+    add_index(MPI_2COMPLEX, &index);
+    add_index(MPI_2DOUBLE_COMPLEX, &index);
+#endif
+    add_index(MPI_2REAL, &index);
+    add_index(MPI_2DOUBLE_PRECISION, &index);
+    add_index(MPI_CHARACTER, &index);
+    add_index(MPI_REAL4, &index);
+    add_index(MPI_REAL8, &index);
+    add_index(MPI_REAL16, &index);
+    add_index(MPI_COMPLEX8, &index);    /* 50 */
+    add_index(MPI_COMPLEX16, &index);
+    add_index(MPI_COMPLEX32, &index);
+    add_index(MPI_INTEGER1, &index);
+    add_index(MPI_INTEGER2, &index);
+    add_index(MPI_INTEGER4, &index);
+    add_index(MPI_INTEGER8, &index);
+
+    if (MPI_INTEGER16 == MPI_DATATYPE_NULL)
+        index++;
+    else
+        add_index(MPI_INTEGER16, &index);
+
+#endif
+    add_index(MPI_FLOAT_INT, &index);
+    add_index(MPI_DOUBLE_INT, &index);
+    add_index(MPI_LONG_INT, &index);
+    add_index(MPI_SHORT_INT, &index);   /* 60 */
+    add_index(MPI_LONG_DOUBLE_INT, &index);
+    create_dt_map();
+}
diff --git a/src/mpid/ch4/netmod/portals4/Makefile.mk b/src/mpid/ch4/netmod/portals4/Makefile.mk
new file mode 100644
index 0000000..af89854
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/Makefile.mk
@@ -0,0 +1,7 @@
+if BUILD_CH4_NETMOD_PORTALS4
+
+mpi_core_sources   += src/mpid/ch4/netmod/portals4/globals.c \
+                      src/mpid/ch4/netmod/portals4/func_table.c
+errnames_txt_files += src/mpid/ch4/netmod/portals4/errnames.txt
+
+endif
diff --git a/src/mpid/ch4/netmod/portals4/errnames.txt b/src/mpid/ch4/netmod/portals4/errnames.txt
new file mode 100644
index 0000000..ff024e3
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/errnames.txt
@@ -0,0 +1,10 @@
+**PtlInit:PtlInit failed
+**PtlInit %s %d %s %s:PtlInit failed (%s %d %s %s)
+**PtlNIInit:PtlNIInit failed
+**PtlNIInit %s %d %s %s:PtlNIInit failed (%s %d %s %s)
+**PtlEQAlloc:PtlEQAlloc failed
+**PtlEQAlloc %s %d %s %s:PtlEQAlloc failed (%s %d %s %s)
+**PtlPTAlloc:PtlPTAlloc failed
+**PtlPTAlloc %s %d %s %s:PtlPTAlloc failed (%s %d %s %s)
+**PtlMDBind:PtlMDBind failed
+**PtlMDBind %s %d %s %s:PtlMDBind failed (%s %d %s %s)
diff --git a/src/mpid/ch4/netmod/portals4/func_table.c b/src/mpid/ch4/netmod/portals4/func_table.c
new file mode 100644
index 0000000..1c123eb
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/func_table.c
@@ -0,0 +1,157 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_DIRECT
+#define NETMOD_DISABLE_INLINES
+#include "mpidimpl.h"
+#include "netmod_direct.h"
+
+MPIDI_NM_funcs_t MPIDI_NM_portals4_funcs = {
+    MPIDI_NM_init,
+    MPIDI_NM_finalize,
+    MPIDI_NM_progress,
+    MPIDI_NM_comm_connect,
+    MPIDI_NM_comm_disconnect,
+    MPIDI_NM_open_port,
+    MPIDI_NM_close_port,
+    MPIDI_NM_comm_accept,
+    MPIDI_NM_comm_get_lpid,
+    MPIDI_NM_gpid_get,
+    MPIDI_NM_get_node_id,
+    MPIDI_NM_get_max_node_id,
+    MPIDI_NM_getallincomm,
+    MPIDI_NM_gpid_tolpidarray,
+    MPIDI_NM_create_intercomm_from_lpids,
+    MPIDI_NM_comm_create,
+    MPIDI_NM_comm_destroy,
+    MPIDI_NM_am_request_init,
+    MPIDI_NM_am_request_finalize,
+    MPIDI_NM_reg_hdr_handler,
+    MPIDI_NM_send_am_hdr,
+    MPIDI_NM_inject_am_hdr,
+    MPIDI_NM_send_am,
+    MPIDI_NM_send_amv,
+    MPIDI_NM_send_amv_hdr,
+    MPIDI_NM_send_am_hdr_reply,
+    MPIDI_NM_inject_am_hdr_reply,
+    MPIDI_NM_send_am_reply,
+    MPIDI_NM_send_amv_reply,
+    MPIDI_NM_am_hdr_max_sz,
+    MPIDI_NM_am_inject_max_sz,
+    MPIDI_NM_am_recv,
+};
+
+MPIDI_NM_native_funcs_t MPIDI_NM_native_portals4_funcs = {
+    MPIDI_NM_send,
+    MPIDI_NM_ssend,
+    MPIDI_NM_startall,
+    MPIDI_NM_send_init,
+    MPIDI_NM_ssend_init,
+    MPIDI_NM_rsend_init,
+    MPIDI_NM_bsend_init,
+    MPIDI_NM_isend,
+    MPIDI_NM_issend,
+    MPIDI_NM_cancel_send,
+    MPIDI_NM_recv_init,
+    MPIDI_NM_recv,
+    MPIDI_NM_irecv,
+    MPIDI_NM_imrecv,
+    MPIDI_NM_cancel_recv,
+    MPIDI_NM_alloc_mem,
+    MPIDI_NM_free_mem,
+    MPIDI_NM_improbe,
+    MPIDI_NM_iprobe,
+    MPIDI_NM_win_set_info,
+    MPIDI_NM_win_shared_query,
+    MPIDI_NM_put,
+    MPIDI_NM_win_start,
+    MPIDI_NM_win_complete,
+    MPIDI_NM_win_post,
+    MPIDI_NM_win_wait,
+    MPIDI_NM_win_test,
+    MPIDI_NM_win_lock,
+    MPIDI_NM_win_unlock,
+    MPIDI_NM_win_get_info,
+    MPIDI_NM_get,
+    MPIDI_NM_win_free,
+    MPIDI_NM_win_fence,
+    MPIDI_NM_win_create,
+    MPIDI_NM_accumulate,
+    MPIDI_NM_win_attach,
+    MPIDI_NM_win_allocate_shared,
+    MPIDI_NM_rput,
+    MPIDI_NM_win_flush_local,
+    MPIDI_NM_win_detach,
+    MPIDI_NM_compare_and_swap,
+    MPIDI_NM_raccumulate,
+    MPIDI_NM_rget_accumulate,
+    MPIDI_NM_fetch_and_op,
+    MPIDI_NM_win_allocate,
+    MPIDI_NM_win_flush,
+    MPIDI_NM_win_flush_local_all,
+    MPIDI_NM_win_unlock_all,
+    MPIDI_NM_win_create_dynamic,
+    MPIDI_NM_rget,
+    MPIDI_NM_win_sync,
+    MPIDI_NM_win_flush_all,
+    MPIDI_NM_get_accumulate,
+    MPIDI_NM_win_lock_all,
+    MPIDI_NM_rank_is_local,
+    MPIDI_NM_barrier,
+    MPIDI_NM_bcast,
+    MPIDI_NM_allreduce,
+    MPIDI_NM_allgather,
+    MPIDI_NM_allgatherv,
+    MPIDI_NM_scatter,
+    MPIDI_NM_scatterv,
+    MPIDI_NM_gather,
+    MPIDI_NM_gatherv,
+    MPIDI_NM_alltoall,
+    MPIDI_NM_alltoallv,
+    MPIDI_NM_alltoallw,
+    MPIDI_NM_reduce,
+    MPIDI_NM_reduce_scatter,
+    MPIDI_NM_reduce_scatter_block,
+    MPIDI_NM_scan,
+    MPIDI_NM_exscan,
+    MPIDI_NM_neighbor_allgather,
+    MPIDI_NM_neighbor_allgatherv,
+    MPIDI_NM_neighbor_alltoall,
+    MPIDI_NM_neighbor_alltoallv,
+    MPIDI_NM_neighbor_alltoallw,
+    MPIDI_NM_ineighbor_allgather,
+    MPIDI_NM_ineighbor_allgatherv,
+    MPIDI_NM_ineighbor_alltoall,
+    MPIDI_NM_ineighbor_alltoallv,
+    MPIDI_NM_ineighbor_alltoallw,
+    MPIDI_NM_ibarrier,
+    MPIDI_NM_ibcast,
+    MPIDI_NM_iallgather,
+    MPIDI_NM_iallgatherv,
+    MPIDI_NM_iallreduce,
+    MPIDI_NM_ialltoall,
+    MPIDI_NM_ialltoallv,
+    MPIDI_NM_ialltoallw,
+    MPIDI_NM_iexscan,
+    MPIDI_NM_igather,
+    MPIDI_NM_igatherv,
+    MPIDI_NM_ireduce_scatter_block,
+    MPIDI_NM_ireduce_scatter,
+    MPIDI_NM_ireduce,
+    MPIDI_NM_iscan,
+    MPIDI_NM_iscatter,
+    MPIDI_NM_iscatterv,
+    MPIDI_NM_datatype_commit,
+    MPIDI_NM_datatype_dup,
+    MPIDI_NM_datatype_destroy,
+    MPIDI_NM_op_commit,
+    MPIDI_NM_op_destroy
+};
+#endif
diff --git a/src/mpid/ch4/netmod/portals4/globals.c b/src/mpid/ch4/netmod/portals4/globals.c
new file mode 100644
index 0000000..0275cdd
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/globals.c
@@ -0,0 +1,14 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#include "ptl_impl.h"
+#include "ptl_types.h"
+
+MPIDI_PTL_global_t MPIDI_PTL_global = { 0 };
diff --git a/src/mpid/ch4/netmod/portals4/netmod_direct.h b/src/mpid/ch4/netmod/portals4/netmod_direct.h
new file mode 100644
index 0000000..edae2b3
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/netmod_direct.h
@@ -0,0 +1,30 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_DIRECT_H_INCLUDED
+#define NETMOD_DIRECT_H_INCLUDED
+#include "ptl_init.h"
+#include "ptl_types.h"
+#include "ptl_probe.h"
+#include "ptl_progress.h"
+#include "ptl_recv.h"
+#include "ptl_request.h"
+#include "ptl_send.h"
+#include "ptl_win.h"
+#include "ptl_rma.h"
+#include "ptl_am.h"
+#include "ptl_spawn.h"
+#include "ptl_comm.h"
+#include "ptl_unimpl.h"
+#include "ptl_proc.h"
+#include "ptl_coll.h"
+#include "ptl_datatype.h"
+#include "ptl_op.h"
+#endif /* NETMOD_DIRECT_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/portals4_pre.h b/src/mpid/ch4/netmod/portals4/portals4_pre.h
new file mode 100644
index 0000000..4649cc5
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/portals4_pre.h
@@ -0,0 +1,51 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#ifndef NETMOD_PORTALS4_PRE_H_INCLUDED
+#define NETMOD_PORTALS4_PRE_H_INCLUDED
+
+#include "portals4.h"
+
+typedef struct {
+    int handler_id;
+    char *pack_buffer;
+    ptl_handle_md_t md;
+} MPIDI_PORTALS4_am_request_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_PORTALS4_request_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_PORTALS4_comm_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_PORTALS4_dt_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_PORTALS4_win_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_PORTALS4_gpid_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_PORTALS4_addr_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_PORTALS4_op_t;
+
+#endif /* NETMOD_PORTALS4_PRE_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/ptl_am.h b/src/mpid/ch4/netmod/portals4/ptl_am.h
new file mode 100644
index 0000000..6b12443
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/ptl_am.h
@@ -0,0 +1,370 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_PTL_AM_H_INCLUDED
+#define NETMOD_PTL_AM_H_INCLUDED
+
+#include "ptl_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_reg_hdr_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_reg_hdr_handler(int handler_id,
+                                           MPIDI_NM_am_origin_handler_fn origin_handler_fn,
+                                           MPIDI_NM_am_target_handler_fn target_handler_fn)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_REG_HDR_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_REG_HDR_HANDLER);
+
+    MPIDI_PTL_global.am_handlers[handler_id] = target_handler_fn;
+    MPIDI_PTL_global.send_cmpl_handlers[handler_id] = origin_handler_fn;
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_REG_HDR_HANDLER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_NM_send_am_hdr(int rank,
+                                       MPIR_Comm * comm,
+                                       int handler_id,
+                                       const void *am_hdr,
+                                       size_t am_hdr_sz, MPIR_Request * sreq, void *netmod_context)
+{
+    int mpi_errno = MPI_SUCCESS, ret, c;
+    ptl_hdr_data_t ptl_hdr;
+    ptl_match_bits_t match_bits;
+    char *send_buf = NULL;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_SEND_AM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_SEND_AM);
+
+    ptl_hdr = MPIDI_PTL_init_am_hdr(handler_id, 0);
+    match_bits = MPIDI_PTL_init_tag(comm->context_id, MPIDI_PTL_AM_TAG);
+    sreq->dev.ch4.ch4u.netmod_am.portals4.handler_id = handler_id;
+
+    MPIR_cc_incr(sreq->cc_ptr, &c);
+
+    send_buf = MPL_malloc(am_hdr_sz);
+    MPIR_Memcpy(send_buf, am_hdr, am_hdr_sz);
+    sreq->dev.ch4.ch4u.netmod_am.portals4.pack_buffer = send_buf;
+
+    ret = PtlPut(MPIDI_PTL_global.md, (ptl_size_t) send_buf, am_hdr_sz,
+                 PTL_ACK_REQ, MPIDI_PTL_global.addr_table[rank].process,
+                 MPIDI_PTL_global.addr_table[rank].pt, match_bits, 0, sreq, ptl_hdr);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_SEND_AM);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_NM_send_am(int rank,
+                                   MPIR_Comm * comm,
+                                   int handler_id,
+                                   const void *am_hdr,
+                                   size_t am_hdr_sz,
+                                   const void *data,
+                                   MPI_Count count,
+                                   MPI_Datatype datatype, MPIR_Request * sreq, void *netmod_context)
+{
+    int mpi_errno = MPI_SUCCESS, ret, c;
+    size_t data_sz;
+    MPI_Aint dt_true_lb, last;
+    MPIR_Datatype *dt_ptr;
+    int dt_contig;
+    ptl_hdr_data_t ptl_hdr;
+    ptl_match_bits_t match_bits;
+    char *send_buf = NULL;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_SEND_AM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_SEND_AM);
+
+    MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);
+    match_bits = MPIDI_PTL_init_tag(comm->context_id, MPIDI_PTL_AM_TAG);
+    ptl_hdr = MPIDI_PTL_init_am_hdr(handler_id, data_sz);
+    sreq->dev.ch4.ch4u.netmod_am.portals4.handler_id = handler_id;
+
+    MPIR_cc_incr(sreq->cc_ptr, &c);
+
+    if (dt_contig) {
+        /* create a two element iovec and send */
+        ptl_md_t md;
+        ptl_iovec_t iovec[2];
+
+        send_buf = MPL_malloc(am_hdr_sz);
+        MPIR_Memcpy(send_buf, am_hdr, am_hdr_sz);
+        sreq->dev.ch4.ch4u.netmod_am.portals4.pack_buffer = send_buf;
+
+        iovec[0].iov_base = send_buf;
+        iovec[0].iov_len = am_hdr_sz;
+        iovec[1].iov_base = (char *) data + dt_true_lb;
+        iovec[1].iov_len = data_sz;
+        md.start = iovec;
+        md.length = 2;
+        md.options = PTL_IOVEC;
+        md.eq_handle = MPIDI_PTL_global.eqs[0];
+        md.ct_handle = PTL_CT_NONE;
+
+        ret = PtlMDBind(MPIDI_PTL_global.ni, &md, &sreq->dev.ch4.ch4u.netmod_am.portals4.md);
+        ret = PtlPut(sreq->dev.ch4.ch4u.netmod_am.portals4.md, 0, am_hdr_sz + data_sz,
+                     PTL_ACK_REQ, MPIDI_PTL_global.addr_table[rank].process,
+                     MPIDI_PTL_global.addr_table[rank].pt, match_bits, 0, sreq, ptl_hdr);
+    }
+    else {
+        /* copy everything into pack_buffer */
+        MPID_Segment *segment;
+        MPI_Aint last;
+
+        send_buf = MPL_malloc(am_hdr_sz + data_sz);
+        MPIR_Memcpy(send_buf, am_hdr, am_hdr_sz);
+        segment = MPID_Segment_alloc();
+        MPID_Segment_init(data, count, datatype, segment, 0);
+        last = data_sz;
+        MPID_Segment_pack(segment, 0, &last, send_buf + am_hdr_sz);
+        MPIR_Assert(last == data_sz);
+        MPID_Segment_free(segment);
+        sreq->dev.ch4.ch4u.netmod_am.portals4.pack_buffer = send_buf;
+
+        ret = PtlPut(MPIDI_PTL_global.md, (ptl_size_t) send_buf, am_hdr_sz + data_sz,
+                     PTL_ACK_REQ, MPIDI_PTL_global.addr_table[rank].process,
+                     MPIDI_PTL_global.addr_table[rank].pt, match_bits, 0, sreq, ptl_hdr);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_SEND_AM);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_NM_send_amv(int rank,
+                                    MPIR_Comm * comm,
+                                    int handler_id,
+                                    struct iovec *am_hdr,
+                                    size_t iov_len,
+                                    const void *data,
+                                    MPI_Count count,
+                                    MPI_Datatype datatype,
+                                    MPIR_Request * sreq, void *netmod_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_send_amv_hdr(int rank,
+                                        MPIR_Comm * comm,
+                                        int handler_id,
+                                        struct iovec *am_hdr,
+                                        size_t iov_len, MPIR_Request * sreq, void *netmod_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_send_am_hdr_reply(MPIR_Context_id_t context_id,
+                                             int src_rank,
+                                             int handler_id,
+                                             const void *am_hdr,
+                                             size_t am_hdr_sz, MPIR_Request * sreq)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_send_am_reply(MPIR_Context_id_t context_id,
+                                         int src_rank,
+                                         int handler_id,
+                                         const void *am_hdr,
+                                         size_t am_hdr_sz,
+                                         const void *data,
+                                         MPI_Count count,
+                                         MPI_Datatype datatype, MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS, ret, c;
+    size_t data_sz;
+    MPI_Aint dt_true_lb, last;
+    MPIR_Datatype *dt_ptr;
+    int dt_contig;
+    ptl_hdr_data_t ptl_hdr;
+    ptl_match_bits_t match_bits;
+    MPIR_Comm *use_comm;
+    char *send_buf = NULL;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_SEND_AM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_SEND_AM);
+
+    use_comm = MPIDI_CH4U_context_id_to_comm(context_id);
+
+    MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);
+    match_bits = MPIDI_PTL_init_tag(use_comm->context_id, MPIDI_PTL_AM_TAG);
+    ptl_hdr = MPIDI_PTL_init_am_hdr(handler_id, data_sz);
+    sreq->dev.ch4.ch4u.netmod_am.portals4.handler_id = handler_id;
+
+    MPIR_cc_incr(sreq->cc_ptr, &c);
+
+    if (dt_contig) {
+        /* create a two element iovec and send */
+        ptl_md_t md;
+        ptl_iovec_t iovec[2];
+
+        send_buf = MPL_malloc(am_hdr_sz);
+        MPIR_Memcpy(send_buf, am_hdr, am_hdr_sz);
+        sreq->dev.ch4.ch4u.netmod_am.portals4.pack_buffer = send_buf;
+
+        iovec[0].iov_base = send_buf;
+        iovec[0].iov_len = am_hdr_sz;
+        iovec[1].iov_base = (char *) data + dt_true_lb;
+        iovec[1].iov_len = data_sz;
+        md.start = iovec;
+        md.length = 2;
+        md.options = PTL_IOVEC;
+        md.eq_handle = MPIDI_PTL_global.eqs[0];
+        md.ct_handle = PTL_CT_NONE;
+
+        ret = PtlMDBind(MPIDI_PTL_global.ni, &md, &sreq->dev.ch4.ch4u.netmod_am.portals4.md);
+        ret = PtlPut(sreq->dev.ch4.ch4u.netmod_am.portals4.md, 0, am_hdr_sz + data_sz,
+                     PTL_ACK_REQ, MPIDI_PTL_global.addr_table[src_rank].process,
+                     MPIDI_PTL_global.addr_table[src_rank].pt, match_bits, 0, sreq, ptl_hdr);
+    }
+    else {
+        /* copy everything into pack_buffer */
+        MPID_Segment *segment;
+        MPI_Aint last;
+
+        send_buf = MPL_malloc(am_hdr_sz + data_sz);
+        MPIR_Memcpy(send_buf, am_hdr, am_hdr_sz);
+        segment = MPID_Segment_alloc();
+        MPID_Segment_init(data, count, datatype, segment, 0);
+        last = data_sz;
+        MPID_Segment_pack(segment, 0, &last, send_buf + am_hdr_sz);
+        MPIR_Assert(last == data_sz);
+        MPID_Segment_free(segment);
+        sreq->dev.ch4.ch4u.netmod_am.portals4.pack_buffer = send_buf;
+
+        ret = PtlPut(MPIDI_PTL_global.md, (ptl_size_t) send_buf, am_hdr_sz + data_sz,
+                     PTL_ACK_REQ, MPIDI_PTL_global.addr_table[src_rank].process,
+                     MPIDI_PTL_global.addr_table[src_rank].pt, match_bits, 0, sreq, ptl_hdr);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_SEND_AM);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_NM_send_amv_reply(MPIR_Context_id_t context_id,
+                                          int src_rank,
+                                          int handler_id,
+                                          struct iovec *am_hdr,
+                                          size_t iov_len,
+                                          const void *data,
+                                          MPI_Count count,
+                                          MPI_Datatype datatype, MPIR_Request * sreq)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline size_t MPIDI_NM_am_hdr_max_sz(void)
+{
+    MPIR_Assert(0);
+    return 0;
+}
+
+static inline int MPIDI_NM_inject_am_hdr(int rank,
+                                         MPIR_Comm * comm,
+                                         int handler_id,
+                                         const void *am_hdr, size_t am_hdr_sz, void *netmod_context)
+{
+    int mpi_errno = MPI_SUCCESS, ret, c;
+    ptl_hdr_data_t ptl_hdr;
+    ptl_match_bits_t match_bits;
+    char *send_buf = NULL;
+    MPIR_Request *inject_req;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_SEND_AM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_SEND_AM);
+
+    ptl_hdr = MPIDI_PTL_init_am_hdr(handler_id, 0);
+    match_bits = MPIDI_PTL_init_tag(comm->context_id, MPIDI_PTL_AM_TAG);
+
+    /* create an internal request for the inject */
+    inject_req = MPIR_Request_create(MPIR_REQUEST_KIND__UNDEFINED);
+    MPIDI_NM_am_request_init(inject_req);
+    send_buf = MPL_malloc(am_hdr_sz);
+    MPIR_Memcpy(send_buf, am_hdr, am_hdr_sz);
+    inject_req->dev.ch4.ch4u.netmod_am.portals4.pack_buffer = send_buf;
+
+    ret = PtlPut(MPIDI_PTL_global.md, (ptl_size_t) send_buf, am_hdr_sz,
+                 PTL_ACK_REQ, MPIDI_PTL_global.addr_table[rank].process,
+                 MPIDI_PTL_global.addr_table[rank].pt, match_bits, 0, inject_req, ptl_hdr);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_SEND_AM);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_NM_inject_am_hdr_reply(MPIR_Context_id_t context_id,
+                                               int src_rank,
+                                               int handler_id, const void *am_hdr, size_t am_hdr_sz)
+{
+    int mpi_errno = MPI_SUCCESS, ret, c;
+    ptl_hdr_data_t ptl_hdr;
+    ptl_match_bits_t match_bits;
+    MPIR_Comm *use_comm;
+    char *send_buf = NULL;
+    MPIR_Request *inject_req;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_SEND_AM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_SEND_AM);
+
+    use_comm = MPIDI_CH4U_context_id_to_comm(context_id);
+
+    ptl_hdr = MPIDI_PTL_init_am_hdr(handler_id, 0);
+    match_bits = MPIDI_PTL_init_tag(use_comm->context_id, MPIDI_PTL_AM_TAG);
+
+    /* create an internal request for the inject */
+    inject_req = MPIR_Request_create(MPIR_REQUEST_KIND__UNDEFINED);
+    MPIDI_NM_am_request_init(inject_req);
+    send_buf = MPL_malloc(am_hdr_sz);
+    MPIR_Memcpy(send_buf, am_hdr, am_hdr_sz);
+    inject_req->dev.ch4.ch4u.netmod_am.portals4.pack_buffer = send_buf;
+
+    ret = PtlPut(MPIDI_PTL_global.md, (ptl_size_t) send_buf, am_hdr_sz,
+                 PTL_ACK_REQ, MPIDI_PTL_global.addr_table[src_rank].process,
+                 MPIDI_PTL_global.addr_table[src_rank].pt, match_bits, 0, inject_req, ptl_hdr);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_SEND_AM);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline size_t MPIDI_NM_am_inject_max_sz(void)
+{
+    MPIR_Assert(0);
+    return 0;
+}
+
+static inline int MPIDI_NM_am_recv(MPIR_Request * req)
+{
+    MPIR_Assert(0);
+    return 0;
+}
+
+#endif /* NETMOD_PTL_AM_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/ptl_coll.h b/src/mpid/ch4/netmod/portals4/ptl_coll.h
new file mode 100644
index 0000000..5d7073f
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/ptl_coll.h
@@ -0,0 +1,871 @@
+
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_PTL_COLL_H_INCLUDED
+#define NETMOD_PTL_COLL_H_INCLUDED
+
+#include "ptl_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_barrier
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_barrier(MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_BARRIER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_BARRIER);
+
+    mpi_errno = MPIR_Barrier(comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_BARRIER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_bcast
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_bcast(void *buffer, int count, MPI_Datatype datatype,
+                                 int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_BCAST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_BCAST);
+
+    mpi_errno = MPIR_Bcast(buffer, count, datatype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_BCAST);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_allreduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_allreduce(const void *sendbuf, void *recvbuf, int count,
+                                     MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                     MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLREDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLREDUCE);
+
+    mpi_errno = MPIR_Allreduce(sendbuf, recvbuf, count, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLREDUCE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                     MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLGATHER);
+
+    mpi_errno = MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
+                               comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_allgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                      void *recvbuf, const int *recvcounts, const int *displs,
+                                      MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                      MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLGATHERV);
+
+    mpi_errno = MPIR_Allgatherv(sendbuf, sendcount, sendtype,
+                                recvbuf, recvcounts, displs, recvtype, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_gather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                  void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                  int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_GATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_GATHER);
+
+    mpi_errno = MPIR_Gather(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                            recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_GATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_gatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, const int *recvcounts, const int *displs,
+                                   MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr,
+                                   MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_GATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_GATHERV);
+
+    mpi_errno = MPIR_Gatherv(sendbuf, sendcount, sendtype,
+                             recvbuf, recvcounts, displs, recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_GATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_scatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                   int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_SCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_SCATTER);
+
+    mpi_errno = MPIR_Scatter(sendbuf, sendcount, sendtype,
+                             recvbuf, recvcount, recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_SCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_scatterv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_scatterv(const void *sendbuf, const int *sendcounts,
+                                    const int *displs, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                    int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_SCATTERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_SCATTERV);
+
+    mpi_errno = MPIR_Scatterv(sendbuf, sendcounts, displs,
+                              sendtype, recvbuf, recvcount, recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_SCATTERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_alltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                    MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLTOALL);
+
+    mpi_errno = MPIR_Alltoall(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                              recvtype, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_alltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_alltoallv(const void *sendbuf, const int *sendcounts,
+                                     const int *sdispls, MPI_Datatype sendtype,
+                                     void *recvbuf, const int *recvcounts,
+                                     const int *rdispls, MPI_Datatype recvtype,
+                                     MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLTOALLV);
+
+    mpi_errno = MPIR_Alltoallv(sendbuf, sendcounts, sdispls,
+                               sendtype, recvbuf, recvcounts, rdispls, recvtype, comm_ptr, errflag);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_alltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_alltoallw(const void *sendbuf, const int sendcounts[],
+                                     const int sdispls[], const MPI_Datatype sendtypes[],
+                                     void *recvbuf, const int recvcounts[],
+                                     const int rdispls[], const MPI_Datatype recvtypes[],
+                                     MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLTOALLW);
+
+    mpi_errno = MPIR_Alltoallw(sendbuf, sendcounts, sdispls,
+                               sendtypes, recvbuf, recvcounts,
+                               rdispls, recvtypes, comm_ptr, errflag);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_reduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_reduce(const void *sendbuf, void *recvbuf, int count,
+                                  MPI_Datatype datatype, MPI_Op op, int root,
+                                  MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_REDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_REDUCE);
+
+    mpi_errno = MPIR_Reduce(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_REDUCE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_reduce_scatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_reduce_scatter(const void *sendbuf, void *recvbuf,
+                                          const int recvcounts[], MPI_Datatype datatype,
+                                          MPI_Op op, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_REDUCE_SCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_REDUCE_SCATTER);
+
+    mpi_errno = MPIR_Reduce_scatter(sendbuf, recvbuf, recvcounts, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_REDUCE_SCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_reduce_scatter_block
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_reduce_scatter_block(const void *sendbuf, void *recvbuf,
+                                                int recvcount, MPI_Datatype datatype,
+                                                MPI_Op op, MPIR_Comm * comm_ptr,
+                                                MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_REDUCE_SCATTER_BLOCK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_REDUCE_SCATTER_BLOCK);
+
+    mpi_errno = MPIR_Reduce_scatter_block(sendbuf, recvbuf, recvcount,
+                                          datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_REDUCE_SCATTER_BLOCK);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_scan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_scan(const void *sendbuf, void *recvbuf, int count,
+                                MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_SCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_SCAN);
+
+    mpi_errno = MPIR_Scan(sendbuf, recvbuf, count, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_SCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_exscan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_exscan(const void *sendbuf, void *recvbuf, int count,
+                                  MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                  MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_EXSCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_EXSCAN);
+
+    mpi_errno = MPIR_Exscan(sendbuf, recvbuf, count, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_EXSCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_allgather(const void *sendbuf, int sendcount,
+                                              MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                              MPI_Datatype recvtype, MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLGATHER);
+
+    mpi_errno =
+        MPIR_Neighbor_allgather_impl(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
+                                     comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_allgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_allgatherv(const void *sendbuf, int sendcount,
+                                               MPI_Datatype sendtype, void *recvbuf,
+                                               const int recvcounts[], const int displs[],
+                                               MPI_Datatype recvtype, MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLGATHERV);
+
+    mpi_errno = MPIR_Neighbor_allgatherv_impl(sendbuf, sendcount, sendtype,
+                                              recvbuf, recvcounts, displs, recvtype, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_alltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_alltoall(const void *sendbuf, int sendcount,
+                                             MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                             MPI_Datatype recvtype, MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLTOALL);
+
+    mpi_errno = MPIR_Neighbor_alltoall_impl(sendbuf, sendcount, sendtype,
+                                            recvbuf, recvcount, recvtype, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_alltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_alltoallv(const void *sendbuf, const int sendcounts[],
+                                              const int sdispls[], MPI_Datatype sendtype,
+                                              void *recvbuf, const int recvcounts[],
+                                              const int rdispls[], MPI_Datatype recvtype,
+                                              MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLTOALLV);
+
+    mpi_errno = MPIR_Neighbor_alltoallv_impl(sendbuf, sendcounts, sdispls, sendtype,
+                                             recvbuf, recvcounts, rdispls, recvtype, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_alltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_alltoallw(const void *sendbuf, const int sendcounts[],
+                                              const MPI_Aint sdispls[],
+                                              const MPI_Datatype sendtypes[], void *recvbuf,
+                                              const int recvcounts[], const MPI_Aint rdispls[],
+                                              const MPI_Datatype recvtypes[], MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLTOALLW);
+
+    mpi_errno = MPIR_Neighbor_alltoallw_impl(sendbuf, sendcounts, sdispls, sendtypes,
+                                             recvbuf, recvcounts, rdispls, recvtypes, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_allgather(const void *sendbuf, int sendcount,
+                                               MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                               MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                               MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLGATHER);
+
+    mpi_errno = MPIR_Ineighbor_allgather_impl(sendbuf, sendcount, sendtype,
+                                              recvbuf, recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_allgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_allgatherv(const void *sendbuf, int sendcount,
+                                                MPI_Datatype sendtype, void *recvbuf,
+                                                const int recvcounts[], const int displs[],
+                                                MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                                MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLGATHERV);
+
+    mpi_errno = MPIR_Ineighbor_allgatherv_impl(sendbuf, sendcount, sendtype,
+                                               recvbuf, recvcounts, displs, recvtype,
+                                               comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_alltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_alltoall(const void *sendbuf, int sendcount,
+                                              MPI_Datatype sendtype, void *recvbuf,
+                                              int recvcount, MPI_Datatype recvtype,
+                                              MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLTOALL);
+
+    mpi_errno = MPIR_Ineighbor_alltoall_impl(sendbuf, sendcount, sendtype,
+                                             recvbuf, recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_alltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_alltoallv(const void *sendbuf, const int sendcounts[],
+                                               const int sdispls[], MPI_Datatype sendtype,
+                                               void *recvbuf, const int recvcounts[],
+                                               const int rdispls[], MPI_Datatype recvtype,
+                                               MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLTOALLV);
+
+    mpi_errno = MPIR_Ineighbor_alltoallv_impl(sendbuf, sendcounts, sdispls, sendtype,
+                                              recvbuf, recvcounts, rdispls, recvtype,
+                                              comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_alltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_alltoallw(const void *sendbuf, const int sendcounts[],
+                                               const MPI_Aint sdispls[],
+                                               const MPI_Datatype sendtypes[], void *recvbuf,
+                                               const int recvcounts[], const MPI_Aint rdispls[],
+                                               const MPI_Datatype recvtypes[], MPIR_Comm * comm_ptr,
+                                               MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLTOALLW);
+
+    mpi_errno = MPIR_Ineighbor_alltoallw_impl(sendbuf, sendcounts, sdispls, sendtypes,
+                                              recvbuf, recvcounts, rdispls, recvtypes,
+                                              comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ibarrier
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ibarrier(MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IBARRIER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IBARRIER);
+
+    mpi_errno = MPIR_Ibarrier_impl(comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IBARRIER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ibcast
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ibcast(void *buffer, int count, MPI_Datatype datatype,
+                                  int root, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IBCAST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IBCAST);
+
+    mpi_errno = MPIR_Ibcast_impl(buffer, count, datatype, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IBCAST);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iallgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                      void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                      MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLGATHER);
+
+    mpi_errno = MPIR_Iallgather_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                     recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iallreduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iallreduce(const void *sendbuf, void *recvbuf, int count,
+                                      MPI_Datatype datatype, MPI_Op op,
+                                      MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IREDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IREDUCE);
+
+    mpi_errno = MPIR_Iallreduce_impl(sendbuf, recvbuf, count, datatype, op, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IREDUCE);
+    return mpi_errno;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iallgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iallgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                       void *recvbuf, const int *recvcounts, const int *displs,
+                                       MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                       MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLGATHERV);
+
+    mpi_errno = MPIR_Iallgatherv_impl(sendbuf, sendcount, sendtype,
+                                      recvbuf, recvcounts, displs, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ialltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ialltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                     MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLTOALL);
+
+    mpi_errno = MPIR_Ialltoall_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                    recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ialltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ialltoallv(const void *sendbuf, const int *sendcounts,
+                                      const int *sdispls, MPI_Datatype sendtype,
+                                      void *recvbuf, const int *recvcounts,
+                                      const int *rdispls, MPI_Datatype recvtype,
+                                      MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLTOALLV);
+
+    mpi_errno = MPIR_Ialltoallv_impl(sendbuf, sendcounts, sdispls,
+                                     sendtype, recvbuf, recvcounts,
+                                     rdispls, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ialltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ialltoallw(const void *sendbuf, const int *sendcounts,
+                                      const int *sdispls, const MPI_Datatype sendtypes[],
+                                      void *recvbuf, const int *recvcounts,
+                                      const int *rdispls, const MPI_Datatype recvtypes[],
+                                      MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLTOALLW);
+
+    mpi_errno = MPIR_Ialltoallw_impl(sendbuf, sendcounts, sdispls,
+                                     sendtypes, recvbuf, recvcounts,
+                                     rdispls, recvtypes, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iexscan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iexscan(const void *sendbuf, void *recvbuf, int count,
+                                   MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                   MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IEXSCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IEXSCAN);
+
+    mpi_errno = MPIR_Iexscan_impl(sendbuf, recvbuf, count, datatype, op, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IEXSCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_igather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_igather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                   int root, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IGATHER);
+
+    mpi_errno = MPIR_Igather_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                  recvcount, recvtype, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_igatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_igatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, const int *recvcounts, const int *displs,
+                                    MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr,
+                                    MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IGATHERV);
+
+    mpi_errno = MPIR_Igatherv_impl(sendbuf, sendcount, sendtype,
+                                   recvbuf, recvcounts, displs, recvtype, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ireduce_scatter_block
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ireduce_scatter_block(const void *sendbuf, void *recvbuf,
+                                                 int recvcount, MPI_Datatype datatype,
+                                                 MPI_Op op, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IREDUCE_SCATTER_BLOCK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IREDUCE_SCATTER_BLOCK);
+
+    mpi_errno = MPIR_Ireduce_scatter_block_impl(sendbuf, recvbuf, recvcount,
+                                                datatype, op, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IREDUCE_SCATTER_BLOCK);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ireduce_scatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ireduce_scatter(const void *sendbuf, void *recvbuf,
+                                           const int recvcounts[], MPI_Datatype datatype,
+                                           MPI_Op op, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IREDUCE_SCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IREDUCE_SCATTER);
+
+    mpi_errno = MPIR_Ireduce_scatter_impl(sendbuf, recvbuf, recvcounts, datatype, op,
+                                          comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IREDUCE_SCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ireduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ireduce(const void *sendbuf, void *recvbuf, int count,
+                                   MPI_Datatype datatype, MPI_Op op, int root,
+                                   MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IREDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IREDUCE);
+
+    mpi_errno = MPIR_Ireduce_impl(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IREDUCE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iscan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iscan(const void *sendbuf, void *recvbuf, int count,
+                                 MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                 MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ISCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ISCAN);
+
+    mpi_errno = MPIR_Iscan_impl(sendbuf, recvbuf, count, datatype, op, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ISCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iscatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iscatter(const void *sendbuf, int sendcount,
+                                    MPI_Datatype sendtype, void *recvbuf,
+                                    int recvcount, MPI_Datatype recvtype,
+                                    int root, MPIR_Comm * comm, MPI_Request * request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ISCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ISCATTER);
+
+    mpi_errno = MPIR_Iscatter_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                   recvcount, recvtype, root, comm, request);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ISCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iscatterv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iscatterv(const void *sendbuf, const int *sendcounts,
+                                     const int *displs, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount,
+                                     MPI_Datatype recvtype, int root,
+                                     MPIR_Comm * comm, MPI_Request * request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ISCATTERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ISCATTERV);
+
+    mpi_errno = MPIR_Iscatterv_impl(sendbuf, sendcounts, displs, sendtype,
+                                    recvbuf, recvcount, recvtype, root, comm, request);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ISCATTERV);
+    return mpi_errno;
+}
+
+#endif /* NETMOD_PTL_COLL_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/ptl_comm.h b/src/mpid/ch4/netmod/portals4/ptl_comm.h
new file mode 100644
index 0000000..651ab1f
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/ptl_comm.h
@@ -0,0 +1,39 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_PTL_COMM_H_INCLUDED
+#define NETMOD_PTL_COMM_H_INCLUDED
+
+#include "ptl_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_comm_create
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_comm_create(MPIR_Comm * comm)
+{
+    int mpi_errno = MPI_SUCCESS;
+    mpi_errno = MPIDI_CH4U_init_comm(comm);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_comm_destroy
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_comm_destroy(MPIR_Comm * comm)
+{
+    int mpi_errno = MPI_SUCCESS;
+    mpi_errno = MPIDI_CH4U_destroy_comm(comm);
+    return mpi_errno;
+}
+
+
+#endif /* NETMOD_PTL_COMM_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/ptl_datatype.h b/src/mpid/ch4/netmod/portals4/ptl_datatype.h
new file mode 100644
index 0000000..3e529d6
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/ptl_datatype.h
@@ -0,0 +1,28 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ */
+#ifndef NETMOD_PTL_DATATYPE_H_INCLUDED
+#define NETMOD_PTL_DATATYPE_H_INCLUDED
+
+#include "ptl_impl.h"
+
+static inline void MPIDI_NM_datatype_destroy(MPIR_Datatype * datatype_p)
+{
+    return;
+}
+
+static inline void MPIDI_NM_datatype_commit(MPIR_Datatype * datatype_p)
+{
+    return;
+}
+
+static inline void MPIDI_NM_datatype_dup(MPIR_Datatype * old_datatype_p,
+                                         MPIR_Datatype * new_datatype_p)
+{
+    return;
+}
+
+#endif /* NETMOD_PTL_DATATYPE_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/ptl_impl.h b/src/mpid/ch4/netmod/portals4/ptl_impl.h
new file mode 100644
index 0000000..ddcf713
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/ptl_impl.h
@@ -0,0 +1,58 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_PTL_IMPL_H_INCLUDED
+#define NETMOD_PTL_IMPL_H_INCLUDED
+
+#include <mpidimpl.h>
+#include "portals4.h"
+
+#define MPIDI_PTL_CASE_STR(x) case x: return #x
+
+static inline const char *MPIDI_PTL_strerror(int ret)
+{
+    switch (ret) {
+        MPIDI_PTL_CASE_STR(PTL_OK);
+        MPIDI_PTL_CASE_STR(PTL_ARG_INVALID);
+        MPIDI_PTL_CASE_STR(PTL_CT_NONE_REACHED);
+        MPIDI_PTL_CASE_STR(PTL_EQ_DROPPED);
+        MPIDI_PTL_CASE_STR(PTL_EQ_EMPTY);
+        MPIDI_PTL_CASE_STR(PTL_FAIL);
+        MPIDI_PTL_CASE_STR(PTL_IN_USE);
+        MPIDI_PTL_CASE_STR(PTL_INTERRUPTED);
+        MPIDI_PTL_CASE_STR(PTL_IGNORED);
+        MPIDI_PTL_CASE_STR(PTL_LIST_TOO_LONG);
+        MPIDI_PTL_CASE_STR(PTL_NO_INIT);
+        MPIDI_PTL_CASE_STR(PTL_NO_SPACE);
+        MPIDI_PTL_CASE_STR(PTL_PID_IN_USE);
+        MPIDI_PTL_CASE_STR(PTL_PT_FULL);
+        MPIDI_PTL_CASE_STR(PTL_PT_EQ_NEEDED);
+        MPIDI_PTL_CASE_STR(PTL_PT_IN_USE);
+    default:
+        return "UNKNOWN";
+    }
+}
+
+#define MPIDI_PTL_ERR  MPIR_ERR_CHKANDJUMP4
+
+#define MPIDI_PTL_CHK_STATUS(STATUS,STR)                        \
+    do {                                                                \
+        MPIDI_PTL_ERR(STATUS!=PTL_OK,                           \
+                              mpi_errno,                                \
+                              MPI_ERR_OTHER,                            \
+                              "**"#STR,                                 \
+                              "**"#STR" %s %d %s %s",                   \
+                              __FILE__,                                 \
+                              __LINE__,                                 \
+                              FCNAME,                                   \
+                              MPIDI_PTL_strerror(STATUS));      \
+    } while (0)
+
+#endif /* NETMOD_PTL_IMPL_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/ptl_init.h b/src/mpid/ch4/netmod/portals4/ptl_init.h
new file mode 100644
index 0000000..da36d4e
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/ptl_init.h
@@ -0,0 +1,271 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_PTL_INIT_H_INCLUDED
+#define NETMOD_PTL_INIT_H_INCLUDED
+
+#include "mpidch4r.h"
+#include "ptl_types.h"
+#include "ptl_impl.h"
+#include "portals4.h"
+
+static inline int MPIDI_PTL_append_overflow(size_t i)
+{
+    ptl_me_t me;
+    ptl_process_t id_any;
+
+    id_any.phys.pid = PTL_PID_ANY;
+    id_any.phys.nid = PTL_NID_ANY;
+
+    me.start = MPIDI_PTL_global.overflow_bufs[i];
+    me.length = MPIDI_PTL_OVERFLOW_BUFFER_SZ;
+    me.ct_handle = PTL_CT_NONE;
+    me.uid = PTL_UID_ANY;
+    me.options = (PTL_ME_OP_PUT | PTL_ME_MANAGE_LOCAL | PTL_ME_NO_TRUNCATE | PTL_ME_MAY_ALIGN |
+                  PTL_ME_IS_ACCESSIBLE | PTL_ME_EVENT_LINK_DISABLE);
+    me.match_id = id_any;
+    me.match_bits = 0;
+    me.ignore_bits = ~((ptl_match_bits_t) 0);
+    me.min_free = MPIDI_PTL_MAX_AM_EAGER_SZ;
+
+    return PtlMEAppend(MPIDI_PTL_global.ni, MPIDI_PTL_global.pt, &me, PTL_OVERFLOW_LIST, (void *) i,
+                       &MPIDI_PTL_global.overflow_me_handles[i]);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_init(int rank,
+                                int size,
+                                int appnum,
+                                int *tag_ub,
+                                MPIR_Comm * comm_world,
+                                MPIR_Comm * comm_self,
+                                int spawned, int num_contexts, void **netmod_contexts)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int ret;
+    ptl_md_t md;
+    ptl_ni_limits_t desired;
+    ptl_process_t my_ptl_id;
+    int key_max_sz;
+    int val_max_sz, val_sz_left;
+    int name_max_sz;
+    int len, i;
+    char *keyS, *valS, *buscard;
+
+    /* Make sure our IOV is the same as portals4's IOV */
+    MPIR_Assert(sizeof(ptl_iovec_t) == sizeof(MPL_IOV));
+    MPIR_Assert(((void *) &(((ptl_iovec_t *) 0)->iov_base)) ==
+                ((void *) &(((MPL_IOV *) 0)->MPL_IOV_BUF)));
+    MPIR_Assert(((void *) &(((ptl_iovec_t *) 0)->iov_len)) ==
+                ((void *) &(((MPL_IOV *) 0)->MPL_IOV_LEN)));
+    MPIR_Assert(sizeof(((ptl_iovec_t *) 0)->iov_len) == sizeof(((MPL_IOV *) 0)->MPL_IOV_LEN));
+
+    /* init portals */
+    ret = PtlInit();
+    MPIDI_PTL_CHK_STATUS(ret, PtlInit);
+
+    /* /\* do an interface pre-init to get the default limits struct *\/ */
+    /* ret = PtlNIInit(PTL_IFACE_DEFAULT, PTL_NI_MATCHING | PTL_NI_PHYSICAL, */
+    /*                 PTL_PID_ANY, NULL, &desired, &MPIDI_PTL_global.ni_handle); */
+    /* MPIDI_PTL_CHK_STATUS(ret, PtlNIInit); */
+
+    /* /\* finalize the interface so we can re-init with our desired maximums *\/ */
+    /* ret = PtlNIFini(MPIDI_PTL_global.ni_handle); */
+    /* MPIDI_PTL_CHK_STATUS(ret, PtlNIFini); */
+
+    /* /\* set higher limits if they are determined to be too low *\/ */
+    /* if (desired.max_unexpected_headers < UNEXPECTED_HDR_COUNT && getenv("PTL_LIM_MAX_UNEXPECTED_HEADERS") == NULL) */
+    /*     desired.max_unexpected_headers = UNEXPECTED_HDR_COUNT; */
+    /* if (desired.max_list_size < LIST_SIZE && getenv("PTL_LIM_MAX_LIST_SIZE") == NULL) */
+    /*     desired.max_list_size = LIST_SIZE; */
+    /* if (desired.max_entries < ENTRY_COUNT && getenv("PTL_LIM_MAX_ENTRIES") == NULL) */
+    /*     desired.max_entries = ENTRY_COUNT; */
+
+    /* do the real init */
+    ret = PtlNIInit(PTL_IFACE_DEFAULT, PTL_NI_MATCHING | PTL_NI_PHYSICAL,
+                    PTL_PID_ANY, NULL, &MPIDI_PTL_global.ni_limits, &MPIDI_PTL_global.ni);
+    MPIDI_PTL_CHK_STATUS(ret, PtlNIInit);
+
+    /* allocate EQs: 0 is origin, 1 is target */
+    ret = PtlEQAlloc(MPIDI_PTL_global.ni, MPIDI_PTL_EVENT_COUNT, &MPIDI_PTL_global.eqs[0]);
+    MPIDI_PTL_CHK_STATUS(ret, PtlEQAlloc);
+    ret = PtlEQAlloc(MPIDI_PTL_global.ni, MPIDI_PTL_EVENT_COUNT, &MPIDI_PTL_global.eqs[1]);
+    MPIDI_PTL_CHK_STATUS(ret, PtlEQAlloc);
+
+    /* allocate portal */
+    ret =
+        PtlPTAlloc(MPIDI_PTL_global.ni,
+                   PTL_PT_ONLY_USE_ONCE | PTL_PT_ONLY_TRUNCATE | PTL_PT_FLOWCTRL,
+                   MPIDI_PTL_global.eqs[1], PTL_PT_ANY, &MPIDI_PTL_global.pt);
+    MPIDI_PTL_CHK_STATUS(ret, PtlPTAlloc);
+
+    /* create an MD that covers all of memory */
+    md.start = NULL;
+    md.length = PTL_SIZE_MAX;
+    md.options = 0x0;
+    md.eq_handle = MPIDI_PTL_global.eqs[0];
+    md.ct_handle = PTL_CT_NONE;
+    ret = PtlMDBind(MPIDI_PTL_global.ni, &md, &MPIDI_PTL_global.md);
+    MPIDI_PTL_CHK_STATUS(ret, PtlMDBind);
+
+    /* create business card */
+    ret = PMI_KVS_Get_key_length_max(&key_max_sz);
+    ret = PMI_KVS_Get_value_length_max(&val_max_sz);
+    ret = PMI_KVS_Get_name_length_max(&name_max_sz);
+    MPIDI_PTL_global.kvsname = MPL_malloc(name_max_sz);
+    ret = PMI_KVS_Get_my_name(MPIDI_PTL_global.kvsname, name_max_sz);
+
+    keyS = MPL_malloc(key_max_sz);
+    valS = MPL_malloc(val_max_sz);
+    buscard = valS;
+    val_sz_left = val_max_sz;
+
+    ret = PtlGetId(MPIDI_PTL_global.ni, &my_ptl_id);
+    ret =
+        MPL_str_add_binary_arg(&buscard, &val_sz_left, "NID", (char *) &my_ptl_id.phys.nid,
+                               sizeof(my_ptl_id.phys.nid));
+    ret =
+        MPL_str_add_binary_arg(&buscard, &val_sz_left, "PID", (char *) &my_ptl_id.phys.pid,
+                               sizeof(my_ptl_id.phys.pid));
+    ret =
+        MPL_str_add_binary_arg(&buscard, &val_sz_left, "PTI", (char *) &MPIDI_PTL_global.pt,
+                               sizeof(MPIDI_PTL_global.pt));
+
+    sprintf(keyS, "PTL-%d", rank);
+    buscard = valS;
+    ret = PMI_KVS_Put(MPIDI_PTL_global.kvsname, keyS, buscard);
+    ret = PMI_KVS_Commit(MPIDI_PTL_global.kvsname);
+    ret = PMI_Barrier();
+
+    /* get and store business cards in address table */
+    MPIDI_PTL_global.addr_table = MPL_malloc(size * sizeof(MPIDI_PTL_addr_t));
+    for (i = 0; i < size; i++) {
+        sprintf(keyS, "PTL-%d", i);
+        ret = PMI_KVS_Get(MPIDI_PTL_global.kvsname, keyS, valS, val_max_sz);
+        MPL_str_get_binary_arg(valS, "NID",
+                               (char *) &MPIDI_PTL_global.addr_table[i].process.phys.nid,
+                               sizeof(MPIDI_PTL_global.addr_table[i].process.phys.nid), &len);
+        MPL_str_get_binary_arg(valS, "PID",
+                               (char *) &MPIDI_PTL_global.addr_table[i].process.phys.pid,
+                               sizeof(MPIDI_PTL_global.addr_table[i].process.phys.pid), &len);
+        MPL_str_get_binary_arg(valS, "PTI", (char *) &MPIDI_PTL_global.addr_table[i].pt,
+                               sizeof(MPIDI_PTL_global.addr_table[i].pt), &len);
+    }
+
+    /* Setup CH4R Active Messages */
+    MPIDI_CH4U_init(comm_world, comm_self, num_contexts, netmod_contexts);
+    for (i = 0; i < MPIDI_PTL_NUM_OVERFLOW_BUFFERS; i++) {
+        MPIDI_PTL_global.overflow_bufs[i] = MPL_malloc(MPIDI_PTL_OVERFLOW_BUFFER_SZ);
+        MPIDI_PTL_append_overflow(i);
+    }
+
+    MPIDI_PTL_global.node_map = MPL_malloc(size * sizeof(*MPIDI_PTL_global.node_map));
+    mpi_errno =
+        MPIDI_CH4U_build_nodemap(rank, comm_world, size, MPIDI_PTL_global.node_map,
+                                 &MPIDI_PTL_global.max_node_id);
+
+  fn_exit:
+    MPL_free(keyS);
+    MPL_free(valS);
+
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_NM_finalize(void)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int ret, i;
+
+    MPIR_Comm_release(MPIR_Process.comm_world);
+    MPIR_Comm_release(MPIR_Process.comm_self);
+
+    MPIDI_CH4U_finalize();
+
+    for (i = 0; i < MPIDI_PTL_NUM_OVERFLOW_BUFFERS; i++) {
+        ret = PtlMEUnlink(MPIDI_PTL_global.overflow_me_handles[i]);
+        MPL_free(MPIDI_PTL_global.overflow_bufs[i]);
+    }
+    ret = PtlMDRelease(MPIDI_PTL_global.md);
+    ret = PtlPTFree(MPIDI_PTL_global.ni, MPIDI_PTL_global.pt);
+    ret = PtlEQFree(MPIDI_PTL_global.eqs[1]);
+    ret = PtlEQFree(MPIDI_PTL_global.eqs[0]);
+    ret = PtlNIFini(MPIDI_PTL_global.ni);
+    PtlFini();
+
+    MPL_free(MPIDI_PTL_global.node_map);
+    MPL_free(MPIDI_PTL_global.addr_table);
+    MPL_free(MPIDI_PTL_global.kvsname);
+
+    return mpi_errno;
+}
+
+
+static inline int MPIDI_NM_comm_get_lpid(MPIR_Comm * comm_ptr,
+                                         int idx, int *lpid_ptr, MPL_bool is_remote)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_gpid_get(MPIR_Comm * comm_ptr, int rank, MPIR_Gpid * gpid)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_get_node_id(MPIR_Comm * comm, int rank, MPID_Node_id_t * id_p)
+{
+    *id_p = MPIDI_PTL_global.node_map[rank];
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_get_max_node_id(MPIR_Comm * comm, MPID_Node_id_t * max_id_p)
+{
+    *max_id_p = MPIDI_PTL_global.max_node_id;
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_getallincomm(MPIR_Comm * comm_ptr,
+                                        int local_size, MPIR_Gpid local_gpids[], int *singleAVT)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_gpid_tolpidarray(int size, MPIR_Gpid gpid[], int lpid[])
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr,
+                                                       int size, const int lpids[])
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_free_mem(void *ptr)
+{
+    return MPIDI_CH4U_free_mem(ptr);
+}
+
+static inline void *MPIDI_NM_alloc_mem(size_t size, MPIR_Info * info_ptr)
+{
+    return MPIDI_CH4U_alloc_mem(size, info_ptr);
+}
+
+
+#endif /* NETMOD_PTL_INIT_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/ptl_op.h b/src/mpid/ch4/netmod/portals4/ptl_op.h
new file mode 100644
index 0000000..a202468
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/ptl_op.h
@@ -0,0 +1,24 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ */
+#ifndef NETMOD_PTL_OP_H_INCLUDED
+#define NETMOD_PTL_OP_H_INCLUDED
+
+#include "ptl_impl.h"
+
+static inline void MPIDI_NM_op_destroy(MPIR_Op * op_p)
+{
+    MPIR_Assert(0);
+    return;
+}
+
+static inline void MPIDI_NM_op_commit(MPIR_Op * op_p)
+{
+    MPIR_Assert(0);
+    return;
+}
+
+#endif /* NETMOD_PTL_OP_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/ptl_probe.h b/src/mpid/ch4/netmod/portals4/ptl_probe.h
new file mode 100644
index 0000000..df4eae2
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/ptl_probe.h
@@ -0,0 +1,39 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_PTL_PROBE_H_INCLUDED
+#define NETMOD_PTL_PROBE_H_INCLUDED
+
+#include "ptl_impl.h"
+
+static inline int MPIDI_NM_probe(int source,
+                                 int tag, MPIR_Comm * comm, int context_offset, MPI_Status * status)
+{
+    return MPIDI_CH4U_probe(source, tag, comm, context_offset, status);
+}
+
+static inline int MPIDI_NM_improbe(int source,
+                                   int tag,
+                                   MPIR_Comm * comm,
+                                   int context_offset,
+                                   int *flag, MPIR_Request ** message, MPI_Status * status)
+{
+    return MPIDI_CH4U_improbe(source, tag, comm, context_offset, flag, message, status);
+}
+
+static inline int MPIDI_NM_iprobe(int source,
+                                  int tag,
+                                  MPIR_Comm * comm,
+                                  int context_offset, int *flag, MPI_Status * status)
+{
+    return MPIDI_CH4U_iprobe(source, tag, comm, context_offset, flag, status);
+}
+
+#endif /* NETMOD_PTL_PROBE_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/ptl_proc.h b/src/mpid/ch4/netmod/portals4/ptl_proc.h
new file mode 100644
index 0000000..9c59030
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/ptl_proc.h
@@ -0,0 +1,29 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_PTL_PROC_H_INCLUDED
+#define NETMOD_PTL_PROC_H_INCLUDED
+
+#include "ptl_impl.h"
+
+static inline int MPIDI_NM_rank_is_local(int rank, MPIR_Comm * comm)
+{
+    int ret;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPIDI_NETMOD_RANK_IS_LOCAL);
+    MPIR_FUNC_VERBOSE_ENTER(MPIDI_NETMOD_RANK_IS_LOCAL);
+
+    MPIR_Assert(0);
+    ret = 0;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPIDI_NETMOD_RANK_IS_LOCAL);
+    return ret;
+}
+
+#endif /* NETMOD_PTL_PROC_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/ptl_progress.h b/src/mpid/ch4/netmod/portals4/ptl_progress.h
new file mode 100644
index 0000000..50163cb
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/ptl_progress.h
@@ -0,0 +1,185 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_PTL_PROGRESS_H_INCLUDED
+#define NETMOD_PTL_PROGRESS_H_INCLUDED
+
+#include "ptl_impl.h"
+
+static inline int MPIDI_PTL_am_handler(ptl_event_t * e)
+{
+    int mpi_errno;
+    MPIR_Request *rreq = NULL;
+    void *p_data;
+    void *in_data;
+    size_t data_sz, in_data_sz;
+    MPIDI_NM_am_completion_handler_fn cmpl_handler_fn = NULL;
+    struct iovec *iov;
+    int i, is_contig, iov_len;
+    size_t done, curr_len, rem;
+
+    in_data_sz = data_sz = (e->hdr_data & MPIDI_PTL_MSG_SZ_MASK);
+    in_data = p_data = (e->start + (e->mlength - data_sz));
+    int handler_id = e->hdr_data >> 56;
+
+    MPIDI_PTL_global.am_handlers[handler_id] (e->start,
+                                              &p_data, &data_sz,
+                                              &is_contig, &cmpl_handler_fn, &rreq);
+
+    if (!rreq)
+        goto fn_exit;
+
+    if ((!p_data || !data_sz) && cmpl_handler_fn) {
+        cmpl_handler_fn(rreq);
+        goto fn_exit;
+    }
+
+    if (is_contig) {
+        if (in_data_sz > data_sz) {
+            rreq->status.MPI_ERROR = MPI_ERR_TRUNCATE;
+        }
+        else {
+            rreq->status.MPI_ERROR = MPI_SUCCESS;
+        }
+
+        data_sz = MPL_MIN(data_sz, in_data_sz);
+        MPIR_Memcpy(p_data, in_data, data_sz);
+        MPIR_STATUS_SET_COUNT(rreq->status, data_sz);
+    }
+    else {
+        done = 0;
+        rem = in_data_sz;
+        iov = (struct iovec *) p_data;
+        iov_len = data_sz;
+
+        for (i = 0; i < iov_len && rem > 0; i++) {
+            curr_len = MPL_MIN(rem, iov[i].iov_len);
+            MPIR_Memcpy(iov[i].iov_base, (char *) in_data + done, curr_len);
+            rem -= curr_len;
+            done += curr_len;
+        }
+
+        if (rem) {
+            rreq->status.MPI_ERROR = MPI_ERR_TRUNCATE;
+        }
+        else {
+            rreq->status.MPI_ERROR = MPI_SUCCESS;
+        }
+
+        MPIR_STATUS_SET_COUNT(rreq->status, done);
+    }
+
+    if (cmpl_handler_fn) {
+        cmpl_handler_fn(rreq);
+    }
+
+  fn_exit:
+    return mpi_errno;
+}
+
+static inline int MPIDI_NM_progress(void *netmod_context, int blocking)
+{
+    ptl_event_t e;
+    unsigned int which;
+
+    while (PtlEQPoll(MPIDI_PTL_global.eqs, 2, 0, &e, &which) != PTL_EQ_EMPTY) {
+        switch (e.type) {
+        case PTL_EVENT_PUT:
+            MPIR_Assert(e.ptl_list == PTL_OVERFLOW_LIST);
+            MPIDI_PTL_am_handler(&e);
+            break;
+        case PTL_EVENT_ACK:
+            {
+                int count;
+                MPIR_Request *sreq = (MPIR_Request *) e.user_ptr;
+                int handler_id = sreq->dev.ch4.ch4u.netmod_am.portals4.handler_id;
+
+                MPIR_cc_decr(sreq->cc_ptr, &count);
+                MPIR_Assert(count >= 0);
+
+                if (count == 0) {
+                    MPIDI_CH4U_request_release(sreq);
+                    break;
+                }
+                MPIDI_PTL_global.send_cmpl_handlers[handler_id] (sreq);
+            }
+            break;
+        case PTL_EVENT_AUTO_UNLINK:
+            MPIDI_PTL_global.overflow_me_handles[(size_t) e.user_ptr] = PTL_INVALID_HANDLE;
+            break;
+        case PTL_EVENT_AUTO_FREE:
+            MPIDI_PTL_append_overflow((size_t) e.user_ptr);
+            break;
+        case PTL_EVENT_SEND:
+            break;
+        default:
+            printf("ABORT: event = %d\n", e.type);
+            abort();
+        }
+    }
+
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_progress_test(void)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_progress_poke(void)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline void MPIDI_NM_progress_start(MPID_Progress_state * state)
+{
+    MPIR_Assert(0);
+    return;
+}
+
+static inline void MPIDI_NM_progress_end(MPID_Progress_state * state)
+{
+    MPIR_Assert(0);
+    return;
+}
+
+static inline int MPIDI_NM_progress_wait(MPID_Progress_state * state)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_progress_register(int (*progress_fn) (int *), int *id)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_progress_deregister(int id)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_progress_activate(int id)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_progress_deactivate(int id)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#endif /* NETMOD_PTL_PROGRESS_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/ptl_recv.h b/src/mpid/ch4/netmod/portals4/ptl_recv.h
new file mode 100644
index 0000000..a239a3a
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/ptl_recv.h
@@ -0,0 +1,63 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_PTL_RECV_H_INCLUDED
+#define NETMOD_PTL_RECV_H_INCLUDED
+
+#include "ptl_impl.h"
+
+static inline int MPIDI_NM_recv(void *buf,
+                                int count,
+                                MPI_Datatype datatype,
+                                int rank,
+                                int tag,
+                                MPIR_Comm * comm,
+                                int context_offset, MPI_Status * status, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_recv(buf, count, datatype, rank, tag, comm, context_offset, status, request);
+}
+
+static inline int MPIDI_NM_recv_init(void *buf,
+                                     int count,
+                                     MPI_Datatype datatype,
+                                     int rank,
+                                     int tag,
+                                     MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_imrecv(void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  MPIR_Request * message, MPIR_Request ** rreqp)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_irecv(void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_irecv(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_cancel_recv(MPIR_Request * rreq)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#endif /* NETMOD_PTL_RECV_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/ptl_request.h b/src/mpid/ch4/netmod/portals4/ptl_request.h
new file mode 100644
index 0000000..0e4a796
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/ptl_request.h
@@ -0,0 +1,32 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_PTL_REQUEST_H_INCLUDED
+#define NETMOD_PTL_REQUEST_H_INCLUDED
+
+#include "ptl_impl.h"
+
+static inline void MPIDI_NM_am_request_init(MPIR_Request * req)
+{
+    req->dev.ch4.ch4u.netmod_am.portals4.pack_buffer = NULL;
+    req->dev.ch4.ch4u.netmod_am.portals4.md = PTL_INVALID_HANDLE;
+}
+
+static inline void MPIDI_NM_am_request_finalize(MPIR_Request * req)
+{
+    if ((req)->dev.ch4.ch4u.netmod_am.portals4.pack_buffer) {
+        MPL_free((req)->dev.ch4.ch4u.netmod_am.portals4.pack_buffer);
+    }
+    if ((req)->dev.ch4.ch4u.netmod_am.portals4.md != PTL_INVALID_HANDLE) {
+        PtlMDRelease((req)->dev.ch4.ch4u.netmod_am.portals4.md);
+    }
+}
+
+#endif /* NETMOD_PTL_REQUEST_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/ptl_rma.h b/src/mpid/ch4/netmod/portals4/ptl_rma.h
new file mode 100644
index 0000000..8be0450
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/ptl_rma.h
@@ -0,0 +1,148 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_PTL_RMA_H_INCLUDED
+#define NETMOD_PTL_RMA_H_INCLUDED
+
+#include "ptl_impl.h"
+
+static inline int MPIDI_NM_put(const void *origin_addr,
+                               int origin_count,
+                               MPI_Datatype origin_datatype,
+                               int target_rank,
+                               MPI_Aint target_disp,
+                               int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    return MPIDI_CH4U_put(origin_addr, origin_count, origin_datatype,
+                          target_rank, target_disp, target_count, target_datatype, win);
+}
+
+static inline int MPIDI_NM_get(void *origin_addr,
+                               int origin_count,
+                               MPI_Datatype origin_datatype,
+                               int target_rank,
+                               MPI_Aint target_disp,
+                               int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    return MPIDI_CH4U_get(origin_addr, origin_count, origin_datatype,
+                          target_rank, target_disp, target_count, target_datatype, win);
+}
+
+static inline int MPIDI_NM_rput(const void *origin_addr,
+                                int origin_count,
+                                MPI_Datatype origin_datatype,
+                                int target_rank,
+                                MPI_Aint target_disp,
+                                int target_count,
+                                MPI_Datatype target_datatype,
+                                MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rput(origin_addr, origin_count, origin_datatype,
+                           target_rank, target_disp, target_count, target_datatype, win, request);
+}
+
+
+static inline int MPIDI_NM_compare_and_swap(const void *origin_addr,
+                                            const void *compare_addr,
+                                            void *result_addr,
+                                            MPI_Datatype datatype,
+                                            int target_rank, MPI_Aint target_disp, MPIR_Win * win)
+{
+    return MPIDI_CH4U_compare_and_swap(origin_addr, compare_addr, result_addr,
+                                       datatype, target_rank, target_disp, win);
+}
+
+static inline int MPIDI_NM_raccumulate(const void *origin_addr,
+                                       int origin_count,
+                                       MPI_Datatype origin_datatype,
+                                       int target_rank,
+                                       MPI_Aint target_disp,
+                                       int target_count,
+                                       MPI_Datatype target_datatype,
+                                       MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_raccumulate(origin_addr, origin_count, origin_datatype,
+                                  target_rank, target_disp, target_count,
+                                  target_datatype, op, win, request);
+}
+
+static inline int MPIDI_NM_rget_accumulate(const void *origin_addr,
+                                           int origin_count,
+                                           MPI_Datatype origin_datatype,
+                                           void *result_addr,
+                                           int result_count,
+                                           MPI_Datatype result_datatype,
+                                           int target_rank,
+                                           MPI_Aint target_disp,
+                                           int target_count,
+                                           MPI_Datatype target_datatype,
+                                           MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rget_accumulate(origin_addr, origin_count, origin_datatype,
+                                      result_addr, result_count, result_datatype,
+                                      target_rank, target_disp, target_count,
+                                      target_datatype, op, win, request);
+}
+
+static inline int MPIDI_NM_fetch_and_op(const void *origin_addr,
+                                        void *result_addr,
+                                        MPI_Datatype datatype,
+                                        int target_rank,
+                                        MPI_Aint target_disp, MPI_Op op, MPIR_Win * win)
+{
+    return MPIDI_CH4U_fetch_and_op(origin_addr, result_addr, datatype,
+                                   target_rank, target_disp, op, win);
+}
+
+
+static inline int MPIDI_NM_rget(void *origin_addr,
+                                int origin_count,
+                                MPI_Datatype origin_datatype,
+                                int target_rank,
+                                MPI_Aint target_disp,
+                                int target_count,
+                                MPI_Datatype target_datatype,
+                                MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rget(origin_addr, origin_count, origin_datatype,
+                           target_rank, target_disp, target_count, target_datatype, win, request);
+}
+
+
+static inline int MPIDI_NM_get_accumulate(const void *origin_addr,
+                                          int origin_count,
+                                          MPI_Datatype origin_datatype,
+                                          void *result_addr,
+                                          int result_count,
+                                          MPI_Datatype result_datatype,
+                                          int target_rank,
+                                          MPI_Aint target_disp,
+                                          int target_count,
+                                          MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    return MPIDI_CH4U_get_accumulate(origin_addr, origin_count, origin_datatype,
+                                     result_addr, result_count, result_datatype,
+                                     target_rank, target_disp, target_count,
+                                     target_datatype, op, win);
+}
+
+static inline int MPIDI_NM_accumulate(const void *origin_addr,
+                                      int origin_count,
+                                      MPI_Datatype origin_datatype,
+                                      int target_rank,
+                                      MPI_Aint target_disp,
+                                      int target_count,
+                                      MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    return MPIDI_CH4U_accumulate(origin_addr, origin_count, origin_datatype,
+                                 target_rank, target_disp, target_count, target_datatype, op, win);
+}
+
+#endif /* NETMOD_PTL_RMA_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/ptl_send.h b/src/mpid/ch4/netmod/portals4/ptl_send.h
new file mode 100644
index 0000000..d910222
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/ptl_send.h
@@ -0,0 +1,128 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_PTL_SEND_H_INCLUDED
+#define NETMOD_PTL_SEND_H_INCLUDED
+
+#include "ptl_impl.h"
+
+static inline int MPIDI_NM_send(const void *buf,
+                                int count,
+                                MPI_Datatype datatype,
+                                int rank,
+                                int tag,
+                                MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_send(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_rsend(const void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rsend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+
+
+static inline int MPIDI_NM_irsend(const void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  int rank,
+                                  int tag,
+                                  MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_irsend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_ssend(const void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_ssend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_startall(int count, MPIR_Request * requests[])
+{
+    return MPIDI_CH4U_startall(count, requests);
+}
+
+static inline int MPIDI_NM_send_init(const void *buf,
+                                     int count,
+                                     MPI_Datatype datatype,
+                                     int rank,
+                                     int tag,
+                                     MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_send_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_ssend_init(const void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_ssend_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_bsend_init(const void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_bsend_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_rsend_init(const void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rsend_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_isend(const void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_isend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_issend(const void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  int rank,
+                                  int tag,
+                                  MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_issend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_cancel_send(MPIR_Request * sreq)
+{
+    return MPIDI_CH4U_cancel_send(sreq);
+}
+
+#endif /* NETMOD_PTL_SEND_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/ptl_spawn.h b/src/mpid/ch4/netmod/portals4/ptl_spawn.h
new file mode 100644
index 0000000..b0ef76a
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/ptl_spawn.h
@@ -0,0 +1,50 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_PTL_SPAWN_H_INCLUDED
+#define NETMOD_PTL_SPAWN_H_INCLUDED
+
+#include "ptl_impl.h"
+
+static inline int MPIDI_NM_comm_connect(const char *port_name,
+                                        MPIR_Info * info,
+                                        int root, MPIR_Comm * comm, MPIR_Comm ** newcomm_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_comm_disconnect(MPIR_Comm * comm_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_open_port(MPIR_Info * info_ptr, char *port_name)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_close_port(const char *port_name)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_comm_accept(const char *port_name,
+                                       MPIR_Info * info,
+                                       int root, MPIR_Comm * comm, MPIR_Comm ** newcomm_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#endif /* NETMOD_PTL_SPAWN_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/ptl_types.h b/src/mpid/ch4/netmod/portals4/ptl_types.h
new file mode 100644
index 0000000..be00d9a
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/ptl_types.h
@@ -0,0 +1,76 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  *  (C) 2016 by Argonne National Laboratory.
+ *   *      See COPYRIGHT in top-level directory.
+ *    *
+ *    */
+
+#ifndef NETMOD_PTL_TYPES_H_INCLUDED
+#define NETMOD_PTL_TYPES_H_INCLUDED
+
+#include "mpidimpl.h"
+#include "portals4.h"
+
+/* Portals 4 Limits */
+#define MPIDI_PTL_EVENT_COUNT          (1024*64)
+#define MPIDI_PTL_UNEXPECTED_HDR_COUNT (1024*64)
+#define MPIDI_PTL_LIST_SIZE            (1024*64)
+
+/* Active Message Stuff */
+#define MPIDI_PTL_NUM_OVERFLOW_BUFFERS (8)
+#define MPIDI_PTL_OVERFLOW_BUFFER_SZ   (1024*1024)
+#define MPIDI_PTL_MAX_AM_EAGER_SZ      (64*1024)
+#define MPIDI_PTL_AM_TAG               (1 << 28)
+#define MPIDI_PTL_MAX_AM_HANDLERS      (64)
+
+typedef struct {
+    ptl_process_t process;
+    ptl_pt_index_t pt;
+} MPIDI_PTL_addr_t;
+
+typedef struct {
+    MPIDI_PTL_addr_t *addr_table;
+    MPID_Node_id_t *node_map;
+    MPID_Node_id_t max_node_id;
+    char *kvsname;
+    char pname[MPI_MAX_PROCESSOR_NAME];
+    void *overflow_bufs[MPIDI_PTL_NUM_OVERFLOW_BUFFERS];
+    ptl_handle_me_t overflow_me_handles[MPIDI_PTL_NUM_OVERFLOW_BUFFERS];
+    MPIDI_NM_am_target_handler_fn am_handlers[MPIDI_PTL_MAX_AM_HANDLERS];
+    MPIDI_NM_am_origin_handler_fn send_cmpl_handlers[MPIDI_PTL_MAX_AM_HANDLERS];
+    ptl_handle_ni_t ni;
+    ptl_ni_limits_t ni_limits;
+    ptl_handle_eq_t eqs[2];
+    ptl_pt_index_t pt;
+    ptl_handle_md_t md;
+} MPIDI_PTL_global_t;
+
+extern MPIDI_PTL_global_t MPIDI_PTL_global;
+
+#define MPIDI_PTL_CONTEXT_ID_BITS 32
+#define MPIDI_PTL_TAG_BITS 32
+
+#define MPIDI_PTL_TAG_MASK      (0x00000000FFFFFFFFULL)
+#define MPIDI_PTL_CTX_MASK      (0xFFFFFFFF00000000ULL)
+#define MPIDI_PTL_TAG_SHIFT     (MPIDI_PTL_TAG_BITS)
+
+static inline ptl_match_bits_t MPIDI_PTL_init_tag(MPIR_Context_id_t contextid, int tag)
+{
+    ptl_match_bits_t match_bits = 0;
+    match_bits = contextid;
+    match_bits <<= MPIDI_PTL_TAG_SHIFT;
+    match_bits |= (MPIDI_PTL_TAG_MASK & tag);
+    return match_bits;
+}
+
+#define MPIDI_PTL_MSG_SZ_MASK   (0x00FFFFFFFFFFFFFFULL)
+
+static inline ptl_hdr_data_t MPIDI_PTL_init_am_hdr(int handler_id, size_t msg_sz)
+{
+    ptl_hdr_data_t hdr = 0;
+    hdr = (ptl_hdr_data_t) handler_id << 56;
+    hdr |= (MPIDI_PTL_MSG_SZ_MASK & msg_sz);
+    return hdr;
+}
+
+#endif /* NETMOD_PTL_TYPES_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/ptl_unimpl.h b/src/mpid/ch4/netmod/portals4/ptl_unimpl.h
new file mode 100644
index 0000000..2b0cd20
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/ptl_unimpl.h
@@ -0,0 +1,19 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/mpid/ch4/netmod/portals4/ptl_win.h b/src/mpid/ch4/netmod/portals4/ptl_win.h
new file mode 100644
index 0000000..c325315
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/ptl_win.h
@@ -0,0 +1,160 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_PTL_WIN_H_INCLUDED
+#define NETMOD_PTL_WIN_H_INCLUDED
+
+#include "ptl_impl.h"
+
+static inline int MPIDI_NM_win_set_info(MPIR_Win * win, MPIR_Info * info)
+{
+    return MPIDI_CH4R_win_set_info(win, info);
+}
+
+
+static inline int MPIDI_NM_win_start(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_start(group, assert, win);
+}
+
+
+static inline int MPIDI_NM_win_complete(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_complete(win);
+}
+
+static inline int MPIDI_NM_win_post(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_post(group, assert, win);
+}
+
+
+static inline int MPIDI_NM_win_wait(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_wait(win);
+}
+
+
+static inline int MPIDI_NM_win_test(MPIR_Win * win, int *flag)
+{
+    return MPIDI_CH4R_win_test(win, flag);
+}
+
+static inline int MPIDI_NM_win_lock(int lock_type, int rank, int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_lock(lock_type, rank, assert, win);
+}
+
+
+static inline int MPIDI_NM_win_unlock(int rank, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_unlock(rank, win);
+}
+
+static inline int MPIDI_NM_win_get_info(MPIR_Win * win, MPIR_Info ** info_p_p)
+{
+    return MPIDI_CH4R_win_get_info(win, info_p_p);
+}
+
+
+static inline int MPIDI_NM_win_free(MPIR_Win ** win_ptr)
+{
+    return MPIDI_CH4R_win_free(win_ptr);
+}
+
+static inline int MPIDI_NM_win_fence(int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_fence(assert, win);
+}
+
+static inline int MPIDI_NM_win_create(void *base,
+                                      MPI_Aint length,
+                                      int disp_unit,
+                                      MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr)
+{
+    return MPIDI_CH4R_win_create(base, length, disp_unit, info, comm_ptr, win_ptr);
+}
+
+static inline int MPIDI_NM_win_attach(MPIR_Win * win, void *base, MPI_Aint size)
+{
+    return MPIDI_CH4R_win_attach(win, base, size);
+}
+
+static inline int MPIDI_NM_win_allocate_shared(MPI_Aint size,
+                                               int disp_unit,
+                                               MPIR_Info * info_ptr,
+                                               MPIR_Comm * comm_ptr,
+                                               void **base_ptr, MPIR_Win ** win_ptr)
+{
+    return MPIDI_CH4R_win_allocate_shared(size, disp_unit, info_ptr, comm_ptr, base_ptr, win_ptr);
+}
+
+static inline int MPIDI_NM_win_detach(MPIR_Win * win, const void *base)
+{
+    return MPIDI_CH4R_win_detach(win, base);
+}
+
+static inline int MPIDI_NM_win_shared_query(MPIR_Win * win,
+                                            int rank,
+                                            MPI_Aint * size, int *disp_unit, void *baseptr)
+{
+    return MPIDI_CH4R_win_shared_query(win, rank, size, disp_unit, baseptr);
+}
+
+static inline int MPIDI_NM_win_allocate(MPI_Aint size,
+                                        int disp_unit,
+                                        MPIR_Info * info,
+                                        MPIR_Comm * comm, void *baseptr, MPIR_Win ** win)
+{
+    return MPIDI_CH4R_win_allocate(size, disp_unit, info, comm, baseptr, win);
+}
+
+static inline int MPIDI_NM_win_flush(int rank, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_flush(rank, win);
+}
+
+static inline int MPIDI_NM_win_flush_local_all(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_flush_local_all(win);
+}
+
+static inline int MPIDI_NM_win_unlock_all(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_unlock_all(win);
+}
+
+static inline int MPIDI_NM_win_create_dynamic(MPIR_Info * info, MPIR_Comm * comm, MPIR_Win ** win)
+{
+    return MPIDI_CH4R_win_create_dynamic(info, comm, win);
+}
+
+static inline int MPIDI_NM_win_flush_local(int rank, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_flush_local(rank, win);
+}
+
+static inline int MPIDI_NM_win_sync(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_sync(win);
+}
+
+static inline int MPIDI_NM_win_flush_all(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_flush_all(win);
+}
+
+static inline int MPIDI_NM_win_lock_all(int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_lock_all(assert, win);
+}
+
+
+#endif /* NETMOD_PTL_WIN_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/portals4/subconfigure.m4 b/src/mpid/ch4/netmod/portals4/subconfigure.m4
new file mode 100644
index 0000000..21a60b1
--- /dev/null
+++ b/src/mpid/ch4/netmod/portals4/subconfigure.m4
@@ -0,0 +1,33 @@
+[#] start of __file__
+dnl MPICH_SUBCFG_AFTER=src/mpid/ch4
+
+AC_DEFUN([PAC_SUBCFG_PREREQ_]PAC_SUBCFG_AUTO_SUFFIX,[
+    AM_COND_IF([BUILD_CH4],[
+        for net in $ch4_netmods ; do
+            AS_CASE([$net],[portals4],[build_ch4_netmod_portals4=yes])
+	    if test $net = "portals4" ; then
+	       AC_DEFINE(HAVE_CH4_NETMOD_PORTALS4,1,[Portals4 netmod is built])
+	       # if test "$build_ch4_locality_info" != "yes" ; then
+	       #    AC_DEFINE(MPIDI_BUILD_CH4_LOCALITY_INFO, 1, [CH4 should build locality info])
+	       # 	  build_ch4_locality_info="yes"
+	       # fi
+	    fi
+        done
+    ])
+    AM_CONDITIONAL([BUILD_CH4_NETMOD_PORTALS4],[test "X$build_ch4_netmod_portals4" = "Xyes"])
+])dnl
+
+AC_DEFUN([PAC_SUBCFG_BODY_]PAC_SUBCFG_AUTO_SUFFIX,[
+AM_COND_IF([BUILD_CH4_NETMOD_PORTALS4],[
+    AC_MSG_NOTICE([RUNNING CONFIGURE FOR ch4:portals4])
+
+    PAC_SET_HEADER_LIB_PATH(portals4)
+    PAC_PUSH_FLAG(LIBS)
+    PAC_CHECK_HEADER_LIB_FATAL(portals4, portals4.h, portals, PtlInit)
+    PAC_APPEND_FLAG([-lportals],[WRAPPER_LIBS])
+    PAC_POP_FLAG(LIBS)
+
+])dnl end AM_COND_IF(BUILD_CH4_NETMOD_PORTALS4,...)
+])dnl end _BODY
+
+[#] end of __file__
diff --git a/src/mpid/ch4/netmod/stubnm/Makefile.mk b/src/mpid/ch4/netmod/stubnm/Makefile.mk
new file mode 100644
index 0000000..244ba01
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/Makefile.mk
@@ -0,0 +1,6 @@
+if BUILD_CH4_NETMOD_STUBNM
+
+mpi_core_sources += src/mpid/ch4/netmod/stubnm/globals.c
+# errnames_txt_files += src/mpid/ch4/netmod/stub/errnames.txt
+
+endif
diff --git a/src/mpid/ch4/netmod/stubnm/globals.c b/src/mpid/ch4/netmod/stubnm/globals.c
new file mode 100644
index 0000000..b830d4b
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/globals.c
@@ -0,0 +1,157 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#ifndef NETMOD_DIRECT
+#define NETMOD_DISABLE_INLINES
+#include <mpidimpl.h>
+#include "netmod_direct.h"
+MPIDI_NM_funcs_t MPIDI_NM_stubnm_funcs = {
+    MPIDI_NM_init,
+    MPIDI_NM_finalize,
+    MPIDI_NM_progress,
+    MPIDI_NM_comm_connect,
+    MPIDI_NM_comm_disconnect,
+    MPIDI_NM_open_port,
+    MPIDI_NM_close_port,
+    MPIDI_NM_comm_accept,
+    MPIDI_NM_comm_get_lpid,
+    MPIDI_NM_gpid_get,
+    MPIDI_NM_get_node_id,
+    MPIDI_NM_get_max_node_id,
+    MPIDI_NM_getallincomm,
+    MPIDI_NM_gpid_tolpidarray,
+    MPIDI_NM_create_intercomm_from_lpids,
+    MPIDI_NM_comm_create,
+    MPIDI_NM_comm_destroy,
+    MPIDI_NM_am_request_init,
+    MPIDI_NM_am_request_finalize,
+    MPIDI_NM_reg_hdr_handler,
+    MPIDI_NM_send_am_hdr,
+    MPIDI_NM_inject_am_hdr,
+    MPIDI_NM_send_am,
+    MPIDI_NM_send_amv,
+    MPIDI_NM_send_amv_hdr,
+    MPIDI_NM_send_am_hdr_reply,
+    MPIDI_NM_inject_am_hdr_reply,
+    MPIDI_NM_send_am_reply,
+    MPIDI_NM_send_amv_reply,
+    MPIDI_NM_am_hdr_max_sz,
+    MPIDI_NM_am_inject_max_sz,
+    MPIDI_NM_am_recv,
+};
+
+MPIDI_NM_native_funcs_t MPIDI_NM_native_stubnm_funcs = {
+    MPIDI_NM_send,
+    MPIDI_NM_ssend,
+    MPIDI_NM_startall,
+    MPIDI_NM_send_init,
+    MPIDI_NM_ssend_init,
+    MPIDI_NM_rsend_init,
+    MPIDI_NM_bsend_init,
+    MPIDI_NM_isend,
+    MPIDI_NM_issend,
+    MPIDI_NM_cancel_send,
+    MPIDI_NM_recv_init,
+    MPIDI_NM_recv,
+    MPIDI_NM_irecv,
+    MPIDI_NM_imrecv,
+    MPIDI_NM_cancel_recv,
+    MPIDI_NM_alloc_mem,
+    MPIDI_NM_free_mem,
+    MPIDI_NM_improbe,
+    MPIDI_NM_iprobe,
+    MPIDI_NM_win_set_info,
+    MPIDI_NM_win_shared_query,
+    MPIDI_NM_put,
+    MPIDI_NM_win_start,
+    MPIDI_NM_win_complete,
+    MPIDI_NM_win_post,
+    MPIDI_NM_win_wait,
+    MPIDI_NM_win_test,
+    MPIDI_NM_win_lock,
+    MPIDI_NM_win_unlock,
+    MPIDI_NM_win_get_info,
+    MPIDI_NM_get,
+    MPIDI_NM_win_free,
+    MPIDI_NM_win_fence,
+    MPIDI_NM_win_create,
+    MPIDI_NM_accumulate,
+    MPIDI_NM_win_attach,
+    MPIDI_NM_win_allocate_shared,
+    MPIDI_NM_rput,
+    MPIDI_NM_win_flush_local,
+    MPIDI_NM_win_detach,
+    MPIDI_NM_compare_and_swap,
+    MPIDI_NM_raccumulate,
+    MPIDI_NM_rget_accumulate,
+    MPIDI_NM_fetch_and_op,
+    MPIDI_NM_win_allocate,
+    MPIDI_NM_win_flush,
+    MPIDI_NM_win_flush_local_all,
+    MPIDI_NM_win_unlock_all,
+    MPIDI_NM_win_create_dynamic,
+    MPIDI_NM_rget,
+    MPIDI_NM_win_sync,
+    MPIDI_NM_win_flush_all,
+    MPIDI_NM_get_accumulate,
+    MPIDI_NM_win_lock_all,
+    MPIDI_NM_rank_is_local,
+    MPIDI_NM_barrier,
+    MPIDI_NM_bcast,
+    MPIDI_NM_allreduce,
+    MPIDI_NM_allgather,
+    MPIDI_NM_allgatherv,
+    MPIDI_NM_scatter,
+    MPIDI_NM_scatterv,
+    MPIDI_NM_gather,
+    MPIDI_NM_gatherv,
+    MPIDI_NM_alltoall,
+    MPIDI_NM_alltoallv,
+    MPIDI_NM_alltoallw,
+    MPIDI_NM_reduce,
+    MPIDI_NM_reduce_scatter,
+    MPIDI_NM_reduce_scatter_block,
+    MPIDI_NM_scan,
+    MPIDI_NM_exscan,
+    MPIDI_NM_neighbor_allgather,
+    MPIDI_NM_neighbor_allgatherv,
+    MPIDI_NM_neighbor_alltoall,
+    MPIDI_NM_neighbor_alltoallv,
+    MPIDI_NM_neighbor_alltoallw,
+    MPIDI_NM_ineighbor_allgather,
+    MPIDI_NM_ineighbor_allgatherv,
+    MPIDI_NM_ineighbor_alltoall,
+    MPIDI_NM_ineighbor_alltoallv,
+    MPIDI_NM_ineighbor_alltoallw,
+    MPIDI_NM_ibarrier,
+    MPIDI_NM_ibcast,
+    MPIDI_NM_iallgather,
+    MPIDI_NM_iallgatherv,
+    MPIDI_NM_iallreduce,
+    MPIDI_NM_ialltoall,
+    MPIDI_NM_ialltoallv,
+    MPIDI_NM_ialltoallw,
+    MPIDI_NM_iexscan,
+    MPIDI_NM_igather,
+    MPIDI_NM_igatherv,
+    MPIDI_NM_ireduce_scatter_block,
+    MPIDI_NM_ireduce_scatter,
+    MPIDI_NM_ireduce,
+    MPIDI_NM_iscan,
+    MPIDI_NM_iscatter,
+    MPIDI_NM_iscatterv,
+    MPIDI_NM_datatype_commit,
+    MPIDI_NM_datatype_dup,
+    MPIDI_NM_datatype_destroy,
+    MPIDI_NM_op_commit,
+    MPIDI_NM_op_destroy,
+};
+#endif
diff --git a/src/mpid/ch4/netmod/stubnm/netmod_direct.h b/src/mpid/ch4/netmod/stubnm/netmod_direct.h
new file mode 100644
index 0000000..f2873ad
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/netmod_direct.h
@@ -0,0 +1,29 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_DIRECT_H_INCLUDED
+#define NETMOD_DIRECT_H_INCLUDED
+#include "stubnm_init.h"
+#include "stubnm_probe.h"
+#include "stubnm_progress.h"
+#include "stubnm_recv.h"
+#include "stubnm_request.h"
+#include "stubnm_send.h"
+#include "stubnm_win.h"
+#include "stubnm_rma.h"
+#include "stubnm_am.h"
+#include "stubnm_spawn.h"
+#include "stubnm_comm.h"
+#include "stubnm_unimpl.h"
+#include "stubnm_proc.h"
+#include "stubnm_coll.h"
+#include "stubnm_datatype.h"
+#include "stubnm_op.h"
+#endif /* NETMOD_DIRECT_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/stubnm/stubnm_am.h b/src/mpid/ch4/netmod/stubnm/stubnm_am.h
new file mode 100644
index 0000000..2d7d9f2
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/stubnm_am.h
@@ -0,0 +1,138 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_STUBNM_AM_H_INCLUDED
+#define NETMOD_STUBNM_AM_H_INCLUDED
+
+#include "stubnm_impl.h"
+
+static inline int MPIDI_NM_reg_hdr_handler(int handler_id,
+                                           MPIDI_NM_am_origin_handler_fn origin_handler_fn,
+                                           MPIDI_NM_am_target_handler_fn target_handler_fn)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_send_am_hdr(int rank,
+                                       MPIR_Comm * comm,
+                                       int handler_id,
+                                       const void *am_hdr,
+                                       size_t am_hdr_sz, MPIR_Request * sreq, void *netmod_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_send_am(int rank,
+                                   MPIR_Comm * comm,
+                                   int handler_id,
+                                   const void *am_hdr,
+                                   size_t am_hdr_sz,
+                                   const void *data,
+                                   MPI_Count count,
+                                   MPI_Datatype datatype, MPIR_Request * sreq, void *netmod_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_send_amv(int rank,
+                                    MPIR_Comm * comm,
+                                    int handler_id,
+                                    struct iovec *am_hdr,
+                                    size_t iov_len,
+                                    const void *data,
+                                    MPI_Count count,
+                                    MPI_Datatype datatype,
+                                    MPIR_Request * sreq, void *netmod_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_send_amv_hdr(int rank,
+                                        MPIR_Comm * comm,
+                                        int handler_id,
+                                        struct iovec *am_hdr,
+                                        size_t iov_len, MPIR_Request * sreq, void *netmod_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_send_am_hdr_reply(MPIR_Context_id_t context_id, int src_rank,
+                                             int handler_id,
+                                             const void *am_hdr,
+                                             size_t am_hdr_sz, MPIR_Request * sreq)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_send_am_reply(MPIR_Context_id_t context_id, int src_rank,
+                                         int handler_id,
+                                         const void *am_hdr,
+                                         size_t am_hdr_sz,
+                                         const void *data,
+                                         MPI_Count count,
+                                         MPI_Datatype datatype, MPIR_Request * sreq)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_send_amv_reply(MPIR_Context_id_t context_id, int src_rank,
+                                          int handler_id,
+                                          struct iovec *am_hdr,
+                                          size_t iov_len,
+                                          const void *data,
+                                          MPI_Count count,
+                                          MPI_Datatype datatype, MPIR_Request * sreq)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline size_t MPIDI_NM_am_hdr_max_sz(void)
+{
+    MPIR_Assert(0);
+    return 0;
+}
+
+static inline int MPIDI_NM_inject_am_hdr(int rank,
+                                         MPIR_Comm * comm,
+                                         int handler_id,
+                                         const void *am_hdr, size_t am_hdr_sz, void *netmod_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_inject_am_hdr_reply(MPIR_Context_id_t context_id, int src_rank,
+                                               int handler_id, const void *am_hdr, size_t am_hdr_sz)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline size_t MPIDI_NM_am_inject_max_sz(void)
+{
+    MPIR_Assert(0);
+    return 0;
+}
+
+static inline int MPIDI_NM_am_recv(MPIR_Request * req)
+{
+    MPIR_Assert(0);
+    return 0;
+}
+
+#endif /* NETMOD_STUBNM_AM_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/stubnm/stubnm_coll.h b/src/mpid/ch4/netmod/stubnm/stubnm_coll.h
new file mode 100644
index 0000000..cf76ab6
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/stubnm_coll.h
@@ -0,0 +1,871 @@
+
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_STUBNM_COLL_H_INCLUDED
+#define NETMOD_STUBNM_COLL_H_INCLUDED
+
+#include "stubnm_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_barrier
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_barrier(MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_BARRIER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_BARRIER);
+
+    mpi_errno = MPIR_Barrier(comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_BARRIER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_bcast
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_bcast(void *buffer, int count, MPI_Datatype datatype,
+                                 int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_BCAST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_BCAST);
+
+    mpi_errno = MPIR_Bcast(buffer, count, datatype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_BCAST);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_allreduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_allreduce(const void *sendbuf, void *recvbuf, int count,
+                                     MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                     MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLREDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLREDUCE);
+
+    mpi_errno = MPIR_Allreduce(sendbuf, recvbuf, count, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLREDUCE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                     MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLGATHER);
+
+    mpi_errno = MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
+                               comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_allgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                      void *recvbuf, const int *recvcounts, const int *displs,
+                                      MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                      MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLGATHERV);
+
+    mpi_errno = MPIR_Allgatherv(sendbuf, sendcount, sendtype,
+                                recvbuf, recvcounts, displs, recvtype, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_gather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                  void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                  int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_GATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_GATHER);
+
+    mpi_errno = MPIR_Gather(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                            recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_GATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_gatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, const int *recvcounts, const int *displs,
+                                   MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr,
+                                   MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_GATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_GATHERV);
+
+    mpi_errno = MPIR_Gatherv(sendbuf, sendcount, sendtype,
+                             recvbuf, recvcounts, displs, recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_GATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_scatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                   int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_SCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_SCATTER);
+
+    mpi_errno = MPIR_Scatter(sendbuf, sendcount, sendtype,
+                             recvbuf, recvcount, recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_SCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_scatterv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_scatterv(const void *sendbuf, const int *sendcounts,
+                                    const int *displs, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                    int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_SCATTERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_SCATTERV);
+
+    mpi_errno = MPIR_Scatterv(sendbuf, sendcounts, displs,
+                              sendtype, recvbuf, recvcount, recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_SCATTERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_alltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                    MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLTOALL);
+
+    mpi_errno = MPIR_Alltoall(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                              recvtype, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_alltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_alltoallv(const void *sendbuf, const int *sendcounts,
+                                     const int *sdispls, MPI_Datatype sendtype,
+                                     void *recvbuf, const int *recvcounts,
+                                     const int *rdispls, MPI_Datatype recvtype,
+                                     MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLTOALLV);
+
+    mpi_errno = MPIR_Alltoallv(sendbuf, sendcounts, sdispls,
+                               sendtype, recvbuf, recvcounts, rdispls, recvtype, comm_ptr, errflag);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_alltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_alltoallw(const void *sendbuf, const int sendcounts[],
+                                     const int sdispls[], const MPI_Datatype sendtypes[],
+                                     void *recvbuf, const int recvcounts[],
+                                     const int rdispls[], const MPI_Datatype recvtypes[],
+                                     MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLTOALLW);
+
+    mpi_errno = MPIR_Alltoallw(sendbuf, sendcounts, sdispls,
+                               sendtypes, recvbuf, recvcounts,
+                               rdispls, recvtypes, comm_ptr, errflag);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_reduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_reduce(const void *sendbuf, void *recvbuf, int count,
+                                  MPI_Datatype datatype, MPI_Op op, int root,
+                                  MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_REDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_REDUCE);
+
+    mpi_errno = MPIR_Reduce(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_REDUCE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_reduce_scatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_reduce_scatter(const void *sendbuf, void *recvbuf,
+                                          const int recvcounts[], MPI_Datatype datatype,
+                                          MPI_Op op, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_REDUCE_SCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_REDUCE_SCATTER);
+
+    mpi_errno = MPIR_Reduce_scatter(sendbuf, recvbuf, recvcounts, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_REDUCE_SCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_reduce_scatter_block
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_reduce_scatter_block(const void *sendbuf, void *recvbuf,
+                                                int recvcount, MPI_Datatype datatype,
+                                                MPI_Op op, MPIR_Comm * comm_ptr,
+                                                MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_REDUCE_SCATTER_BLOCK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_REDUCE_SCATTER_BLOCK);
+
+    mpi_errno = MPIR_Reduce_scatter_block(sendbuf, recvbuf, recvcount,
+                                          datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_REDUCE_SCATTER_BLOCK);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_scan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_scan(const void *sendbuf, void *recvbuf, int count,
+                                MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_SCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_SCAN);
+
+    mpi_errno = MPIR_Scan(sendbuf, recvbuf, count, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_SCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_exscan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_exscan(const void *sendbuf, void *recvbuf, int count,
+                                  MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                  MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_EXSCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_EXSCAN);
+
+    mpi_errno = MPIR_Exscan(sendbuf, recvbuf, count, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_EXSCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_allgather(const void *sendbuf, int sendcount,
+                                              MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                              MPI_Datatype recvtype, MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLGATHER);
+
+    mpi_errno =
+        MPIR_Neighbor_allgather_impl(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
+                                     comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_allgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_allgatherv(const void *sendbuf, int sendcount,
+                                               MPI_Datatype sendtype, void *recvbuf,
+                                               const int recvcounts[], const int displs[],
+                                               MPI_Datatype recvtype, MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLGATHERV);
+
+    mpi_errno = MPIR_Neighbor_allgatherv_impl(sendbuf, sendcount, sendtype,
+                                              recvbuf, recvcounts, displs, recvtype, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_alltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_alltoall(const void *sendbuf, int sendcount,
+                                             MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                             MPI_Datatype recvtype, MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLTOALL);
+
+    mpi_errno = MPIR_Neighbor_alltoall_impl(sendbuf, sendcount, sendtype,
+                                            recvbuf, recvcount, recvtype, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_alltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_alltoallv(const void *sendbuf, const int sendcounts[],
+                                              const int sdispls[], MPI_Datatype sendtype,
+                                              void *recvbuf, const int recvcounts[],
+                                              const int rdispls[], MPI_Datatype recvtype,
+                                              MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLTOALLV);
+
+    mpi_errno = MPIR_Neighbor_alltoallv_impl(sendbuf, sendcounts, sdispls, sendtype,
+                                             recvbuf, recvcounts, rdispls, recvtype, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_alltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_alltoallw(const void *sendbuf, const int sendcounts[],
+                                              const MPI_Aint sdispls[],
+                                              const MPI_Datatype sendtypes[], void *recvbuf,
+                                              const int recvcounts[], const MPI_Aint rdispls[],
+                                              const MPI_Datatype recvtypes[], MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLTOALLW);
+
+    mpi_errno = MPIR_Neighbor_alltoallw_impl(sendbuf, sendcounts, sdispls, sendtypes,
+                                             recvbuf, recvcounts, rdispls, recvtypes, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_allgather(const void *sendbuf, int sendcount,
+                                               MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                               MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                               MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLGATHER);
+
+    mpi_errno = MPIR_Ineighbor_allgather_impl(sendbuf, sendcount, sendtype,
+                                              recvbuf, recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_allgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_allgatherv(const void *sendbuf, int sendcount,
+                                                MPI_Datatype sendtype, void *recvbuf,
+                                                const int recvcounts[], const int displs[],
+                                                MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                                MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLGATHERV);
+
+    mpi_errno = MPIR_Ineighbor_allgatherv_impl(sendbuf, sendcount, sendtype,
+                                               recvbuf, recvcounts, displs, recvtype,
+                                               comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_alltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_alltoall(const void *sendbuf, int sendcount,
+                                              MPI_Datatype sendtype, void *recvbuf,
+                                              int recvcount, MPI_Datatype recvtype,
+                                              MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLTOALL);
+
+    mpi_errno = MPIR_Ineighbor_alltoall_impl(sendbuf, sendcount, sendtype,
+                                             recvbuf, recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_alltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_alltoallv(const void *sendbuf, const int sendcounts[],
+                                               const int sdispls[], MPI_Datatype sendtype,
+                                               void *recvbuf, const int recvcounts[],
+                                               const int rdispls[], MPI_Datatype recvtype,
+                                               MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLTOALLV);
+
+    mpi_errno = MPIR_Ineighbor_alltoallv_impl(sendbuf, sendcounts, sdispls, sendtype,
+                                              recvbuf, recvcounts, rdispls, recvtype,
+                                              comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_alltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_alltoallw(const void *sendbuf, const int sendcounts[],
+                                               const MPI_Aint sdispls[],
+                                               const MPI_Datatype sendtypes[], void *recvbuf,
+                                               const int recvcounts[], const MPI_Aint rdispls[],
+                                               const MPI_Datatype recvtypes[], MPIR_Comm * comm_ptr,
+                                               MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLTOALLW);
+
+    mpi_errno = MPIR_Ineighbor_alltoallw_impl(sendbuf, sendcounts, sdispls, sendtypes,
+                                              recvbuf, recvcounts, rdispls, recvtypes,
+                                              comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ibarrier
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ibarrier(MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IBARRIER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IBARRIER);
+
+    mpi_errno = MPIR_Ibarrier_impl(comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IBARRIER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ibcast
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ibcast(void *buffer, int count, MPI_Datatype datatype,
+                                  int root, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IBCAST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IBCAST);
+
+    mpi_errno = MPIR_Ibcast_impl(buffer, count, datatype, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IBCAST);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iallgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                      void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                      MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLGATHER);
+
+    mpi_errno = MPIR_Iallgather_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                     recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iallreduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iallreduce(const void *sendbuf, void *recvbuf, int count,
+                                      MPI_Datatype datatype, MPI_Op op,
+                                      MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IREDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IREDUCE);
+
+    mpi_errno = MPIR_Iallreduce_impl(sendbuf, recvbuf, count, datatype, op, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IREDUCE);
+    return mpi_errno;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iallgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iallgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                       void *recvbuf, const int *recvcounts, const int *displs,
+                                       MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                       MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLGATHERV);
+
+    mpi_errno = MPIR_Iallgatherv_impl(sendbuf, sendcount, sendtype,
+                                      recvbuf, recvcounts, displs, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ialltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ialltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                     MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLTOALL);
+
+    mpi_errno = MPIR_Ialltoall_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                    recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ialltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ialltoallv(const void *sendbuf, const int *sendcounts,
+                                      const int *sdispls, MPI_Datatype sendtype,
+                                      void *recvbuf, const int *recvcounts,
+                                      const int *rdispls, MPI_Datatype recvtype,
+                                      MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLTOALLV);
+
+    mpi_errno = MPIR_Ialltoallv_impl(sendbuf, sendcounts, sdispls,
+                                     sendtype, recvbuf, recvcounts,
+                                     rdispls, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ialltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ialltoallw(const void *sendbuf, const int *sendcounts,
+                                      const int *sdispls, const MPI_Datatype sendtypes[],
+                                      void *recvbuf, const int *recvcounts,
+                                      const int *rdispls, const MPI_Datatype recvtypes[],
+                                      MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLTOALLW);
+
+    mpi_errno = MPIR_Ialltoallw_impl(sendbuf, sendcounts, sdispls,
+                                     sendtypes, recvbuf, recvcounts,
+                                     rdispls, recvtypes, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iexscan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iexscan(const void *sendbuf, void *recvbuf, int count,
+                                   MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                   MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IEXSCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IEXSCAN);
+
+    mpi_errno = MPIR_Iexscan_impl(sendbuf, recvbuf, count, datatype, op, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IEXSCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_igather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_igather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                   int root, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IGATHER);
+
+    mpi_errno = MPIR_Igather_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                  recvcount, recvtype, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_igatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_igatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, const int *recvcounts, const int *displs,
+                                    MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr,
+                                    MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IGATHERV);
+
+    mpi_errno = MPIR_Igatherv_impl(sendbuf, sendcount, sendtype,
+                                   recvbuf, recvcounts, displs, recvtype, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ireduce_scatter_block
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ireduce_scatter_block(const void *sendbuf, void *recvbuf,
+                                                 int recvcount, MPI_Datatype datatype,
+                                                 MPI_Op op, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IREDUCE_SCATTER_BLOCK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IREDUCE_SCATTER_BLOCK);
+
+    mpi_errno = MPIR_Ireduce_scatter_block_impl(sendbuf, recvbuf, recvcount,
+                                                datatype, op, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IREDUCE_SCATTER_BLOCK);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ireduce_scatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ireduce_scatter(const void *sendbuf, void *recvbuf,
+                                           const int recvcounts[], MPI_Datatype datatype,
+                                           MPI_Op op, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IREDUCE_SCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IREDUCE_SCATTER);
+
+    mpi_errno = MPIR_Ireduce_scatter_impl(sendbuf, recvbuf, recvcounts, datatype, op,
+                                          comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IREDUCE_SCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ireduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ireduce(const void *sendbuf, void *recvbuf, int count,
+                                   MPI_Datatype datatype, MPI_Op op, int root,
+                                   MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IREDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IREDUCE);
+
+    mpi_errno = MPIR_Ireduce_impl(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IREDUCE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iscan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iscan(const void *sendbuf, void *recvbuf, int count,
+                                 MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                 MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ISCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ISCAN);
+
+    mpi_errno = MPIR_Iscan_impl(sendbuf, recvbuf, count, datatype, op, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ISCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iscatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iscatter(const void *sendbuf, int sendcount,
+                                    MPI_Datatype sendtype, void *recvbuf,
+                                    int recvcount, MPI_Datatype recvtype,
+                                    int root, MPIR_Comm * comm, MPI_Request * request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ISCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ISCATTER);
+
+    mpi_errno = MPIR_Iscatter_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                   recvcount, recvtype, root, comm, request);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ISCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iscatterv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iscatterv(const void *sendbuf, const int *sendcounts,
+                                     const int *displs, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount,
+                                     MPI_Datatype recvtype, int root,
+                                     MPIR_Comm * comm, MPI_Request * request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ISCATTERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ISCATTERV);
+
+    mpi_errno = MPIR_Iscatterv_impl(sendbuf, sendcounts, displs, sendtype,
+                                    recvbuf, recvcount, recvtype, root, comm, request);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ISCATTERV);
+    return mpi_errno;
+}
+
+#endif
diff --git a/src/mpid/ch4/netmod/stubnm/stubnm_comm.h b/src/mpid/ch4/netmod/stubnm/stubnm_comm.h
new file mode 100644
index 0000000..3188e94
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/stubnm_comm.h
@@ -0,0 +1,39 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_STUBNM_COMM_H_INCLUDED
+#define NETMOD_STUBNM_COMM_H_INCLUDED
+
+#include "stubnm_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_comm_create
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_comm_create(MPIR_Comm * comm)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Assert(0);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_comm_destroy
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_comm_destroy(MPIR_Comm * comm)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Assert(0);
+    return mpi_errno;
+}
+
+
+#endif /* NETMOD_STUBNM_COMM_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/stubnm/stubnm_datatype.h b/src/mpid/ch4/netmod/stubnm/stubnm_datatype.h
new file mode 100644
index 0000000..877009b
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/stubnm_datatype.h
@@ -0,0 +1,35 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_STUBNM_DATATYPE_H_INCLUDED
+#define NETMOD_STUBNM_DATATYPE_H_INCLUDED
+
+#include "stubnm_impl.h"
+
+static inline void MPIDI_NM_datatype_destroy(MPIR_Datatype * datatype_p)
+{
+    MPIR_Assert(0);
+    return;
+}
+
+static inline void MPIDI_NM_datatype_commit(MPIR_Datatype * datatype_p)
+{
+    MPIR_Assert(0);
+    return;
+}
+
+static inline void MPIDI_NM_datatype_dup(MPIR_Datatype * old_datatype_p,
+                                         MPIR_Datatype * new_datatype_p)
+{
+    MPIR_Assert(0);
+    return;
+}
+
+#endif
diff --git a/src/mpid/ch4/netmod/stubnm/stubnm_impl.h b/src/mpid/ch4/netmod/stubnm/stubnm_impl.h
new file mode 100644
index 0000000..586c6d5
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/stubnm_impl.h
@@ -0,0 +1,17 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_STUBNM_IMPL_H_INCLUDED
+#define NETMOD_STUBNM_IMPL_H_INCLUDED
+
+#include <mpidimpl.h>
+#include "mpidch4r.h"
+
+#endif /* NETMOD_STUBNM_IMPL_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/stubnm/stubnm_init.h b/src/mpid/ch4/netmod/stubnm/stubnm_init.h
new file mode 100644
index 0000000..5f458a6
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/stubnm_init.h
@@ -0,0 +1,95 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_STUBNM_INIT_H_INCLUDED
+#define NETMOD_STUBNM_INIT_H_INCLUDED
+
+#include "stubnm_impl.h"
+
+static inline int MPIDI_NM_init(int rank,
+                                int size,
+                                int appnum,
+                                int *tag_ub,
+                                MPIR_Comm * comm_world,
+                                MPIR_Comm * comm_self,
+                                int spawned, int num_contexts, void **netmod_contexts)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+
+    MPIR_Assert(0);
+    return mpi_errno;
+}
+
+static inline int MPIDI_NM_finalize(void)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Assert(0);
+    return mpi_errno;
+}
+
+
+static inline int MPIDI_NM_comm_get_lpid(MPIR_Comm * comm_ptr,
+                                         int idx, int *lpid_ptr, MPL_bool is_remote)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_gpid_get(MPIR_Comm * comm_ptr, int rank, MPIR_Gpid * gpid)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_get_node_id(MPIR_Comm * comm, int rank, MPID_Node_id_t * id_p)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_get_max_node_id(MPIR_Comm * comm, MPID_Node_id_t * max_id_p)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_getallincomm(MPIR_Comm * comm_ptr,
+                                        int local_size, MPIR_Gpid local_gpids[], int *singleAVT)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_gpid_tolpidarray(int size, MPIR_Gpid gpid[], int lpid[])
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr,
+                                                       int size, const int lpids[])
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_free_mem(void *ptr)
+{
+    return MPIDI_CH4U_free_mem(ptr);
+}
+
+static inline void *MPIDI_NM_alloc_mem(size_t size, MPIR_Info * info_ptr)
+{
+    return MPIDI_CH4U_alloc_mem(size, info_ptr);
+}
+
+
+#endif /* NETMOD_STUBNM_INIT_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/stubnm/stubnm_op.h b/src/mpid/ch4/netmod/stubnm/stubnm_op.h
new file mode 100644
index 0000000..ae4eade
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/stubnm_op.h
@@ -0,0 +1,29 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_STUBNM_OP_H_INCLUDED
+#define NETMOD_STUBNM_OP_H_INCLUDED
+
+#include "stubnm_impl.h"
+
+static inline void MPIDI_NM_op_destroy(MPIR_Op * op_p)
+{
+    MPIR_Assert(0);
+    return;
+}
+
+static inline void MPIDI_NM_op_commit(MPIR_Op * op_p)
+{
+    MPIR_Assert(0);
+    return;
+}
+
+
+#endif
diff --git a/src/mpid/ch4/netmod/stubnm/stubnm_pre.h b/src/mpid/ch4/netmod/stubnm/stubnm_pre.h
new file mode 100644
index 0000000..ad23b48
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/stubnm_pre.h
@@ -0,0 +1,47 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#ifndef NETMOD_STUBNM_PRE_H_INCLUDED
+#define NETMOD_STUBNM_PRE_H_INCLUDED
+
+typedef struct {
+    int dummy;
+} MPIDI_STUBNM_am_request_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_STUBNM_request_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_STUBNM_comm_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_STUBNM_dt_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_STUBNM_op_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_STUBNM_win_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_STUBNM_gpid_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_STUBNM_addr_t;
+
+#endif /* NETMOD_STUBNM_PRE_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/stubnm/stubnm_probe.h b/src/mpid/ch4/netmod/stubnm/stubnm_probe.h
new file mode 100644
index 0000000..6442fb6
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/stubnm_probe.h
@@ -0,0 +1,39 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_STUBNM_PROBE_H_INCLUDED
+#define NETMOD_STUBNM_PROBE_H_INCLUDED
+
+#include "stubnm_impl.h"
+
+static inline int MPIDI_NM_probe(int source,
+                                 int tag, MPIR_Comm * comm, int context_offset, MPI_Status * status)
+{
+    return MPIDI_CH4U_probe(source, tag, comm, context_offset, status);
+}
+
+static inline int MPIDI_NM_improbe(int source,
+                                   int tag,
+                                   MPIR_Comm * comm,
+                                   int context_offset,
+                                   int *flag, MPIR_Request ** message, MPI_Status * status)
+{
+    return MPIDI_CH4U_improbe(source, tag, comm, context_offset, flag, message, status);
+}
+
+static inline int MPIDI_NM_iprobe(int source,
+                                  int tag,
+                                  MPIR_Comm * comm,
+                                  int context_offset, int *flag, MPI_Status * status)
+{
+    return MPIDI_CH4U_iprobe(source, tag, comm, context_offset, flag, status);
+}
+
+#endif /* NETMOD_STUBNM_PROBE_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/stubnm/stubnm_proc.h b/src/mpid/ch4/netmod/stubnm/stubnm_proc.h
new file mode 100644
index 0000000..595aa6b
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/stubnm_proc.h
@@ -0,0 +1,28 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_STUBNM_PROC_H_INCLUDED
+#define NETMOD_STUBNM_PROC_H_INCLUDED
+
+#include "stubnm_impl.h"
+
+static inline int MPIDI_NM_rank_is_local(int rank, MPIR_Comm * comm)
+{
+    int ret;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPIDI_NETMOD_RANK_IS_LOCAL);
+    MPIR_FUNC_VERBOSE_ENTER(MPIDI_NETMOD_RANK_IS_LOCAL);
+
+    MPIR_Assert(0);
+    ret = 0;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPIDI_NETMOD_RANK_IS_LOCAL);
+    return ret;
+}
+#endif /*NETMOD_STUBNM_PROC_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/stubnm/stubnm_progress.h b/src/mpid/ch4/netmod/stubnm/stubnm_progress.h
new file mode 100644
index 0000000..d43cac5
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/stubnm_progress.h
@@ -0,0 +1,76 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_STUBNM_PROGRESS_H_INCLUDED
+#define NETMOD_STUBNM_PROGRESS_H_INCLUDED
+
+#include "stubnm_impl.h"
+
+static inline int MPIDI_NM_progress(void *netmod_context, int blocking)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_progress_test(void)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_progress_poke(void)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline void MPIDI_NM_progress_start(MPID_Progress_state * state)
+{
+    MPIR_Assert(0);
+    return;
+}
+
+static inline void MPIDI_NM_progress_end(MPID_Progress_state * state)
+{
+    MPIR_Assert(0);
+    return;
+}
+
+static inline int MPIDI_NM_progress_wait(MPID_Progress_state * state)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_progress_register(int (*progress_fn) (int *), int *id)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_progress_deregister(int id)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_progress_activate(int id)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_progress_deactivate(int id)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#endif /* NETMOD_STUBNM_PROGRESS_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/stubnm/stubnm_recv.h b/src/mpid/ch4/netmod/stubnm/stubnm_recv.h
new file mode 100644
index 0000000..18724cc
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/stubnm_recv.h
@@ -0,0 +1,67 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_STUBNM_RECV_H_INCLUDED
+#define NETMOD_STUBNM_RECV_H_INCLUDED
+
+#include "stubnm_impl.h"
+
+static inline int MPIDI_NM_recv(void *buf,
+                                int count,
+                                MPI_Datatype datatype,
+                                int rank,
+                                int tag,
+                                MPIR_Comm * comm,
+                                int context_offset, MPI_Status * status, MPIR_Request ** request)
+{
+    int err = MPI_SUCCESS;
+    MPIR_Assert(0);
+    return err;
+}
+
+static inline int MPIDI_NM_recv_init(void *buf,
+                                     int count,
+                                     MPI_Datatype datatype,
+                                     int rank,
+                                     int tag,
+                                     MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_imrecv(void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  MPIR_Request * message, MPIR_Request ** rreqp)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_irecv(void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int err = MPI_SUCCESS;
+    MPIR_Assert(0);
+    return err;
+}
+
+static inline int MPIDI_NM_cancel_recv(MPIR_Request * rreq)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#endif /* NETMOD_STUBNM_RECV_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/stubnm/stubnm_request.h b/src/mpid/ch4/netmod/stubnm/stubnm_request.h
new file mode 100644
index 0000000..5909e53
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/stubnm_request.h
@@ -0,0 +1,26 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_STUBNM_REQUEST_H_INCLUDED
+#define NETMOD_STUBNM_REQUEST_H_INCLUDED
+
+#include "stubnm_impl.h"
+
+static inline void MPIDI_NM_am_request_init(MPIR_Request * req)
+{
+    MPIR_Assert(0);
+}
+
+static inline void MPIDI_NM_am_request_finalize(MPIR_Request * req)
+{
+    MPIR_Assert(0);
+}
+
+#endif /* NETMOD_STUBNM_REQUEST_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/stubnm/stubnm_rma.h b/src/mpid/ch4/netmod/stubnm/stubnm_rma.h
new file mode 100644
index 0000000..de8172f
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/stubnm_rma.h
@@ -0,0 +1,148 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_STUBNM_RMA_H_INCLUDED
+#define NETMOD_STUBNM_RMA_H_INCLUDED
+
+#include "stubnm_impl.h"
+
+static inline int MPIDI_NM_put(const void *origin_addr,
+                               int origin_count,
+                               MPI_Datatype origin_datatype,
+                               int target_rank,
+                               MPI_Aint target_disp,
+                               int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    return MPIDI_CH4U_put(origin_addr, origin_count, origin_datatype,
+                          target_rank, target_disp, target_count, target_datatype, win);
+}
+
+static inline int MPIDI_NM_get(void *origin_addr,
+                               int origin_count,
+                               MPI_Datatype origin_datatype,
+                               int target_rank,
+                               MPI_Aint target_disp,
+                               int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    return MPIDI_CH4U_get(origin_addr, origin_count, origin_datatype,
+                          target_rank, target_disp, target_count, target_datatype, win);
+}
+
+static inline int MPIDI_NM_rput(const void *origin_addr,
+                                int origin_count,
+                                MPI_Datatype origin_datatype,
+                                int target_rank,
+                                MPI_Aint target_disp,
+                                int target_count,
+                                MPI_Datatype target_datatype,
+                                MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rput(origin_addr, origin_count, origin_datatype,
+                           target_rank, target_disp, target_count, target_datatype, win, request);
+}
+
+
+static inline int MPIDI_NM_compare_and_swap(const void *origin_addr,
+                                            const void *compare_addr,
+                                            void *result_addr,
+                                            MPI_Datatype datatype,
+                                            int target_rank, MPI_Aint target_disp, MPIR_Win * win)
+{
+    return MPIDI_CH4U_compare_and_swap(origin_addr, compare_addr, result_addr,
+                                       datatype, target_rank, target_disp, win);
+}
+
+static inline int MPIDI_NM_raccumulate(const void *origin_addr,
+                                       int origin_count,
+                                       MPI_Datatype origin_datatype,
+                                       int target_rank,
+                                       MPI_Aint target_disp,
+                                       int target_count,
+                                       MPI_Datatype target_datatype,
+                                       MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_raccumulate(origin_addr, origin_count, origin_datatype,
+                                  target_rank, target_disp, target_count,
+                                  target_datatype, op, win, request);
+}
+
+static inline int MPIDI_NM_rget_accumulate(const void *origin_addr,
+                                           int origin_count,
+                                           MPI_Datatype origin_datatype,
+                                           void *result_addr,
+                                           int result_count,
+                                           MPI_Datatype result_datatype,
+                                           int target_rank,
+                                           MPI_Aint target_disp,
+                                           int target_count,
+                                           MPI_Datatype target_datatype,
+                                           MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rget_accumulate(origin_addr, origin_count, origin_datatype,
+                                      result_addr, result_count, result_datatype,
+                                      target_rank, target_disp, target_count,
+                                      target_datatype, op, win, request);
+}
+
+static inline int MPIDI_NM_fetch_and_op(const void *origin_addr,
+                                        void *result_addr,
+                                        MPI_Datatype datatype,
+                                        int target_rank,
+                                        MPI_Aint target_disp, MPI_Op op, MPIR_Win * win)
+{
+    return MPIDI_CH4U_fetch_and_op(origin_addr, result_addr, datatype,
+                                   target_rank, target_disp, op, win);
+}
+
+
+static inline int MPIDI_NM_rget(void *origin_addr,
+                                int origin_count,
+                                MPI_Datatype origin_datatype,
+                                int target_rank,
+                                MPI_Aint target_disp,
+                                int target_count,
+                                MPI_Datatype target_datatype,
+                                MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rget(origin_addr, origin_count, origin_datatype,
+                           target_rank, target_disp, target_count, target_datatype, win, request);
+}
+
+
+static inline int MPIDI_NM_get_accumulate(const void *origin_addr,
+                                          int origin_count,
+                                          MPI_Datatype origin_datatype,
+                                          void *result_addr,
+                                          int result_count,
+                                          MPI_Datatype result_datatype,
+                                          int target_rank,
+                                          MPI_Aint target_disp,
+                                          int target_count,
+                                          MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    return MPIDI_CH4U_get_accumulate(origin_addr, origin_count, origin_datatype,
+                                     result_addr, result_count, result_datatype,
+                                     target_rank, target_disp, target_count,
+                                     target_datatype, op, win);
+}
+
+static inline int MPIDI_NM_accumulate(const void *origin_addr,
+                                      int origin_count,
+                                      MPI_Datatype origin_datatype,
+                                      int target_rank,
+                                      MPI_Aint target_disp,
+                                      int target_count,
+                                      MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    return MPIDI_CH4U_accumulate(origin_addr, origin_count, origin_datatype,
+                                 target_rank, target_disp, target_count, target_datatype, op, win);
+}
+
+#endif /* NETMOD_STUBNM_RMA_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/stubnm/stubnm_send.h b/src/mpid/ch4/netmod/stubnm/stubnm_send.h
new file mode 100644
index 0000000..c83867d
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/stubnm_send.h
@@ -0,0 +1,128 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_STUBNM_SEND_H_INCLUDED
+#define NETMOD_STUBNM_SEND_H_INCLUDED
+
+#include "stubnm_impl.h"
+
+static inline int MPIDI_NM_send(const void *buf,
+                                int count,
+                                MPI_Datatype datatype,
+                                int rank,
+                                int tag,
+                                MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_send(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_rsend(const void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rsend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+
+
+static inline int MPIDI_NM_irsend(const void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  int rank,
+                                  int tag,
+                                  MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_irsend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_ssend(const void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_ssend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_startall(int count, MPIR_Request * requests[])
+{
+    return MPIDI_CH4U_startall(count, requests);
+}
+
+static inline int MPIDI_NM_send_init(const void *buf,
+                                     int count,
+                                     MPI_Datatype datatype,
+                                     int rank,
+                                     int tag,
+                                     MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_send_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_ssend_init(const void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_ssend_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_bsend_init(const void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_bsend_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_rsend_init(const void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rsend_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_isend(const void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_isend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_issend(const void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  int rank,
+                                  int tag,
+                                  MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_issend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_cancel_send(MPIR_Request * sreq)
+{
+    return MPIDI_CH4U_cancel_send(sreq);
+}
+
+#endif /* NETMOD_STUBNM_SEND_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/stubnm/stubnm_spawn.h b/src/mpid/ch4/netmod/stubnm/stubnm_spawn.h
new file mode 100644
index 0000000..2f70a59
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/stubnm_spawn.h
@@ -0,0 +1,50 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_STUBNM_SPAWN_H_INCLUDED
+#define NETMOD_STUBNM_SPAWN_H_INCLUDED
+
+#include "stubnm_impl.h"
+
+static inline int MPIDI_NM_comm_connect(const char *port_name,
+                                        MPIR_Info * info,
+                                        int root, MPIR_Comm * comm, MPIR_Comm ** newcomm_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_comm_disconnect(MPIR_Comm * comm_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_open_port(MPIR_Info * info_ptr, char *port_name)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_close_port(const char *port_name)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_comm_accept(const char *port_name,
+                                       MPIR_Info * info,
+                                       int root, MPIR_Comm * comm, MPIR_Comm ** newcomm_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#endif /* NETMOD_STUBNM_WIN_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/stubnm/stubnm_unimpl.h b/src/mpid/ch4/netmod/stubnm/stubnm_unimpl.h
new file mode 100644
index 0000000..db8e66b
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/stubnm_unimpl.h
@@ -0,0 +1,19 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/mpid/ch4/netmod/stubnm/stubnm_win.h b/src/mpid/ch4/netmod/stubnm/stubnm_win.h
new file mode 100644
index 0000000..5c8a112
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/stubnm_win.h
@@ -0,0 +1,160 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_STUBNM_WIN_H_INCLUDED
+#define NETMOD_STUBNM_WIN_H_INCLUDED
+
+#include "stubnm_impl.h"
+
+static inline int MPIDI_NM_win_set_info(MPIR_Win * win, MPIR_Info * info)
+{
+    return MPIDI_CH4R_win_set_info(win, info);
+}
+
+
+static inline int MPIDI_NM_win_start(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_start(group, assert, win);
+}
+
+
+static inline int MPIDI_NM_win_complete(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_complete(win);
+}
+
+static inline int MPIDI_NM_win_post(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_post(group, assert, win);
+}
+
+
+static inline int MPIDI_NM_win_wait(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_wait(win);
+}
+
+
+static inline int MPIDI_NM_win_test(MPIR_Win * win, int *flag)
+{
+    return MPIDI_CH4R_win_test(win, flag);
+}
+
+static inline int MPIDI_NM_win_lock(int lock_type, int rank, int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_lock(lock_type, rank, assert, win);
+}
+
+
+static inline int MPIDI_NM_win_unlock(int rank, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_unlock(rank, win);
+}
+
+static inline int MPIDI_NM_win_get_info(MPIR_Win * win, MPIR_Info ** info_p_p)
+{
+    return MPIDI_CH4R_win_get_info(win, info_p_p);
+}
+
+
+static inline int MPIDI_NM_win_free(MPIR_Win ** win_ptr)
+{
+    return MPIDI_CH4R_win_free(win_ptr);
+}
+
+static inline int MPIDI_NM_win_fence(int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_fence(assert, win);
+}
+
+static inline int MPIDI_NM_win_create(void *base,
+                                      MPI_Aint length,
+                                      int disp_unit,
+                                      MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr)
+{
+    return MPIDI_CH4R_win_create(base, length, disp_unit, info, comm_ptr, win_ptr);
+}
+
+static inline int MPIDI_NM_win_attach(MPIR_Win * win, void *base, MPI_Aint size)
+{
+    return MPIDI_CH4R_win_attach(win, base, size);
+}
+
+static inline int MPIDI_NM_win_allocate_shared(MPI_Aint size,
+                                               int disp_unit,
+                                               MPIR_Info * info_ptr,
+                                               MPIR_Comm * comm_ptr,
+                                               void **base_ptr, MPIR_Win ** win_ptr)
+{
+    return MPIDI_CH4R_win_allocate_shared(size, disp_unit, info_ptr, comm_ptr, base_ptr, win_ptr);
+}
+
+static inline int MPIDI_NM_win_detach(MPIR_Win * win, const void *base)
+{
+    return MPIDI_CH4R_win_detach(win, base);
+}
+
+static inline int MPIDI_NM_win_shared_query(MPIR_Win * win,
+                                            int rank,
+                                            MPI_Aint * size, int *disp_unit, void *baseptr)
+{
+    return MPIDI_CH4R_win_shared_query(win, rank, size, disp_unit, baseptr);
+}
+
+static inline int MPIDI_NM_win_allocate(MPI_Aint size,
+                                        int disp_unit,
+                                        MPIR_Info * info,
+                                        MPIR_Comm * comm, void *baseptr, MPIR_Win ** win)
+{
+    return MPIDI_CH4R_win_allocate(size, disp_unit, info, comm, baseptr, win);
+}
+
+static inline int MPIDI_NM_win_flush(int rank, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_flush(rank, win);
+}
+
+static inline int MPIDI_NM_win_flush_local_all(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_flush_local_all(win);
+}
+
+static inline int MPIDI_NM_win_unlock_all(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_unlock_all(win);
+}
+
+static inline int MPIDI_NM_win_create_dynamic(MPIR_Info * info, MPIR_Comm * comm, MPIR_Win ** win)
+{
+    return MPIDI_CH4R_win_create_dynamic(info, comm, win);
+}
+
+static inline int MPIDI_NM_win_flush_local(int rank, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_flush_local(rank, win);
+}
+
+static inline int MPIDI_NM_win_sync(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_sync(win);
+}
+
+static inline int MPIDI_NM_win_flush_all(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_flush_all(win);
+}
+
+static inline int MPIDI_NM_win_lock_all(int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_lock_all(assert, win);
+}
+
+
+#endif /* NETMOD_STUBNM_WIN_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/stubnm/subconfigure.m4 b/src/mpid/ch4/netmod/stubnm/subconfigure.m4
new file mode 100644
index 0000000..4dce4f1
--- /dev/null
+++ b/src/mpid/ch4/netmod/stubnm/subconfigure.m4
@@ -0,0 +1,21 @@
+[#] start of __file__
+dnl MPICH_SUBCFG_AFTER=src/mpid/ch4
+
+AC_DEFUN([PAC_SUBCFG_PREREQ_]PAC_SUBCFG_AUTO_SUFFIX,[
+    AM_COND_IF([BUILD_CH4],[
+        for net in $ch4_netmods ; do
+            AS_CASE([$net],[stubnm],[build_ch4_netmod_stubnm=yes])
+        done
+    ])
+    AM_CONDITIONAL([BUILD_CH4_NETMOD_STUBNM],[test "X$build_ch4_netmod_stubnm" = "Xyes"])
+])dnl
+
+AC_DEFUN([PAC_SUBCFG_BODY_]PAC_SUBCFG_AUTO_SUFFIX,[
+AM_COND_IF([BUILD_CH4_NETMOD_STUBNM],[
+    AC_MSG_NOTICE([RUNNING CONFIGURE FOR ch4:stubnm])
+    AC_DEFINE([ENABLE_COMM_OVERRIDES], 1, [define to add per-vc function pointers to override send and recv functions])
+    AC_DEFINE(MPIDI_BUILD_CH4_LOCALITY_INFO, 1, [CH4 should build locality info])
+])dnl end AM_COND_IF(BUILD_CH4_NETMOD_STUBNM,...)
+])dnl end _BODY
+
+[#] end of __file__
diff --git a/src/mpid/ch4/netmod/ucx/Makefile.mk b/src/mpid/ch4/netmod/ucx/Makefile.mk
new file mode 100644
index 0000000..ca88e1a
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/Makefile.mk
@@ -0,0 +1,15 @@
+## -*- Mode: Makefile; -*-
+## vim: set ft=automake :
+##
+## (C) 2016 by Argonne National Laboratory.
+##     See COPYRIGHT in top-level directory.
+##
+if BUILD_CH4_NETMOD_UCX
+
+noinst_HEADERS     +=
+mpi_core_sources   += src/mpid/ch4/netmod/ucx/func_table.c\
+                      src/mpid/ch4/netmod/ucx/globals.c
+
+errnames_txt_files += src/mpid/ch4/netmod/ucx/errnames.txt
+
+endif
diff --git a/src/mpid/ch4/netmod/ucx/errnames.txt b/src/mpid/ch4/netmod/ucx/errnames.txt
new file mode 100644
index 0000000..e6058f2
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/errnames.txt
@@ -0,0 +1,16 @@
+**ucx_nm_read_config:ucx_read_config faild
+**ucx_nm_read_config %s %d %s %s: ucx_read_config faild(%s %d %s %s)
+**ucx_nm_init:ucx_init failed
+**ucx_nm_init %s %d %s %s: ucx_init failed (%s %d %s %s)
+**ucx_nm_worker_create:ucx_worker_create failed
+**ucx_nm_worker_create %s %d %s %s: ucx_worker_create failed (%s %d %s %s)
+**ucx_nm_ep_create:failed to create ucp_endpoint
+**ucx_nm_ep_create %s %d %s %s: failed to create ucp_endpoint (%s %d %s %s)
+**ucx_nm_get_worker_address:ucx failed to get worker address
+**ucx_nm_get_worker_address %s %d %s %s: ucx failed to get worker address (%s %s %s %s)
+**ucx_nm_tag_nb_send:Failed to start tag send in ucx
+**ucx_nm_tag_nb_send %s %d %s %s: Failed to start tag send in ucx (%s %d %s %s)
+**ucx_nm_tag_nb_recv:Failed to start tag recv in ucx
+**ucx_nm_tag_nb_recv %s %d %s %s: Failed to start tag recv in ucx (%s %d %s %s)
+**ucx_nm_other:Other error
+**ucx_nm_other %s %d %s %s: Other error (%s %d %s %s)
diff --git a/src/mpid/ch4/netmod/ucx/func_table.c b/src/mpid/ch4/netmod/ucx/func_table.c
new file mode 100644
index 0000000..7e99f6d
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/func_table.c
@@ -0,0 +1,155 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+
+#ifndef NETMOD_DIRECT
+#define NETMOD_DISABLE_INLINES
+#include <mpidimpl.h>
+#include "netmod_direct.h"
+MPIDI_NM_funcs_t MPIDI_NM_ucx_funcs = {
+    MPIDI_NM_init,
+    MPIDI_NM_finalize,
+    MPIDI_NM_progress,
+    MPIDI_NM_comm_connect,
+    MPIDI_NM_comm_disconnect,
+    MPIDI_NM_open_port,
+    MPIDI_NM_close_port,
+    MPIDI_NM_comm_accept,
+    MPIDI_NM_comm_get_lpid,
+    MPIDI_NM_gpid_get,
+    MPIDI_NM_get_node_id,
+    MPIDI_NM_get_max_node_id,
+    MPIDI_NM_getallincomm,
+    MPIDI_NM_gpid_tolpidarray,
+    MPIDI_NM_create_intercomm_from_lpids,
+    MPIDI_NM_comm_create,
+    MPIDI_NM_comm_destroy,
+    MPIDI_NM_am_request_init,
+    MPIDI_NM_am_request_finalize,
+    MPIDI_NM_reg_hdr_handler,
+    MPIDI_NM_send_am_hdr,
+    MPIDI_NM_inject_am_hdr,
+    MPIDI_NM_send_am,
+    MPIDI_NM_send_amv,
+    MPIDI_NM_send_amv_hdr,
+    MPIDI_NM_send_am_hdr_reply,
+    MPIDI_NM_inject_am_hdr_reply,
+    MPIDI_NM_send_am_reply,
+    MPIDI_NM_send_amv_reply,
+    MPIDI_NM_am_hdr_max_sz,
+    MPIDI_NM_am_inject_max_sz,
+    MPIDI_NM_am_recv
+};
+
+MPIDI_NM_native_funcs_t MPIDI_NM_native_ucx_funcs = {
+    MPIDI_NM_send,
+    MPIDI_NM_ssend,
+    MPIDI_NM_startall,
+    MPIDI_NM_send_init,
+    MPIDI_NM_ssend_init,
+    MPIDI_NM_rsend_init,
+    MPIDI_NM_bsend_init,
+    MPIDI_NM_isend,
+    MPIDI_NM_issend,
+    MPIDI_NM_cancel_send,
+    MPIDI_NM_recv_init,
+    MPIDI_NM_recv,
+    MPIDI_NM_irecv,
+    MPIDI_NM_imrecv,
+    MPIDI_NM_cancel_recv,
+    MPIDI_NM_alloc_mem,
+    MPIDI_NM_free_mem,
+    MPIDI_NM_improbe,
+    MPIDI_NM_iprobe,
+    MPIDI_NM_win_set_info,
+    MPIDI_NM_win_shared_query,
+    MPIDI_NM_put,
+    MPIDI_NM_win_start,
+    MPIDI_NM_win_complete,
+    MPIDI_NM_win_post,
+    MPIDI_NM_win_wait,
+    MPIDI_NM_win_test,
+    MPIDI_NM_win_lock,
+    MPIDI_NM_win_unlock,
+    MPIDI_NM_win_get_info,
+    MPIDI_NM_get,
+    MPIDI_NM_win_free,
+    MPIDI_NM_win_fence,
+    MPIDI_NM_win_create,
+    MPIDI_NM_accumulate,
+    MPIDI_NM_win_attach,
+    MPIDI_NM_win_allocate_shared,
+    MPIDI_NM_rput,
+    MPIDI_NM_win_flush_local,
+    MPIDI_NM_win_detach,
+    MPIDI_NM_compare_and_swap,
+    MPIDI_NM_raccumulate,
+    MPIDI_NM_rget_accumulate,
+    MPIDI_NM_fetch_and_op,
+    MPIDI_NM_win_allocate,
+    MPIDI_NM_win_flush,
+    MPIDI_NM_win_flush_local_all,
+    MPIDI_NM_win_unlock_all,
+    MPIDI_NM_win_create_dynamic,
+    MPIDI_NM_rget,
+    MPIDI_NM_win_sync,
+    MPIDI_NM_win_flush_all,
+    MPIDI_NM_get_accumulate,
+    MPIDI_NM_win_lock_all,
+    MPIDI_NM_rank_is_local,
+    MPIDI_NM_barrier,
+    MPIDI_NM_bcast,
+    MPIDI_NM_allreduce,
+    MPIDI_NM_allgather,
+    MPIDI_NM_allgatherv,
+    MPIDI_NM_scatter,
+    MPIDI_NM_scatterv,
+    MPIDI_NM_gather,
+    MPIDI_NM_gatherv,
+    MPIDI_NM_alltoall,
+    MPIDI_NM_alltoallv,
+    MPIDI_NM_alltoallw,
+    MPIDI_NM_reduce,
+    MPIDI_NM_reduce_scatter,
+    MPIDI_NM_reduce_scatter_block,
+    MPIDI_NM_scan,
+    MPIDI_NM_exscan,
+    MPIDI_NM_neighbor_allgather,
+    MPIDI_NM_neighbor_allgatherv,
+    MPIDI_NM_neighbor_alltoall,
+    MPIDI_NM_neighbor_alltoallv,
+    MPIDI_NM_neighbor_alltoallw,
+    MPIDI_NM_ineighbor_allgather,
+    MPIDI_NM_ineighbor_allgatherv,
+    MPIDI_NM_ineighbor_alltoall,
+    MPIDI_NM_ineighbor_alltoallv,
+    MPIDI_NM_ineighbor_alltoallw,
+    MPIDI_NM_ibarrier,
+    MPIDI_NM_ibcast,
+    MPIDI_NM_iallgather,
+    MPIDI_NM_iallgatherv,
+    MPIDI_NM_iallreduce,
+    MPIDI_NM_ialltoall,
+    MPIDI_NM_ialltoallv,
+    MPIDI_NM_ialltoallw,
+    MPIDI_NM_iexscan,
+    MPIDI_NM_igather,
+    MPIDI_NM_igatherv,
+    MPIDI_NM_ireduce_scatter_block,
+    MPIDI_NM_ireduce_scatter,
+    MPIDI_NM_ireduce,
+    MPIDI_NM_iscan,
+    MPIDI_NM_iscatter,
+    MPIDI_NM_iscatterv,
+    MPIDI_NM_datatype_commit,
+    MPIDI_NM_datatype_dup,
+    MPIDI_NM_datatype_destroy,
+    MPIDI_NM_op_commit,
+    MPIDI_NM_op_destroy
+};
+#endif
diff --git a/src/mpid/ch4/netmod/ucx/globals.c b/src/mpid/ch4/netmod/ucx/globals.c
new file mode 100644
index 0000000..d914681
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/globals.c
@@ -0,0 +1,13 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#include <mpidimpl.h>
+#include "ucx_impl.h"
+#include "ucx_types.h"
+
+MPIDI_UCX_global_t MPIDI_UCX_global = { 0 };
diff --git a/src/mpid/ch4/netmod/ucx/netmod_direct.h b/src/mpid/ch4/netmod/ucx/netmod_direct.h
new file mode 100644
index 0000000..0fd8bdf
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/netmod_direct.h
@@ -0,0 +1,34 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_DIRECT_H_INCLUDED
+#define NETMOD_DIRECT_H_INCLUDED
+
+#include "ucx_progress.h"
+#include "ucx_request.h"
+#include "ucx_probe.h"
+#include "ucx_init.h"
+#ifdef MPICH_UCX_AM_ONLY
+#include "ucx_am_send.h"
+#include "ucx_am_recv.h"
+#include "ucx_am_win.h"
+#include "ucx_am_rma.h"
+#else
+#include "ucx_send.h"
+#include "ucx_recv.h"
+#include "ucx_win.h"
+#include "ucx_rma.h"
+#endif
+#include "ucx_am.h"
+#include "ucx_spawn.h"
+#include "ucx_comm.h"
+#include "ucx_datatype.h"
+#include "ucx_op.h"
+#include "ucx_proc.h"
+#include "ucx_coll.h"
+#endif /* NETMOD_DIRECT_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/subconfigure.m4 b/src/mpid/ch4/netmod/ucx/subconfigure.m4
new file mode 100644
index 0000000..7dc5a21
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/subconfigure.m4
@@ -0,0 +1,61 @@
+[#] start of __file__
+dnl MPICH_SUBCFG_AFTER=src/mpid/ch4
+
+AC_DEFUN([PAC_SUBCFG_PREREQ_]PAC_SUBCFG_AUTO_SUFFIX,[
+    AM_COND_IF([BUILD_CH4],[
+        for net in $ch4_netmods ; do
+            AS_CASE([$net],[ucx],[build_ch4_netmod_ucx=yes])
+	    if test $net = "ucx" ; then
+	       AC_DEFINE(HAVE_CH4_NETMOD_UCX,1,[UCX netmod is built])
+	       if test "$build_ch4_locality_info" != "yes" ; then
+	          AC_DEFINE(MPIDI_BUILD_CH4_LOCALITY_INFO, 1, [CH4 should build locality info])
+		  build_ch4_locality_info="yes"
+	       fi
+	    fi
+        done
+    ])
+    AM_CONDITIONAL([BUILD_CH4_NETMOD_UCX],[test "X$build_ch4_netmod_ucx" = "Xyes"])
+
+    AC_ARG_WITH(ch4-netmod-ucx-args,
+    [  --with-ch4-netmod-ucx-args=arg1:arg2:arg3
+    CH4 OFI netmod arguments:
+            am-only          - Do not use UCX tagged or RMA communication.
+            ],
+            [ucx_netmod_args=$withval],
+            [ucx_netmod_args=])
+
+dnl Parse the device arguments
+    SAVE_IFS=$IFS
+    IFS=':'
+    args_array=$ucx_netmod_args
+    do_am_only=false
+    echo "Parsing Arguments for OFI Netmod"
+    for arg in $args_array; do
+    case ${arg} in
+      am-only)
+              do_am_only=true
+              echo " ---> CH4::UCX Disable native tagged and RMA communication : $arg"
+    esac
+    done
+    IFS=$SAVE_IFS
+
+    if [test "$do_am_only" = "true"]; then
+       AC_MSG_NOTICE([Disabling native UCX tagged and RMA communication])
+       PAC_APPEND_FLAG([-DMPICH_UCX_AM_ONLY], [CPPFLAGS])
+    fi
+])dnl
+
+AC_DEFUN([PAC_SUBCFG_BODY_]PAC_SUBCFG_AUTO_SUFFIX,[
+AM_COND_IF([BUILD_CH4_NETMOD_UCX],[
+    AC_MSG_NOTICE([RUNNING CONFIGURE FOR ch4:ucx])
+
+    PAC_SET_HEADER_LIB_PATH(ucx)
+    PAC_PUSH_FLAG(LIBS)
+    PAC_CHECK_HEADER_LIB_FATAL(ucx, ucp/api/ucp.h, ucp, ucp_config_read)
+    PAC_POP_FLAG(LIBS)
+    PAC_APPEND_FLAG([-lucp],[WRAPPER_LIBS])
+
+])dnl end AM_COND_IF(BUILD_CH4_NETMOD_OFI,...)
+])dnl end _BODY
+
+[#] end of __file__
diff --git a/src/mpid/ch4/netmod/ucx/ucx_am.h b/src/mpid/ch4/netmod/ucx/ucx_am.h
new file mode 100644
index 0000000..992729d
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_am.h
@@ -0,0 +1,615 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_UCX_AM_H_INCLUDED
+#define NETMOD_UCX_AM_H_INCLUDED
+
+#include "ucx_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_reg_hdr_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_reg_hdr_handler(int handler_id,
+                                           MPIDI_NM_am_origin_handler_fn origin_handler_fn,
+                                           MPIDI_NM_am_target_handler_fn target_handler_fn)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_REG_HDR_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_REG_HDR_HANDLER);
+
+    MPIDI_UCX_global.am_handlers[handler_id] = target_handler_fn;
+    MPIDI_UCX_global.send_cmpl_handlers[handler_id] = origin_handler_fn;
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_REG_HDR_HANDLER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+static inline void MPIDI_UCX_send_am_callback(void *request, ucs_status_t status)
+{
+    MPIDI_UCX_ucp_request_t *ucp_request = (MPIDI_UCX_ucp_request_t *) request;
+
+    if (ucp_request->req) {
+        MPIR_Request *req = ucp_request->req;
+        int handler_id = req->dev.ch4.ch4u.netmod_am.ucx.handler_id;
+
+        MPL_free(req->dev.ch4.ch4u.netmod_am.ucx.pack_buffer);
+        req->dev.ch4.ch4u.netmod_am.ucx.pack_buffer = NULL;
+        MPIDI_UCX_global.send_cmpl_handlers[handler_id] (req);
+        ucp_request->req = NULL;
+    }
+    else {
+        ucp_request->req = (void *) TRUE;
+    }
+
+  fn_exit:
+    return;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline void MPIDI_UCX_inject_am_callback(void *request, ucs_status_t status)
+{
+    MPIDI_UCX_ucp_request_t *ucp_request = (MPIDI_UCX_ucp_request_t *) request;
+
+    if (ucp_request->req) {
+        MPL_free(ucp_request->req);
+        ucp_request->req = NULL;
+        ucp_request_release(ucp_request);
+    }
+    else {
+        ucp_request->req = (void *) TRUE;
+    }
+
+  fn_exit:
+    return;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_send_am_hdr
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_send_am_hdr(int rank,
+                                       MPIR_Comm * comm,
+                                       int handler_id,
+                                       const void *am_hdr,
+                                       size_t am_hdr_sz, MPIR_Request * sreq, void *netmod_context)
+{
+    int mpi_errno = MPI_SUCCESS, c;
+    MPIDI_UCX_ucp_request_t *ucp_request;
+    ucp_ep_h ep;
+    uint64_t ucx_tag;
+    char *send_buf;
+    MPIDI_UCX_am_header_t ucx_hdr;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_SEND_AM_HDR);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_SEND_AM_HDR);
+
+    ep = MPIDI_UCX_COMM_TO_EP(comm, rank);
+    ucx_tag = MPIDI_UCX_init_tag(0, 0, MPIDI_UCX_AM_TAG);
+
+    /* initialize our portion of the hdr */
+    ucx_hdr.handler_id = handler_id;
+    ucx_hdr.data_sz = 0;
+
+    /* just pack and send for now */
+    send_buf = MPL_malloc(am_hdr_sz + sizeof(ucx_hdr));
+    MPIR_Memcpy(send_buf, &ucx_hdr, sizeof(ucx_hdr));
+    MPIR_Memcpy(send_buf + sizeof(ucx_hdr), am_hdr, am_hdr_sz);
+
+    ucp_request = (MPIDI_UCX_ucp_request_t *) ucp_tag_send_nb(ep, send_buf,
+                                                              am_hdr_sz + sizeof(ucx_hdr),
+                                                              ucp_dt_make_contig(1), ucx_tag,
+                                                              &MPIDI_UCX_send_am_callback);
+    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+
+    /* send is done. free all resources and complete the request */
+    if (ucp_request == NULL) {
+        MPL_free(send_buf);
+        MPIDI_UCX_global.send_cmpl_handlers[handler_id] (sreq);
+        goto fn_exit;
+    }
+
+    /* request completed between the UCP call and now. free resources
+     * and complete the send request */
+    if (ucp_request->req) {
+        MPL_free(send_buf);
+        MPIDI_UCX_global.send_cmpl_handlers[handler_id] (sreq);
+        ucp_request->req = NULL;
+        ucp_request_release(ucp_request);
+    }
+    else {
+        /* set the ch4r request inside the UCP request */
+        sreq->dev.ch4.ch4u.netmod_am.ucx.pack_buffer = send_buf;
+        sreq->dev.ch4.ch4u.netmod_am.ucx.handler_id = handler_id;
+        ucp_request->req = sreq;
+        ucp_request_release(ucp_request);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_SEND_AM_HDR);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_send_am
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_send_am(int rank,
+                                   MPIR_Comm * comm,
+                                   int handler_id,
+                                   const void *am_hdr,
+                                   size_t am_hdr_sz,
+                                   const void *data,
+                                   MPI_Count count,
+                                   MPI_Datatype datatype, MPIR_Request * sreq, void *netmod_context)
+{
+    int mpi_errno = MPI_SUCCESS, c;
+    MPIDI_UCX_ucp_request_t *ucp_request;
+    ucp_ep_h ep;
+    uint64_t ucx_tag;
+    char *send_buf;
+    size_t data_sz;
+    MPI_Aint dt_true_lb, last;
+    MPIR_Datatype *dt_ptr;
+    int dt_contig;
+    MPIDI_UCX_am_header_t ucx_hdr;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_SEND_AM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_SEND_AM);
+
+    MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);
+    if (handler_id == MPIDI_CH4U_SEND &&
+        am_hdr_sz + sizeof(MPIDI_UCX_am_header_t) + data_sz > MPIDI_UCX_MAX_AM_EAGER_SZ) {
+        MPIDI_CH4U_send_long_req_msg_t lreq_hdr;
+
+        MPIR_Memcpy(&lreq_hdr.hdr, am_hdr, am_hdr_sz);
+        lreq_hdr.data_sz = data_sz;
+        lreq_hdr.sreq_ptr = (uint64_t) sreq;
+        MPIDI_CH4U_REQUEST(sreq, req->lreq).src_buf = data;
+        MPIDI_CH4U_REQUEST(sreq, req->lreq).count = count;
+        dtype_add_ref_if_not_builtin(datatype);
+        MPIDI_CH4U_REQUEST(sreq, req->lreq).datatype = datatype;
+        MPIDI_CH4U_REQUEST(sreq, req->lreq).msg_tag = lreq_hdr.hdr.msg_tag;
+        MPIDI_CH4U_REQUEST(sreq, src_rank) = rank;
+        mpi_errno = MPIDI_NM_inject_am_hdr(rank, comm, MPIDI_CH4U_SEND_LONG_REQ,
+                                           &lreq_hdr, sizeof(lreq_hdr), NULL);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+        goto fn_exit;
+    }
+
+    ep = MPIDI_UCX_COMM_TO_EP(comm, rank);
+    ucx_tag = MPIDI_UCX_init_tag(0, 0, MPIDI_UCX_AM_TAG);
+
+    /* initialize our portion of the hdr */
+    ucx_hdr.handler_id = handler_id;
+    ucx_hdr.data_sz = data_sz;
+
+    if (dt_contig) {
+        /* just pack and send for now */
+        send_buf = MPL_malloc(data_sz + am_hdr_sz + sizeof(ucx_hdr));
+        MPIR_Memcpy(send_buf, &ucx_hdr, sizeof(ucx_hdr));
+        MPIR_Memcpy(send_buf + sizeof(ucx_hdr), am_hdr, am_hdr_sz);
+        MPIR_Memcpy(send_buf + am_hdr_sz + sizeof(ucx_hdr), data + dt_true_lb, data_sz);
+    }
+    else {
+        size_t segment_first;
+        struct MPIDU_Segment *segment_ptr;
+        segment_ptr = MPIDU_Segment_alloc();
+        MPIR_ERR_CHKANDJUMP1(segment_ptr == NULL, mpi_errno,
+                             MPI_ERR_OTHER, "**nomem", "**nomem %s", "Send MPIDU_Segment_alloc");
+        MPIDU_Segment_init(data, count, datatype, segment_ptr, 0);
+        segment_first = 0;
+        last = data_sz;
+        send_buf = MPL_malloc(data_sz + am_hdr_sz + sizeof(ucx_hdr));
+
+        MPIR_Memcpy(send_buf, &ucx_hdr, sizeof(ucx_hdr));
+        MPIR_Memcpy(send_buf + sizeof(ucx_hdr), am_hdr, am_hdr_sz);
+        MPIDU_Segment_pack(segment_ptr, segment_first, &last,
+                           send_buf + am_hdr_sz + sizeof(ucx_hdr));
+        MPIDU_Segment_free(segment_ptr);
+    }
+
+    ucp_request = (MPIDI_UCX_ucp_request_t *) ucp_tag_send_nb(ep, send_buf,
+                                                              data_sz + am_hdr_sz + sizeof(ucx_hdr),
+                                                              ucp_dt_make_contig(1), ucx_tag,
+                                                              &MPIDI_UCX_send_am_callback);
+    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+    /* send is done. free all resources and complete the request */
+    if (ucp_request == NULL) {
+        MPL_free(send_buf);
+        MPIDI_UCX_global.send_cmpl_handlers[handler_id] (sreq);
+        goto fn_exit;
+    }
+
+    /* request completed between the UCP call and now. free resources
+     * and complete the send request */
+    if (ucp_request->req) {
+        MPL_free(send_buf);
+        MPIDI_UCX_global.send_cmpl_handlers[handler_id] (sreq);
+        ucp_request->req = NULL;
+        ucp_request_release(ucp_request);
+    }
+    else {
+        /* set the ch4r request inside the UCP request */
+        sreq->dev.ch4.ch4u.netmod_am.ucx.pack_buffer = send_buf;
+        sreq->dev.ch4.ch4u.netmod_am.ucx.handler_id = handler_id;
+        ucp_request->req = sreq;
+        ucp_request_release(ucp_request);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_SEND_AM);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_send_amv_hdr
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_send_amv_hdr(int rank,
+                                        MPIR_Comm * comm,
+                                        int handler_id,
+                                        struct iovec *am_hdr,
+                                        size_t iov_len, MPIR_Request * sreq, void *netmod_context)
+{
+    int mpi_errno = MPI_SUCCESS;
+    size_t am_hdr_sz = 0, i;
+    char *am_hdr_buf;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_UCX_SEND_AMV_HDR);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_UCX_SEND_AMV_HDR);
+
+    for (i = 0; i < iov_len; i++) {
+        am_hdr_sz += am_hdr[i].iov_len;
+    }
+
+    am_hdr_buf = (char *) MPL_malloc(am_hdr_sz);
+    MPIR_Assert(am_hdr_buf);
+    am_hdr_sz = 0;
+
+    for (i = 0; i < iov_len; i++) {
+        MPIR_Memcpy(am_hdr_buf + am_hdr_sz, am_hdr[i].iov_base, am_hdr[i].iov_len);
+        am_hdr_sz += am_hdr[i].iov_len;
+    }
+
+    mpi_errno = MPIDI_NM_send_am_hdr(rank, comm, handler_id, am_hdr_buf, am_hdr_sz,
+                                     sreq, netmod_context);
+    MPL_free(am_hdr_buf);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_UCX_SEND_AMV_HDR);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_send_amv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_send_amv(int rank,
+                                    MPIR_Comm * comm,
+                                    int handler_id,
+                                    struct iovec *am_hdr,
+                                    size_t iov_len,
+                                    const void *data,
+                                    MPI_Count count,
+                                    MPI_Datatype datatype,
+                                    MPIR_Request * sreq, void *netmod_context)
+{
+    int mpi_errno = MPI_SUCCESS;
+    size_t am_hdr_sz = 0, i;
+    char *am_hdr_buf;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_UCX_SEND_AMV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_UCX_SEND_AMV);
+
+    for (i = 0; i < iov_len; i++) {
+        am_hdr_sz += am_hdr[i].iov_len;
+    }
+
+    am_hdr_buf = (char *) MPL_malloc(am_hdr_sz);
+    MPIR_Assert(am_hdr_buf);
+    am_hdr_sz = 0;
+
+    for (i = 0; i < iov_len; i++) {
+        MPIR_Memcpy(am_hdr_buf + am_hdr_sz, am_hdr[i].iov_base, am_hdr[i].iov_len);
+        am_hdr_sz += am_hdr[i].iov_len;
+    }
+
+    mpi_errno = MPIDI_NM_send_am(rank, comm, handler_id, am_hdr_buf, am_hdr_sz,
+                                 data, count, datatype, sreq, netmod_context);
+
+    MPL_free(am_hdr_buf);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_UCX_SEND_AMV);
+    return mpi_errno;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_send_am_hdr_reply
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_send_am_hdr_reply(MPIR_Context_id_t context_id,
+                                             int src_rank,
+                                             int handler_id,
+                                             const void *am_hdr,
+                                             size_t am_hdr_sz, MPIR_Request * sreq)
+{
+
+    return MPIDI_NM_send_am_hdr(src_rank, MPIDI_CH4U_context_id_to_comm(context_id), handler_id,
+                                am_hdr, am_hdr_sz, sreq, NULL);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_send_am_reply
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_send_am_reply(MPIR_Context_id_t context_id,
+                                         int src_rank,
+                                         int handler_id,
+                                         const void *am_hdr,
+                                         size_t am_hdr_sz,
+                                         const void *data, MPI_Count count,
+                                         MPI_Datatype datatype, MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS, c;
+    MPIDI_UCX_ucp_request_t *ucp_request;
+    ucp_ep_h ep;
+    uint64_t ucx_tag;
+    char *send_buf;
+    size_t data_sz;
+    MPI_Aint dt_true_lb, last;
+    MPIR_Datatype *dt_ptr;
+    int dt_contig;
+    MPIDI_UCX_am_header_t ucx_hdr;
+    MPIR_Comm *use_comm;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_SEND_AM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_SEND_AM);
+
+    use_comm = MPIDI_CH4U_context_id_to_comm(context_id);
+    ep = MPIDI_UCX_COMM_TO_EP(use_comm, src_rank);
+    ucx_tag = MPIDI_UCX_init_tag(0, 0, MPIDI_UCX_AM_TAG);
+
+    MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);
+
+    /* initialize our portion of the hdr */
+    ucx_hdr.handler_id = handler_id;
+    ucx_hdr.data_sz = data_sz;
+
+    if (dt_contig) {
+        /* just pack and send for now */
+        send_buf = MPL_malloc(data_sz + am_hdr_sz + sizeof(ucx_hdr));
+        MPIR_Memcpy(send_buf, &ucx_hdr, sizeof(ucx_hdr));
+        MPIR_Memcpy(send_buf + sizeof(ucx_hdr), am_hdr, am_hdr_sz);
+        MPIR_Memcpy(send_buf + am_hdr_sz + sizeof(ucx_hdr), data + dt_true_lb, data_sz);
+
+        ucp_request = (MPIDI_UCX_ucp_request_t *) ucp_tag_send_nb(ep, send_buf,
+                                                                  data_sz + am_hdr_sz +
+                                                                  sizeof(ucx_hdr),
+                                                                  ucp_dt_make_contig(1), ucx_tag,
+                                                                  &MPIDI_UCX_send_am_callback);
+        MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+    }
+
+    /* send is done. free all resources and complete the request */
+    if (ucp_request == NULL) {
+        MPL_free(send_buf);
+        MPIDI_UCX_global.send_cmpl_handlers[handler_id] (sreq);
+        goto fn_exit;
+    }
+
+    /* request completed between the UCP call and now. free resources
+     * and complete the send request */
+    if (ucp_request->req) {
+        MPL_free(send_buf);
+        MPIDI_UCX_global.send_cmpl_handlers[handler_id] (sreq);
+        ucp_request->req = NULL;
+        ucp_request_release(ucp_request);
+    }
+    else {
+        /* set the ch4r request inside the UCP request */
+        sreq->dev.ch4.ch4u.netmod_am.ucx.pack_buffer = send_buf;
+        sreq->dev.ch4.ch4u.netmod_am.ucx.handler_id = handler_id;
+        ucp_request->req = sreq;
+        ucp_request_release(ucp_request);
+
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_SEND_AM);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_NM_send_amv_reply(MPIR_Context_id_t context_id,
+                                          int src_rank,
+                                          int handler_id,
+                                          struct iovec *am_hdr,
+                                          size_t iov_len,
+                                          const void *data, MPI_Count count,
+                                          MPI_Datatype datatype, MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    size_t am_hdr_sz = 0, i;
+    char *am_hdr_buf;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_UCX_SEND_AMV_REPLY);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_UCX_SEND_AMV_REPLY);
+
+    for (i = 0; i < iov_len; i++) {
+        am_hdr_sz += am_hdr[i].iov_len;
+    }
+
+    am_hdr_buf = (char *) MPL_malloc(am_hdr_sz);
+
+    MPIR_Assert(am_hdr_buf);
+    am_hdr_sz = 0;
+
+    for (i = 0; i < iov_len; i++) {
+        MPIR_Memcpy(am_hdr_buf + am_hdr_sz, am_hdr[i].iov_base, am_hdr[i].iov_len);
+        am_hdr_sz += am_hdr[i].iov_len;
+    }
+
+    mpi_errno = MPIDI_NM_send_am_reply(context_id, src_rank, handler_id, am_hdr_buf, am_hdr_sz,
+                                       data, count, datatype, sreq);
+    MPL_free(am_hdr_buf);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_UCX_SEND_AMV_REPLY);
+    return mpi_errno;
+}
+
+static inline size_t MPIDI_NM_am_hdr_max_sz(void)
+{
+    return (MPIDI_UCX_MAX_AM_EAGER_SZ - sizeof(MPIDI_UCX_am_header_t));
+}
+
+static inline int MPIDI_NM_inject_am_hdr(int rank,
+                                         MPIR_Comm * comm,
+                                         int handler_id,
+                                         const void *am_hdr, size_t am_hdr_sz, void *netmod_context)
+{
+    int mpi_errno = MPI_SUCCESS, c;
+    MPIDI_UCX_ucp_request_t *ucp_request;
+    ucp_ep_h ep;
+    uint64_t ucx_tag;
+    char *send_buf;
+    MPIDI_UCX_am_header_t ucx_hdr;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_SEND_AM_HDR);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_SEND_AM_HDR);
+
+    ep = MPIDI_UCX_COMM_TO_EP(comm, rank);
+    ucx_tag = MPIDI_UCX_init_tag(0, 0, MPIDI_UCX_AM_TAG);
+
+    /* initialize our portion of the hdr */
+    ucx_hdr.handler_id = handler_id;
+    ucx_hdr.data_sz = 0;
+
+    /* just pack and send for now */
+    send_buf = MPL_malloc(am_hdr_sz + sizeof(ucx_hdr));
+    MPIR_Memcpy(send_buf, &ucx_hdr, sizeof(ucx_hdr));
+    MPIR_Memcpy(send_buf + sizeof(ucx_hdr), am_hdr, am_hdr_sz);
+
+    ucp_request = (MPIDI_UCX_ucp_request_t *) ucp_tag_send_nb(ep, send_buf,
+                                                              am_hdr_sz + sizeof(ucx_hdr),
+                                                              ucp_dt_make_contig(1), ucx_tag,
+                                                              &MPIDI_UCX_inject_am_callback);
+    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+
+    if (ucp_request == NULL) {
+        /* inject is done */
+        MPL_free(send_buf);
+    }
+    else if (ucp_request->req) {
+        MPL_free(send_buf);
+        ucp_request->req = NULL;
+        ucp_request_release(ucp_request);
+    }
+    else {
+        ucp_request->req = send_buf;
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_SEND_AM_HDR);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_NM_inject_am_hdr_reply(MPIR_Context_id_t context_id,
+                                               int src_rank,
+                                               int handler_id, const void *am_hdr, size_t am_hdr_sz)
+{
+    int mpi_errno = MPI_SUCCESS, c;
+    MPIDI_UCX_ucp_request_t *ucp_request;
+    ucp_ep_h ep;
+    uint64_t ucx_tag;
+    char *send_buf;
+    MPIDI_UCX_am_header_t ucx_hdr;
+    MPIR_Comm *use_comm;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_INJECT_AM_HDR_REPLY);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_INJECT_AM_HDR_REPLY);
+
+    use_comm = MPIDI_CH4U_context_id_to_comm(context_id);
+    ep = MPIDI_UCX_COMM_TO_EP(use_comm, src_rank);
+    ucx_tag = MPIDI_UCX_init_tag(0, 0, MPIDI_UCX_AM_TAG);
+
+    /* initialize our portion of the hdr */
+    ucx_hdr.handler_id = handler_id;
+
+    /* just pack and send for now */
+    send_buf = MPL_malloc(am_hdr_sz + sizeof(ucx_hdr));
+    MPIR_Memcpy(send_buf, &ucx_hdr, sizeof(ucx_hdr));
+    MPIR_Memcpy(send_buf + sizeof(ucx_hdr), am_hdr, am_hdr_sz);
+    ucp_request = (MPIDI_UCX_ucp_request_t *) ucp_tag_send_nb(ep, send_buf,
+                                                              am_hdr_sz + sizeof(ucx_hdr),
+                                                              ucp_dt_make_contig(1), ucx_tag,
+                                                              &MPIDI_UCX_inject_am_callback);
+    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+
+    if (ucp_request == NULL) {
+        /* inject is done */
+        MPL_free(send_buf);
+    }
+    else if (ucp_request->req) {
+        MPL_free(send_buf);
+        ucp_request->req = NULL;
+        ucp_request_release(ucp_request);
+    }
+    else {
+        ucp_request->req = send_buf;
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_SEND_AM);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline size_t MPIDI_NM_am_inject_max_sz(void)
+{
+    return MPIDI_NM_am_hdr_max_sz();
+}
+
+static inline int MPIDI_NM_am_recv(MPIR_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_CH4U_send_long_ack_msg_t msg;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_UCX_AM_MATCHED);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_UCX_AM_MATCHED);
+
+    msg.sreq_ptr = (MPIDI_CH4U_REQUEST(req, req->rreq.peer_req_ptr));
+    msg.rreq_ptr = (uint64_t) req;
+    MPIR_Assert((void *) msg.sreq_ptr != NULL);
+    mpi_errno = MPIDI_NM_inject_am_hdr_reply(MPIDI_CH4U_get_context(MPIDI_CH4U_REQUEST(req, tag)),
+                                             MPIDI_CH4U_REQUEST(req, src_rank),
+                                             MPIDI_CH4U_SEND_LONG_ACK, &msg, sizeof(msg));
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_UCX_AM_MATCHED);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#endif /* NETMOD_UCX_AM_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_am_recv.h b/src/mpid/ch4/netmod/ucx/ucx_am_recv.h
new file mode 100644
index 0000000..2f4f821
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_am_recv.h
@@ -0,0 +1,60 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_UCX_AM_RECV_H_INCLUDED
+#define NETMOD_UCX_AM_RECV_H_INCLUDED
+
+#include "ucx_impl.h"
+
+static inline int MPIDI_NM_recv(void *buf,
+                                int count,
+                                MPI_Datatype datatype,
+                                int rank,
+                                int tag,
+                                MPIR_Comm * comm,
+                                int context_offset, MPI_Status * status, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_recv(buf, count, datatype, rank, tag, comm, context_offset, status, request);
+}
+
+static inline int MPIDI_NM_recv_init(void *buf,
+                                     int count,
+                                     MPI_Datatype datatype,
+                                     int rank,
+                                     int tag,
+                                     MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_recv_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_imrecv(void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  MPIR_Request * message, MPIR_Request ** rreqp)
+{
+    return MPIDI_CH4U_imrecv(buf, count, datatype, message, rreqp);
+}
+
+static inline int MPIDI_NM_irecv(void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_irecv(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_cancel_recv(MPIR_Request * rreq)
+{
+    return MPIDI_CH4U_cancel_recv(rreq);
+}
+
+#endif /* NETMOD_UCX_RECV_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_am_rma.h b/src/mpid/ch4/netmod/ucx/ucx_am_rma.h
new file mode 100644
index 0000000..7863464
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_am_rma.h
@@ -0,0 +1,149 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#ifndef NETMOD_UCX_AM_RMA_H_INCLUDED
+#define NETMOD_UCX_AM_RMA_H_INCLUDED
+
+#include "ucx_impl.h"
+
+static inline int MPIDI_NM_put(const void *origin_addr,
+                               int origin_count,
+                               MPI_Datatype origin_datatype,
+                               int target_rank,
+                               MPI_Aint target_disp,
+                               int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    return MPIDI_CH4U_put(origin_addr, origin_count, origin_datatype,
+                          target_rank, target_disp, target_count, target_datatype, win);
+}
+
+static inline int MPIDI_NM_get(void *origin_addr,
+                               int origin_count,
+                               MPI_Datatype origin_datatype,
+                               int target_rank,
+                               MPI_Aint target_disp,
+                               int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    return MPIDI_CH4U_get(origin_addr, origin_count, origin_datatype,
+                          target_rank, target_disp, target_count, target_datatype, win);
+}
+
+static inline int MPIDI_NM_rput(const void *origin_addr,
+                                int origin_count,
+                                MPI_Datatype origin_datatype,
+                                int target_rank,
+                                MPI_Aint target_disp,
+                                int target_count,
+                                MPI_Datatype target_datatype,
+                                MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rput(origin_addr, origin_count, origin_datatype,
+                           target_rank, target_disp, target_count, target_datatype, win, request);
+}
+
+
+static inline int MPIDI_NM_compare_and_swap(const void *origin_addr,
+                                            const void *compare_addr,
+                                            void *result_addr,
+                                            MPI_Datatype datatype,
+                                            int target_rank, MPI_Aint target_disp, MPIR_Win * win)
+{
+    return MPIDI_CH4U_compare_and_swap(origin_addr, compare_addr, result_addr,
+                                       datatype, target_rank, target_disp, win);
+}
+
+static inline int MPIDI_NM_raccumulate(const void *origin_addr,
+                                       int origin_count,
+                                       MPI_Datatype origin_datatype,
+                                       int target_rank,
+                                       MPI_Aint target_disp,
+                                       int target_count,
+                                       MPI_Datatype target_datatype,
+                                       MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_raccumulate(origin_addr, origin_count, origin_datatype,
+                                  target_rank, target_disp, target_count,
+                                  target_datatype, op, win, request);
+}
+
+static inline int MPIDI_NM_rget_accumulate(const void *origin_addr,
+                                           int origin_count,
+                                           MPI_Datatype origin_datatype,
+                                           void *result_addr,
+                                           int result_count,
+                                           MPI_Datatype result_datatype,
+                                           int target_rank,
+                                           MPI_Aint target_disp,
+                                           int target_count,
+                                           MPI_Datatype target_datatype,
+                                           MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rget_accumulate(origin_addr, origin_count, origin_datatype,
+                                      result_addr, result_count, result_datatype,
+                                      target_rank, target_disp, target_count,
+                                      target_datatype, op, win, request);
+}
+
+static inline int MPIDI_NM_fetch_and_op(const void *origin_addr,
+                                        void *result_addr,
+                                        MPI_Datatype datatype,
+                                        int target_rank,
+                                        MPI_Aint target_disp, MPI_Op op, MPIR_Win * win)
+{
+    return MPIDI_CH4U_fetch_and_op(origin_addr, result_addr, datatype,
+                                   target_rank, target_disp, op, win);
+}
+
+
+static inline int MPIDI_NM_rget(void *origin_addr,
+                                int origin_count,
+                                MPI_Datatype origin_datatype,
+                                int target_rank,
+                                MPI_Aint target_disp,
+                                int target_count,
+                                MPI_Datatype target_datatype,
+                                MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rget(origin_addr, origin_count, origin_datatype,
+                           target_rank, target_disp, target_count, target_datatype, win, request);
+}
+
+
+static inline int MPIDI_NM_get_accumulate(const void *origin_addr,
+                                          int origin_count,
+                                          MPI_Datatype origin_datatype,
+                                          void *result_addr,
+                                          int result_count,
+                                          MPI_Datatype result_datatype,
+                                          int target_rank,
+                                          MPI_Aint target_disp,
+                                          int target_count,
+                                          MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    return MPIDI_CH4U_get_accumulate(origin_addr, origin_count, origin_datatype,
+                                     result_addr, result_count, result_datatype,
+                                     target_rank, target_disp, target_count,
+                                     target_datatype, op, win);
+}
+
+static inline int MPIDI_NM_accumulate(const void *origin_addr,
+                                      int origin_count,
+                                      MPI_Datatype origin_datatype,
+                                      int target_rank,
+                                      MPI_Aint target_disp,
+                                      int target_count,
+                                      MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    return MPIDI_CH4U_accumulate(origin_addr, origin_count, origin_datatype,
+                                 target_rank, target_disp, target_count, target_datatype, op, win);
+}
+
+#endif /* NETMOD_UCX_AM_RMA_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_am_send.h b/src/mpid/ch4/netmod/ucx/ucx_am_send.h
new file mode 100644
index 0000000..9fd977b
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_am_send.h
@@ -0,0 +1,128 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_UCX_AM_SEND_H_INCLUDED
+#define NETMOD_UCX_AM_SEND_H_INCLUDED
+
+#include "ucx_impl.h"
+
+static inline int MPIDI_NM_send(const void *buf,
+                                int count,
+                                MPI_Datatype datatype,
+                                int rank,
+                                int tag,
+                                MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_send(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_rsend(const void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rsend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+
+
+static inline int MPIDI_NM_irsend(const void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  int rank,
+                                  int tag,
+                                  MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_irsend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_ssend(const void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_ssend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_startall(int count, MPIR_Request * requests[])
+{
+    return MPIDI_CH4U_startall(count, requests);
+}
+
+static inline int MPIDI_NM_send_init(const void *buf,
+                                     int count,
+                                     MPI_Datatype datatype,
+                                     int rank,
+                                     int tag,
+                                     MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_send_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_ssend_init(const void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_ssend_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_bsend_init(const void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_bsend_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_rsend_init(const void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rsend_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_isend(const void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_isend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_issend(const void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  int rank,
+                                  int tag,
+                                  MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_issend(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+static inline int MPIDI_NM_cancel_send(MPIR_Request * sreq)
+{
+    return MPIDI_CH4U_cancel_send(sreq);
+}
+
+#endif /* NETMOD_UCX_SEND_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_am_win.h b/src/mpid/ch4/netmod/ucx/ucx_am_win.h
new file mode 100644
index 0000000..2bf5da9
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_am_win.h
@@ -0,0 +1,160 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef NETMOD_UCX_AM_WIN_H_INCLUDED
+#define NETMOD_UCX_AM_WIN_H_INCLUDED
+
+#include "ucx_impl.h"
+
+static inline int MPIDI_NM_win_set_info(MPIR_Win * win, MPIR_Info * info)
+{
+    return MPIDI_CH4R_win_set_info(win, info);
+}
+
+
+static inline int MPIDI_NM_win_start(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_start(group, assert, win);
+}
+
+
+static inline int MPIDI_NM_win_complete(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_complete(win);
+}
+
+static inline int MPIDI_NM_win_post(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_post(group, assert, win);
+}
+
+
+static inline int MPIDI_NM_win_wait(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_wait(win);
+}
+
+
+static inline int MPIDI_NM_win_test(MPIR_Win * win, int *flag)
+{
+    return MPIDI_CH4R_win_test(win, flag);
+}
+
+static inline int MPIDI_NM_win_lock(int lock_type, int rank, int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_lock(lock_type, rank, assert, win);
+}
+
+
+static inline int MPIDI_NM_win_unlock(int rank, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_unlock(rank, win);
+}
+
+static inline int MPIDI_NM_win_get_info(MPIR_Win * win, MPIR_Info ** info_p_p)
+{
+    return MPIDI_CH4R_win_get_info(win, info_p_p);
+}
+
+
+static inline int MPIDI_NM_win_free(MPIR_Win ** win_ptr)
+{
+    return MPIDI_CH4R_win_free(win_ptr);
+}
+
+static inline int MPIDI_NM_win_fence(int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_fence(assert, win);
+}
+
+static inline int MPIDI_NM_win_create(void *base,
+                                      MPI_Aint length,
+                                      int disp_unit,
+                                      MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr)
+{
+    return MPIDI_CH4R_win_create(base, length, disp_unit, info, comm_ptr, win_ptr);
+}
+
+static inline int MPIDI_NM_win_attach(MPIR_Win * win, void *base, MPI_Aint size)
+{
+    return MPIDI_CH4R_win_attach(win, base, size);
+}
+
+static inline int MPIDI_NM_win_allocate_shared(MPI_Aint size,
+                                               int disp_unit,
+                                               MPIR_Info * info_ptr,
+                                               MPIR_Comm * comm_ptr,
+                                               void **base_ptr, MPIR_Win ** win_ptr)
+{
+    return MPIDI_CH4R_win_allocate_shared(size, disp_unit, info_ptr, comm_ptr, base_ptr, win_ptr);
+}
+
+static inline int MPIDI_NM_win_detach(MPIR_Win * win, const void *base)
+{
+    return MPIDI_CH4R_win_detach(win, base);
+}
+
+static inline int MPIDI_NM_win_shared_query(MPIR_Win * win,
+                                            int rank,
+                                            MPI_Aint * size, int *disp_unit, void *baseptr)
+{
+    return MPIDI_CH4R_win_shared_query(win, rank, size, disp_unit, baseptr);
+}
+
+static inline int MPIDI_NM_win_allocate(MPI_Aint size,
+                                        int disp_unit,
+                                        MPIR_Info * info,
+                                        MPIR_Comm * comm, void *baseptr, MPIR_Win ** win)
+{
+    return MPIDI_CH4R_win_allocate(size, disp_unit, info, comm, baseptr, win);
+}
+
+static inline int MPIDI_NM_win_flush(int rank, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_flush(rank, win);
+}
+
+static inline int MPIDI_NM_win_flush_local_all(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_flush_local_all(win);
+}
+
+static inline int MPIDI_NM_win_unlock_all(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_unlock_all(win);
+}
+
+static inline int MPIDI_NM_win_create_dynamic(MPIR_Info * info, MPIR_Comm * comm, MPIR_Win ** win)
+{
+    return MPIDI_CH4R_win_create_dynamic(info, comm, win);
+}
+
+static inline int MPIDI_NM_win_flush_local(int rank, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_flush_local(rank, win);
+}
+
+static inline int MPIDI_NM_win_sync(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_sync(win);
+}
+
+static inline int MPIDI_NM_win_flush_all(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_flush_all(win);
+}
+
+static inline int MPIDI_NM_win_lock_all(int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_lock_all(assert, win);
+}
+
+
+#endif /* NETMOD_UCX_AM_WIN_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_coll.h b/src/mpid/ch4/netmod/ucx/ucx_coll.h
new file mode 100644
index 0000000..185c5b9
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_coll.h
@@ -0,0 +1,867 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_UCX_COLL_H_INCLUDED
+#define NETMOD_UCX_COLL_H_INCLUDED
+
+#include "ucx_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_barrier
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_barrier(MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_BARRIER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_BARRIER);
+
+    mpi_errno = MPIR_Barrier(comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_BARRIER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_bcast
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_bcast(void *buffer, int count, MPI_Datatype datatype,
+                                 int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_BCAST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_BCAST);
+
+    mpi_errno = MPIR_Bcast(buffer, count, datatype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_BCAST);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_allreduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_allreduce(const void *sendbuf, void *recvbuf, int count,
+                                     MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                     MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLREDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLREDUCE);
+
+    mpi_errno = MPIR_Allreduce(sendbuf, recvbuf, count, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLREDUCE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                     MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLGATHER);
+
+    mpi_errno = MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
+                               comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_allgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                      void *recvbuf, const int *recvcounts, const int *displs,
+                                      MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                      MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLGATHERV);
+
+    mpi_errno = MPIR_Allgatherv(sendbuf, sendcount, sendtype,
+                                recvbuf, recvcounts, displs, recvtype, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_gather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                  void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                  int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_GATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_GATHER);
+
+    mpi_errno = MPIR_Gather(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                            recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_GATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_gatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, const int *recvcounts, const int *displs,
+                                   MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr,
+                                   MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_GATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_GATHERV);
+
+    mpi_errno = MPIR_Gatherv(sendbuf, sendcount, sendtype,
+                             recvbuf, recvcounts, displs, recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_GATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_scatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                   int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_SCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_SCATTER);
+
+    mpi_errno = MPIR_Scatter(sendbuf, sendcount, sendtype,
+                             recvbuf, recvcount, recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_SCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_scatterv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_scatterv(const void *sendbuf, const int *sendcounts,
+                                    const int *displs, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                    int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_SCATTERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_SCATTERV);
+
+    mpi_errno = MPIR_Scatterv(sendbuf, sendcounts, displs,
+                              sendtype, recvbuf, recvcount, recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_SCATTERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_alltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                    MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLTOALL);
+
+    mpi_errno = MPIR_Alltoall(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                              recvtype, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_alltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_alltoallv(const void *sendbuf, const int *sendcounts,
+                                     const int *sdispls, MPI_Datatype sendtype,
+                                     void *recvbuf, const int *recvcounts,
+                                     const int *rdispls, MPI_Datatype recvtype,
+                                     MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLTOALLV);
+
+    mpi_errno = MPIR_Alltoallv(sendbuf, sendcounts, sdispls,
+                               sendtype, recvbuf, recvcounts, rdispls, recvtype, comm_ptr, errflag);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_alltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_alltoallw(const void *sendbuf, const int sendcounts[],
+                                     const int sdispls[], const MPI_Datatype sendtypes[],
+                                     void *recvbuf, const int recvcounts[],
+                                     const int rdispls[], const MPI_Datatype recvtypes[],
+                                     MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ALLTOALLW);
+
+    mpi_errno = MPIR_Alltoallw(sendbuf, sendcounts, sdispls,
+                               sendtypes, recvbuf, recvcounts,
+                               rdispls, recvtypes, comm_ptr, errflag);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_reduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_reduce(const void *sendbuf, void *recvbuf, int count,
+                                  MPI_Datatype datatype, MPI_Op op, int root,
+                                  MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_REDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_REDUCE);
+
+    mpi_errno = MPIR_Reduce(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_REDUCE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_reduce_scatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_reduce_scatter(const void *sendbuf, void *recvbuf,
+                                          const int recvcounts[], MPI_Datatype datatype,
+                                          MPI_Op op, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_REDUCE_SCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_REDUCE_SCATTER);
+
+    mpi_errno = MPIR_Reduce_scatter(sendbuf, recvbuf, recvcounts, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_REDUCE_SCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_reduce_scatter_block
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_reduce_scatter_block(const void *sendbuf, void *recvbuf,
+                                                int recvcount, MPI_Datatype datatype,
+                                                MPI_Op op, MPIR_Comm * comm_ptr,
+                                                MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_REDUCE_SCATTER_BLOCK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_REDUCE_SCATTER_BLOCK);
+
+    mpi_errno = MPIR_Reduce_scatter_block(sendbuf, recvbuf, recvcount,
+                                          datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_REDUCE_SCATTER_BLOCK);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_scan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_scan(const void *sendbuf, void *recvbuf, int count,
+                                MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_SCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_SCAN);
+
+    mpi_errno = MPIR_Scan(sendbuf, recvbuf, count, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_SCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_exscan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_exscan(const void *sendbuf, void *recvbuf, int count,
+                                  MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                  MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_EXSCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_EXSCAN);
+
+    mpi_errno = MPIR_Exscan(sendbuf, recvbuf, count, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_EXSCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_allgather(const void *sendbuf, int sendcount,
+                                              MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                              MPI_Datatype recvtype, MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLGATHER);
+
+    mpi_errno =
+        MPIR_Neighbor_allgather_impl(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
+                                     comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_allgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_allgatherv(const void *sendbuf, int sendcount,
+                                               MPI_Datatype sendtype, void *recvbuf,
+                                               const int recvcounts[], const int displs[],
+                                               MPI_Datatype recvtype, MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLGATHERV);
+
+    mpi_errno = MPIR_Neighbor_allgatherv_impl(sendbuf, sendcount, sendtype,
+                                              recvbuf, recvcounts, displs, recvtype, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_alltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_alltoall(const void *sendbuf, int sendcount,
+                                             MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                             MPI_Datatype recvtype, MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLTOALL);
+
+    mpi_errno = MPIR_Neighbor_alltoall_impl(sendbuf, sendcount, sendtype,
+                                            recvbuf, recvcount, recvtype, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_alltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_alltoallv(const void *sendbuf, const int sendcounts[],
+                                              const int sdispls[], MPI_Datatype sendtype,
+                                              void *recvbuf, const int recvcounts[],
+                                              const int rdispls[], MPI_Datatype recvtype,
+                                              MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLTOALLV);
+
+    mpi_errno = MPIR_Neighbor_alltoallv_impl(sendbuf, sendcounts, sdispls, sendtype,
+                                             recvbuf, recvcounts, rdispls, recvtype, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_neighbor_alltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_neighbor_alltoallw(const void *sendbuf, const int sendcounts[],
+                                              const MPI_Aint sdispls[],
+                                              const MPI_Datatype sendtypes[], void *recvbuf,
+                                              const int recvcounts[], const MPI_Aint rdispls[],
+                                              const MPI_Datatype recvtypes[], MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_NEIGHBOR_ALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_NEIGHBOR_ALLTOALLW);
+
+    mpi_errno = MPIR_Neighbor_alltoallw_impl(sendbuf, sendcounts, sdispls, sendtypes,
+                                             recvbuf, recvcounts, rdispls, recvtypes, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_NEIGHBOR_ALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_allgather(const void *sendbuf, int sendcount,
+                                               MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                               MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                               MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLGATHER);
+
+    mpi_errno = MPIR_Ineighbor_allgather_impl(sendbuf, sendcount, sendtype,
+                                              recvbuf, recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_allgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_allgatherv(const void *sendbuf, int sendcount,
+                                                MPI_Datatype sendtype, void *recvbuf,
+                                                const int recvcounts[], const int displs[],
+                                                MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                                MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLGATHERV);
+
+    mpi_errno = MPIR_Ineighbor_allgatherv_impl(sendbuf, sendcount, sendtype,
+                                               recvbuf, recvcounts, displs, recvtype,
+                                               comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_alltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_alltoall(const void *sendbuf, int sendcount,
+                                              MPI_Datatype sendtype, void *recvbuf,
+                                              int recvcount, MPI_Datatype recvtype,
+                                              MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLTOALL);
+
+    mpi_errno = MPIR_Ineighbor_alltoall_impl(sendbuf, sendcount, sendtype,
+                                             recvbuf, recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_alltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_alltoallv(const void *sendbuf, const int sendcounts[],
+                                               const int sdispls[], MPI_Datatype sendtype,
+                                               void *recvbuf, const int recvcounts[],
+                                               const int rdispls[], MPI_Datatype recvtype,
+                                               MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLTOALLV);
+
+    mpi_errno = MPIR_Ineighbor_alltoallv_impl(sendbuf, sendcounts, sdispls, sendtype,
+                                              recvbuf, recvcounts, rdispls, recvtype,
+                                              comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ineighbor_alltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ineighbor_alltoallw(const void *sendbuf, const int sendcounts[],
+                                               const MPI_Aint sdispls[],
+                                               const MPI_Datatype sendtypes[], void *recvbuf,
+                                               const int recvcounts[], const MPI_Aint rdispls[],
+                                               const MPI_Datatype recvtypes[], MPIR_Comm * comm_ptr,
+                                               MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_INEIGHBOR_ALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_INEIGHBOR_ALLTOALLW);
+
+    mpi_errno = MPIR_Ineighbor_alltoallw_impl(sendbuf, sendcounts, sdispls, sendtypes,
+                                              recvbuf, recvcounts, rdispls, recvtypes,
+                                              comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_INEIGHBOR_ALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ibarrier
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ibarrier(MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IBARRIER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IBARRIER);
+
+    mpi_errno = MPIR_Ibarrier_impl(comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IBARRIER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ibcast
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ibcast(void *buffer, int count, MPI_Datatype datatype,
+                                  int root, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IBCAST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IBCAST);
+
+    mpi_errno = MPIR_Ibcast_impl(buffer, count, datatype, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IBCAST);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iallgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                      void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                      MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLGATHER);
+
+    mpi_errno = MPIR_Iallgather_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                     recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iallgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iallgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                       void *recvbuf, const int *recvcounts, const int *displs,
+                                       MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                       MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLGATHERV);
+
+    mpi_errno = MPIR_Iallgatherv_impl(sendbuf, sendcount, sendtype,
+                                      recvbuf, recvcounts, displs, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iallreduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iallreduce(const void *sendbuf, void *recvbuf, int count,
+                                      MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                      MPI_Request * request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLREDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLREDUCE);
+
+    mpi_errno = MPIR_Iallreduce_impl(sendbuf, recvbuf, count, datatype, op, comm, request);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLREDUCE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ialltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ialltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                     MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLTOALL);
+
+    mpi_errno = MPIR_Ialltoall_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                    recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ialltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ialltoallv(const void *sendbuf, const int *sendcounts,
+                                      const int *sdispls, MPI_Datatype sendtype,
+                                      void *recvbuf, const int *recvcounts,
+                                      const int *rdispls, MPI_Datatype recvtype,
+                                      MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLTOALLV);
+
+    mpi_errno = MPIR_Ialltoallv_impl(sendbuf, sendcounts, sdispls,
+                                     sendtype, recvbuf, recvcounts,
+                                     rdispls, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ialltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ialltoallw(const void *sendbuf, const int *sendcounts,
+                                      const int *sdispls, const MPI_Datatype sendtypes[],
+                                      void *recvbuf, const int *recvcounts,
+                                      const int *rdispls, const MPI_Datatype recvtypes[],
+                                      MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IALLTOALLW);
+
+    mpi_errno = MPIR_Ialltoallw_impl(sendbuf, sendcounts, sdispls,
+                                     sendtypes, recvbuf, recvcounts,
+                                     rdispls, recvtypes, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iexscan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iexscan(const void *sendbuf, void *recvbuf, int count,
+                                   MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                   MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IEXSCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IEXSCAN);
+
+    mpi_errno = MPIR_Iexscan_impl(sendbuf, recvbuf, count, datatype, op, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IEXSCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_igather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_igather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                   int root, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IGATHER);
+
+    mpi_errno = MPIR_Igather_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                  recvcount, recvtype, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_igatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_igatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, const int *recvcounts, const int *displs,
+                                    MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr,
+                                    MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IGATHERV);
+
+    mpi_errno = MPIR_Igatherv_impl(sendbuf, sendcount, sendtype,
+                                   recvbuf, recvcounts, displs, recvtype, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ireduce_scatter_block
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ireduce_scatter_block(const void *sendbuf, void *recvbuf,
+                                                 int recvcount, MPI_Datatype datatype,
+                                                 MPI_Op op, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IREDUCE_SCATTER_BLOCK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IREDUCE_SCATTER_BLOCK);
+
+    mpi_errno = MPIR_Ireduce_scatter_block_impl(sendbuf, recvbuf, recvcount,
+                                                datatype, op, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IREDUCE_SCATTER_BLOCK);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ireduce_scatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ireduce_scatter(const void *sendbuf, void *recvbuf,
+                                           const int recvcounts[], MPI_Datatype datatype,
+                                           MPI_Op op, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IREDUCE_SCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IREDUCE_SCATTER);
+
+    mpi_errno = MPIR_Ireduce_scatter_impl(sendbuf, recvbuf, recvcounts, datatype, op,
+                                          comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IREDUCE_SCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_ireduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ireduce(const void *sendbuf, void *recvbuf, int count,
+                                   MPI_Datatype datatype, MPI_Op op, int root,
+                                   MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_IREDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_IREDUCE);
+
+    mpi_errno = MPIR_Ireduce_impl(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_IREDUCE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iscan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iscan(const void *sendbuf, void *recvbuf, int count,
+                                 MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                 MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ISCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ISCAN);
+
+    mpi_errno = MPIR_Iscan_impl(sendbuf, recvbuf, count, datatype, op, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ISCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iscatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iscatter(const void *sendbuf, int sendcount,
+                                    MPI_Datatype sendtype, void *recvbuf,
+                                    int recvcount, MPI_Datatype recvtype,
+                                    int root, MPIR_Comm * comm, MPI_Request * request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ISCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ISCATTER);
+
+    mpi_errno = MPIR_Iscatter_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                   recvcount, recvtype, root, comm, request);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ISCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_iscatterv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_iscatterv(const void *sendbuf, const int *sendcounts,
+                                     const int *displs, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount,
+                                     MPI_Datatype recvtype, int root,
+                                     MPIR_Comm * comm, MPI_Request * request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_ISCATTERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_ISCATTERV);
+
+    mpi_errno = MPIR_Iscatterv_impl(sendbuf, sendcounts, displs, sendtype,
+                                    recvbuf, recvcount, recvtype, root, comm, request);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_ISCATTERV);
+    return mpi_errno;
+}
+
+#endif /* NETMOD_UCX_COLL_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_comm.h b/src/mpid/ch4/netmod/ucx/ucx_comm.h
new file mode 100644
index 0000000..b152c29
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_comm.h
@@ -0,0 +1,48 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_UCX_COMM_H_INCLUDED
+#define NETMOD_UCX_COMM_H_INCLUDED
+
+#include "ucx_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_comm_create
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_comm_create(MPIR_Comm * comm)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_UCX_COMM_CREATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_UCX_COMM_CREATE);
+
+    mpi_errno = MPIDI_CH4U_init_comm(comm);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_UCX_COMM_CREATE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_comm_destroy
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_comm_destroy(MPIR_Comm * comm)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_UCX_COMM_DESTROY);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_UCX_COMM_DESTROY);
+
+    mpi_errno = MPIDI_CH4U_destroy_comm(comm);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_UCX_COMM_DESTROY);
+    return mpi_errno;
+}
+
+#endif /* NETMOD_UCX_COMM_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_datatype.h b/src/mpid/ch4/netmod/ucx/ucx_datatype.h
new file mode 100644
index 0000000..32f1df1
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_datatype.h
@@ -0,0 +1,150 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_UCX_DATATYPE_H_INCLUDED
+#define NETMOD_UCX_DATATYPE_H_INCLUDED
+
+#include "ucx_impl.h"
+#include "ucx_types.h"
+#include <ucp/api/ucp.h>
+struct MPIDI_UCX_pack_state {
+
+    MPID_Segment *segment_ptr;
+    MPI_Aint packsize;
+};
+
+static inline void *MPIDI_UCX_Start_pack(void *context, const void *buffer, size_t count)
+{
+
+    MPI_Datatype *datatype = (MPI_Datatype *) context;
+    MPID_Segment *segment_ptr;
+    struct MPIDI_UCX_pack_state *state;
+    MPI_Aint packsize;
+    state = MPL_malloc(sizeof(struct MPIDI_UCX_pack_state));
+    segment_ptr = MPID_Segment_alloc();
+    MPIR_Pack_size_impl(count, *datatype, &packsize);
+/* Todo: Add error handling */
+    MPID_Segment_init(buffer, count, *datatype, segment_ptr, 1);
+    state->packsize = packsize;
+    state->segment_ptr = segment_ptr;
+    return (void *) state;
+}
+
+static inline void *MPIDI_UCX_Start_unpack(void *context, void *buffer, size_t count)
+{
+
+    MPI_Datatype *datatype = (MPI_Datatype *) context;
+    MPID_Segment *segment_ptr;
+    struct MPIDI_UCX_pack_state *state;
+    MPI_Aint packsize;
+
+    state = MPL_malloc(sizeof(struct MPIDI_UCX_pack_state));
+    MPIR_Pack_size_impl(count, *datatype, &packsize);
+
+    segment_ptr = MPID_Segment_alloc();
+
+/* Todo: Add error handling */
+    MPID_Segment_init(buffer, count, *datatype, segment_ptr, 1);
+    state->packsize = packsize;
+    state->segment_ptr = segment_ptr;
+    return (void *) state;
+
+}
+
+static inline size_t MPIDI_UCX_Packed_size(void *state)
+{
+
+    struct MPIDI_UCX_pack_state *pack_state = (struct MPIDI_UCX_pack_state *) state;
+
+    return (size_t) pack_state->packsize;
+}
+
+static inline size_t MPIDI_UCX_Pack(void *state, size_t offset, void *dest, size_t max_length)
+{
+
+    struct MPIDI_UCX_pack_state *pack_state = (struct MPIDI_UCX_pack_state *) state;
+    MPI_Aint last = MPL_MIN(pack_state->packsize, offset + max_length);
+
+    MPID_Segment_pack(pack_state->segment_ptr, offset, &last, dest);
+
+    return (size_t) last - offset;
+}
+
+static inline ucs_status_t MPIDI_UCX_Unpack(void *state, size_t offset, const void *src,
+                                            size_t count)
+{
+
+    struct MPIDI_UCX_pack_state *pack_state = (struct MPIDI_UCX_pack_state *) state;
+    size_t last = MPL_MIN(pack_state->packsize, offset + count);
+
+    MPID_Segment_unpack(pack_state->segment_ptr, offset, &last, (void *) src);
+
+    return UCS_OK;
+}
+
+static inline void MPIDI_UCX_Finish_pack(void *state)
+{
+
+    struct MPIDI_UCX_pack_state *pack_state = (struct MPIDI_UCX_pack_state *) state;
+    MPID_Segment_free(pack_state->segment_ptr);
+    MPL_free(pack_state);
+
+}
+
+
+static ucp_generic_dt_ops_t MPIDI_UCX_datatype_ops = {
+    .start_pack = MPIDI_UCX_Start_pack,
+    .start_unpack = MPIDI_UCX_Start_unpack,
+    .packed_size = MPIDI_UCX_Packed_size,
+    .pack = MPIDI_UCX_Pack,
+    .unpack = MPIDI_UCX_Unpack,
+    .finish = MPIDI_UCX_Finish_pack
+};
+
+
+static inline void MPIDI_NM_datatype_destroy(MPIR_Datatype * datatype_p)
+{
+
+
+    if (datatype_p->is_committed && (int) datatype_p->dev.netmod.ucx.ucp_datatype >= 0) {
+        ucp_dt_destroy(datatype_p->dev.netmod.ucx.ucp_datatype);
+        datatype_p->dev.netmod.ucx.ucp_datatype = -1;
+    }
+
+    return;
+}
+
+static inline void MPIDI_NM_datatype_commit(MPIR_Datatype * datatype_p)
+{
+    ucp_datatype_t ucp_datatype;
+    ucs_status_t status;
+    size_t size;
+    int is_contig;
+
+
+    datatype_p->dev.netmod.ucx.ucp_datatype = -1;
+    MPID_Datatype_is_contig(datatype_p->handle, &is_contig);
+
+    if (!is_contig) {
+
+        status = ucp_dt_create_generic(&MPIDI_UCX_datatype_ops, datatype_p, &ucp_datatype);
+        MPIR_Assertp(status == UCS_OK);
+        datatype_p->dev.netmod.ucx.ucp_datatype = ucp_datatype;
+
+    }
+
+    return;
+}
+
+static inline void MPIDI_NM_datatype_dup(MPIR_Datatype * old_datatype_p,
+                                         MPIR_Datatype * new_datatype_p)
+{
+    return;
+}
+
+#endif /* NETMOD_UCX_DATATYPE_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_impl.h b/src/mpid/ch4/netmod/ucx/ucx_impl.h
new file mode 100644
index 0000000..6f7200d
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_impl.h
@@ -0,0 +1,151 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_UCX_IMPL_H_INCLUDED
+#define NETMOD_UCX_IMPL_H_INCLUDED
+
+#include <mpidimpl.h>
+#include "ucx_types.h"
+#include "mpidch4r.h"
+#include "ch4_impl.h"
+
+#include <ucs/type/status.h>
+
+#define MPIDI_UCX_COMM(comm)     ((comm)->dev.ch4.netmod.ucx)
+#define MPIDI_UCX_REQ(req)       ((req)->dev.ch4.netmod.ucx)
+#define COMM_TO_INDEX(comm,rank) MPIDIU_comm_rank_to_pid(comm, rank, NULL, NULL)
+#define MPIDI_UCX_COMM_TO_EP(comm,rank) \
+    MPIDI_UCX_AV(MPIDIU_comm_rank_to_av(comm, rank)).dest
+
+#define MPIDI_UCX_WIN(win) ((win)->dev.netmod.ucx)
+#define MPIDI_UCX_WIN_INFO(win, rank) MPIDI_UCX_WIN(win).info_table[rank]
+
+static inline uint64_t MPIDI_UCX_init_tag(MPIR_Context_id_t contextid, int source, uint64_t tag)
+{
+    uint64_t ucp_tag = 0;
+    ucp_tag = contextid;
+    ucp_tag = (ucp_tag << MPIDI_UCX_SOURCE_SHIFT);
+    ucp_tag |= source;
+    ucp_tag = (ucp_tag << MPIDI_UCX_TAG_SHIFT);
+    ucp_tag |= (MPIDI_UCX_TAG_MASK & tag);
+    return ucp_tag;
+}
+
+#ifndef MPIR_TAG_ERROR_BIT
+#define MPIR_TAG_ERROR_BIT (1 << 30)
+#endif
+#ifndef  MPIR_TAG_PROC_FAILURE_BIT
+#define MPIR_TAG_PROC_FAILURE_BIT (1 << 29)
+#endif
+
+static inline uint64_t MPIDI_UCX_tag_mask(int mpi_tag, int src)
+{
+    uint64_t tag_mask;
+    tag_mask = ~(MPIR_TAG_PROC_FAILURE_BIT | MPIR_TAG_ERROR_BIT);
+    if (mpi_tag == MPI_ANY_TAG)
+        tag_mask &= ~MPIDI_UCX_TAG_MASK;
+
+    if (src == MPI_ANY_SOURCE)
+        tag_mask &= ~(MPIDI_UCX_SOURCE_MASK);
+
+    return tag_mask;
+}
+
+static inline uint64_t MPIDI_UCX_recv_tag(int mpi_tag, int src, MPIR_Context_id_t contextid)
+{
+    uint64_t ucp_tag = contextid;
+
+    ucp_tag = (ucp_tag << MPIDI_UCX_SOURCE_SHIFT);
+    if (src != MPI_ANY_SOURCE)
+        ucp_tag |= (src & UCS_MASK(MPIDI_UCX_CONTEXT_RANK_BITS));
+    ucp_tag = ucp_tag << MPIDI_UCX_TAG_SHIFT;
+    if (mpi_tag != MPI_ANY_TAG)
+        ucp_tag |= (MPIDI_UCX_TAG_MASK & mpi_tag);
+    return ucp_tag;
+}
+
+static inline int MPIDI_UCX_get_tag(uint64_t match_bits)
+{
+    return ((int) (match_bits & MPIDI_UCX_TAG_MASK));
+}
+
+static inline int MPIDI_UCX_get_source(uint64_t match_bits)
+{
+    return ((int) ((match_bits & MPIDI_UCX_SOURCE_MASK) >> MPIDI_UCX_TAG_SHIFT));
+}
+
+
+#define MPIDI_UCX_ERR  MPIR_ERR_CHKANDJUMP4
+
+#define MPIDI_UCX_CHK_STATUS(STATUS,STR)                \
+  do {								\
+    MPIDI_UCX_ERR((STATUS!=UCS_OK && STATUS!=UCS_INPROGRESS),\
+			  mpi_errno,				\
+			  MPI_ERR_OTHER,			\
+			  "**ch4_ucx_nm_"#STR,                  \
+			  "**ch4_ucx_nm_"#STR" %s %d %s %s",    \
+			  __SHORT_FILE__,			\
+			  __LINE__,				\
+			  FCNAME,				\
+			  ucs_status_string(STATUS));		\
+    } while (0)
+
+
+
+#define MPIDI_UCX_PMI_ERROR(_errno,STR)				\
+  do									\
+    {									\
+      MPIDI_UCX_ERR(_errno!=PMI_SUCCESS,			\
+			    mpi_errno,					\
+			    MPI_ERR_OTHER,				\
+			    "**ch4_ucx_nm_pmi"#STR,			\
+			    "**ch4_ucx_nm_mpi"#STR" %s %d %s %s",	\
+			    __SHORT_FILE__,				\
+			    __LINE__,					\
+			    FCNAME,					\
+			    #STR);					\
+    } while (0)
+
+#define MPIDI_CH4_UCX_MPI_ERROR(_errno)				     \
+  do								     \
+    {								     \
+      if (unlikely(_errno!=MPI_SUCCESS)) MPIR_ERR_POP(mpi_errno);    \
+    } while (0)
+
+#define MPIDI_CH4_UCX_STR_ERRCHK(_errno,STR)				\
+  do									\
+    {									\
+      MPIDI_UCX_ERR(_errno!=MPL_STR_SUCCESS,			\
+			    mpi_errno,					\
+			    MPI_ERR_OTHER,				\
+			    "**ch4_ucx_nm_"#STR,			\
+			    "**ch4_ucx_nm_"#STR" %s %d %s %s",		\
+			    __SHORT_FILE__,				\
+			    __LINE__,					\
+			    FCNAME,					\
+			    #STR);					\
+    } while (0)
+
+
+
+#define MPIDI_CH4_UCX_REQUEST(_req, STR)				\
+  do {									\
+    MPIDI_UCX_ERR(UCS_PTR_IS_ERR(_req),				\
+			  mpi_errno,					\
+			  MPI_ERR_OTHER,				\
+			  "**ch4_ucx_nm_"#STR,				\
+			  "**ch4_ucx_nm_"#STR" %s %d %s %s",		\
+			  __SHORT_FILE__,				\
+			  __LINE__,					\
+			  FCNAME,					\
+			  ucs_status_string(UCS_PTR_STATUS(_req)));	\
+  } while (0)
+
+extern int MPIR_Datatype_init_names(void);
+
+#endif /* NETMOD_UCX_IMPL_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_init.h b/src/mpid/ch4/netmod/ucx/ucx_init.h
new file mode 100644
index 0000000..06b9e7e
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_init.h
@@ -0,0 +1,330 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_UCX_INIT_H_INCLUDED
+#define NETMOD_UCX_INIT_H_INCLUDED
+
+#include "ucx_impl.h"
+#include "mpir_cvars.h"
+#include "ucx_types.h"
+#include "pmi.h"
+#include <ucp/api/ucp.h>
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_init(int rank,
+                                int size,
+                                int appnum,
+                                int *tag_ub,
+                                MPIR_Comm * comm_world,
+                                MPIR_Comm * comm_self,
+                                int spawned, int num_contexts, void **netmod_contexts)
+{
+    int mpi_errno = MPI_SUCCESS, thr_err, pmi_errno;
+    int str_errno = MPL_STR_SUCCESS;
+    ucp_config_t *config;
+    ucs_status_t ucx_status;
+    uint64_t features = 0;
+    int status;
+    char valS[MPIDI_UCX_KVSAPPSTRLEN], *val;
+    char keyS[MPIDI_UCX_KVSAPPSTRLEN];
+    char remote_addr[MPIDI_UCX_KVSAPPSTRLEN];
+    size_t maxlen = MPIDI_UCX_KVSAPPSTRLEN;
+    //   char *table = NULL;
+    int i;
+    ucp_params_t ucp_params;
+    int avtid = 0, max_n_avts;
+
+    size_t address_length = 0;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_INIT);
+
+    ucx_status = ucp_config_read(NULL, NULL, &config);
+    MPIDI_UCX_CHK_STATUS(ucx_status, read_config);
+
+    /* For now use only the tag feature */
+    features = UCP_FEATURE_TAG | UCP_FEATURE_RMA;
+    ucp_params.features = features;
+    ucp_params.request_size = sizeof(MPIDI_UCX_ucp_request_t);
+    ucp_params.request_init = MPIDI_UCX_Request_init_callback;
+    ucp_params.request_cleanup = NULL;
+    ucx_status = ucp_init(&ucp_params, config, &MPIDI_UCX_global.context);
+    MPIDI_UCX_CHK_STATUS(ucx_status, init);
+    ucp_config_release(config);
+
+    ucx_status = ucp_worker_create(MPIDI_UCX_global.context, UCS_THREAD_MODE_SERIALIZED,
+                                   &MPIDI_UCX_global.worker);
+    MPIDI_UCX_CHK_STATUS(ucx_status, worker_create);
+    ucx_status =
+        ucp_worker_get_address(MPIDI_UCX_global.worker, &MPIDI_UCX_global.if_address,
+                               &MPIDI_UCX_global.addrname_len);
+    MPIDI_UCX_CHK_STATUS(ucx_status, get_worker_address);
+
+
+    val = valS;
+    str_errno =
+        MPL_str_add_binary_arg(&val, (int *) &maxlen, "UCX", (char *) MPIDI_UCX_global.if_address,
+                               (int) MPIDI_UCX_global.addrname_len);
+    MPIDI_UCX_global.max_addr_len = MPIDI_UCX_global.addrname_len;
+    /* MPIDI_CH4_UCX_STR_ERRCHK(str_errno, buscard_len); */
+    pmi_errno = PMI_KVS_Get_my_name(MPIDI_UCX_global.kvsname, MPIDI_UCX_KVSAPPSTRLEN);
+
+    val = valS;
+    sprintf(keyS, "UCX-%d", rank);
+    pmi_errno = PMI_KVS_Put(MPIDI_UCX_global.kvsname, keyS, val);
+    MPIDI_UCX_PMI_ERROR(pmi_errno, pmi_put_name);
+    pmi_errno = PMI_KVS_Commit(MPIDI_UCX_global.kvsname);
+    MPIDI_UCX_PMI_ERROR(pmi_errno, pmi_commit);
+    pmi_errno = PMI_Barrier();
+    MPIDI_UCX_PMI_ERROR(pmi_errno, pmi_barrier);
+
+    ///table = MPL_malloc(size * MPIDI_UCX_NAME_LEN);
+    MPIDI_UCX_global.pmi_addr_table = NULL;
+//    memset(table,0x0, MPIDI_UCX_NAME_LEN*size);
+
+    maxlen = MPIDI_UCX_KVSAPPSTRLEN;
+
+    for (i = 0; i < size; i++) {
+        sprintf(keyS, "UCX-%d", i);
+        pmi_errno = PMI_KVS_Get(MPIDI_UCX_global.kvsname, keyS, valS, MPIDI_UCX_KVSAPPSTRLEN);
+        MPIDI_UCX_PMI_ERROR(pmi_errno, pmi_commit);
+        str_errno = MPL_str_get_binary_arg(valS, "UCX", remote_addr,
+                                           (int) MPIDI_UCX_KVSAPPSTRLEN, (int *) &maxlen);
+        if (maxlen > MPIDI_UCX_global.max_addr_len)
+            MPIDI_UCX_global.max_addr_len = maxlen;
+        /* MPIDI_UCX_STR_ERRCHK(str_errno, buscard_len); */
+        ucx_status = ucp_ep_create(MPIDI_UCX_global.worker,
+                                   (ucp_address_t *) remote_addr,
+                                   &MPIDI_UCX_AV(&MPIDIU_get_av(0, i)).dest);
+
+        MPIDI_UCX_CHK_STATUS(ucx_status, ep_create);
+        memset(remote_addr, 0x0, maxlen);
+    }
+
+    MPIDI_CH4U_init(comm_world, comm_self, num_contexts, netmod_contexts);
+
+    mpi_errno = MPIR_Datatype_init_names();
+    MPIDI_CH4_UCX_MPI_ERROR(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_EXIT);
+    return mpi_errno;
+  fn_fail:
+    if (MPIDI_UCX_global.worker != NULL)
+        ucp_worker_destroy(MPIDI_UCX_global.worker);
+
+    if (MPIDI_UCX_global.context != NULL)
+        ucp_cleanup(MPIDI_UCX_global.context);
+
+    goto fn_exit;
+
+}
+
+static inline int MPIDI_NM_finalize(void)
+{
+    int mpi_errno = MPI_SUCCESS, thr_err, pmi_errno;
+    int i, j, max_n_avts;
+    MPIR_Errflag_t errflag;
+    MPIR_Comm *comm;
+    max_n_avts = MPIDIU_get_max_n_avts();
+
+    for (i = 0; i < max_n_avts; i++) {
+        for (j = 0; j < MPIDIU_get_av_table(i)->size; j++)
+            ucp_ep_destroy(MPIDI_UCX_AV(&MPIDIU_get_av(i, j)).dest);
+    }
+    pmi_errno = PMI_Barrier();
+    MPIDI_UCX_PMI_ERROR(pmi_errno, pmi_barrier);
+
+
+    if (MPIDI_UCX_global.worker != NULL)
+        ucp_worker_destroy(MPIDI_UCX_global.worker);
+
+    if (MPIDI_UCX_global.context != NULL)
+        ucp_cleanup(MPIDI_UCX_global.context);
+
+    comm = MPIR_Process.comm_world;
+    MPIR_Comm_release_always(comm);
+
+    comm = MPIR_Process.comm_self;
+    MPIR_Comm_release_always(comm);
+    if (MPIDI_UCX_global.pmi_addr_table)
+        MPL_free(MPIDI_UCX_global.pmi_addr_table);
+
+    MPIDI_CH4U_finalize();
+    PMI_Finalize();
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+static inline int MPIDI_NM_comm_get_lpid(MPIR_Comm * comm_ptr,
+                                         int idx, int *lpid_ptr, MPL_bool is_remote)
+{
+    int avtid = 0, lpid = 0;
+    if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) {
+        MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid);
+    }
+    else if (is_remote) {
+        MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid);
+    }
+    else {
+        MPIDIU_comm_rank_to_pid_local(comm_ptr, idx, &lpid, &avtid);
+    }
+
+    *lpid_ptr = MPIDIU_LPID_CREATE(avtid, lpid);
+    return MPI_SUCCESS;
+
+}
+
+static inline int allocate_address_table()
+{
+
+    char keyS[MPIDI_UCX_KVSAPPSTRLEN];
+    char valS[MPIDI_UCX_KVSAPPSTRLEN];
+    int len = MPIDI_UCX_global.max_addr_len;
+    int i;
+    int size, maxlen = 1;
+    size = MPIR_Process.comm_world->local_size;
+    MPIDI_UCX_global.pmi_addr_table = MPL_malloc(size * len);
+    memset(MPIDI_UCX_global.pmi_addr_table, 0x0, len * size);
+
+
+    for (i = 0; i < size; i++) {
+        sprintf(keyS, "UCX-%d", i);
+        PMI_KVS_Get(MPIDI_UCX_global.kvsname, keyS, valS, MPIDI_UCX_KVSAPPSTRLEN);
+        // MPIDI_UCX_PMI_ERROR(pmi_errno, pmi_commit);
+        MPL_str_get_binary_arg(valS, "UCX", &MPIDI_UCX_global.pmi_addr_table[len * i],
+                               (int) len, (int *) &maxlen);
+    }
+
+
+}
+
+static inline int MPIDI_NM_gpid_get(MPIR_Comm * comm_ptr, int rank, MPIR_Gpid * gpid)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int avtid = 0, lpid = 0;
+
+
+    int len = MPIDI_UCX_global.max_addr_len;
+
+    MPIDIU_comm_rank_to_pid(comm_ptr, rank, &lpid, &avtid);
+    MPIR_Assert(rank < comm_ptr->local_size);
+
+    if (MPIDI_UCX_global.pmi_addr_table == NULL) {
+        allocate_address_table();
+    }
+    memset(MPIDI_UCX_GPID(gpid).addr, 0, len);
+    memcpy(MPIDI_UCX_GPID(gpid).addr, &MPIDI_UCX_global.pmi_addr_table[lpid * len], len);
+    MPIR_Assert(len <= sizeof(MPIDI_UCX_GPID(gpid).addr));
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_NM_get_node_id(MPIR_Comm * comm, int rank, MPID_Node_id_t * id_p)
+{
+    MPIDI_CH4U_get_node_id(comm, rank, id_p);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_get_max_node_id(MPIR_Comm * comm, MPID_Node_id_t * max_id_p)
+{
+    MPIDI_CH4U_get_max_node_id(comm, max_id_p);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_getallincomm(MPIR_Comm * comm_ptr,
+                                        int local_size, MPIR_Gpid local_gpids[], int *singleAVT)
+{
+    int i;
+
+    for (i = 0; i < comm_ptr->local_size; i++)
+        MPIDI_GPID_Get(comm_ptr, i, &local_gpids[i]);
+
+    *singleAVT = 0;
+    return 0;
+}
+
+static inline int MPIDI_NM_gpid_tolpidarray(int size, MPIR_Gpid gpid[], int lpid[])
+{
+
+    int i, mpi_errno = MPI_SUCCESS;
+    int *new_avt_procs;
+    int n_new_procs = 0;
+    size_t sz;
+    int max_n_avts;
+    new_avt_procs = (int *) MPL_malloc(size * sizeof(int));
+    max_n_avts = MPIDIU_get_max_n_avts();
+    if (MPIDI_UCX_global.pmi_addr_table == NULL) {
+        allocate_address_table();
+    }
+
+    for (i = 0; i < size; i++) {
+        int j, k;
+        char tbladdr[128];
+        int found = 0;
+
+        for (k = 0; k < max_n_avts; k++) {
+            if (MPIDIU_get_av_table(k) == NULL) {
+                continue;
+            }
+            for (j = 0; j < MPIDIU_get_av_table(k)->size; j++) {
+                sz = MPIDI_UCX_global.max_addr_len;     //  sizeof(MPIDI_UCX_GPID(&gpid[i]).addr);
+                MPIR_Assert(sz <= sizeof(MPIDI_UCX_GPID(&gpid[i]).addr));
+
+                if (!memcmp(&MPIDI_UCX_global.pmi_addr_table[j * sz],
+                            MPIDI_UCX_GPID(&gpid[i]).addr, sz)) {
+                    lpid[i] = MPIDIU_LPID_CREATE(k, j);
+                    found = 1;
+                    break;
+                }
+            }
+        }
+        if (!found) {
+            new_avt_procs[n_new_procs] = i;
+            n_new_procs++;
+        }
+    }
+
+    /* FIXME: add support for dynamic processes */
+    if (n_new_procs > 0) {
+        mpi_errno = -1;
+        MPIR_ERR_POP(mpi_errno);
+    }
+
+  fn_exit:
+    MPL_free(new_avt_procs);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_NM_create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr,
+                                                       int size, const int lpids[])
+{
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_NM_free_mem(void *ptr)
+{
+    return MPIDI_CH4U_free_mem(ptr);
+}
+
+static inline void *MPIDI_NM_alloc_mem(size_t size, MPIR_Info * info_ptr)
+{
+    return MPIDI_CH4U_alloc_mem(size, info_ptr);
+}
+
+#endif /* NETMOD_UCX_INIT_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_op.h b/src/mpid/ch4/netmod/ucx/ucx_op.h
new file mode 100644
index 0000000..ac10797
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_op.h
@@ -0,0 +1,24 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_UCX_OP_H_INCLUDED
+#define NETMOD_UCX_OP_H_INCLUDED
+
+#include "ucx_impl.h"
+
+static inline void MPIDI_NM_op_destroy(MPIR_Op * op_p)
+{
+    return;
+}
+
+static inline void MPIDI_NM_op_commit(MPIR_Op * op_p)
+{
+    return;
+}
+
+#endif /* NETMOD_UCX_OP_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_pre.h b/src/mpid/ch4/netmod/ucx/ucx_pre.h
new file mode 100644
index 0000000..9779a4d
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_pre.h
@@ -0,0 +1,75 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_UCX_PRE_H_INCLUDED
+#define NETMOD_UCX_PRE_H_INCLUDED
+
+#include <ucp/api/ucp.h>
+
+#define HAVE_MPIDI_NM_datatype_commit_hook
+#define HAVE_MPIDI_NM_datatype_destroy_hook
+
+#define MPIDI_UCX_KVSAPPSTRLEN 4096
+
+//#define MPIDI_UCX_NAME_LEN             (512)
+typedef struct {
+    void *req;
+} MPIDI_UCX_ucp_request_t;
+
+typedef struct {
+    ucp_datatype_t ucp_datatype;
+} MPIDI_UCX_dt_t;
+
+typedef struct {
+    union {
+        ucp_tag_message_h message_handler;
+        MPIDI_UCX_ucp_request_t *ucp_request;
+    } a;
+} MPIDI_UCX_request_t;
+
+typedef struct {
+    int handler_id;
+    char *pack_buffer;
+} MPIDI_UCX_am_request_t;
+
+typedef struct MPIDI_UCX_am_header_t {
+    uint64_t handler_id;
+    uint64_t data_sz;
+    uint64_t payload[0];
+} MPIDI_UCX_am_header_t;
+
+typedef struct MPIDI_UCX_win_info {
+    ucp_rkey_h rkey;
+    uint64_t addr;
+    uint32_t disp;
+} __attribute__ ((packed)) MPIDI_UCX_win_info_t;
+
+
+typedef struct {
+    MPIDI_UCX_win_info_t *info_table;
+    ucp_mem_h mem_h;
+    int need_local_flush;
+} MPIDI_UCX_win_t;
+
+typedef struct {
+    char addr[MPIDI_UCX_KVSAPPSTRLEN];
+} MPIDI_UCX_gpid_t;
+
+typedef struct {
+    ucp_ep_h dest;
+} MPIDI_UCX_addr_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_UCX_comm_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_UCX_op_t;
+
+#endif /* NETMOD_UCX_PRE_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_probe.h b/src/mpid/ch4/netmod/ucx/ucx_probe.h
new file mode 100644
index 0000000..8806146
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_probe.h
@@ -0,0 +1,99 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_UCX_PROBE_H_INCLUDED
+#define NETMOD_UCX_PROBE_H_INCLUDED
+
+#include "ucx_impl.h"
+#include "mpidch4.h"
+
+static inline int ucx_do_iprobe(int source,
+                                int tag,
+                                MPIR_Comm * comm, int context_offset, int *flag,
+                                MPI_Status * status)
+{
+    int mpi_errno = MPI_SUCCESS;
+    uint64_t ucp_tag, tag_mask;
+    MPI_Aint count;
+    ucp_tag_recv_info_t info;
+    ucp_tag_message_h message_handler;
+    tag_mask = MPIDI_UCX_tag_mask(tag, source);
+    ucp_tag = MPIDI_UCX_recv_tag(tag, source, comm->recvcontext_id + context_offset);
+    message_handler = ucp_tag_probe_nb(MPIDI_UCX_global.worker, ucp_tag, tag_mask, 0, &info);
+    if (message_handler == NULL) {
+        *flag = 0;
+        goto fn_exit;
+    }
+    *flag = true;
+    if (status == MPI_STATUS_IGNORE)
+        goto fn_exit;
+
+    status->MPI_ERROR = MPI_SUCCESS;
+    status->MPI_SOURCE = MPIDI_UCX_get_source(info.sender_tag);
+    status->MPI_TAG = MPIDI_UCX_get_tag(info.sender_tag);
+    count = info.length;
+    MPIR_STATUS_SET_COUNT(*status, count);
+  fn_exit:
+    return mpi_errno;
+
+}
+
+static inline int MPIDI_NM_improbe(int source,
+                                   int tag,
+                                   MPIR_Comm * comm,
+                                   int context_offset,
+                                   int *flag, MPIR_Request ** message, MPI_Status * status)
+{
+
+
+    int mpi_errno = MPI_SUCCESS;
+    uint64_t ucp_tag, tag_mask;
+    MPI_Aint count;
+    ucp_tag_recv_info_t info;
+    ucp_tag_message_h message_handler;
+    MPIR_Request *req;
+
+    tag_mask = MPIDI_UCX_tag_mask(tag, source);
+    ucp_tag = MPIDI_UCX_recv_tag(tag, source, comm->recvcontext_id + context_offset);
+
+    message_handler = ucp_tag_probe_nb(MPIDI_UCX_global.worker, ucp_tag, tag_mask, 1, &info);
+    if (message_handler == NULL) {
+        *flag = 0;
+        goto fn_exit;
+    }
+    *flag = 1;
+    req = (MPIR_Request *) MPIR_Request_create(MPIR_REQUEST_KIND__MPROBE);
+    MPIR_Assert(req);
+    MPIDI_UCX_REQ(req).a.message_handler = message_handler;
+    if (status == MPI_STATUS_IGNORE)
+        goto fn_exit;
+
+    status->MPI_SOURCE = MPIDI_UCX_get_source(info.sender_tag);
+    status->MPI_TAG = MPIDI_UCX_get_tag(info.sender_tag);
+    count = info.length;
+    MPIR_STATUS_SET_COUNT(*status, count);
+  fn_exit:
+    *message = req;
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+static inline int MPIDI_NM_iprobe(int source,
+                                  int tag,
+                                  MPIR_Comm * comm,
+                                  int context_offset, int *flag, MPI_Status * status)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    mpi_errno = ucx_do_iprobe(source, tag, comm, context_offset, flag, status);
+    return mpi_errno;
+}
+
+#endif /* NETMOD_UCX_PROBE_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_proc.h b/src/mpid/ch4/netmod/ucx/ucx_proc.h
new file mode 100644
index 0000000..ca43260
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_proc.h
@@ -0,0 +1,26 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_UCX_PROC_H_INCLUDED
+#define NETMOD_UCX_PROC_H_INCLUDED
+
+#include "ucx_impl.h"
+
+static inline int MPIDI_NM_rank_is_local(int rank, MPIR_Comm * comm)
+{
+    int ret;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPIDI_NETMOD_RANK_IS_LOCAL);
+    MPIR_FUNC_VERBOSE_ENTER(MPIDI_NETMOD_RANK_IS_LOCAL);
+
+    ret = MPIDI_CH4U_rank_is_local(rank, comm);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPIDI_NETMOD_RANK_IS_LOCAL);
+    return ret;
+}
+
+#endif /* NETMOD_UCX_PROC_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_progress.h b/src/mpid/ch4/netmod/ucx/ucx_progress.h
new file mode 100644
index 0000000..c12801d
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_progress.h
@@ -0,0 +1,147 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_UCX_PROGRESS_H_INCLUDED
+#define NETMOD_UCX_PROGRESS_H_INCLUDED
+
+#include "ucx_impl.h"
+//#include "events.h"
+
+static inline int MPIDI_UCX_am_handler(void *msg, size_t msg_sz)
+{
+    int mpi_errno;
+    MPIR_Request *rreq;
+    void *p_data;
+    void *in_data;
+    size_t data_sz, in_data_sz;
+    MPIDI_NM_am_completion_handler_fn cmpl_handler_fn;
+    struct iovec *iov;
+    int i, is_contig, iov_len;
+    size_t done, curr_len, rem;
+    MPIDI_UCX_am_header_t *msg_hdr = (MPIDI_UCX_am_header_t *) msg;
+
+    p_data = in_data = (char *) msg_hdr->payload + (msg_sz - msg_hdr->data_sz - sizeof(*msg_hdr));
+    in_data_sz = data_sz = msg_hdr->data_sz;
+
+    MPIDI_UCX_global.am_handlers[msg_hdr->handler_id] (msg_hdr->payload,
+                                                       &p_data, &data_sz,
+                                                       &is_contig, &cmpl_handler_fn, &rreq);
+
+    if (!rreq)
+        goto fn_exit;
+
+    if ((!p_data || !data_sz) && cmpl_handler_fn) {
+        MPIR_STATUS_SET_COUNT(rreq->status, data_sz);
+        cmpl_handler_fn(rreq);
+        goto fn_exit;
+    }
+
+    if (is_contig) {
+        if (in_data_sz > data_sz) {
+            rreq->status.MPI_ERROR = MPI_ERR_TRUNCATE;
+        }
+        else {
+            rreq->status.MPI_ERROR = MPI_SUCCESS;
+        }
+
+        data_sz = MPL_MIN(data_sz, in_data_sz);
+        MPIR_Memcpy(p_data, in_data, data_sz);
+        MPIR_STATUS_SET_COUNT(rreq->status, data_sz);
+    }
+    else {
+        done = 0;
+        rem = in_data_sz;
+        iov = (struct iovec *) p_data;
+        iov_len = data_sz;
+
+        for (i = 0; i < iov_len && rem > 0; i++) {
+            curr_len = MPL_MIN(rem, iov[i].iov_len);
+            MPIR_Memcpy(iov[i].iov_base, (char *) in_data + done, curr_len);
+            rem -= curr_len;
+            done += curr_len;
+        }
+
+        if (rem) {
+            rreq->status.MPI_ERROR = MPI_ERR_TRUNCATE;
+        }
+        else {
+            rreq->status.MPI_ERROR = MPI_SUCCESS;
+        }
+
+        MPIR_STATUS_SET_COUNT(rreq->status, done);
+    }
+
+    if (cmpl_handler_fn) {
+        cmpl_handler_fn(rreq);
+    }
+
+  fn_exit:
+    return mpi_errno;
+}
+
+static inline void MPIDI_UCX_Handle_am_recv(void *request, ucs_status_t status,
+                                            ucp_tag_recv_info_t * info)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_UCX_ucp_request_t *ucp_request = (MPIDI_UCX_ucp_request_t *) request;
+
+    if (status == UCS_ERR_CANCELED) {
+        goto fn_exit;
+    }
+  fn_exit:
+    return;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_progress
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_progress(void *netmod_context, int blocking)
+{
+    int mpi_errno = MPI_SUCCESS;
+    ucp_tag_recv_info_t info;
+    MPIDI_UCX_ucp_request_t *ucp_request;
+    void *am_buf;
+    ucp_tag_message_h message_handle;
+    /* check for active messages */
+    message_handle =
+        ucp_tag_probe_nb(MPIDI_UCX_global.worker, MPIDI_UCX_AM_TAG, MPIDI_UCX_AM_TAG, 1, &info);
+    while (message_handle) {
+        am_buf = MPL_malloc(info.length);
+        ucp_request = (MPIDI_UCX_ucp_request_t *) ucp_tag_msg_recv_nb(MPIDI_UCX_global.worker,
+                                                                      am_buf,
+                                                                      info.length,
+                                                                      ucp_dt_make_contig(1),
+                                                                      message_handle,
+                                                                      &MPIDI_UCX_Handle_am_recv);
+        while (!ucp_request_is_completed(ucp_request)) {
+            ucp_worker_progress(MPIDI_UCX_global.worker);
+        }
+
+        ucp_request_release(ucp_request);
+        MPIDI_UCX_am_handler(am_buf, info.length);
+        MPL_free(am_buf);
+        message_handle =
+            ucp_tag_probe_nb(MPIDI_UCX_global.worker, MPIDI_UCX_AM_TAG,
+                             ~MPIDI_UCX_AM_TAG, 1, &info);
+
+    }
+
+    ucp_worker_progress(MPIDI_UCX_global.worker);
+
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_THREAD_WORKER_MUTEX);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#endif /* NETMOD_UCX_PROGRESS_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_recv.h b/src/mpid/ch4/netmod/ucx/ucx_recv.h
new file mode 100644
index 0000000..11eb7cf
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_recv.h
@@ -0,0 +1,244 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_UCX_RECV_H_INCLUDED
+#define NETMOD_UCX_RECV_H_INCLUDED
+
+#include "ucx_impl.h"
+
+__ALWAYS_INLINE__ int ucx_irecv_continous(void *buf,
+                                          size_t data_sz,
+                                          int rank,
+                                          int tag,
+                                          MPIR_Comm * comm,
+                                          int context_offset, MPIR_Request ** request)
+{
+
+    int mpi_errno = MPI_SUCCESS;
+    uint64_t ucp_tag, tag_mask;
+    MPIR_Request *req;
+    MPIDI_UCX_ucp_request_t *ucp_request;
+//    MPID_THREAD_CS_ENTER(POBJ,MPIDI_THREAD_WORKER_MUTEX);
+    tag_mask = MPIDI_UCX_tag_mask(tag, rank);
+    ucp_tag = MPIDI_UCX_recv_tag(tag, rank, comm->recvcontext_id + context_offset);
+
+    ucp_request = (MPIDI_UCX_ucp_request_t *) ucp_tag_recv_nb(MPIDI_UCX_global.worker,
+                                                              buf, data_sz, ucp_dt_make_contig(1),
+                                                              ucp_tag, tag_mask,
+                                                              &MPIDI_UCX_Handle_recv_callback);
+
+
+    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+
+
+    if (ucp_request->req == NULL) {
+        req = MPIR_Request_create(MPIR_REQUEST_KIND__RECV);
+        MPIR_Request_add_ref(req);
+        MPIDI_UCX_REQ(req).a.ucp_request = ucp_request;
+        ucp_request->req = req;
+        ucp_request_release(ucp_request);
+    }
+    else {
+        req = ucp_request->req;
+        ucp_request->req = NULL;
+        MPIDI_UCX_REQ(req).a.ucp_request = NULL;
+        ucp_request_release(ucp_request);
+    }
+  fn_exit:
+    *request = req;
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+__ALWAYS_INLINE__ int ucx_irecv_non_continous(void *buf,
+                                              size_t count,
+                                              int rank,
+                                              int tag,
+                                              MPIR_Comm * comm,
+                                              int context_offset, MPIR_Request ** request,
+                                              MPIR_Datatype * datatype)
+{
+
+    int mpi_errno = MPI_SUCCESS;
+    uint64_t ucp_tag, tag_mask;
+    MPIR_Request *req;
+    MPIDI_UCX_ucp_request_t *ucp_request;
+//    MPID_THREAD_CS_ENTER(POBJ,MPIDI_THREAD_WORKER_MUTEX);
+    tag_mask = MPIDI_UCX_tag_mask(tag, rank);
+    ucp_tag = MPIDI_UCX_recv_tag(tag, rank, comm->recvcontext_id + context_offset);
+
+    ucp_request = (MPIDI_UCX_ucp_request_t *) ucp_tag_recv_nb(MPIDI_UCX_global.worker,
+                                                              buf, count,
+                                                              datatype->dev.netmod.ucx.ucp_datatype,
+                                                              ucp_tag, tag_mask,
+                                                              &MPIDI_UCX_Handle_recv_callback);
+
+
+    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+
+
+    if (ucp_request->req == NULL) {
+        req = MPIR_Request_create(MPIR_REQUEST_KIND__RECV);
+        MPIR_Request_add_ref(req);
+        ucp_request->req = req;
+        MPIDI_UCX_REQ(req).a.ucp_request = ucp_request;
+        ucp_request_release(ucp_request);
+    }
+    else {
+        req = ucp_request->req;
+
+        MPIDI_UCX_REQ(req).a.ucp_request = NULL;
+        ucp_request->req = NULL;
+        ucp_request_release(ucp_request);
+    }
+    (req)->kind = MPIR_REQUEST_KIND__RECV;
+  fn_exit:
+    *request = req;
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int do_irecv(void *buf,
+                           int count,
+                           MPI_Datatype datatype,
+                           int rank,
+                           int tag, MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    size_t data_sz;
+    int dt_contig;
+    MPIR_Request *req;
+    MPI_Aint dt_true_lb;
+
+    MPIR_Datatype *dt_ptr;
+
+    MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);
+    if (dt_contig)
+        mpi_errno =
+            ucx_irecv_continous(buf + dt_true_lb, data_sz, rank, tag, comm, context_offset,
+                                request);
+    else
+        mpi_errno =
+            ucx_irecv_non_continous(buf, count, rank, tag, comm, context_offset, request, dt_ptr);
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+__ALWAYS_INLINE__ int MPIDI_NM_recv(void *buf,
+                                    int count,
+                                    MPI_Datatype datatype,
+                                    int rank,
+                                    int tag,
+                                    MPIR_Comm * comm,
+                                    int context_offset,
+                                    MPI_Status * status, MPIR_Request ** request)
+{
+
+    return do_irecv(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+__ALWAYS_INLINE__ int MPIDI_NM_recv_init(void *buf,
+                                         int count,
+                                         MPI_Datatype datatype,
+                                         int rank,
+                                         int tag,
+                                         MPIR_Comm * comm,
+                                         int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_recv_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+__ALWAYS_INLINE__ int MPIDI_NM_imrecv(void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      MPIR_Request * message, MPIR_Request ** rreqp)
+{
+    ucp_tag_message_h message_handler;
+    int mpi_errno = MPI_SUCCESS;
+    size_t data_sz;
+    int dt_contig;
+    MPIR_Request *req;
+    MPI_Aint dt_true_lb;
+    MPIDI_UCX_ucp_request_t *ucp_request;
+
+    MPIR_Datatype *dt_ptr;
+    MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);
+    if (message == NULL) {
+        mpi_errno = MPI_SUCCESS;
+        MPIDI_CH4U_request_complete(req);
+        *rreqp = req;
+
+        goto fn_exit;
+    }
+
+    message_handler = MPIDI_UCX_REQ(message).a.message_handler;
+    if (dt_contig)
+        ucp_request = (MPIDI_UCX_ucp_request_t *) ucp_tag_msg_recv_nb(MPIDI_UCX_global.worker,
+                                                                      buf + dt_true_lb, data_sz,
+                                                                      ucp_dt_make_contig(1),
+                                                                      message_handler,
+                                                                      &MPIDI_UCX_Handle_recv_callback);
+    else
+        ucp_request = (MPIDI_UCX_ucp_request_t *) ucp_tag_msg_recv_nb(MPIDI_UCX_global.worker,
+                                                                      buf, count,
+                                                                      dt_ptr->dev.netmod.ucx.
+                                                                      ucp_datatype, message_handler,
+                                                                      &MPIDI_UCX_Handle_recv_callback);
+
+
+    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+
+    if (ucp_request->req == NULL) {
+        req = MPIR_Request_create(MPIR_REQUEST_KIND__RECV);
+        MPIR_Request_add_ref(req);
+        ucp_request->req = req;
+        ucp_request_release(ucp_request);
+    }
+    else {
+        req = ucp_request->req;
+        ucp_request->req = NULL;
+        ucp_request_release(ucp_request);
+    }
+
+
+  fn_exit:
+    *rreqp = req;
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+__ALWAYS_INLINE__ int MPIDI_NM_irecv(void *buf,
+                                     int count,
+                                     MPI_Datatype datatype,
+                                     int rank,
+                                     int tag,
+                                     MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+
+
+
+    return do_irecv(buf, count, datatype, rank, tag, comm, context_offset, request);
+
+}
+
+static inline int MPIDI_NM_cancel_recv(MPIR_Request * rreq)
+{
+
+    if (MPIDI_UCX_REQ(rreq).a.ucp_request) {
+        ucp_request_cancel(MPIDI_UCX_global.worker, MPIDI_UCX_REQ(rreq).a.ucp_request);
+    }
+
+}
+
+#endif /* NETMOD_UCX_RECV_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_request.h b/src/mpid/ch4/netmod/ucx/ucx_request.h
new file mode 100644
index 0000000..35779da
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_request.h
@@ -0,0 +1,116 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_UCX_REQUEST_H_INCLUDED
+#define NETMOD_UCX_REQUEST_H_INCLUDED
+
+#include "ucx_impl.h"
+#include "mpidch4.h"
+#include <ucp/api/ucp.h>
+#include "mpidch4r.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_request_release
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline void MPIDI_NM_am_request_init(MPIR_Request * req)
+{
+    req->dev.ch4.ch4u.netmod_am.ucx.pack_buffer = NULL;
+}
+
+static inline void MPIDI_NM_am_request_finalize(MPIR_Request * req)
+{
+    if ((req)->dev.ch4.ch4u.netmod_am.ucx.pack_buffer) {
+        MPL_free((req)->dev.ch4.ch4u.netmod_am.ucx.pack_buffer);
+    }
+    /* MPIDI_CH4U_request_release(req); */
+}
+
+static inline void MPIDI_UCX_Request_init_callback(void *request)
+{
+
+    MPIDI_UCX_ucp_request_t *ucp_request = (MPIDI_UCX_ucp_request_t *) request;
+    ucp_request->req = NULL;
+
+}
+
+static inline void MPIDI_UCX_Handle_send_callback(void *request, ucs_status_t status)
+{
+    int c;
+    int mpi_errno;
+    MPIDI_UCX_ucp_request_t *ucp_request = (MPIDI_UCX_ucp_request_t *) request;
+    MPIR_Request *req = NULL;
+    if (unlikely(status == UCS_ERR_CANCELED)) {
+        req = ucp_request->req;
+        MPIDI_CH4U_request_complete(req);
+        MPIR_STATUS_SET_CANCEL_BIT(req->status, TRUE);
+        ucp_request->req = NULL;
+        goto fn_exit;
+    }
+    if (ucp_request->req) {
+        req = ucp_request->req;
+        MPIR_cc_decr(req->cc_ptr, &c);
+        MPIR_Assert(c >= 0);
+
+        if (c == 0) {
+            MPIR_Request_free(req);
+        }
+        ucp_request->req = NULL;
+    }
+    else {
+        req = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
+        MPIR_cc_set(&req->cc, 0);
+        ucp_request->req = req;
+    }
+  fn_exit:
+    return;
+  fn_fail:
+    req->status.MPI_ERROR = mpi_errno;
+}
+
+static inline void MPIDI_UCX_Handle_recv_callback(void *request, ucs_status_t status,
+                                                  ucp_tag_recv_info_t * info)
+{
+    MPI_Aint count;
+    int mpi_errno;
+    MPIDI_UCX_ucp_request_t *ucp_request = (MPIDI_UCX_ucp_request_t *) request;
+    MPIR_Request *rreq = NULL;
+    if (unlikely(status == UCS_ERR_CANCELED)) {
+        rreq = ucp_request->req;
+        MPIDI_CH4U_request_complete(rreq);
+        MPIR_STATUS_SET_CANCEL_BIT(rreq->status, TRUE);
+        ucp_request->req = NULL;
+        goto fn_exit;
+    }
+    if (!ucp_request->req) {
+        rreq = MPIR_Request_create(MPIR_REQUEST_KIND__RECV);
+        MPIR_cc_set(&rreq->cc, 0);
+        rreq->status.MPI_SOURCE = MPIDI_UCX_get_source(info->sender_tag);
+        rreq->status.MPI_TAG = MPIDI_UCX_get_tag(info->sender_tag);
+        count = info->length;
+        MPIR_STATUS_SET_COUNT(rreq->status, count);
+        ucp_request->req = rreq;
+    }
+    else {
+        rreq = ucp_request->req;
+        rreq->status.MPI_ERROR = MPI_SUCCESS;
+        rreq->status.MPI_SOURCE = MPIDI_UCX_get_source(info->sender_tag);
+        rreq->status.MPI_TAG = MPIDI_UCX_get_tag(info->sender_tag);
+        count = info->length;
+        MPIR_STATUS_SET_COUNT(rreq->status, count);
+        MPIDI_CH4U_request_complete(rreq);
+        ucp_request->req = NULL;
+    }
+
+  fn_exit:
+    return;
+  fn_fail:
+    rreq->status.MPI_ERROR = mpi_errno;
+}
+
+#endif /* NETMOD_UCX_REQUEST_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_rma.h b/src/mpid/ch4/netmod/ucx/ucx_rma.h
new file mode 100644
index 0000000..88aa9af
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_rma.h
@@ -0,0 +1,299 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_UCX_RMA_H_INCLUDED
+#define NETMOD_UCX_RMA_H_INCLUDED
+
+#include "ucx_impl.h"
+
+static inline int MPIDI_UCX_contig_put(const void *origin_addr,
+                                       size_t size,
+                                       int target_rank,
+                                       MPI_Aint target_disp, MPI_Aint true_lb, MPIR_Win * win)
+{
+
+    MPIDI_UCX_win_info_t *win_info = &(MPIDI_UCX_WIN_INFO(win, target_rank));
+    size_t offset;
+    uint64_t base;
+    int mpi_errno = MPI_SUCCESS;
+    ucs_status_t status;
+    MPIR_Comm *comm = win->comm_ptr;
+    ucp_ep_h ep = MPIDI_UCX_COMM_TO_EP(comm, target_rank);
+
+    MPIDI_CH4U_EPOCH_START_CHECK(win, mpi_errno, goto fn_fail);
+    base = win_info->addr;
+    offset = target_disp * win_info->disp + true_lb;
+
+    status = ucp_put_nbi(ep, origin_addr, size, base + offset, win_info->rkey);
+    if (status == UCS_INPROGRESS)
+        MPIDI_UCX_WIN(win).need_local_flush = 1;
+    else
+        MPIDI_UCX_CHK_STATUS(status, ucp_mem_map);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+
+
+}
+
+static inline int MPIDI_UCX_contig_get(void *origin_addr,
+                                       size_t size,
+                                       int target_rank,
+                                       MPI_Aint target_disp, MPI_Aint true_lb, MPIR_Win * win)
+{
+
+    MPIDI_UCX_win_info_t *win_info = &(MPIDI_UCX_WIN_INFO(win, target_rank));
+    size_t offset;
+    uint64_t base;
+    int mpi_errno = MPI_SUCCESS;
+    ucs_status_t status;
+    MPIR_Comm *comm = win->comm_ptr;
+    ucp_ep_h ep = MPIDI_UCX_COMM_TO_EP(comm, target_rank);
+
+
+    MPIDI_CH4U_EPOCH_START_CHECK(win, mpi_errno, goto fn_fail);
+    base = win_info->addr;
+    offset = target_disp * win_info->disp + true_lb;
+
+    status = ucp_get_nbi(ep, origin_addr, size, base + offset, win_info->rkey);
+    if (status == UCS_INPROGRESS)
+        MPIDI_UCX_WIN(win).need_local_flush = 1;
+    else
+        MPIDI_UCX_CHK_STATUS(status, ucp_mem_map);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+
+
+}
+
+static inline int MPIDI_NM_put(const void *origin_addr,
+                               int origin_count,
+                               MPI_Datatype origin_datatype,
+                               int target_rank,
+                               MPI_Aint target_disp,
+                               int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_UCX_PUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_UCX_PUT);
+    int target_contig, origin_contig, mpi_errno = MPI_SUCCESS;
+    size_t target_bytes, origin_bytes;
+    MPI_Aint origin_true_lb, target_true_lb;
+    size_t offset;
+    if (win->create_flavor == MPI_WIN_FLAVOR_DYNAMIC || win->create_flavor == MPI_WIN_FLAVOR_SHARED)
+        return MPIDI_CH4U_put(origin_addr, origin_count, origin_datatype,
+                              target_rank, target_disp, target_count, target_datatype, win);
+
+
+
+
+
+    MPIDI_Datatype_check_contig_size_lb(target_datatype, target_count,
+                                        target_contig, target_bytes, target_true_lb);
+    MPIDI_Datatype_check_contig_size_lb(origin_datatype, origin_count,
+                                        origin_contig, origin_bytes, origin_true_lb);
+
+    MPIR_ERR_CHKANDJUMP((origin_bytes != target_bytes), mpi_errno, MPI_ERR_SIZE, "**rmasize");
+
+    if (unlikely((origin_bytes == 0) || (target_rank == MPI_PROC_NULL)))
+        goto fn_exit;
+    if (!target_contig || !origin_contig || MPIDI_UCX_WIN_INFO(win, target_rank).rkey == NULL)
+        return MPIDI_CH4U_put(origin_addr, origin_count, origin_datatype,
+                              target_rank, target_disp, target_count, target_datatype, win);
+
+    MPIDI_CH4U_EPOCH_CHECK_SYNC(win, mpi_errno, goto fn_fail);
+
+    if (target_rank == win->comm_ptr->rank) {
+        offset = win->disp_unit * target_disp;
+        return MPIR_Localcopy(origin_addr,
+                              origin_count,
+                              origin_datatype,
+                              (char *) win->base + offset, target_count, target_datatype);
+    }
+
+
+    mpi_errno = MPIDI_UCX_contig_put(origin_addr + origin_true_lb, origin_bytes,
+                                     target_rank, target_disp, target_true_lb, win);
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+static inline int MPIDI_NM_get(void *origin_addr,
+                               int origin_count,
+                               MPI_Datatype origin_datatype,
+                               int target_rank,
+                               MPI_Aint target_disp,
+                               int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+
+
+    int origin_contig, target_contig, mpi_errno = MPI_SUCCESS;
+    size_t origin_bytes, target_bytes;
+    size_t offset;
+
+    MPI_Aint origin_true_lb, target_true_lb;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_UCX_GET);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_UCX_GET);
+
+    if (win->create_flavor == MPI_WIN_FLAVOR_DYNAMIC || win->create_flavor == MPI_WIN_FLAVOR_SHARED)
+        return MPIDI_CH4U_get(origin_addr, origin_count, origin_datatype,
+                              target_rank, target_disp, target_count, target_datatype, win);
+
+    MPIDI_Datatype_check_contig_size_lb(target_datatype, target_count,
+                                        target_contig, target_bytes, target_true_lb);
+    MPIDI_Datatype_check_contig_size_lb(origin_datatype, origin_count,
+                                        origin_contig, origin_bytes, origin_true_lb);
+
+    if (unlikely((origin_bytes == 0) || (target_rank == MPI_PROC_NULL)))
+        goto fn_exit;
+
+
+
+    if (!origin_contig || !target_contig || MPIDI_UCX_WIN_INFO(win, target_rank).rkey == NULL)
+        return MPIDI_CH4U_get(origin_addr, origin_count, origin_datatype,
+                              target_rank, target_disp, target_count, target_datatype, win);
+
+    MPIDI_CH4U_EPOCH_CHECK_SYNC(win, mpi_errno, goto fn_fail);
+
+    if (target_rank == win->comm_ptr->rank) {
+        offset = target_disp * win->disp_unit;
+        return MPIR_Localcopy((char *) win->base + offset,
+                              target_count,
+                              target_datatype, origin_addr, origin_count, origin_datatype);
+    }
+
+
+    return MPIDI_UCX_contig_get(origin_addr + origin_true_lb, origin_bytes,
+                                target_rank, target_disp, target_true_lb, win);
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+static inline int MPIDI_NM_rput(const void *origin_addr,
+                                int origin_count,
+                                MPI_Datatype origin_datatype,
+                                int target_rank,
+                                MPI_Aint target_disp,
+                                int target_count,
+                                MPI_Datatype target_datatype,
+                                MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rput(origin_addr, origin_count, origin_datatype,
+                           target_rank, target_disp, target_count, target_datatype, win, request);
+}
+
+
+static inline int MPIDI_NM_compare_and_swap(const void *origin_addr,
+                                            const void *compare_addr,
+                                            void *result_addr,
+                                            MPI_Datatype datatype,
+                                            int target_rank, MPI_Aint target_disp, MPIR_Win * win)
+{
+    return MPIDI_CH4U_compare_and_swap(origin_addr, compare_addr, result_addr,
+                                       datatype, target_rank, target_disp, win);
+}
+
+static inline int MPIDI_NM_raccumulate(const void *origin_addr,
+                                       int origin_count,
+                                       MPI_Datatype origin_datatype,
+                                       int target_rank,
+                                       MPI_Aint target_disp,
+                                       int target_count,
+                                       MPI_Datatype target_datatype,
+                                       MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_raccumulate(origin_addr, origin_count, origin_datatype,
+                                  target_rank, target_disp, target_count,
+                                  target_datatype, op, win, request);
+}
+
+static inline int MPIDI_NM_rget_accumulate(const void *origin_addr,
+                                           int origin_count,
+                                           MPI_Datatype origin_datatype,
+                                           void *result_addr,
+                                           int result_count,
+                                           MPI_Datatype result_datatype,
+                                           int target_rank,
+                                           MPI_Aint target_disp,
+                                           int target_count,
+                                           MPI_Datatype target_datatype,
+                                           MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rget_accumulate(origin_addr, origin_count, origin_datatype,
+                                      result_addr, result_count, result_datatype,
+                                      target_rank, target_disp, target_count,
+                                      target_datatype, op, win, request);
+}
+
+static inline int MPIDI_NM_fetch_and_op(const void *origin_addr,
+                                        void *result_addr,
+                                        MPI_Datatype datatype,
+                                        int target_rank,
+                                        MPI_Aint target_disp, MPI_Op op, MPIR_Win * win)
+{
+    return MPIDI_CH4U_fetch_and_op(origin_addr, result_addr, datatype,
+                                   target_rank, target_disp, op, win);
+}
+
+
+static inline int MPIDI_NM_rget(void *origin_addr,
+                                int origin_count,
+                                MPI_Datatype origin_datatype,
+                                int target_rank,
+                                MPI_Aint target_disp,
+                                int target_count,
+                                MPI_Datatype target_datatype,
+                                MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rget(origin_addr, origin_count, origin_datatype,
+                           target_rank, target_disp, target_count, target_datatype, win, request);
+}
+
+
+static inline int MPIDI_NM_get_accumulate(const void *origin_addr,
+                                          int origin_count,
+                                          MPI_Datatype origin_datatype,
+                                          void *result_addr,
+                                          int result_count,
+                                          MPI_Datatype result_datatype,
+                                          int target_rank,
+                                          MPI_Aint target_disp,
+                                          int target_count,
+                                          MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    return MPIDI_CH4U_get_accumulate(origin_addr, origin_count, origin_datatype,
+                                     result_addr, result_count, result_datatype,
+                                     target_rank, target_disp, target_count,
+                                     target_datatype, op, win);
+}
+
+static inline int MPIDI_NM_accumulate(const void *origin_addr,
+                                      int origin_count,
+                                      MPI_Datatype origin_datatype,
+                                      int target_rank,
+                                      MPI_Aint target_disp,
+                                      int target_count,
+                                      MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    return MPIDI_CH4U_accumulate(origin_addr, origin_count, origin_datatype,
+                                 target_rank, target_disp, target_count, target_datatype, op, win);
+}
+
+#endif /* NETMOD_UCX_RMA_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_send.h b/src/mpid/ch4/netmod/ucx/ucx_send.h
new file mode 100644
index 0000000..6b2f41c
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_send.h
@@ -0,0 +1,488 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_UCX_SEND_H_INCLUDED
+#define NETMOD_UCX_SEND_H_INCLUDED
+#include <ucp/api/ucp.h>
+#include "ucx_impl.h"
+#include "ucx_types.h"
+
+#undef FUNCNAME
+#define FUNCNAME ucx_send_continous
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__ALWAYS_INLINE__ int ucx_send_continous(const void *buf,
+                                         size_t data_sz,
+                                         int rank,
+                                         int tag,
+                                         MPIR_Comm * comm, int context_offset,
+                                         MPIR_Request ** request, int have_request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *req;
+    MPIDI_UCX_ucp_request_t *ucp_request;
+    ucp_ep_h ep;
+    uint64_t ucx_tag;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SEND_CONTINOUS);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SEND_CONTINOUS);
+
+    ep = MPIDI_UCX_COMM_TO_EP(comm, rank);
+    ucx_tag = MPIDI_UCX_init_tag(comm->context_id + context_offset, comm->rank, tag);
+
+    ucp_request =
+        (MPIDI_UCX_ucp_request_t *) ucp_tag_send_nb(ep, buf, data_sz, ucp_dt_make_contig(1),
+                                                    ucx_tag, &MPIDI_UCX_Handle_send_callback);
+
+    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+
+    if (ucp_request == NULL) {
+        req = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
+        MPIR_cc_set(&req->cc, 0);
+        MPIDI_UCX_REQ(req).a.ucp_request = NULL;
+        goto fn_exit;
+    }
+
+    if (ucp_request->req) {
+        req = ucp_request->req;
+        ucp_request->req = NULL;
+        MPIDI_UCX_REQ(req).a.ucp_request = NULL;
+        ucp_request_release(ucp_request);
+    }
+    else {
+        req = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
+        MPIR_Request_add_ref(req);
+        ucp_request->req = req;
+        MPIDI_UCX_REQ(req).a.ucp_request = ucp_request;
+        ucp_request_release(ucp_request);
+    }
+
+
+  fn_exit:
+    *request = req;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SEND_CONTINOUS);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+__ALWAYS_INLINE__ int ucx_sync_send_continous(const void *buf,
+                                              size_t data_sz,
+                                              int rank,
+                                              int tag,
+                                              MPIR_Comm * comm, int context_offset,
+                                              MPIR_Request ** request, int have_request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *req;
+    MPIDI_UCX_ucp_request_t *ucp_request;
+    ucp_ep_h ep;
+    uint64_t ucx_tag;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SEND_CONTINOUS);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SEND_CONTINOUS);
+
+    ep = MPIDI_UCX_COMM_TO_EP(comm, rank);
+    ucx_tag = MPIDI_UCX_init_tag(comm->context_id + context_offset, comm->rank, tag);
+
+    ucp_request =
+        (MPIDI_UCX_ucp_request_t *) ucp_tag_send_sync_nb(ep, buf, data_sz, ucp_dt_make_contig(1),
+                                                         ucx_tag, &MPIDI_UCX_Handle_send_callback);
+
+    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+    if (ucp_request->req) {
+        req = ucp_request->req;
+        ucp_request->req = NULL;
+        MPIDI_UCX_REQ(req).a.ucp_request = NULL;
+        ucp_request_release(ucp_request);
+    }
+    else {
+        req = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
+        MPIR_Request_add_ref(req);
+        ucp_request->req = req;
+        MPIDI_UCX_REQ(req).a.ucp_request = ucp_request;
+        ucp_request_release(ucp_request);
+    }
+
+
+  fn_exit:
+    *request = req;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SEND_CONTINOUS);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+__ALWAYS_INLINE__ int ucx_sync_send_non_continous(const void *buf,
+                                                  size_t count,
+                                                  int rank,
+                                                  int tag,
+                                                  MPIR_Comm * comm, int context_offset,
+                                                  MPIR_Request ** request, int have_request,
+                                                  MPIR_Datatype * datatype)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *req;
+    MPIDI_UCX_ucp_request_t *ucp_request;
+    ucp_ep_h ep;
+    uint64_t ucx_tag;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SEND_CONTINOUS);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SEND_CONTINOUS);
+
+    ep = MPIDI_UCX_COMM_TO_EP(comm, rank);
+    ucx_tag = MPIDI_UCX_init_tag(comm->context_id + context_offset, comm->rank, tag);
+
+    ucp_request =
+        (MPIDI_UCX_ucp_request_t *) ucp_tag_send_sync_nb(ep, buf, count,
+                                                         datatype->dev.netmod.ucx.ucp_datatype,
+                                                         ucx_tag, &MPIDI_UCX_Handle_send_callback);
+
+    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+
+    if (ucp_request->req) {
+        req = ucp_request->req;
+        ucp_request->req = NULL;
+        ucp_request_release(ucp_request);
+        MPIDI_UCX_REQ(req).a.ucp_request = NULL;
+    }
+    else {
+        req = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
+        MPIR_Request_add_ref(req);
+        ucp_request->req = req;
+        ucp_request_release(ucp_request);
+        MPIDI_UCX_REQ(req).a.ucp_request = ucp_request;
+    }
+
+
+  fn_exit:
+    *request = req;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SEND_CONTINOUS);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+
+
+#undef FUNCNAME
+#define FUNCNAME ucx_send_continous
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+
+__ALWAYS_INLINE__ int ucx_send_non_continous(const void *buf,
+                                             size_t count,
+                                             int rank,
+                                             int tag,
+                                             MPIR_Comm * comm, int context_offset,
+                                             MPIR_Request ** request, int have_request,
+                                             MPIR_Datatype * datatype)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *req;
+    MPIDI_UCX_ucp_request_t *ucp_request;
+    ucp_ep_h ep;
+    uint64_t ucx_tag;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SEND_CONTINOUS);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SEND_CONTINOUS);
+
+    ep = MPIDI_UCX_COMM_TO_EP(comm, rank);
+    ucx_tag = MPIDI_UCX_init_tag(comm->context_id + context_offset, comm->rank, tag);
+
+    ucp_request =
+        (MPIDI_UCX_ucp_request_t *) ucp_tag_send_nb(ep, buf, count,
+                                                    datatype->dev.netmod.ucx.ucp_datatype, ucx_tag,
+                                                    &MPIDI_UCX_Handle_send_callback);
+
+    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+
+    if (ucp_request == NULL) {
+        req = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
+        MPIDI_UCX_REQ(req).a.ucp_request = NULL;
+        MPIR_cc_set(&req->cc, 0);
+        goto fn_exit;
+    }
+
+    if (ucp_request->req) {
+        req = ucp_request->req;
+        ucp_request->req = NULL;
+        MPIDI_UCX_REQ(req).a.ucp_request = NULL;
+        ucp_request_release(ucp_request);
+    }
+    else {
+        req = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
+        MPIR_Request_add_ref(req);
+        ucp_request->req = req;
+        MPIDI_UCX_REQ(req).a.ucp_request = ucp_request;
+        ucp_request_release(ucp_request);
+    }
+
+
+  fn_exit:
+    *request = req;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SEND_CONTINOUS);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+#undef FUNCNAME
+#define FUNCNAME ucx_send
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int ucx_send(const void *buf,
+                           int count,
+                           MPI_Datatype datatype,
+                           int rank,
+                           int tag,
+                           MPIR_Comm * comm, int context_offset, MPIR_Request ** request,
+                           int have_request)
+{
+
+    int dt_contig, mpi_errno;
+    size_t data_sz;
+    MPI_Aint dt_true_lb;
+    MPIR_Datatype *dt_ptr;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SEND);
+
+    MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);
+
+    if (dt_contig)
+        mpi_errno =
+            ucx_send_continous(buf + dt_true_lb, data_sz, rank, tag, comm, context_offset, request,
+                               have_request);
+    else
+        mpi_errno =
+            ucx_send_non_continous(buf, count, rank, tag, comm, context_offset, request,
+                                   have_request, dt_ptr);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SEND);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+static inline int ucx_sync_send(const void *buf,
+                                int count,
+                                MPI_Datatype datatype,
+                                int rank,
+                                int tag,
+                                MPIR_Comm * comm, int context_offset, MPIR_Request ** request,
+                                int have_request)
+{
+
+    int dt_contig, mpi_errno;
+    size_t data_sz;
+    MPI_Aint dt_true_lb;
+    MPIR_Datatype *dt_ptr;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SEND);
+
+    MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);
+
+    if (dt_contig)
+        mpi_errno =
+            ucx_sync_send_continous(buf + dt_true_lb, data_sz, rank, tag, comm, context_offset,
+                                    request, have_request);
+    else
+        mpi_errno =
+            ucx_sync_send_non_continous(buf, count, rank, tag, comm, context_offset, request,
+                                        have_request, dt_ptr);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SEND);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_netmode_send
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_send(const void *buf,
+                                int count,
+                                MPI_Datatype datatype,
+                                int rank,
+                                int tag,
+                                MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SEND);
+    mpi_errno = ucx_send(buf, count, datatype, rank, tag, comm, context_offset, request, 0);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SEND);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_netmode_rsend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_rsend(const void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+
+    return ucx_send(buf, count, datatype, rank, tag, comm, context_offset, request, 0);
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_netmode_irsend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_netmod_irsend(const void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return ucx_send(buf, count, datatype, rank, tag, comm, context_offset, request, 1);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_netmode_ssend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ssend(const void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return ucx_sync_send(buf, count, datatype, rank, tag, comm, context_offset, request, 0);
+}
+
+static inline int MPIDI_NM_startall(int count, MPIR_Request * requests[])
+{
+    return MPIDI_CH4U_startall(count, requests);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_netmode_send_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_send_init(const void *buf,
+                                     int count,
+                                     MPI_Datatype datatype,
+                                     int rank,
+                                     int tag,
+                                     MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_send_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_netmode_ssend_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_ssend_init(const void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_ssend_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_netmode_bsend_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_bsend_init(const void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_bsend_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_netmode_rsend_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_rsend_init(const void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_CH4U_rsend_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_netmode_isend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_isend(const void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+
+    return ucx_send(buf, count, datatype, rank, tag, comm, context_offset, request, 1);
+
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_netmode_issend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_issend(const void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  int rank,
+                                  int tag,
+                                  MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+
+    return ucx_sync_send(buf, count, datatype, rank, tag, comm, context_offset, request, 1);
+
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_netmode_cancel_send
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_cancel_send(MPIR_Request * sreq)
+{
+    if (MPIDI_UCX_REQ(sreq).a.ucp_request) {
+        ucp_request_cancel(MPIDI_UCX_global.worker, MPIDI_UCX_REQ(sreq).a.ucp_request);
+        ucp_request_release(MPIDI_UCX_REQ(sreq).a.ucp_request);
+    }
+}
+
+#endif /* NETMOD_UCX_SEND_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_spawn.h b/src/mpid/ch4/netmod/ucx/ucx_spawn.h
new file mode 100644
index 0000000..037b348
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_spawn.h
@@ -0,0 +1,104 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_UCX_SPAWN_H_INCLUDED
+#define NETMOD_UCX_SPAWN_H_INCLUDED
+
+#include "ucx_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_netmod_comm_open_port
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_open_port(MPIR_Info * info_ptr, char *port_name)
+{
+    int mpi_errno = MPI_SUCCESS;
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_netmod_comm_close_port
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_close_port(const char *port_name)
+{
+    int mpi_errno = MPI_SUCCESS;
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_comm_connect
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_comm_connect(const char *port_name,
+                                        MPIR_Info * info,
+                                        int root, MPIR_Comm * comm_ptr, MPIR_Comm ** newcomm)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+
+
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_NM_comm_disconnect
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_comm_disconnect(MPIR_Comm * comm_ptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_COMM_DISCONNECT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_COMM_DISCONNECT);
+
+
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_COMM_DISCONNECT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_netmod_comm_close_port
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_NM_comm_accept(const char *port_name,
+                                       MPIR_Info * info,
+                                       int root, MPIR_Comm * comm_ptr, MPIR_Comm ** newcomm)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_COMM_ACCEPT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_COMM_ACCEPT);
+
+
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_COMM_ACCEPT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#endif /* NETMOD_UCX_SPAWN_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_types.h b/src/mpid/ch4/netmod/ucx/ucx_types.h
new file mode 100644
index 0000000..045f64e
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_types.h
@@ -0,0 +1,66 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_UCX_TYPES_H_INCLUDED
+#define NETMOD_UCX_TYPES_H_INCLUDED
+#include <ucp/api/ucp.h>
+#include <ucp/api/ucp_def.h>
+#include "mpiimpl.h"
+
+#define __SHORT_FILE__                          \
+  (strrchr(__FILE__,'/')                        \
+   ? strrchr(__FILE__,'/')+1                    \
+   : __FILE__                                   \
+)
+
+#define UCP_PEER_NAME_MAX         HOST_NAME_MAX
+
+#define MPIDI_MAP_NOT_FOUND      ((void*)(-1UL))
+
+/* Active Message Stuff */
+#define MPIDI_UCX_NUM_AM_BUFFERS       (64)
+#define MPIDI_UCX_MAX_AM_EAGER_SZ      (16*1024)
+#define MPIDI_UCX_AM_TAG               (1 << 28)
+#define MPIDI_UCX_MAX_AM_HANDLERS      (64)
+
+typedef struct {
+    int avtid;
+    ucp_context_h context;
+    ucp_worker_h worker;
+    char addrname[UCP_PEER_NAME_MAX];
+    char *pmi_addr_table;
+    size_t addrname_len;
+    ucp_address_t *if_address;
+    char kvsname[MPIDI_UCX_KVSAPPSTRLEN];
+    char pname[MPI_MAX_PROCESSOR_NAME];
+    int max_addr_len;
+    MPIDI_NM_am_target_handler_fn am_handlers[MPIDI_UCX_MAX_AM_HANDLERS];
+    MPIDI_NM_am_origin_handler_fn send_cmpl_handlers[MPIDI_UCX_MAX_AM_HANDLERS];
+} MPIDI_UCX_global_t;
+
+#define MPIDI_UCX_GPID(gpid) ((gpid)->dev.netmod.ucx)
+#define MPIDI_UCX_AV(av)     ((av)->netmod.ucx)
+
+extern MPIDI_UCX_global_t MPIDI_UCX_global;
+
+/* UCX TAG Layout */
+
+/* 01234567 01234567 01234567 01234567 01234567 01234567 01234567 01234567
+ *  context_id (16) |source rank (16) | Message Tag (32)+ERROR BITS
+ */
+
+#define MPIDI_UCX_CONTEXT_TAG_BITS 16
+#define MPIDI_UCX_CONTEXT_RANK_BITS 16
+#define UCX_TAG_BITS 32
+
+#define MPIDI_UCX_TAG_MASK      (0x00000000FFFFFFFFULL)
+#define MPIDI_UCX_SOURCE_MASK   (0x0000FFFF00000000ULL)
+#define MPIDI_UCX_TAG_SHIFT     (32)
+#define MPIDI_UCX_SOURCE_SHIFT  (16)
+
+#endif /* NETMOD_UCX_TYPES_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_win.h b/src/mpid/ch4/netmod/ucx/ucx_win.h
new file mode 100644
index 0000000..8ae3fad
--- /dev/null
+++ b/src/mpid/ch4/netmod/ucx/ucx_win.h
@@ -0,0 +1,503 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Mellanox Technologies Ltd.
+ *  Copyright (C) Mellanox Technologies Ltd. 2016. ALL RIGHTS RESERVED
+ */
+#ifndef NETMOD_UCX_WIN_H_INCLUDED
+#define NETMOD_UCX_WIN_H_INCLUDED
+
+#include "ucx_impl.h"
+struct _UCX_share {
+    int disp;
+    MPI_Aint addr;
+};
+
+char ucx_dummy_buffer[4096];
+
+static inline int MPIDI_UCX_Win_allgather(MPIR_Win * win, size_t length,
+                                          uint32_t disp_unit, void **base_ptr)
+{
+
+    MPIR_Errflag_t err = MPIR_ERR_NONE;
+    int mpi_errno = MPI_SUCCESS;
+    ucs_status_t status;
+    ucp_mem_h mem_h;
+    int cntr = 0;
+    size_t rkey_size;
+    int *rkey_sizes, *recv_disps, i;
+    char *rkey_buffer, *rkey_recv_buff = NULL;
+    struct _UCX_share *share_data;
+    size_t size;
+    void *base;
+    if (length == 0)
+        size = 1024;
+    else
+        size = length;
+    MPIR_Comm *comm_ptr = win->comm_ptr;
+
+    ucp_context_h ucp_context = MPIDI_UCX_global.context;
+
+    MPIDI_UCX_WIN(win).info_table = MPL_malloc(sizeof(MPIDI_UCX_win_info_t) * comm_ptr->local_size);
+    if (length == 0)
+        base = &ucx_dummy_buffer;
+    else
+        base = *base_ptr;
+
+    status = ucp_mem_map(MPIDI_UCX_global.context, &base, size, 0, &mem_h);
+    MPIDI_UCX_CHK_STATUS(status, ucp_mem_map);
+    if (length > 0)
+        *base_ptr = base;
+
+    MPIDI_UCX_WIN(win).mem_h = mem_h;
+
+    /* pack the key */
+    status = ucp_rkey_pack(ucp_context, mem_h, (void **) &rkey_buffer, &rkey_size);
+
+    MPIDI_UCX_CHK_STATUS(status, ucp_mem_map);
+
+    rkey_sizes = (int *) MPL_malloc(sizeof(int) * comm_ptr->local_size);
+    rkey_sizes[comm_ptr->rank] = (int) rkey_size;
+    mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 1, MPI_INT,
+                                    rkey_sizes, 1, MPI_INT, comm_ptr, &err);
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    recv_disps = (int *) MPL_malloc(sizeof(int) * comm_ptr->local_size);
+
+
+    for (i = 0; i < comm_ptr->local_size; i++) {
+        recv_disps[i] = cntr;
+        cntr += rkey_sizes[i];
+    }
+
+    rkey_recv_buff = MPL_malloc(cntr);
+
+    /* allgather */
+    mpi_errno = MPIR_Allgatherv_impl(rkey_buffer, rkey_size, MPI_BYTE,
+                                     rkey_recv_buff, rkey_sizes, recv_disps, MPI_BYTE,
+                                     comm_ptr, &err);
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+/* If we use the shared memory support in UCX, we have to distinguish between local
+    and remote windows (at least now). If win_create is used, the key cannot be unpackt -
+    then we need our fallback-solution */
+
+    for (i = 0; i < comm_ptr->local_size; i++) {
+        status = ucp_ep_rkey_unpack(MPIDI_UCX_COMM_TO_EP(comm_ptr, i),
+                                    &rkey_recv_buff[recv_disps[i]],
+                                    &(MPIDI_UCX_WIN_INFO(win, i).rkey));
+        if (status == UCS_ERR_UNREACHABLE) {
+            MPIDI_UCX_WIN_INFO(win, i).rkey = NULL;
+        }
+        else
+            MPIDI_UCX_CHK_STATUS(status, ucp_mem_map);
+    }
+    share_data = MPL_malloc(comm_ptr->local_size * sizeof(struct _UCX_share));
+
+    share_data[comm_ptr->rank].disp = disp_unit;
+    share_data[comm_ptr->rank].addr = (MPI_Aint) base;
+
+    mpi_errno =
+        MPIR_Allgather(MPI_IN_PLACE, sizeof(struct _UCX_share), MPI_BYTE, share_data,
+                       sizeof(struct _UCX_share), MPI_BYTE, comm_ptr, &err);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    for (i = 0; i < comm_ptr->local_size; i++) {
+        MPIDI_UCX_WIN_INFO(win, i).disp = share_data[i].disp;
+        MPIDI_UCX_WIN_INFO(win, i).addr = share_data[i].addr;
+    }
+    MPIDI_UCX_WIN(win).need_local_flush = 0;
+  fn_exit:
+    /* buffer release */
+    if (rkey_buffer)
+        ucp_rkey_buffer_release(rkey_buffer);
+    /* free temps */
+    MPL_free(share_data);
+    MPL_free(rkey_sizes);
+    MPL_free(recv_disps);
+    MPL_free(rkey_recv_buff);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_UCX_Win_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_UCX_Win_init(MPI_Aint length,
+                                     int disp_unit,
+                                     MPIR_Win ** win_ptr,
+                                     MPIR_Info * info,
+                                     MPIR_Comm * comm_ptr, int create_flavor, int model)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Win *win;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_UCX_WIN_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_UCX_WIN_INIT);
+
+    mpi_errno = MPIDI_CH4R_win_init(length, disp_unit, &win, info, comm_ptr, create_flavor, model);
+    MPIR_ERR_CHKANDSTMT(mpi_errno != MPI_SUCCESS,
+                        mpi_errno, MPI_ERR_NO_MEM, goto fn_fail, "**nomem");
+    *win_ptr = win;
+
+    memset(&MPIDI_UCX_WIN(win), 0, sizeof(MPIDI_UCX_win_t));
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_UCX_PROGRESS_WIN_INIT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+static inline int MPIDI_NM_win_set_info(MPIR_Win * win, MPIR_Info * info)
+{
+    return MPIDI_CH4R_win_set_info(win, info);
+}
+
+
+static inline int MPIDI_NM_win_start(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_start(group, assert, win);
+}
+
+
+static inline int MPIDI_NM_win_complete(MPIR_Win * win)
+{
+
+    ucs_status_t ucp_status;
+    ucp_status = ucp_worker_flush(MPIDI_UCX_global.worker);
+    return MPIDI_CH4R_win_complete(win);
+}
+
+static inline int MPIDI_NM_win_post(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+
+    return MPIDI_CH4R_win_post(group, assert, win);
+}
+
+
+static inline int MPIDI_NM_win_wait(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_wait(win);
+}
+
+
+static inline int MPIDI_NM_win_test(MPIR_Win * win, int *flag)
+{
+    return MPIDI_CH4R_win_test(win, flag);
+}
+
+static inline int MPIDI_NM_win_lock(int lock_type, int rank, int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_lock(lock_type, rank, assert, win);
+}
+
+
+static inline int MPIDI_NM_win_unlock(int rank, MPIR_Win * win)
+{
+
+    int mpi_errno = MPI_SUCCESS;
+    ucs_status_t ucp_status;
+    ucp_ep_h ep = MPIDI_UCX_COMM_TO_EP(win->comm_ptr, rank);
+    /* make sure all operations are completed  */
+    ucp_status = ucp_ep_flush(ep);
+    MPIDI_UCX_CHK_STATUS(ucp_status, ucp_worker_fence);
+    mpi_errno = MPIDI_CH4R_win_unlock(rank, win);
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_NM_win_get_info(MPIR_Win * win, MPIR_Info ** info_p_p)
+{
+    return MPIDI_CH4R_win_get_info(win, info_p_p);
+}
+
+
+static inline int MPIDI_NM_win_free(MPIR_Win ** win_ptr)
+{
+
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+    MPIR_Win *win = *win_ptr;
+    MPIDI_CH4U_EPOCH_FREE_CHECK(win, mpi_errno, return mpi_errno);
+    mpi_errno = MPIR_Barrier_impl(win->comm_ptr, &errflag);
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+    if (win->create_flavor != MPI_WIN_FLAVOR_SHARED && win->create_flavor != MPI_WIN_FLAVOR_DYNAMIC) {
+        ucp_mem_unmap(MPIDI_UCX_global.context, MPIDI_UCX_WIN(win).mem_h);
+        MPL_free(MPIDI_UCX_WIN(win).info_table);
+    }
+    if (win->create_flavor == MPI_WIN_FLAVOR_ALLOCATE)
+        win->base = NULL;
+    MPIDI_CH4R_win_finalize(win_ptr);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_WIN_FREE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+static inline int MPIDI_NM_win_fence(int assert, MPIR_Win * win)
+{
+    int mpi_errno;
+    ucs_status_t ucp_status;
+    /*keep this for now to fence all none-natice operations */
+/* make sure all local and remote operations are completed */
+    ucp_status = ucp_worker_flush(MPIDI_UCX_global.worker);
+
+
+    mpi_errno = MPIDI_CH4R_win_fence(assert, win);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    MPIDI_UCX_CHK_STATUS(ucp_status, ucp_worker_fence);
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_NM_win_create(void *base,
+                                      MPI_Aint length,
+                                      int disp_unit,
+                                      MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr)
+{
+
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+    MPIR_Win *win;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_UCX_WIN_CREATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_UCX_WIN_CREATE);
+
+    MPIDI_UCX_Win_init(length, disp_unit, win_ptr, info,
+                       comm_ptr, MPI_WIN_FLAVOR_CREATE, MPI_WIN_UNIFIED);
+
+    win = *win_ptr;
+
+    mpi_errno = MPIDI_UCX_Win_allgather(win, length, disp_unit, &base);
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+    win->base = base;
+
+
+
+    mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag);
+
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_UCX_WIN_CREATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+
+}
+
+static inline int MPIDI_NM_win_attach(MPIR_Win * win, void *base, MPI_Aint size)
+{
+    return MPIDI_CH4R_win_attach(win, base, size);
+}
+
+static inline int MPIDI_NM_win_allocate_shared(MPI_Aint size,
+                                               int disp_unit,
+                                               MPIR_Info * info_ptr,
+                                               MPIR_Comm * comm_ptr,
+                                               void **base_ptr, MPIR_Win ** win_ptr)
+{
+    return MPIDI_CH4R_win_allocate_shared(size, disp_unit, info_ptr, comm_ptr, base_ptr, win_ptr);
+}
+
+static inline int MPIDI_NM_win_detach(MPIR_Win * win, const void *base)
+{
+    return MPIDI_CH4R_win_detach(win, base);
+}
+
+static inline int MPIDI_NM_win_shared_query(MPIR_Win * win,
+                                            int rank,
+                                            MPI_Aint * size, int *disp_unit, void *baseptr)
+{
+    return MPIDI_CH4R_win_shared_query(win, rank, size, disp_unit, baseptr);
+}
+
+static inline int MPIDI_NM_win_allocate(MPI_Aint length,
+                                        int disp_unit,
+                                        MPIR_Info * info,
+                                        MPIR_Comm * comm_ptr, void *baseptr, MPIR_Win ** win_ptr)
+{
+
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+    MPIR_Win *win;
+    void *base = NULL;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_UCX_WIN_ALLOCATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_UCX_WIN_WIN_ALLOCATE);
+
+    MPIDI_UCX_Win_init(length, disp_unit, win_ptr, info,
+                       comm_ptr, MPI_WIN_FLAVOR_ALLOCATE, MPI_WIN_UNIFIED);
+    win = *win_ptr;
+    mpi_errno = MPIDI_UCX_Win_allgather(win, length, disp_unit, &base);
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+    win->base = base;
+
+
+    *(void **) baseptr = (void *) base;
+
+
+    mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag);
+
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_UCX_WIN_ALLOCATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+
+}
+
+static inline int MPIDI_NM_win_flush(int rank, MPIR_Win * win)
+{
+
+    int mpi_errno;
+    ucs_status_t ucp_status;
+
+    ucp_ep_h ep = MPIDI_UCX_COMM_TO_EP(win->comm_ptr, rank);
+
+    mpi_errno = MPIDI_CH4R_win_flush(rank, win);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+/* only flush the endpoint */
+    ucp_status = ucp_ep_flush(ep);
+
+    MPIDI_UCX_CHK_STATUS(ucp_status, ucp_worker_fence);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+static inline int MPIDI_NM_win_flush_local_all(MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    ucs_status_t ucp_status;
+    mpi_errno = MPIDI_CH4R_win_flush_local_all(win);
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+    /* currently, UCP does not support local flush, so we have to call
+     * a global flush. This is not good for performance - but OK for now */
+    if (MPIDI_UCX_WIN(win).need_local_flush == 1) {
+        ucp_status = ucp_worker_flush(MPIDI_UCX_global.worker);
+        MPIDI_UCX_CHK_STATUS(ucp_status, ucp_worker_fence);
+        MPIDI_UCX_WIN(win).need_local_flush = 0;
+    }
+
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_NM_win_unlock_all(MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    ucs_status_t ucp_status;
+
+    /*first we have to make sure that all operations are completed */
+    ucp_status = ucp_worker_flush(MPIDI_UCX_global.worker);
+    MPIDI_UCX_CHK_STATUS(ucp_status, ucp_worker_fence);
+    mpi_errno = MPIDI_CH4R_win_unlock_all(win);
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline int MPIDI_NM_win_create_dynamic(MPIR_Info * info, MPIR_Comm * comm, MPIR_Win ** win)
+{
+    return MPIDI_CH4R_win_create_dynamic(info, comm, win);
+}
+
+static inline int MPIDI_NM_win_flush_local(int rank, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    ucs_status_t ucp_status;
+    mpi_errno = MPIDI_CH4R_win_flush_local(rank, win);
+
+    ucp_ep_h ep = MPIDI_UCX_COMM_TO_EP(win->comm_ptr, rank);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+    /* currently, UCP does not support local flush, so we have to call
+     * a global flush. This is not good for performance - but OK for now */
+
+    if (MPIDI_UCX_WIN(win).need_local_flush == 1) {
+        ucp_status = ucp_ep_flush(ep);
+        MPIDI_UCX_CHK_STATUS(ucp_status, ucp_worker_fence);
+        MPIDI_UCX_WIN(win).need_local_flush = 0;
+    }
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+static inline int MPIDI_NM_win_sync(MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_sync(win);
+}
+
+static inline int MPIDI_NM_win_flush_all(MPIR_Win * win)
+{
+
+/*maybe we just flush all eps here? More efficient for smaller communicators...*/
+    int mpi_errno = MPI_SUCCESS;
+    ucs_status_t ucp_status;
+    mpi_errno = MPIDI_CH4R_win_flush_all(win);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    ucp_status = ucp_worker_flush(MPIDI_UCX_global.worker);
+
+    MPIDI_UCX_CHK_STATUS(ucp_status, ucp_worker_fence);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+static inline int MPIDI_NM_win_lock_all(int assert, MPIR_Win * win)
+{
+    return MPIDI_CH4R_win_lock_all(assert, win);
+}
+
+
+#endif /* NETMOD_UCX_WIN_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/Makefile.mk b/src/mpid/ch4/shm/Makefile.mk
new file mode 100644
index 0000000..6bf059b
--- /dev/null
+++ b/src/mpid/ch4/shm/Makefile.mk
@@ -0,0 +1,24 @@
+## -*- Mode: Makefile; -*-
+## vim: set ft=automake :
+##
+## (C) 2016 by Argonne National Laboratory.
+## (C) 2014 by Mellanox Technologies, Inc.
+##     See COPYRIGHT in top-level directory.
+##
+##  Portions of this code were written by Intel Corporation.
+##  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+##  to Argonne National Laboratory subject to Software Grant and Corporate
+##  Contributor License Agreement dated February 8, 2012.
+##
+
+if BUILD_CH4_SHM
+
+AM_CPPFLAGS += -I$(top_srcdir)/src/mpid/ch4/shm/include
+
+noinst_HEADERS += src/mpid/ch4/shm/include/shm.h
+noinst_HEADERS += src/mpid/ch4/shm/include/shm_impl.h
+
+include $(top_srcdir)/src/mpid/ch4/shm/stubshm/Makefile.mk
+include $(top_srcdir)/src/mpid/ch4/shm/posix/Makefile.mk
+
+endif
diff --git a/src/mpid/ch4/shm/include/shm.h b/src/mpid/ch4/shm/include/shm.h
new file mode 100644
index 0000000..1fa4138
--- /dev/null
+++ b/src/mpid/ch4/shm/include/shm.h
@@ -0,0 +1,1192 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+/* ch4 shm functions */
+#ifndef SHM_PROTOTYPES_H_INCLUDED
+#define SHM_PROTOTYPES_H_INCLUDED
+
+#include <mpidimpl.h>
+
+#define MPIDI_MAX_SHM_STRING_LEN 64
+
+typedef int (*MPIDI_SHM_am_completion_handler_fn) (MPIR_Request * req);
+typedef int (*MPIDI_SHM_am_origin_handler_fn) (MPIR_Request * req);
+
+/* Callback function setup by handler register function */
+/* for short cases, output arguments are NULL */
+typedef int (*MPIDI_SHM_am_target_handler_fn)
+ (void *am_hdr, size_t am_hdr_sz, void **data,  /* CH4 manages this buffer - shm only fills with data */
+  MPI_Datatype * datatype, MPI_Count * count, int *noncontig,   /* if TRUE: data/data_sz are actually iovec/count */
+  MPIDI_SHM_am_completion_handler_fn * cmpl_handler_fn, /* completion handler */
+  MPIR_Request ** req);         /* if allocated, need pointer to completion function */
+
+typedef int (*MPIDI_SHM_init_t) (int rank, int size);
+typedef int (*MPIDI_SHM_finalize_t) (void);
+typedef int (*MPIDI_SHM_progress_t) (int blocking);
+typedef int (*MPIDI_SHM_reg_hdr_handler_t) (int handler_id,
+                                            MPIDI_SHM_am_origin_handler_fn origin_handler_fn,
+                                            MPIDI_SHM_am_target_handler_fn target_handler_fn);
+typedef int (*MPIDI_SHM_comm_connect_t) (const char *port_name, MPIR_Info * info, int root,
+                                         MPIR_Comm * comm, MPIR_Comm ** newcomm_ptr);
+typedef int (*MPIDI_SHM_comm_disconnect_t) (MPIR_Comm * comm_ptr);
+typedef int (*MPIDI_SHM_open_port_t) (MPIR_Info * info_ptr, char *port_name);
+typedef int (*MPIDI_SHM_close_port_t) (const char *port_name);
+typedef int (*MPIDI_SHM_comm_accept_t) (const char *port_name, MPIR_Info * info, int root,
+                                        MPIR_Comm * comm, MPIR_Comm ** newcomm_ptr);
+typedef int (*MPIDI_SHM_send_am_hdr_t) (int rank, MPIR_Comm * comm, int handler_id,
+                                        const void *am_hdr, size_t am_hdr_sz, MPIR_Request * sreq,
+                                        void *shm_context);
+typedef int (*MPIDI_SHM_inject_am_hdr_t) (int rank, MPIR_Comm * comm, int handler_id,
+                                          const void *am_hdr, size_t am_hdr_sz, void *shm_context);
+typedef int (*MPIDI_SHM_send_am_t) (int rank, MPIR_Comm * comm, int handler_id, const void *am_hdr,
+                                    size_t am_hdr_sz, const void *data, MPI_Count count,
+                                    MPI_Datatype datatype, MPIR_Request * sreq, void *shm_context);
+typedef int (*MPIDI_SHM_inject_am_t) (int rank, MPIR_Comm * comm, int handler_id,
+                                      const void *am_hdr, size_t am_hdr_sz, const void *data,
+                                      MPI_Count count, MPI_Datatype datatype, void *shm_context);
+typedef int (*MPIDI_SHM_send_amv_t) (int rank, MPIR_Comm * comm, int handler_id,
+                                     struct iovec * am_hdrs, size_t iov_len, const void *data,
+                                     MPI_Count count, MPI_Datatype datatype, MPIR_Request * sreq,
+                                     void *shm_context);
+typedef int (*MPIDI_SHM_inject_amv_t) (int rank, MPIR_Comm * comm, int handler_id,
+                                       struct iovec * am_hdrs, size_t iov_len, const void *data,
+                                       MPI_Count count, MPI_Datatype datatype, void *shm_context);
+typedef int (*MPIDI_SHM_send_am_hdr_reply_t) (MPIR_Context_id_t context_id, int src_rank,
+                                              int handler_id, const void *am_hdr, size_t am_hdr_sz,
+                                              MPIR_Request * sreq);
+typedef int (*MPIDI_SHM_inject_am_hdr_reply_t) (MPIR_Context_id_t context_id, int src_rank,
+                                                int handler_id, const void *am_hdr,
+                                                size_t am_hdr_sz);
+typedef int (*MPIDI_SHM_send_am_reply_t) (MPIR_Context_id_t context_id, int src_rank,
+                                          int handler_id, const void *am_hdr, size_t am_hdr_sz,
+                                          const void *data, MPI_Count count, MPI_Datatype datatype,
+                                          MPIR_Request * sreq);
+typedef int (*MPIDI_SHM_inject_am_reply_t) (MPIR_Context_id_t context_id, int src_rank,
+                                            int handler_id, const void *am_hdr, size_t am_hdr_sz,
+                                            const void *data, MPI_Count count,
+                                            MPI_Datatype datatype);
+typedef int (*MPIDI_SHM_send_amv_reply_t) (MPIR_Context_id_t context_id, int src_rank,
+                                           int handler_id, struct iovec * am_hdr, size_t iov_len,
+                                           const void *data, MPI_Count count, MPI_Datatype datatype,
+                                           MPIR_Request * sreq);
+typedef int (*MPIDI_SHM_inject_amv_reply_t) (MPIR_Context_id_t context_id, int src_rank,
+                                             int handler_id, struct iovec * am_hdrs, size_t iov_len,
+                                             const void *data, MPI_Count count,
+                                             MPI_Datatype datatype);
+typedef size_t(*MPIDI_SHM_am_hdr_max_sz_t) (void);
+typedef size_t(*MPIDI_SHM_am_inject_max_sz_t) (void);
+typedef int (*MPIDI_SHM_am_recv_t) (MPIR_Request * req);
+typedef int (*MPIDI_SHM_comm_get_lpid_t) (MPIR_Comm * comm_ptr, int idx, int *lpid_ptr,
+                                          MPL_bool is_remote);
+typedef int (*MPIDI_SHM_gpid_get_t) (MPIR_Comm * comm_ptr, int rank, MPIR_Gpid * gpid);
+typedef int (*MPIDI_SHM_get_node_id_t) (MPIR_Comm * comm, int rank, MPID_Node_id_t * id_p);
+typedef int (*MPIDI_SHM_get_max_node_id_t) (MPIR_Comm * comm, MPID_Node_id_t * max_id_p);
+typedef int (*MPIDI_SHM_getallincomm_t) (MPIR_Comm * comm_ptr, int local_size,
+                                         MPIR_Gpid local_gpid[], int *singleAVT);
+typedef int (*MPIDI_SHM_gpid_tolpidarray_t) (int size, MPIR_Gpid gpid[], int lpid[]);
+typedef int (*MPIDI_SHM_create_intercomm_from_lpids_t) (MPIR_Comm * newcomm_ptr, int size,
+                                                        const int lpids[]);
+typedef int (*MPIDI_SHM_comm_create_t) (MPIR_Comm * comm);
+typedef int (*MPIDI_SHM_comm_destroy_t) (MPIR_Comm * comm);
+typedef void (*MPIDI_SHM_am_request_init_t) (MPIR_Request * req);
+typedef void (*MPIDI_SHM_am_request_finalize_t) (MPIR_Request * req);
+typedef int (*MPIDI_SHM_send_t) (const void *buf, int count, MPI_Datatype datatype, int rank,
+                                 int tag, MPIR_Comm * comm, int context_offset,
+                                 MPIR_Request ** request);
+typedef int (*MPIDI_SHM_ssend_t) (const void *buf, int count, MPI_Datatype datatype, int rank,
+                                  int tag, MPIR_Comm * comm, int context_offset,
+                                  MPIR_Request ** request);
+typedef int (*MPIDI_SHM_startall_t) (int count, MPIR_Request * requests[]);
+typedef int (*MPIDI_SHM_send_init_t) (const void *buf, int count, MPI_Datatype datatype, int rank,
+                                      int tag, MPIR_Comm * comm, int context_offset,
+                                      MPIR_Request ** request);
+typedef int (*MPIDI_SHM_ssend_init_t) (const void *buf, int count, MPI_Datatype datatype, int rank,
+                                       int tag, MPIR_Comm * comm, int context_offset,
+                                       MPIR_Request ** request);
+typedef int (*MPIDI_SHM_rsend_init_t) (const void *buf, int count, MPI_Datatype datatype, int rank,
+                                       int tag, MPIR_Comm * comm, int context_offset,
+                                       MPIR_Request ** request);
+typedef int (*MPIDI_SHM_bsend_init_t) (const void *buf, int count, MPI_Datatype datatype, int rank,
+                                       int tag, MPIR_Comm * comm, int context_offset,
+                                       MPIR_Request ** request);
+typedef int (*MPIDI_SHM_isend_t) (const void *buf, int count, MPI_Datatype datatype, int rank,
+                                  int tag, MPIR_Comm * comm, int context_offset,
+                                  MPIR_Request ** request);
+typedef int (*MPIDI_SHM_issend_t) (const void *buf, int count, MPI_Datatype datatype, int rank,
+                                   int tag, MPIR_Comm * comm, int context_offset,
+                                   MPIR_Request ** request);
+typedef int (*MPIDI_SHM_cancel_send_t) (MPIR_Request * sreq);
+typedef int (*MPIDI_SHM_recv_init_t) (void *buf, int count, MPI_Datatype datatype, int rank,
+                                      int tag, MPIR_Comm * comm, int context_offset,
+                                      MPIR_Request ** request);
+typedef int (*MPIDI_SHM_recv_t) (void *buf, int count, MPI_Datatype datatype, int rank, int tag,
+                                 MPIR_Comm * comm, int context_offset, MPI_Status * status,
+                                 MPIR_Request ** request);
+typedef int (*MPIDI_SHM_irecv_t) (void *buf, int count, MPI_Datatype datatype, int rank, int tag,
+                                  MPIR_Comm * comm, int context_offset, MPIR_Request ** request);
+typedef int (*MPIDI_SHM_imrecv_t) (void *buf, int count, MPI_Datatype datatype,
+                                   MPIR_Request * message, MPIR_Request ** rreqp);
+typedef int (*MPIDI_SHM_cancel_recv_t) (MPIR_Request * rreq);
+typedef void *(*MPIDI_SHM_alloc_mem_t) (size_t size, MPIR_Info * info_ptr);
+typedef int (*MPIDI_SHM_free_mem_t) (void *ptr);
+typedef int (*MPIDI_SHM_improbe_t) (int source, int tag, MPIR_Comm * comm, int context_offset,
+                                    int *flag, MPIR_Request ** message, MPI_Status * status);
+typedef int (*MPIDI_SHM_iprobe_t) (int source, int tag, MPIR_Comm * comm, int context_offset,
+                                   int *flag, MPI_Status * status);
+typedef int (*MPIDI_SHM_win_set_info_t) (MPIR_Win * win, MPIR_Info * info);
+typedef int (*MPIDI_SHM_win_shared_query_t) (MPIR_Win * win, int rank, MPI_Aint * size,
+                                             int *disp_unit, void *baseptr);
+typedef int (*MPIDI_SHM_put_t) (const void *origin_addr, int origin_count,
+                                MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp,
+                                int target_count, MPI_Datatype target_datatype, MPIR_Win * win);
+typedef int (*MPIDI_SHM_win_start_t) (MPIR_Group * group, int assert, MPIR_Win * win);
+typedef int (*MPIDI_SHM_win_complete_t) (MPIR_Win * win);
+typedef int (*MPIDI_SHM_win_post_t) (MPIR_Group * group, int assert, MPIR_Win * win);
+typedef int (*MPIDI_SHM_win_wait_t) (MPIR_Win * win);
+typedef int (*MPIDI_SHM_win_test_t) (MPIR_Win * win, int *flag);
+typedef int (*MPIDI_SHM_win_lock_t) (int lock_type, int rank, int assert, MPIR_Win * win);
+typedef int (*MPIDI_SHM_win_unlock_t) (int rank, MPIR_Win * win);
+typedef int (*MPIDI_SHM_win_get_info_t) (MPIR_Win * win, MPIR_Info ** info_p_p);
+typedef int (*MPIDI_SHM_get_t) (void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                                int target_rank, MPI_Aint target_disp, int target_count,
+                                MPI_Datatype target_datatype, MPIR_Win * win);
+typedef int (*MPIDI_SHM_win_free_t) (MPIR_Win ** win_ptr);
+typedef int (*MPIDI_SHM_win_fence_t) (int assert, MPIR_Win * win);
+typedef int (*MPIDI_SHM_win_create_t) (void *base, MPI_Aint length, int disp_unit, MPIR_Info * info,
+                                       MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr);
+typedef int (*MPIDI_SHM_accumulate_t) (const void *origin_addr, int origin_count,
+                                       MPI_Datatype origin_datatype, int target_rank,
+                                       MPI_Aint target_disp, int target_count,
+                                       MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win);
+typedef int (*MPIDI_SHM_win_attach_t) (MPIR_Win * win, void *base, MPI_Aint size);
+typedef int (*MPIDI_SHM_win_allocate_shared_t) (MPI_Aint size, int disp_unit, MPIR_Info * info_ptr,
+                                                MPIR_Comm * comm_ptr, void **base_ptr,
+                                                MPIR_Win ** win_ptr);
+typedef int (*MPIDI_SHM_rput_t) (const void *origin_addr, int origin_count,
+                                 MPI_Datatype origin_datatype, int target_rank,
+                                 MPI_Aint target_disp, int target_count,
+                                 MPI_Datatype target_datatype, MPIR_Win * win,
+                                 MPIR_Request ** request);
+typedef int (*MPIDI_SHM_win_flush_local_t) (int rank, MPIR_Win * win);
+typedef int (*MPIDI_SHM_win_detach_t) (MPIR_Win * win, const void *base);
+typedef int (*MPIDI_SHM_compare_and_swap_t) (const void *origin_addr, const void *compare_addr,
+                                             void *result_addr, MPI_Datatype datatype,
+                                             int target_rank, MPI_Aint target_disp, MPIR_Win * win);
+typedef int (*MPIDI_SHM_raccumulate_t) (const void *origin_addr, int origin_count,
+                                        MPI_Datatype origin_datatype, int target_rank,
+                                        MPI_Aint target_disp, int target_count,
+                                        MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win,
+                                        MPIR_Request ** request);
+typedef int (*MPIDI_SHM_rget_accumulate_t) (const void *origin_addr, int origin_count,
+                                            MPI_Datatype origin_datatype, void *result_addr,
+                                            int result_count, MPI_Datatype result_datatype,
+                                            int target_rank, MPI_Aint target_disp, int target_count,
+                                            MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win,
+                                            MPIR_Request ** request);
+typedef int (*MPIDI_SHM_fetch_and_op_t) (const void *origin_addr, void *result_addr,
+                                         MPI_Datatype datatype, int target_rank,
+                                         MPI_Aint target_disp, MPI_Op op, MPIR_Win * win);
+typedef int (*MPIDI_SHM_win_allocate_t) (MPI_Aint size, int disp_unit, MPIR_Info * info,
+                                         MPIR_Comm * comm, void *baseptr, MPIR_Win ** win);
+typedef int (*MPIDI_SHM_win_flush_t) (int rank, MPIR_Win * win);
+typedef int (*MPIDI_SHM_win_flush_local_all_t) (MPIR_Win * win);
+typedef int (*MPIDI_SHM_win_unlock_all_t) (MPIR_Win * win);
+typedef int (*MPIDI_SHM_win_create_dynamic_t) (MPIR_Info * info, MPIR_Comm * comm, MPIR_Win ** win);
+typedef int (*MPIDI_SHM_rget_t) (void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                                 int target_rank, MPI_Aint target_disp, int target_count,
+                                 MPI_Datatype target_datatype, MPIR_Win * win,
+                                 MPIR_Request ** request);
+typedef int (*MPIDI_SHM_win_sync_t) (MPIR_Win * win);
+typedef int (*MPIDI_SHM_win_flush_all_t) (MPIR_Win * win);
+typedef int (*MPIDI_SHM_get_accumulate_t) (const void *origin_addr, int origin_count,
+                                           MPI_Datatype origin_datatype, void *result_addr,
+                                           int result_count, MPI_Datatype result_datatype,
+                                           int target_rank, MPI_Aint target_disp, int target_count,
+                                           MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win);
+typedef int (*MPIDI_SHM_win_lock_all_t) (int assert, MPIR_Win * win);
+typedef int (*MPIDI_SHM_barrier_t) (MPIR_Comm * comm, MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_bcast_t) (void *buffer, int count, MPI_Datatype datatype, int root,
+                                  MPIR_Comm * comm, MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_allreduce_t) (const void *sendbuf, void *recvbuf, int count,
+                                      MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                      MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_allgather_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                      void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                      MPIR_Comm * comm, MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_allgatherv_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                       void *recvbuf, const int *recvcounts, const int *displs,
+                                       MPI_Datatype recvtype, MPIR_Comm * comm,
+                                       MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_scatter_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype, int root,
+                                    MPIR_Comm * comm, MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_scatterv_t) (const void *sendbuf, const int *sendcounts, const int *displs,
+                                     MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                     MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr,
+                                     MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_gather_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, int recvcount, MPI_Datatype recvtype, int root,
+                                   MPIR_Comm * comm, MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_gatherv_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, const int *recvcounts, const int *displs,
+                                    MPI_Datatype recvtype, int root, MPIR_Comm * comm,
+                                    MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_alltoall_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                     MPIR_Comm * comm, MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_alltoallv_t) (const void *sendbuf, const int *sendcounts,
+                                      const int *sdispls, MPI_Datatype sendtype, void *recvbuf,
+                                      const int *recvcounts, const int *rdispls,
+                                      MPI_Datatype recvtype, MPIR_Comm * comm,
+                                      MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_alltoallw_t) (const void *sendbuf, const int *sendcounts,
+                                      const int *sdispls, const MPI_Datatype sendtypes[],
+                                      void *recvbuf, const int *recvcounts, const int *rdispls,
+                                      const MPI_Datatype recvtypes[], MPIR_Comm * comm,
+                                      MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_reduce_t) (const void *sendbuf, void *recvbuf, int count,
+                                   MPI_Datatype datatype, MPI_Op op, int root, MPIR_Comm * comm_ptr,
+                                   MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_reduce_scatter_t) (const void *sendbuf, void *recvbuf,
+                                           const int *recvcounts, MPI_Datatype datatype, MPI_Op op,
+                                           MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_reduce_scatter_block_t) (const void *sendbuf, void *recvbuf, int recvcount,
+                                                 MPI_Datatype datatype, MPI_Op op,
+                                                 MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_scan_t) (const void *sendbuf, void *recvbuf, int count,
+                                 MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                 MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_exscan_t) (const void *sendbuf, void *recvbuf, int count,
+                                   MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                   MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_neighbor_allgather_t) (const void *sendbuf, int sendcount,
+                                               MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                               MPI_Datatype recvtype, MPIR_Comm * comm,
+                                               MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_neighbor_allgatherv_t) (const void *sendbuf, int sendcount,
+                                                MPI_Datatype sendtype, void *recvbuf,
+                                                const int *recvcounts, const int *displs,
+                                                MPI_Datatype recvtype, MPIR_Comm * comm,
+                                                MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_neighbor_alltoallv_t) (const void *sendbuf, const int *sendcounts,
+                                               const int *sdispls, MPI_Datatype sendtype,
+                                               void *recvbuf, const int *recvcounts,
+                                               const int *rdispls, MPI_Datatype recvtype,
+                                               MPIR_Comm * comm, MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_neighbor_alltoallw_t) (const void *sendbuf, const int *sendcounts,
+                                               const MPI_Aint * sdispls,
+                                               const MPI_Datatype * sendtypes, void *recvbuf,
+                                               const int *recvcounts, const MPI_Aint * rdispls,
+                                               const MPI_Datatype * recvtypes, MPIR_Comm * comm,
+                                               MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_neighbor_alltoall_t) (const void *sendbuf, int sendcount,
+                                              MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                              MPI_Datatype recvtype, MPIR_Comm * comm,
+                                              MPIR_Errflag_t * errflag);
+typedef int (*MPIDI_SHM_ineighbor_allgather_t) (const void *sendbuf, int sendcount,
+                                                MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                                MPI_Datatype recvtype, MPIR_Comm * comm,
+                                                MPI_Request * req);
+typedef int (*MPIDI_SHM_ineighbor_allgatherv_t) (const void *sendbuf, int sendcount,
+                                                 MPI_Datatype sendtype, void *recvbuf,
+                                                 const int *recvcounts, const int *displs,
+                                                 MPI_Datatype recvtype, MPIR_Comm * comm,
+                                                 MPI_Request * req);
+typedef int (*MPIDI_SHM_ineighbor_alltoall_t) (const void *sendbuf, int sendcount,
+                                               MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                               MPI_Datatype recvtype, MPIR_Comm * comm,
+                                               MPI_Request * req);
+typedef int (*MPIDI_SHM_ineighbor_alltoallv_t) (const void *sendbuf, const int *sendcounts,
+                                                const int *sdispls, MPI_Datatype sendtype,
+                                                void *recvbuf, const int *recvcounts,
+                                                const int *rdispls, MPI_Datatype recvtype,
+                                                MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_SHM_ineighbor_alltoallw_t) (const void *sendbuf, const int *sendcounts,
+                                                const MPI_Aint * sdispls,
+                                                const MPI_Datatype * sendtypes, void *recvbuf,
+                                                const int *recvcounts, const MPI_Aint * rdispls,
+                                                const MPI_Datatype * recvtypes, MPIR_Comm * comm,
+                                                MPI_Request * req);
+typedef int (*MPIDI_SHM_ibarrier_t) (MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_SHM_ibcast_t) (void *buffer, int count, MPI_Datatype datatype, int root,
+                                   MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_SHM_iallgather_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                       void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                       MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_SHM_iallgatherv_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                        void *recvbuf, const int *recvcounts, const int *displs,
+                                        MPI_Datatype recvtype, MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_SHM_iallreduce_t) (const void *sendbuf, void *recvbuf, int count,
+                                       MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                       MPI_Request * req);
+typedef int (*MPIDI_SHM_ialltoall_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                      void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                      MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_SHM_ialltoallv_t) (const void *sendbuf, const int *sendcounts,
+                                       const int *sdispls, MPI_Datatype sendtype, void *recvbuf,
+                                       const int *recvcounts, const int *rdispls,
+                                       MPI_Datatype recvtype, MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_SHM_ialltoallw_t) (const void *sendbuf, const int *sendcounts,
+                                       const int *sdispls, const MPI_Datatype sendtypes[],
+                                       void *recvbuf, const int *recvcounts, const int *rdispls,
+                                       const MPI_Datatype recvtypes[], MPIR_Comm * comm,
+                                       MPI_Request * req);
+typedef int (*MPIDI_SHM_iexscan_t) (const void *sendbuf, void *recvbuf, int count,
+                                    MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                    MPI_Request * req);
+typedef int (*MPIDI_SHM_igather_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype, int root,
+                                    MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_SHM_igatherv_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, const int *recvcounts, const int *displs,
+                                     MPI_Datatype recvtype, int root, MPIR_Comm * comm,
+                                     MPI_Request * req);
+typedef int (*MPIDI_SHM_ireduce_scatter_block_t) (const void *sendbuf, void *recvbuf, int recvcount,
+                                                  MPI_Datatype datatype, MPI_Op op,
+                                                  MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_SHM_ireduce_scatter_t) (const void *sendbuf, void *recvbuf,
+                                            const int *recvcounts, MPI_Datatype datatype, MPI_Op op,
+                                            MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_SHM_ireduce_t) (const void *sendbuf, void *recvbuf, int count,
+                                    MPI_Datatype datatype, MPI_Op op, int root,
+                                    MPIR_Comm * comm_ptr, MPI_Request * req);
+typedef int (*MPIDI_SHM_iscan_t) (const void *sendbuf, void *recvbuf, int count,
+                                  MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                  MPI_Request * req);
+typedef int (*MPIDI_SHM_iscatter_t) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount, MPI_Datatype recvtype, int root,
+                                     MPIR_Comm * comm, MPI_Request * req);
+typedef int (*MPIDI_SHM_iscatterv_t) (const void *sendbuf, const int *sendcounts, const int *displs,
+                                      MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                      MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr,
+                                      MPI_Request * req);
+
+typedef struct MPIDI_SHM_funcs {
+    MPIDI_SHM_init_t init;
+    MPIDI_SHM_finalize_t finalize;
+    MPIDI_SHM_progress_t progress;
+    MPIDI_SHM_reg_hdr_handler_t reg_hdr_handler;
+    MPIDI_SHM_comm_connect_t comm_connect;
+    MPIDI_SHM_comm_disconnect_t comm_disconnect;
+    MPIDI_SHM_open_port_t open_port;
+    MPIDI_SHM_close_port_t close_port;
+    MPIDI_SHM_comm_accept_t comm_accept;
+    MPIDI_SHM_send_am_hdr_t send_am_hdr;
+    MPIDI_SHM_inject_am_hdr_t inject_am_hdr;
+    MPIDI_SHM_send_am_t send_am;
+    MPIDI_SHM_inject_am_t inject_am;
+    MPIDI_SHM_send_amv_t send_amv;
+    MPIDI_SHM_inject_amv_t inject_amv;
+    MPIDI_SHM_send_am_hdr_reply_t send_am_hdr_reply;
+    MPIDI_SHM_inject_am_hdr_reply_t inject_am_hdr_reply;
+    MPIDI_SHM_send_am_reply_t send_am_reply;
+    MPIDI_SHM_inject_am_reply_t inject_am_reply;
+    MPIDI_SHM_send_amv_reply_t send_amv_reply;
+    MPIDI_SHM_inject_amv_reply_t inject_amv_reply;
+    MPIDI_SHM_am_hdr_max_sz_t am_hdr_max_sz;
+    MPIDI_SHM_am_inject_max_sz_t am_inject_max_sz;
+    MPIDI_SHM_am_recv_t am_recv;
+    /* Routines that handle addressing */
+    MPIDI_SHM_comm_get_lpid_t comm_get_lpid;
+    MPIDI_SHM_gpid_get_t gpid_get;
+    MPIDI_SHM_get_node_id_t get_node_id;
+    MPIDI_SHM_get_max_node_id_t get_max_node_id;
+    MPIDI_SHM_getallincomm_t getallincomm;
+    MPIDI_SHM_gpid_tolpidarray_t gpid_tolpidarray;
+    MPIDI_SHM_create_intercomm_from_lpids_t create_intercomm_from_lpids;
+    MPIDI_SHM_comm_create_t comm_create;
+    MPIDI_SHM_comm_destroy_t comm_destroy;
+    /* Request allocation routines */
+    MPIDI_SHM_am_request_init_t am_request_init;
+    MPIDI_SHM_am_request_finalize_t am_request_finalize;
+} MPIDI_SHM_funcs_t;
+
+typedef struct MPIDI_SHM_native_funcs {
+    MPIDI_SHM_send_t send;
+    MPIDI_SHM_ssend_t ssend;
+    MPIDI_SHM_startall_t startall;
+    MPIDI_SHM_send_init_t send_init;
+    MPIDI_SHM_ssend_init_t ssend_init;
+    MPIDI_SHM_rsend_init_t rsend_init;
+    MPIDI_SHM_bsend_init_t bsend_init;
+    MPIDI_SHM_isend_t isend;
+    MPIDI_SHM_issend_t issend;
+    MPIDI_SHM_cancel_send_t cancel_send;
+    MPIDI_SHM_recv_init_t recv_init;
+    MPIDI_SHM_recv_t recv;
+    MPIDI_SHM_irecv_t irecv;
+    MPIDI_SHM_imrecv_t imrecv;
+    MPIDI_SHM_cancel_recv_t cancel_recv;
+    MPIDI_SHM_alloc_mem_t alloc_mem;
+    MPIDI_SHM_free_mem_t free_mem;
+    MPIDI_SHM_improbe_t improbe;
+    MPIDI_SHM_iprobe_t iprobe;
+    MPIDI_SHM_win_set_info_t win_set_info;
+    MPIDI_SHM_win_shared_query_t win_shared_query;
+    MPIDI_SHM_put_t put;
+    MPIDI_SHM_win_start_t win_start;
+    MPIDI_SHM_win_complete_t win_complete;
+    MPIDI_SHM_win_post_t win_post;
+    MPIDI_SHM_win_wait_t win_wait;
+    MPIDI_SHM_win_test_t win_test;
+    MPIDI_SHM_win_lock_t win_lock;
+    MPIDI_SHM_win_unlock_t win_unlock;
+    MPIDI_SHM_win_get_info_t win_get_info;
+    MPIDI_SHM_get_t get;
+    MPIDI_SHM_win_free_t win_free;
+    MPIDI_SHM_win_fence_t win_fence;
+    MPIDI_SHM_win_create_t win_create;
+    MPIDI_SHM_accumulate_t accumulate;
+    MPIDI_SHM_win_attach_t win_attach;
+    MPIDI_SHM_win_allocate_shared_t win_allocate_shared;
+    MPIDI_SHM_rput_t rput;
+    MPIDI_SHM_win_flush_local_t win_flush_local;
+    MPIDI_SHM_win_detach_t win_detach;
+    MPIDI_SHM_compare_and_swap_t compare_and_swap;
+    MPIDI_SHM_raccumulate_t raccumulate;
+    MPIDI_SHM_rget_accumulate_t rget_accumulate;
+    MPIDI_SHM_fetch_and_op_t fetch_and_op;
+    MPIDI_SHM_win_allocate_t win_allocate;
+    MPIDI_SHM_win_flush_t win_flush;
+    MPIDI_SHM_win_flush_local_all_t win_flush_local_all;
+    MPIDI_SHM_win_unlock_all_t win_unlock_all;
+    MPIDI_SHM_win_create_dynamic_t win_create_dynamic;
+    MPIDI_SHM_rget_t rget;
+    MPIDI_SHM_win_sync_t win_sync;
+    MPIDI_SHM_win_flush_all_t win_flush_all;
+    MPIDI_SHM_get_accumulate_t get_accumulate;
+    MPIDI_SHM_win_lock_all_t win_lock_all;
+    /* Collectives */
+    MPIDI_SHM_barrier_t barrier;
+    MPIDI_SHM_bcast_t bcast;
+    MPIDI_SHM_allreduce_t allreduce;
+    MPIDI_SHM_allgather_t allgather;
+    MPIDI_SHM_allgatherv_t allgatherv;
+    MPIDI_SHM_scatter_t scatter;
+    MPIDI_SHM_scatterv_t scatterv;
+    MPIDI_SHM_gather_t gather;
+    MPIDI_SHM_gatherv_t gatherv;
+    MPIDI_SHM_alltoall_t alltoall;
+    MPIDI_SHM_alltoallv_t alltoallv;
+    MPIDI_SHM_alltoallw_t alltoallw;
+    MPIDI_SHM_reduce_t reduce;
+    MPIDI_SHM_reduce_scatter_t reduce_scatter;
+    MPIDI_SHM_reduce_scatter_block_t reduce_scatter_block;
+    MPIDI_SHM_scan_t scan;
+    MPIDI_SHM_exscan_t exscan;
+    MPIDI_SHM_neighbor_allgather_t neighbor_allgather;
+    MPIDI_SHM_neighbor_allgatherv_t neighbor_allgatherv;
+    MPIDI_SHM_neighbor_alltoall_t neighbor_alltoall;
+    MPIDI_SHM_neighbor_alltoallv_t neighbor_alltoallv;
+    MPIDI_SHM_neighbor_alltoallw_t neighbor_alltoallw;
+    MPIDI_SHM_ineighbor_allgather_t ineighbor_allgather;
+    MPIDI_SHM_ineighbor_allgatherv_t ineighbor_allgatherv;
+    MPIDI_SHM_ineighbor_alltoall_t ineighbor_alltoall;
+    MPIDI_SHM_ineighbor_alltoallv_t ineighbor_alltoallv;
+    MPIDI_SHM_ineighbor_alltoallw_t ineighbor_alltoallw;
+    MPIDI_SHM_ibarrier_t ibarrier;
+    MPIDI_SHM_ibcast_t ibcast;
+    MPIDI_SHM_iallgather_t iallgather;
+    MPIDI_SHM_iallgatherv_t iallgatherv;
+    MPIDI_SHM_iallreduce_t iallreduce;
+    MPIDI_SHM_ialltoall_t ialltoall;
+    MPIDI_SHM_ialltoallv_t ialltoallv;
+    MPIDI_SHM_ialltoallw_t ialltoallw;
+    MPIDI_SHM_iexscan_t iexscan;
+    MPIDI_SHM_igather_t igather;
+    MPIDI_SHM_igatherv_t igatherv;
+    MPIDI_SHM_ireduce_scatter_block_t ireduce_scatter_block;
+    MPIDI_SHM_ireduce_scatter_t ireduce_scatter;
+    MPIDI_SHM_ireduce_t ireduce;
+    MPIDI_SHM_iscan_t iscan;
+    MPIDI_SHM_iscatter_t iscatter;
+    MPIDI_SHM_iscatterv_t iscatterv;
+} MPIDI_SHM_native_funcs_t;
+
+extern MPIDI_SHM_funcs_t *MPIDI_SHM_funcs[];
+extern MPIDI_SHM_funcs_t *MPIDI_SHM_func;
+extern MPIDI_SHM_native_funcs_t *MPIDI_SHM_native_funcs[];
+extern MPIDI_SHM_native_funcs_t *MPIDI_SHM_native_func;
+extern int MPIDI_num_shms;
+extern char MPIDI_SHM_strings[][MPIDI_MAX_SHM_STRING_LEN];
+
+#ifndef MPIDI_SHM_STATIC_INLINE_PREFIX
+#define MPIDI_SHM_STATIC_INLINE_PREFIX __attribute__((always_inline)) static inline
+#endif
+
+#ifndef MPIDI_SHM_STATIC_INLINE_SUFFIX
+#define MPIDI_SHM_STATIC_INLINE_SUFFIX __attribute__((always_inline))
+#endif
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_init(int rank,
+                                                  int size) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_finalize(void) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_progress(int blocking) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_reg_hdr_handler(int handler_id,
+                                                             MPIDI_SHM_am_origin_handler_fn
+                                                             origin_handler_fn,
+                                                             MPIDI_SHM_am_target_handler_fn
+                                                             target_handler_fn)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_comm_connect(const char *port_name, MPIR_Info * info,
+                                                          int root, MPIR_Comm * comm,
+                                                          MPIR_Comm **
+                                                          newcomm_ptr)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_comm_disconnect(MPIR_Comm *
+                                                             comm_ptr)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_open_port(MPIR_Info * info_ptr,
+                                                       char *port_name)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_close_port(const char *port_name)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_comm_accept(const char *port_name, MPIR_Info * info,
+                                                         int root, MPIR_Comm * comm,
+                                                         MPIR_Comm **
+                                                         newcomm_ptr)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_send_am_hdr(int rank, MPIR_Comm * comm, int handler_id,
+                                                         const void *am_hdr, size_t am_hdr_sz,
+                                                         MPIR_Request * sreq,
+                                                         void *shm_context)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_inject_am_hdr(int rank, MPIR_Comm * comm,
+                                                           int handler_id, const void *am_hdr,
+                                                           size_t am_hdr_sz,
+                                                           void *shm_context)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_send_am(int rank, MPIR_Comm * comm, int handler_id,
+                                                     const void *am_hdr, size_t am_hdr_sz,
+                                                     const void *data, MPI_Count count,
+                                                     MPI_Datatype datatype, MPIR_Request * sreq,
+                                                     void *shm_context)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_inject_am(int rank, MPIR_Comm * comm, int handler_id,
+                                                       const void *am_hdr, size_t am_hdr_sz,
+                                                       const void *data, MPI_Count count,
+                                                       MPI_Datatype datatype,
+                                                       void *shm_context)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_send_amv(int rank, MPIR_Comm * comm, int handler_id,
+                                                      struct iovec *am_hdrs, size_t iov_len,
+                                                      const void *data, MPI_Count count,
+                                                      MPI_Datatype datatype, MPIR_Request * sreq,
+                                                      void *shm_context)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_inject_amv(int rank, MPIR_Comm * comm, int handler_id,
+                                                        struct iovec *am_hdrs, size_t iov_len,
+                                                        const void *data, MPI_Count count,
+                                                        MPI_Datatype datatype,
+                                                        void *shm_context)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_send_am_hdr_reply(MPIR_Context_id_t context_id,
+                                                               int src_rank, int handler_id,
+                                                               const void *am_hdr, size_t am_hdr_sz,
+                                                               MPIR_Request *
+                                                               sreq) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_inject_am_hdr_reply(MPIR_Context_id_t context_id,
+                                                                 int src_rank, int handler_id,
+                                                                 const void *am_hdr,
+                                                                 size_t am_hdr_sz)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_send_am_reply(MPIR_Context_id_t context_id,
+                                                           int src_rank, int handler_id,
+                                                           const void *am_hdr, size_t am_hdr_sz,
+                                                           const void *data, MPI_Count count,
+                                                           MPI_Datatype datatype,
+                                                           MPIR_Request *
+                                                           sreq) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_inject_am_reply(MPIR_Context_id_t context_id,
+                                                             int src_rank, int handler_id,
+                                                             const void *am_hdr, size_t am_hdr_sz,
+                                                             const void *data, MPI_Count count,
+                                                             MPI_Datatype datatype)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_send_amv_reply(MPIR_Context_id_t context_id,
+                                                            int src_rank, int handler_id,
+                                                            struct iovec *am_hdr, size_t iov_len,
+                                                            const void *data, MPI_Count count,
+                                                            MPI_Datatype datatype,
+                                                            MPIR_Request *
+                                                            sreq) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_inject_amv_reply(MPIR_Context_id_t context_id,
+                                                              int src_rank, int handler_id,
+                                                              struct iovec *am_hdrs, size_t iov_len,
+                                                              const void *data, MPI_Count count,
+                                                              MPI_Datatype datatype)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX size_t MPIDI_SHM_am_hdr_max_sz(void) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX size_t MPIDI_SHM_am_inject_max_sz(void)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_am_recv(MPIR_Request *
+                                                     req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_comm_get_lpid(MPIR_Comm * comm_ptr, int idx,
+                                                           int *lpid_ptr,
+                                                           MPL_bool is_remote)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_gpid_get(MPIR_Comm * comm_ptr, int rank,
+                                                      MPIR_Gpid *
+                                                      gpid) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_get_node_id(MPIR_Comm * comm, int rank,
+                                                         MPID_Node_id_t *
+                                                         id_p) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_get_max_node_id(MPIR_Comm * comm,
+                                                             MPID_Node_id_t *
+                                                             max_id_p)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_getallincomm(MPIR_Comm * comm_ptr, int local_size,
+                                                          MPIR_Gpid local_gpid[],
+                                                          int *singleAVT)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_gpid_tolpidarray(int size, MPIR_Gpid gpid[],
+                                                              int lpid[])
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr,
+                                                                         int size,
+                                                                         const int lpids[])
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_comm_create(MPIR_Comm *
+                                                         comm) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_comm_destroy(MPIR_Comm *
+                                                          comm) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX void MPIDI_SHM_am_request_init(MPIR_Request *
+                                                              req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX void MPIDI_SHM_am_request_finalize(MPIR_Request *
+                                                                  req)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_send(const void *buf, int count, MPI_Datatype datatype,
+                                                  int rank, int tag, MPIR_Comm * comm,
+                                                  int context_offset,
+                                                  MPIR_Request **
+                                                  request) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ssend(const void *buf, int count,
+                                                   MPI_Datatype datatype, int rank, int tag,
+                                                   MPIR_Comm * comm, int context_offset,
+                                                   MPIR_Request **
+                                                   request) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_startall(int count,
+                                                      MPIR_Request *
+                                                      requests[]) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_send_init(const void *buf, int count,
+                                                       MPI_Datatype datatype, int rank, int tag,
+                                                       MPIR_Comm * comm, int context_offset,
+                                                       MPIR_Request **
+                                                       request) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ssend_init(const void *buf, int count,
+                                                        MPI_Datatype datatype, int rank, int tag,
+                                                        MPIR_Comm * comm, int context_offset,
+                                                        MPIR_Request **
+                                                        request) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_rsend_init(const void *buf, int count,
+                                                        MPI_Datatype datatype, int rank, int tag,
+                                                        MPIR_Comm * comm, int context_offset,
+                                                        MPIR_Request **
+                                                        request) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_bsend_init(const void *buf, int count,
+                                                        MPI_Datatype datatype, int rank, int tag,
+                                                        MPIR_Comm * comm, int context_offset,
+                                                        MPIR_Request **
+                                                        request) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_isend(const void *buf, int count,
+                                                   MPI_Datatype datatype, int rank, int tag,
+                                                   MPIR_Comm * comm, int context_offset,
+                                                   MPIR_Request **
+                                                   request) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_issend(const void *buf, int count,
+                                                    MPI_Datatype datatype, int rank, int tag,
+                                                    MPIR_Comm * comm, int context_offset,
+                                                    MPIR_Request **
+                                                    request) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_cancel_send(MPIR_Request *
+                                                         sreq) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_recv_init(void *buf, int count, MPI_Datatype datatype,
+                                                       int rank, int tag, MPIR_Comm * comm,
+                                                       int context_offset,
+                                                       MPIR_Request **
+                                                       request) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_recv(void *buf, int count, MPI_Datatype datatype,
+                                                  int rank, int tag, MPIR_Comm * comm,
+                                                  int context_offset, MPI_Status * status,
+                                                  MPIR_Request **
+                                                  request) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_irecv(void *buf, int count, MPI_Datatype datatype,
+                                                   int rank, int tag, MPIR_Comm * comm,
+                                                   int context_offset,
+                                                   MPIR_Request **
+                                                   request) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_imrecv(void *buf, int count, MPI_Datatype datatype,
+                                                    MPIR_Request * message,
+                                                    MPIR_Request **
+                                                    rreqp) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_cancel_recv(MPIR_Request *
+                                                         rreq) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX void *MPIDI_SHM_alloc_mem(size_t size,
+                                                         MPIR_Info *
+                                                         info_ptr) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_free_mem(void *ptr) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_improbe(int source, int tag, MPIR_Comm * comm,
+                                                     int context_offset, int *flag,
+                                                     MPIR_Request ** message,
+                                                     MPI_Status *
+                                                     status) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_iprobe(int source, int tag, MPIR_Comm * comm,
+                                                    int context_offset, int *flag,
+                                                    MPI_Status *
+                                                    status) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_set_info(MPIR_Win * win,
+                                                          MPIR_Info *
+                                                          info) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_shared_query(MPIR_Win * win, int rank,
+                                                              MPI_Aint * size, int *disp_unit,
+                                                              void *baseptr)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_put(const void *origin_addr, int origin_count,
+                                                 MPI_Datatype origin_datatype, int target_rank,
+                                                 MPI_Aint target_disp, int target_count,
+                                                 MPI_Datatype target_datatype,
+                                                 MPIR_Win * win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_start(MPIR_Group * group, int assert,
+                                                       MPIR_Win *
+                                                       win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_complete(MPIR_Win *
+                                                          win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_post(MPIR_Group * group, int assert,
+                                                      MPIR_Win *
+                                                      win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_wait(MPIR_Win *
+                                                      win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_test(MPIR_Win * win,
+                                                      int *flag) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_lock(int lock_type, int rank, int assert,
+                                                      MPIR_Win *
+                                                      win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_unlock(int rank,
+                                                        MPIR_Win *
+                                                        win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_get_info(MPIR_Win * win,
+                                                          MPIR_Info **
+                                                          info_p_p) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_get(void *origin_addr, int origin_count,
+                                                 MPI_Datatype origin_datatype, int target_rank,
+                                                 MPI_Aint target_disp, int target_count,
+                                                 MPI_Datatype target_datatype,
+                                                 MPIR_Win * win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_free(MPIR_Win **
+                                                      win_ptr) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_fence(int assert,
+                                                       MPIR_Win *
+                                                       win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_create(void *base, MPI_Aint length, int disp_unit,
+                                                        MPIR_Info * info, MPIR_Comm * comm_ptr,
+                                                        MPIR_Win **
+                                                        win_ptr) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_accumulate(const void *origin_addr, int origin_count,
+                                                        MPI_Datatype origin_datatype,
+                                                        int target_rank, MPI_Aint target_disp,
+                                                        int target_count,
+                                                        MPI_Datatype target_datatype, MPI_Op op,
+                                                        MPIR_Win *
+                                                        win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_attach(MPIR_Win * win, void *base,
+                                                        MPI_Aint size)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_allocate_shared(MPI_Aint size, int disp_unit,
+                                                                 MPIR_Info * info_ptr,
+                                                                 MPIR_Comm * comm_ptr,
+                                                                 void **base_ptr,
+                                                                 MPIR_Win **
+                                                                 win_ptr)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_rput(const void *origin_addr, int origin_count,
+                                                  MPI_Datatype origin_datatype, int target_rank,
+                                                  MPI_Aint target_disp, int target_count,
+                                                  MPI_Datatype target_datatype, MPIR_Win * win,
+                                                  MPIR_Request **
+                                                  request) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_flush_local(int rank,
+                                                             MPIR_Win *
+                                                             win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_detach(MPIR_Win * win,
+                                                        const void *base)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_compare_and_swap(const void *origin_addr,
+                                                              const void *compare_addr,
+                                                              void *result_addr,
+                                                              MPI_Datatype datatype,
+                                                              int target_rank, MPI_Aint target_disp,
+                                                              MPIR_Win *
+                                                              win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_raccumulate(const void *origin_addr, int origin_count,
+                                                         MPI_Datatype origin_datatype,
+                                                         int target_rank, MPI_Aint target_disp,
+                                                         int target_count,
+                                                         MPI_Datatype target_datatype, MPI_Op op,
+                                                         MPIR_Win * win,
+                                                         MPIR_Request **
+                                                         request) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_rget_accumulate(const void *origin_addr,
+                                                             int origin_count,
+                                                             MPI_Datatype origin_datatype,
+                                                             void *result_addr, int result_count,
+                                                             MPI_Datatype result_datatype,
+                                                             int target_rank, MPI_Aint target_disp,
+                                                             int target_count,
+                                                             MPI_Datatype target_datatype,
+                                                             MPI_Op op, MPIR_Win * win,
+                                                             MPIR_Request **
+                                                             request)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_fetch_and_op(const void *origin_addr,
+                                                          void *result_addr, MPI_Datatype datatype,
+                                                          int target_rank, MPI_Aint target_disp,
+                                                          MPI_Op op,
+                                                          MPIR_Win *
+                                                          win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_allocate(MPI_Aint size, int disp_unit,
+                                                          MPIR_Info * info, MPIR_Comm * comm,
+                                                          void *baseptr,
+                                                          MPIR_Win **
+                                                          win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_flush(int rank,
+                                                       MPIR_Win *
+                                                       win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_flush_local_all(MPIR_Win *
+                                                                 win)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_unlock_all(MPIR_Win *
+                                                            win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_create_dynamic(MPIR_Info * info, MPIR_Comm * comm,
+                                                                MPIR_Win **
+                                                                win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_rget(void *origin_addr, int origin_count,
+                                                  MPI_Datatype origin_datatype, int target_rank,
+                                                  MPI_Aint target_disp, int target_count,
+                                                  MPI_Datatype target_datatype, MPIR_Win * win,
+                                                  MPIR_Request **
+                                                  request) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_sync(MPIR_Win *
+                                                      win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_flush_all(MPIR_Win *
+                                                           win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_get_accumulate(const void *origin_addr,
+                                                            int origin_count,
+                                                            MPI_Datatype origin_datatype,
+                                                            void *result_addr, int result_count,
+                                                            MPI_Datatype result_datatype,
+                                                            int target_rank, MPI_Aint target_disp,
+                                                            int target_count,
+                                                            MPI_Datatype target_datatype, MPI_Op op,
+                                                            MPIR_Win *
+                                                            win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_lock_all(int assert,
+                                                          MPIR_Win *
+                                                          win) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_barrier(MPIR_Comm * comm,
+                                                     MPIR_Errflag_t *
+                                                     errflag) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_bcast(void *buffer, int count, MPI_Datatype datatype,
+                                                   int root, MPIR_Comm * comm,
+                                                   MPIR_Errflag_t *
+                                                   errflag) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_allreduce(const void *sendbuf, void *recvbuf,
+                                                       int count, MPI_Datatype datatype, MPI_Op op,
+                                                       MPIR_Comm * comm,
+                                                       MPIR_Errflag_t *
+                                                       errflag) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_allgather(const void *sendbuf, int sendcount,
+                                                       MPI_Datatype sendtype, void *recvbuf,
+                                                       int recvcount, MPI_Datatype recvtype,
+                                                       MPIR_Comm * comm,
+                                                       MPIR_Errflag_t *
+                                                       errflag) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_allgatherv(const void *sendbuf, int sendcount,
+                                                        MPI_Datatype sendtype, void *recvbuf,
+                                                        const int *recvcounts, const int *displs,
+                                                        MPI_Datatype recvtype, MPIR_Comm * comm,
+                                                        MPIR_Errflag_t *
+                                                        errflag) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_scatter(const void *sendbuf, int sendcount,
+                                                     MPI_Datatype sendtype, void *recvbuf,
+                                                     int recvcount, MPI_Datatype recvtype, int root,
+                                                     MPIR_Comm * comm,
+                                                     MPIR_Errflag_t *
+                                                     errflag) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_scatterv(const void *sendbuf, const int *sendcounts,
+                                                      const int *displs, MPI_Datatype sendtype,
+                                                      void *recvbuf, int recvcount,
+                                                      MPI_Datatype recvtype, int root,
+                                                      MPIR_Comm * comm_ptr,
+                                                      MPIR_Errflag_t *
+                                                      errflag) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_gather(const void *sendbuf, int sendcount,
+                                                    MPI_Datatype sendtype, void *recvbuf,
+                                                    int recvcount, MPI_Datatype recvtype, int root,
+                                                    MPIR_Comm * comm,
+                                                    MPIR_Errflag_t *
+                                                    errflag) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_gatherv(const void *sendbuf, int sendcount,
+                                                     MPI_Datatype sendtype, void *recvbuf,
+                                                     const int *recvcounts, const int *displs,
+                                                     MPI_Datatype recvtype, int root,
+                                                     MPIR_Comm * comm,
+                                                     MPIR_Errflag_t *
+                                                     errflag) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_alltoall(const void *sendbuf, int sendcount,
+                                                      MPI_Datatype sendtype, void *recvbuf,
+                                                      int recvcount, MPI_Datatype recvtype,
+                                                      MPIR_Comm * comm,
+                                                      MPIR_Errflag_t *
+                                                      errflag) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_alltoallv(const void *sendbuf, const int *sendcounts,
+                                                       const int *sdispls, MPI_Datatype sendtype,
+                                                       void *recvbuf, const int *recvcounts,
+                                                       const int *rdispls, MPI_Datatype recvtype,
+                                                       MPIR_Comm * comm,
+                                                       MPIR_Errflag_t *
+                                                       errflag) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_alltoallw(const void *sendbuf, const int *sendcounts,
+                                                       const int *sdispls,
+                                                       const MPI_Datatype sendtypes[],
+                                                       void *recvbuf, const int *recvcounts,
+                                                       const int *rdispls,
+                                                       const MPI_Datatype recvtypes[],
+                                                       MPIR_Comm * comm,
+                                                       MPIR_Errflag_t *
+                                                       errflag) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_reduce(const void *sendbuf, void *recvbuf, int count,
+                                                    MPI_Datatype datatype, MPI_Op op, int root,
+                                                    MPIR_Comm * comm_ptr,
+                                                    MPIR_Errflag_t *
+                                                    errflag) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_reduce_scatter(const void *sendbuf, void *recvbuf,
+                                                            const int *recvcounts,
+                                                            MPI_Datatype datatype, MPI_Op op,
+                                                            MPIR_Comm * comm_ptr,
+                                                            MPIR_Errflag_t *
+                                                            errflag) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_reduce_scatter_block(const void *sendbuf,
+                                                                  void *recvbuf, int recvcount,
+                                                                  MPI_Datatype datatype, MPI_Op op,
+                                                                  MPIR_Comm * comm_ptr,
+                                                                  MPIR_Errflag_t *
+                                                                  errflag)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_scan(const void *sendbuf, void *recvbuf, int count,
+                                                  MPI_Datatype datatype, MPI_Op op,
+                                                  MPIR_Comm * comm,
+                                                  MPIR_Errflag_t *
+                                                  errflag) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_exscan(const void *sendbuf, void *recvbuf, int count,
+                                                    MPI_Datatype datatype, MPI_Op op,
+                                                    MPIR_Comm * comm,
+                                                    MPIR_Errflag_t *
+                                                    errflag) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_neighbor_allgather(const void *sendbuf, int sendcount,
+                                                                MPI_Datatype sendtype,
+                                                                void *recvbuf, int recvcount,
+                                                                MPI_Datatype recvtype,
+                                                                MPIR_Comm * comm,
+                                                                MPIR_Errflag_t *
+                                                                errflag)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_neighbor_allgatherv(const void *sendbuf, int sendcount,
+                                                                 MPI_Datatype sendtype,
+                                                                 void *recvbuf,
+                                                                 const int *recvcounts,
+                                                                 const int *displs,
+                                                                 MPI_Datatype recvtype,
+                                                                 MPIR_Comm * comm,
+                                                                 MPIR_Errflag_t *
+                                                                 errflag)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_neighbor_alltoallv(const void *sendbuf,
+                                                                const int *sendcounts,
+                                                                const int *sdispls,
+                                                                MPI_Datatype sendtype,
+                                                                void *recvbuf,
+                                                                const int *recvcounts,
+                                                                const int *rdispls,
+                                                                MPI_Datatype recvtype,
+                                                                MPIR_Comm * comm,
+                                                                MPIR_Errflag_t *
+                                                                errflag)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_neighbor_alltoallw(const void *sendbuf,
+                                                                const int *sendcounts,
+                                                                const MPI_Aint * sdispls,
+                                                                const MPI_Datatype * sendtypes,
+                                                                void *recvbuf,
+                                                                const int *recvcounts,
+                                                                const MPI_Aint * rdispls,
+                                                                const MPI_Datatype * recvtypes,
+                                                                MPIR_Comm * comm,
+                                                                MPIR_Errflag_t *
+                                                                errflag)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_neighbor_alltoall(const void *sendbuf, int sendcount,
+                                                               MPI_Datatype sendtype, void *recvbuf,
+                                                               int recvcount, MPI_Datatype recvtype,
+                                                               MPIR_Comm * comm,
+                                                               MPIR_Errflag_t *
+                                                               errflag)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ineighbor_allgather(const void *sendbuf, int sendcount,
+                                                                 MPI_Datatype sendtype,
+                                                                 void *recvbuf, int recvcount,
+                                                                 MPI_Datatype recvtype,
+                                                                 MPIR_Comm * comm,
+                                                                 MPI_Request *
+                                                                 req)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ineighbor_allgatherv(const void *sendbuf,
+                                                                  int sendcount,
+                                                                  MPI_Datatype sendtype,
+                                                                  void *recvbuf,
+                                                                  const int *recvcounts,
+                                                                  const int *displs,
+                                                                  MPI_Datatype recvtype,
+                                                                  MPIR_Comm * comm,
+                                                                  MPI_Request *
+                                                                  req)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ineighbor_alltoall(const void *sendbuf, int sendcount,
+                                                                MPI_Datatype sendtype,
+                                                                void *recvbuf, int recvcount,
+                                                                MPI_Datatype recvtype,
+                                                                MPIR_Comm * comm,
+                                                                MPI_Request *
+                                                                req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ineighbor_alltoallv(const void *sendbuf,
+                                                                 const int *sendcounts,
+                                                                 const int *sdispls,
+                                                                 MPI_Datatype sendtype,
+                                                                 void *recvbuf,
+                                                                 const int *recvcounts,
+                                                                 const int *rdispls,
+                                                                 MPI_Datatype recvtype,
+                                                                 MPIR_Comm * comm,
+                                                                 MPI_Request *
+                                                                 req)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ineighbor_alltoallw(const void *sendbuf,
+                                                                 const int *sendcounts,
+                                                                 const MPI_Aint * sdispls,
+                                                                 const MPI_Datatype * sendtypes,
+                                                                 void *recvbuf,
+                                                                 const int *recvcounts,
+                                                                 const MPI_Aint * rdispls,
+                                                                 const MPI_Datatype * recvtypes,
+                                                                 MPIR_Comm * comm,
+                                                                 MPI_Request *
+                                                                 req)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ibarrier(MPIR_Comm * comm,
+                                                      MPI_Request *
+                                                      req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ibcast(void *buffer, int count, MPI_Datatype datatype,
+                                                    int root, MPIR_Comm * comm,
+                                                    MPI_Request *
+                                                    req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_iallgather(const void *sendbuf, int sendcount,
+                                                        MPI_Datatype sendtype, void *recvbuf,
+                                                        int recvcount, MPI_Datatype recvtype,
+                                                        MPIR_Comm * comm,
+                                                        MPI_Request *
+                                                        req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_iallgatherv(const void *sendbuf, int sendcount,
+                                                         MPI_Datatype sendtype, void *recvbuf,
+                                                         const int *recvcounts, const int *displs,
+                                                         MPI_Datatype recvtype, MPIR_Comm * comm,
+                                                         MPI_Request *
+                                                         req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_iallreduce(const void *sendbuf, void *recvbuf,
+                                                        int count, MPI_Datatype datatype, MPI_Op op,
+                                                        MPIR_Comm * comm,
+                                                        MPI_Request *
+                                                        req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ialltoall(const void *sendbuf, int sendcount,
+                                                       MPI_Datatype sendtype, void *recvbuf,
+                                                       int recvcount, MPI_Datatype recvtype,
+                                                       MPIR_Comm * comm,
+                                                       MPI_Request *
+                                                       req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ialltoallv(const void *sendbuf, const int *sendcounts,
+                                                        const int *sdispls, MPI_Datatype sendtype,
+                                                        void *recvbuf, const int *recvcounts,
+                                                        const int *rdispls, MPI_Datatype recvtype,
+                                                        MPIR_Comm * comm,
+                                                        MPI_Request *
+                                                        req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ialltoallw(const void *sendbuf, const int *sendcounts,
+                                                        const int *sdispls,
+                                                        const MPI_Datatype sendtypes[],
+                                                        void *recvbuf, const int *recvcounts,
+                                                        const int *rdispls,
+                                                        const MPI_Datatype recvtypes[],
+                                                        MPIR_Comm * comm,
+                                                        MPI_Request *
+                                                        req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_iexscan(const void *sendbuf, void *recvbuf, int count,
+                                                     MPI_Datatype datatype, MPI_Op op,
+                                                     MPIR_Comm * comm,
+                                                     MPI_Request *
+                                                     req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_igather(const void *sendbuf, int sendcount,
+                                                     MPI_Datatype sendtype, void *recvbuf,
+                                                     int recvcount, MPI_Datatype recvtype, int root,
+                                                     MPIR_Comm * comm,
+                                                     MPI_Request *
+                                                     req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_igatherv(const void *sendbuf, int sendcount,
+                                                      MPI_Datatype sendtype, void *recvbuf,
+                                                      const int *recvcounts, const int *displs,
+                                                      MPI_Datatype recvtype, int root,
+                                                      MPIR_Comm * comm,
+                                                      MPI_Request *
+                                                      req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ireduce_scatter_block(const void *sendbuf,
+                                                                   void *recvbuf, int recvcount,
+                                                                   MPI_Datatype datatype, MPI_Op op,
+                                                                   MPIR_Comm * comm,
+                                                                   MPI_Request *
+                                                                   req)
+    MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ireduce_scatter(const void *sendbuf, void *recvbuf,
+                                                             const int *recvcounts,
+                                                             MPI_Datatype datatype, MPI_Op op,
+                                                             MPIR_Comm * comm,
+                                                             MPI_Request *
+                                                             req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ireduce(const void *sendbuf, void *recvbuf, int count,
+                                                     MPI_Datatype datatype, MPI_Op op, int root,
+                                                     MPIR_Comm * comm_ptr,
+                                                     MPI_Request *
+                                                     req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_iscan(const void *sendbuf, void *recvbuf, int count,
+                                                   MPI_Datatype datatype, MPI_Op op,
+                                                   MPIR_Comm * comm,
+                                                   MPI_Request *
+                                                   req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_iscatter(const void *sendbuf, int sendcount,
+                                                      MPI_Datatype sendtype, void *recvbuf,
+                                                      int recvcount, MPI_Datatype recvtype,
+                                                      int root, MPIR_Comm * comm,
+                                                      MPI_Request *
+                                                      req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_iscatterv(const void *sendbuf, const int *sendcounts,
+                                                       const int *displs, MPI_Datatype sendtype,
+                                                       void *recvbuf, int recvcount,
+                                                       MPI_Datatype recvtype, int root,
+                                                       MPIR_Comm * comm_ptr,
+                                                       MPI_Request *
+                                                       req) MPIDI_SHM_STATIC_INLINE_SUFFIX;
+
+#endif /* SHM_PROTOTYPES_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/include/shm_impl.h b/src/mpid/ch4/shm/include/shm_impl.h
new file mode 100644
index 0000000..a6d0baa
--- /dev/null
+++ b/src/mpid/ch4/shm/include/shm_impl.h
@@ -0,0 +1,1109 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+/* ch4 shm functions */
+#ifndef SHM_IMPL_PROTOTYPES_H_INCLUDED
+#define SHM_IMPL_PROTOTYPES_H_INCLUDED
+
+#ifndef SHM_DIRECT
+#ifndef SHM_DISABLE_INLINES
+
+#ifndef MPIDI_SHM_STATIC_INLINE_PREFIX
+#define MPIDI_SHM_STATIC_INLINE_PREFIX __attribute__((always_inline)) static inline
+#endif
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_init(int rank, int size)
+{
+    return MPIDI_SHM_func->init(rank, size);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_finalize(void)
+{
+    return MPIDI_SHM_func->finalize();
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_progress(int blocking)
+{
+    return MPIDI_SHM_func->progress(blocking);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_reg_hdr_handler(int handler_id,
+                                                             MPIDI_SHM_am_origin_handler_fn
+                                                             origin_handler_fn,
+                                                             MPIDI_SHM_am_target_handler_fn
+                                                             target_handler_fn)
+{
+    return MPIDI_SHM_func->reg_hdr_handler(handler_id, origin_handler_fn, target_handler_fn);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_comm_connect(const char *port_name, MPIR_Info * info,
+                                                          int root, MPIR_Comm * comm,
+                                                          MPIR_Comm ** newcomm_ptr)
+{
+    return MPIDI_SHM_func->comm_connect(port_name, info, root, comm, newcomm_ptr);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_comm_disconnect(MPIR_Comm * comm_ptr)
+{
+    return MPIDI_SHM_func->comm_disconnect(comm_ptr);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_open_port(MPIR_Info * info_ptr, char *port_name)
+{
+    return MPIDI_SHM_func->open_port(info_ptr, port_name);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_close_port(const char *port_name)
+{
+    return MPIDI_SHM_func->close_port(port_name);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_comm_accept(const char *port_name, MPIR_Info * info,
+                                                         int root, MPIR_Comm * comm,
+                                                         MPIR_Comm ** newcomm_ptr)
+{
+    return MPIDI_SHM_func->comm_accept(port_name, info, root, comm, newcomm_ptr);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_send_am_hdr(int rank, MPIR_Comm * comm, int handler_id,
+                                                         const void *am_hdr, size_t am_hdr_sz,
+                                                         MPIR_Request * sreq, void *shm_context)
+{
+    return MPIDI_SHM_func->send_am_hdr(rank, comm, handler_id, am_hdr, am_hdr_sz, sreq,
+                                       shm_context);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_inject_am_hdr(int rank, MPIR_Comm * comm,
+                                                           int handler_id, const void *am_hdr,
+                                                           size_t am_hdr_sz, void *shm_context)
+{
+    return MPIDI_SHM_func->inject_am_hdr(rank, comm, handler_id, am_hdr, am_hdr_sz, shm_context);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_send_am(int rank, MPIR_Comm * comm, int handler_id,
+                                                     const void *am_hdr, size_t am_hdr_sz,
+                                                     const void *data, MPI_Count count,
+                                                     MPI_Datatype datatype, MPIR_Request * sreq,
+                                                     void *shm_context)
+{
+    return MPIDI_SHM_func->send_am(rank, comm, handler_id, am_hdr, am_hdr_sz, data, count, datatype,
+                                   sreq, shm_context);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_inject_am(int rank, MPIR_Comm * comm, int handler_id,
+                                                       const void *am_hdr, size_t am_hdr_sz,
+                                                       const void *data, MPI_Count count,
+                                                       MPI_Datatype datatype, void *shm_context)
+{
+    return MPIDI_SHM_func->inject_am(rank, comm, handler_id, am_hdr, am_hdr_sz, data, count,
+                                     datatype, shm_context);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_send_amv(int rank, MPIR_Comm * comm, int handler_id,
+                                                      struct iovec *am_hdrs, size_t iov_len,
+                                                      const void *data, MPI_Count count,
+                                                      MPI_Datatype datatype, MPIR_Request * sreq,
+                                                      void *shm_context)
+{
+    return MPIDI_SHM_func->send_amv(rank, comm, handler_id, am_hdrs, iov_len, data, count, datatype,
+                                    sreq, shm_context);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_inject_amv(int rank, MPIR_Comm * comm, int handler_id,
+                                                        struct iovec *am_hdrs, size_t iov_len,
+                                                        const void *data, MPI_Count count,
+                                                        MPI_Datatype datatype, void *shm_context)
+{
+    return MPIDI_SHM_func->inject_amv(rank, comm, handler_id, am_hdrs, iov_len, data, count,
+                                      datatype, shm_context);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_send_am_hdr_reply(MPIR_Context_id_t context_id,
+                                                               int src_rank, int handler_id,
+                                                               const void *am_hdr, size_t am_hdr_sz,
+                                                               MPIR_Request * sreq)
+{
+    return MPIDI_SHM_func->send_am_hdr_reply(context_id, src_rank, handler_id, am_hdr, am_hdr_sz,
+                                             sreq);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_inject_am_hdr_reply(MPIR_Context_id_t context_id,
+                                                                 int src_rank, int handler_id,
+                                                                 const void *am_hdr,
+                                                                 size_t am_hdr_sz)
+{
+    return MPIDI_SHM_func->inject_am_hdr_reply(context_id, src_rank, handler_id, am_hdr, am_hdr_sz);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_send_am_reply(MPIR_Context_id_t context_id,
+                                                           int src_rank, int handler_id,
+                                                           const void *am_hdr, size_t am_hdr_sz,
+                                                           const void *data, MPI_Count count,
+                                                           MPI_Datatype datatype,
+                                                           MPIR_Request * sreq)
+{
+    return MPIDI_SHM_func->send_am_reply(context_id, src_rank, handler_id, am_hdr, am_hdr_sz, data,
+                                         count, datatype, sreq);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_inject_am_reply(MPIR_Context_id_t context_id,
+                                                             int src_rank, int handler_id,
+                                                             const void *am_hdr, size_t am_hdr_sz,
+                                                             const void *data, MPI_Count count,
+                                                             MPI_Datatype datatype)
+{
+    return MPIDI_SHM_func->inject_am_reply(context_id, src_rank, handler_id, am_hdr, am_hdr_sz,
+                                           data, count, datatype);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_send_amv_reply(MPIR_Context_id_t context_id,
+                                                            int src_rank, int handler_id,
+                                                            struct iovec *am_hdr, size_t iov_len,
+                                                            const void *data, MPI_Count count,
+                                                            MPI_Datatype datatype,
+                                                            MPIR_Request * sreq)
+{
+    return MPIDI_SHM_func->send_amv_reply(context_id, src_rank, handler_id, am_hdr, iov_len, data,
+                                          count, datatype, sreq);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_inject_amv_reply(MPIR_Context_id_t context_id,
+                                                              int src_rank, int handler_id,
+                                                              struct iovec *am_hdrs, size_t iov_len,
+                                                              const void *data, MPI_Count count,
+                                                              MPI_Datatype datatype)
+{
+    return MPIDI_SHM_func->inject_amv_reply(context_id, src_rank, handler_id, am_hdrs, iov_len,
+                                            data, count, datatype);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX size_t MPIDI_SHM_am_hdr_max_sz(void)
+{
+    return MPIDI_SHM_func->am_hdr_max_sz();
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX size_t MPIDI_SHM_am_inject_max_sz(void)
+{
+    return MPIDI_SHM_func->am_inject_max_sz();
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_am_recv(MPIR_Request * req)
+{
+    return MPIDI_SHM_func->am_recv();
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_comm_get_lpid(MPIR_Comm * comm_ptr, int idx,
+                                                           int *lpid_ptr, MPL_bool is_remote)
+{
+    return MPIDI_SHM_func->comm_get_lpid(comm_ptr, idx, lpid_ptr, is_remote);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_gpid_get(MPIR_Comm * comm_ptr, int rank,
+                                                      MPIR_Gpid * gpid)
+{
+    return MPIDI_SHM_func->gpid_get(comm_ptr, rank, gpid);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_get_node_id(MPIR_Comm * comm, int rank,
+                                                         MPID_Node_id_t * id_p)
+{
+    return MPIDI_SHM_func->get_node_id(comm, rank, id_p);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_get_max_node_id(MPIR_Comm * comm,
+                                                             MPID_Node_id_t * max_id_p)
+{
+    return MPIDI_SHM_func->get_max_node_id(comm, max_id_p);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_getallincomm(MPIR_Comm * comm_ptr, int local_size,
+                                                          MPIR_Gpid local_gpid[], int *singleAVT)
+{
+    return MPIDI_SHM_func->getallincomm(comm_ptr, local_size, local_gpid, singleAVT);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_gpid_tolpidarray(int size, MPIR_Gpid gpid[],
+                                                              int lpid[])
+{
+    return MPIDI_SHM_func->gpid_tolpidarray(size, gpid, lpid);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr,
+                                                                         int size,
+                                                                         const int lpids[])
+{
+    return MPIDI_SHM_func->create_intercomm_from_lpids(newcomm_ptr, size, lpids);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_comm_create(MPIR_Comm * comm)
+{
+    return MPIDI_SHM_func->comm_create(comm);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_comm_destroy(MPIR_Comm * comm)
+{
+    return MPIDI_SHM_func->comm_destroy(comm);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX void MPIDI_SHM_am_request_init(MPIR_Request * req)
+{
+    return MPIDI_SHM_func->am_request_init(req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX void MPIDI_SHM_am_request_finalize(MPIR_Request * req)
+{
+    return MPIDI_SHM_func->am_request_finalize(req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_send(const void *buf, int count, MPI_Datatype datatype,
+                                                  int rank, int tag, MPIR_Comm * comm,
+                                                  int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_SHM_native_func->send(buf, count, datatype, rank, tag, comm, context_offset,
+                                       request);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ssend(const void *buf, int count,
+                                                   MPI_Datatype datatype, int rank, int tag,
+                                                   MPIR_Comm * comm, int context_offset,
+                                                   MPIR_Request ** request)
+{
+    return MPIDI_SHM_native_func->ssend(buf, count, datatype, rank, tag, comm, context_offset,
+                                        request);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_startall(int count, MPIR_Request * requests[])
+{
+    return MPIDI_SHM_native_func->startall(count, requests);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_send_init(const void *buf, int count,
+                                                       MPI_Datatype datatype, int rank, int tag,
+                                                       MPIR_Comm * comm, int context_offset,
+                                                       MPIR_Request ** request)
+{
+    return MPIDI_SHM_native_func->send_init(buf, count, datatype, rank, tag, comm, context_offset,
+                                            request);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ssend_init(const void *buf, int count,
+                                                        MPI_Datatype datatype, int rank, int tag,
+                                                        MPIR_Comm * comm, int context_offset,
+                                                        MPIR_Request ** request)
+{
+    return MPIDI_SHM_native_func->ssend_init(buf, count, datatype, rank, tag, comm, context_offset,
+                                             request);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_rsend_init(const void *buf, int count,
+                                                        MPI_Datatype datatype, int rank, int tag,
+                                                        MPIR_Comm * comm, int context_offset,
+                                                        MPIR_Request ** request)
+{
+    return MPIDI_SHM_native_func->rsend_init(buf, count, datatype, rank, tag, comm, context_offset,
+                                             request);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_bsend_init(const void *buf, int count,
+                                                        MPI_Datatype datatype, int rank, int tag,
+                                                        MPIR_Comm * comm, int context_offset,
+                                                        MPIR_Request ** request)
+{
+    return MPIDI_SHM_native_func->bsend_init(buf, count, datatype, rank, tag, comm, context_offset,
+                                             request);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_isend(const void *buf, int count,
+                                                   MPI_Datatype datatype, int rank, int tag,
+                                                   MPIR_Comm * comm, int context_offset,
+                                                   MPIR_Request ** request)
+{
+    return MPIDI_SHM_native_func->isend(buf, count, datatype, rank, tag, comm, context_offset,
+                                        request);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_issend(const void *buf, int count,
+                                                    MPI_Datatype datatype, int rank, int tag,
+                                                    MPIR_Comm * comm, int context_offset,
+                                                    MPIR_Request ** request)
+{
+    return MPIDI_SHM_native_func->issend(buf, count, datatype, rank, tag, comm, context_offset,
+                                         request);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_cancel_send(MPIR_Request * sreq)
+{
+    return MPIDI_SHM_native_func->cancel_send(sreq);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_recv_init(void *buf, int count, MPI_Datatype datatype,
+                                                       int rank, int tag, MPIR_Comm * comm,
+                                                       int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_SHM_native_func->recv_init(buf, count, datatype, rank, tag, comm, context_offset,
+                                            request);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_recv(void *buf, int count, MPI_Datatype datatype,
+                                                  int rank, int tag, MPIR_Comm * comm,
+                                                  int context_offset, MPI_Status * status,
+                                                  MPIR_Request ** request)
+{
+    return MPIDI_SHM_native_func->recv(buf, count, datatype, rank, tag, comm, context_offset,
+                                       status, request);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_irecv(void *buf, int count, MPI_Datatype datatype,
+                                                   int rank, int tag, MPIR_Comm * comm,
+                                                   int context_offset, MPIR_Request ** request)
+{
+    return MPIDI_SHM_native_func->irecv(buf, count, datatype, rank, tag, comm, context_offset,
+                                        request);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_imrecv(void *buf, int count, MPI_Datatype datatype,
+                                                    MPIR_Request * message, MPIR_Request ** rreqp)
+{
+    return MPIDI_SHM_native_func->imrecv(buf, count, datatype, message, rreqp);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_cancel_recv(MPIR_Request * rreq)
+{
+    return MPIDI_SHM_native_func->cancel_recv(rreq);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX void *MPIDI_SHM_alloc_mem(size_t size, MPIR_Info * info_ptr)
+{
+    return MPIDI_SHM_native_func->alloc_mem(size, info_ptr);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_free_mem(void *ptr)
+{
+    return MPIDI_SHM_native_func->free_mem(ptr);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_improbe(int source, int tag, MPIR_Comm * comm,
+                                                     int context_offset, int *flag,
+                                                     MPIR_Request ** message, MPI_Status * status)
+{
+    return MPIDI_SHM_native_func->improbe(source, tag, comm, context_offset, flag, message, status);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_iprobe(int source, int tag, MPIR_Comm * comm,
+                                                    int context_offset, int *flag,
+                                                    MPI_Status * status)
+{
+    return MPIDI_SHM_native_func->iprobe(source, tag, comm, context_offset, flag, status);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_set_info(MPIR_Win * win, MPIR_Info * info)
+{
+    return MPIDI_SHM_native_func->win_set_info(win, info);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_shared_query(MPIR_Win * win, int rank,
+                                                              MPI_Aint * size, int *disp_unit,
+                                                              void *baseptr)
+{
+    return MPIDI_SHM_native_func->win_shared_query(win, rank, size, disp_unit, baseptr);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_put(const void *origin_addr, int origin_count,
+                                                 MPI_Datatype origin_datatype, int target_rank,
+                                                 MPI_Aint target_disp, int target_count,
+                                                 MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->put(origin_addr, origin_count, origin_datatype, target_rank,
+                                      target_disp, target_count, target_datatype, win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_start(MPIR_Group * group, int assert,
+                                                       MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->win_start(group, assert, win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_complete(MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->win_complete(win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_post(MPIR_Group * group, int assert,
+                                                      MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->win_post(group, assert, win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_wait(MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->win_wait(win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_test(MPIR_Win * win, int *flag)
+{
+    return MPIDI_SHM_native_func->win_test(win, flag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_lock(int lock_type, int rank, int assert,
+                                                      MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->win_lock(lock_type, rank, assert, win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_unlock(int rank, MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->win_unlock(rank, win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_get_info(MPIR_Win * win, MPIR_Info ** info_p_p)
+{
+    return MPIDI_SHM_native_func->win_get_info(win, info_p_p);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_get(void *origin_addr, int origin_count,
+                                                 MPI_Datatype origin_datatype, int target_rank,
+                                                 MPI_Aint target_disp, int target_count,
+                                                 MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->get(origin_addr, origin_count, origin_datatype, target_rank,
+                                      target_disp, target_count, target_datatype, win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_free(MPIR_Win ** win_ptr)
+{
+    return MPIDI_SHM_native_func->win_free(win_ptr);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_fence(int assert, MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->win_fence(assert, win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_create(void *base, MPI_Aint length, int disp_unit,
+                                                        MPIR_Info * info, MPIR_Comm * comm_ptr,
+                                                        MPIR_Win ** win_ptr)
+{
+    return MPIDI_SHM_native_func->win_create(base, length, disp_unit, info, comm_ptr, win_ptr);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_accumulate(const void *origin_addr, int origin_count,
+                                                        MPI_Datatype origin_datatype,
+                                                        int target_rank, MPI_Aint target_disp,
+                                                        int target_count,
+                                                        MPI_Datatype target_datatype, MPI_Op op,
+                                                        MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->accumulate(origin_addr, origin_count, origin_datatype,
+                                             target_rank, target_disp, target_count,
+                                             target_datatype, op, win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_attach(MPIR_Win * win, void *base, MPI_Aint size)
+{
+    return MPIDI_SHM_native_func->win_attach(win, base, size);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_allocate_shared(MPI_Aint size, int disp_unit,
+                                                                 MPIR_Info * info_ptr,
+                                                                 MPIR_Comm * comm_ptr,
+                                                                 void **base_ptr,
+                                                                 MPIR_Win ** win_ptr)
+{
+    return MPIDI_SHM_native_func->win_allocate_shared(size, disp_unit, info_ptr, comm_ptr, base_ptr,
+                                                      win_ptr);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_rput(const void *origin_addr, int origin_count,
+                                                  MPI_Datatype origin_datatype, int target_rank,
+                                                  MPI_Aint target_disp, int target_count,
+                                                  MPI_Datatype target_datatype, MPIR_Win * win,
+                                                  MPIR_Request ** request)
+{
+    return MPIDI_SHM_native_func->rput(origin_addr, origin_count, origin_datatype, target_rank,
+                                       target_disp, target_count, target_datatype, win, request);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_flush_local(int rank, MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->win_flush_local(rank, win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_detach(MPIR_Win * win, const void *base)
+{
+    return MPIDI_SHM_native_func->win_detach(win, base);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_compare_and_swap(const void *origin_addr,
+                                                              const void *compare_addr,
+                                                              void *result_addr,
+                                                              MPI_Datatype datatype,
+                                                              int target_rank, MPI_Aint target_disp,
+                                                              MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->compare_and_swap(origin_addr, compare_addr, result_addr, datatype,
+                                                   target_rank, target_disp, win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_raccumulate(const void *origin_addr, int origin_count,
+                                                         MPI_Datatype origin_datatype,
+                                                         int target_rank, MPI_Aint target_disp,
+                                                         int target_count,
+                                                         MPI_Datatype target_datatype, MPI_Op op,
+                                                         MPIR_Win * win, MPIR_Request ** request)
+{
+    return MPIDI_SHM_native_func->raccumulate(origin_addr, origin_count, origin_datatype,
+                                              target_rank, target_disp, target_count,
+                                              target_datatype, op, win, request);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_rget_accumulate(const void *origin_addr,
+                                                             int origin_count,
+                                                             MPI_Datatype origin_datatype,
+                                                             void *result_addr, int result_count,
+                                                             MPI_Datatype result_datatype,
+                                                             int target_rank, MPI_Aint target_disp,
+                                                             int target_count,
+                                                             MPI_Datatype target_datatype,
+                                                             MPI_Op op, MPIR_Win * win,
+                                                             MPIR_Request ** request)
+{
+    return MPIDI_SHM_native_func->rget_accumulate(origin_addr, origin_count, origin_datatype,
+                                                  result_addr, result_count, result_datatype,
+                                                  target_rank, target_disp, target_count,
+                                                  target_datatype, op, win, request);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_fetch_and_op(const void *origin_addr,
+                                                          void *result_addr, MPI_Datatype datatype,
+                                                          int target_rank, MPI_Aint target_disp,
+                                                          MPI_Op op, MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->fetch_and_op(origin_addr, result_addr, datatype, target_rank,
+                                               target_disp, op, win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_allocate(MPI_Aint size, int disp_unit,
+                                                          MPIR_Info * info, MPIR_Comm * comm,
+                                                          void *baseptr, MPIR_Win ** win)
+{
+    return MPIDI_SHM_native_func->win_allocate(size, disp_unit, info, comm, baseptr, win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_flush(int rank, MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->win_flush(rank, win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_flush_local_all(MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->win_flush_local_all(win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_unlock_all(MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->win_unlock_all(win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_create_dynamic(MPIR_Info * info, MPIR_Comm * comm,
+                                                                MPIR_Win ** win)
+{
+    return MPIDI_SHM_native_func->win_create_dynamic(info, comm, win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_rget(void *origin_addr, int origin_count,
+                                                  MPI_Datatype origin_datatype, int target_rank,
+                                                  MPI_Aint target_disp, int target_count,
+                                                  MPI_Datatype target_datatype, MPIR_Win * win,
+                                                  MPIR_Request ** request)
+{
+    return MPIDI_SHM_native_func->rget(origin_addr, origin_count, origin_datatype, target_rank,
+                                       target_disp, target_count, target_datatype, win, request);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_sync(MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->win_sync(win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_flush_all(MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->win_flush_all(win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_get_accumulate(const void *origin_addr,
+                                                            int origin_count,
+                                                            MPI_Datatype origin_datatype,
+                                                            void *result_addr, int result_count,
+                                                            MPI_Datatype result_datatype,
+                                                            int target_rank, MPI_Aint target_disp,
+                                                            int target_count,
+                                                            MPI_Datatype target_datatype, MPI_Op op,
+                                                            MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->get_accumulate(origin_addr, origin_count, origin_datatype,
+                                                 result_addr, result_count, result_datatype,
+                                                 target_rank, target_disp, target_count,
+                                                 target_datatype, op, win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_win_lock_all(int assert, MPIR_Win * win)
+{
+    return MPIDI_SHM_native_func->win_lock_all(assert, win);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_barrier(MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->barrier(comm, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_bcast(void *buffer, int count, MPI_Datatype datatype,
+                                                   int root, MPIR_Comm * comm,
+                                                   MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->bcast(buffer, count, datatype, root, comm, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_allreduce(const void *sendbuf, void *recvbuf,
+                                                       int count, MPI_Datatype datatype, MPI_Op op,
+                                                       MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->allreduce(sendbuf, recvbuf, count, datatype, op, comm, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_allgather(const void *sendbuf, int sendcount,
+                                                       MPI_Datatype sendtype, void *recvbuf,
+                                                       int recvcount, MPI_Datatype recvtype,
+                                                       MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                                            recvtype, comm, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_allgatherv(const void *sendbuf, int sendcount,
+                                                        MPI_Datatype sendtype, void *recvbuf,
+                                                        const int *recvcounts, const int *displs,
+                                                        MPI_Datatype recvtype, MPIR_Comm * comm,
+                                                        MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->allgatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts,
+                                             displs, recvtype, comm, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_scatter(const void *sendbuf, int sendcount,
+                                                     MPI_Datatype sendtype, void *recvbuf,
+                                                     int recvcount, MPI_Datatype recvtype, int root,
+                                                     MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->scatter(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                                          recvtype, root, comm, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_scatterv(const void *sendbuf, const int *sendcounts,
+                                                      const int *displs, MPI_Datatype sendtype,
+                                                      void *recvbuf, int recvcount,
+                                                      MPI_Datatype recvtype, int root,
+                                                      MPIR_Comm * comm_ptr,
+                                                      MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->scatterv(sendbuf, sendcounts, displs, sendtype, recvbuf,
+                                           recvcount, recvtype, root, comm_ptr, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_gather(const void *sendbuf, int sendcount,
+                                                    MPI_Datatype sendtype, void *recvbuf,
+                                                    int recvcount, MPI_Datatype recvtype, int root,
+                                                    MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->gather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
+                                         root, comm, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_gatherv(const void *sendbuf, int sendcount,
+                                                     MPI_Datatype sendtype, void *recvbuf,
+                                                     const int *recvcounts, const int *displs,
+                                                     MPI_Datatype recvtype, int root,
+                                                     MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->gatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs,
+                                          recvtype, root, comm, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_alltoall(const void *sendbuf, int sendcount,
+                                                      MPI_Datatype sendtype, void *recvbuf,
+                                                      int recvcount, MPI_Datatype recvtype,
+                                                      MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->alltoall(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                                           recvtype, comm, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_alltoallv(const void *sendbuf, const int *sendcounts,
+                                                       const int *sdispls, MPI_Datatype sendtype,
+                                                       void *recvbuf, const int *recvcounts,
+                                                       const int *rdispls, MPI_Datatype recvtype,
+                                                       MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->alltoallv(sendbuf, sendcounts, sdispls, sendtype, recvbuf,
+                                            recvcounts, rdispls, recvtype, comm, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_alltoallw(const void *sendbuf, const int *sendcounts,
+                                                       const int *sdispls,
+                                                       const MPI_Datatype sendtypes[],
+                                                       void *recvbuf, const int *recvcounts,
+                                                       const int *rdispls,
+                                                       const MPI_Datatype recvtypes[],
+                                                       MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->alltoallw(sendbuf, sendcounts, sdispls, sendtypes, recvbuf,
+                                            recvcounts, rdispls, recvtypes, comm, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_reduce(const void *sendbuf, void *recvbuf, int count,
+                                                    MPI_Datatype datatype, MPI_Op op, int root,
+                                                    MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->reduce(sendbuf, recvbuf, count, datatype, op, root, comm_ptr,
+                                         errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_reduce_scatter(const void *sendbuf, void *recvbuf,
+                                                            const int *recvcounts,
+                                                            MPI_Datatype datatype, MPI_Op op,
+                                                            MPIR_Comm * comm_ptr,
+                                                            MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->reduce_scatter(sendbuf, recvbuf, recvcounts, datatype, op,
+                                                 comm_ptr, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_reduce_scatter_block(const void *sendbuf,
+                                                                  void *recvbuf, int recvcount,
+                                                                  MPI_Datatype datatype, MPI_Op op,
+                                                                  MPIR_Comm * comm_ptr,
+                                                                  MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->reduce_scatter_block(sendbuf, recvbuf, recvcount, datatype, op,
+                                                       comm_ptr, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_scan(const void *sendbuf, void *recvbuf, int count,
+                                                  MPI_Datatype datatype, MPI_Op op,
+                                                  MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->scan(sendbuf, recvbuf, count, datatype, op, comm, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_exscan(const void *sendbuf, void *recvbuf, int count,
+                                                    MPI_Datatype datatype, MPI_Op op,
+                                                    MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->exscan(sendbuf, recvbuf, count, datatype, op, comm, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_neighbor_allgather(const void *sendbuf, int sendcount,
+                                                                MPI_Datatype sendtype,
+                                                                void *recvbuf, int recvcount,
+                                                                MPI_Datatype recvtype,
+                                                                MPIR_Comm * comm,
+                                                                MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->neighbor_allgather(sendbuf, sendcount, sendtype, recvbuf,
+                                                     recvcount, recvtype, comm, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_neighbor_allgatherv(const void *sendbuf, int sendcount,
+                                                                 MPI_Datatype sendtype,
+                                                                 void *recvbuf,
+                                                                 const int *recvcounts,
+                                                                 const int *displs,
+                                                                 MPI_Datatype recvtype,
+                                                                 MPIR_Comm * comm,
+                                                                 MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->neighbor_allgatherv(sendbuf, sendcount, sendtype, recvbuf,
+                                                      recvcounts, displs, recvtype, comm, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_neighbor_alltoallv(const void *sendbuf,
+                                                                const int *sendcounts,
+                                                                const int *sdispls,
+                                                                MPI_Datatype sendtype,
+                                                                void *recvbuf,
+                                                                const int *recvcounts,
+                                                                const int *rdispls,
+                                                                MPI_Datatype recvtype,
+                                                                MPIR_Comm * comm,
+                                                                MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->neighbor_alltoallv(sendbuf, sendcounts, sdispls, sendtype,
+                                                     recvbuf, recvcounts, rdispls, recvtype, comm,
+                                                     errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_neighbor_alltoallw(const void *sendbuf,
+                                                                const int *sendcounts,
+                                                                const MPI_Aint * sdispls,
+                                                                const MPI_Datatype * sendtypes,
+                                                                void *recvbuf,
+                                                                const int *recvcounts,
+                                                                const MPI_Aint * rdispls,
+                                                                const MPI_Datatype * recvtypes,
+                                                                MPIR_Comm * comm,
+                                                                MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->neighbor_alltoallw(sendbuf, sendcounts, sdispls, sendtypes,
+                                                     recvbuf, recvcounts, rdispls, recvtypes, comm,
+                                                     errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_neighbor_alltoall(const void *sendbuf, int sendcount,
+                                                               MPI_Datatype sendtype, void *recvbuf,
+                                                               int recvcount, MPI_Datatype recvtype,
+                                                               MPIR_Comm * comm,
+                                                               MPIR_Errflag_t * errflag)
+{
+    return MPIDI_SHM_native_func->neighbor_alltoall(sendbuf, sendcount, sendtype, recvbuf,
+                                                    recvcount, recvtype, comm, errflag);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ineighbor_allgather(const void *sendbuf, int sendcount,
+                                                                 MPI_Datatype sendtype,
+                                                                 void *recvbuf, int recvcount,
+                                                                 MPI_Datatype recvtype,
+                                                                 MPIR_Comm * comm,
+                                                                 MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->ineighbor_allgather(sendbuf, sendcount, sendtype, recvbuf,
+                                                      recvcount, recvtype, comm, req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ineighbor_allgatherv(const void *sendbuf,
+                                                                  int sendcount,
+                                                                  MPI_Datatype sendtype,
+                                                                  void *recvbuf,
+                                                                  const int *recvcounts,
+                                                                  const int *displs,
+                                                                  MPI_Datatype recvtype,
+                                                                  MPIR_Comm * comm,
+                                                                  MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->ineighbor_allgatherv(sendbuf, sendcount, sendtype, recvbuf,
+                                                       recvcounts, displs, recvtype, comm, req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ineighbor_alltoall(const void *sendbuf, int sendcount,
+                                                                MPI_Datatype sendtype,
+                                                                void *recvbuf, int recvcount,
+                                                                MPI_Datatype recvtype,
+                                                                MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->ineighbor_alltoall(sendbuf, sendcount, sendtype, recvbuf,
+                                                     recvcount, recvtype, comm, req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ineighbor_alltoallv(const void *sendbuf,
+                                                                 const int *sendcounts,
+                                                                 const int *sdispls,
+                                                                 MPI_Datatype sendtype,
+                                                                 void *recvbuf,
+                                                                 const int *recvcounts,
+                                                                 const int *rdispls,
+                                                                 MPI_Datatype recvtype,
+                                                                 MPIR_Comm * comm,
+                                                                 MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->ineighbor_alltoallv(sendbuf, sendcounts, sdispls, sendtype,
+                                                      recvbuf, recvcounts, rdispls, recvtype, comm,
+                                                      req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ineighbor_alltoallw(const void *sendbuf,
+                                                                 const int *sendcounts,
+                                                                 const MPI_Aint * sdispls,
+                                                                 const MPI_Datatype * sendtypes,
+                                                                 void *recvbuf,
+                                                                 const int *recvcounts,
+                                                                 const MPI_Aint * rdispls,
+                                                                 const MPI_Datatype * recvtypes,
+                                                                 MPIR_Comm * comm,
+                                                                 MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->ineighbor_alltoallw(sendbuf, sendcounts, sdispls, sendtypes,
+                                                      recvbuf, recvcounts, rdispls, recvtypes, comm,
+                                                      req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ibarrier(MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->ibarrier(comm, req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ibcast(void *buffer, int count, MPI_Datatype datatype,
+                                                    int root, MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->ibcast(buffer, count, datatype, root, comm, req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_iallgather(const void *sendbuf, int sendcount,
+                                                        MPI_Datatype sendtype, void *recvbuf,
+                                                        int recvcount, MPI_Datatype recvtype,
+                                                        MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->iallgather(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                                             recvtype, comm, req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_iallgatherv(const void *sendbuf, int sendcount,
+                                                         MPI_Datatype sendtype, void *recvbuf,
+                                                         const int *recvcounts, const int *displs,
+                                                         MPI_Datatype recvtype, MPIR_Comm * comm,
+                                                         MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->iallgatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts,
+                                              displs, recvtype, comm, req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_iallreduce(const void *sendbuf, void *recvbuf,
+                                                        int count, MPI_Datatype datatype, MPI_Op op,
+                                                        MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->iallreduce(sendbuf, recvbuf, count, datatype, op, comm, req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ialltoall(const void *sendbuf, int sendcount,
+                                                       MPI_Datatype sendtype, void *recvbuf,
+                                                       int recvcount, MPI_Datatype recvtype,
+                                                       MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->ialltoall(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                                            recvtype, comm, req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ialltoallv(const void *sendbuf, const int *sendcounts,
+                                                        const int *sdispls, MPI_Datatype sendtype,
+                                                        void *recvbuf, const int *recvcounts,
+                                                        const int *rdispls, MPI_Datatype recvtype,
+                                                        MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->ialltoallv(sendbuf, sendcounts, sdispls, sendtype, recvbuf,
+                                             recvcounts, rdispls, recvtype, comm, req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ialltoallw(const void *sendbuf, const int *sendcounts,
+                                                        const int *sdispls,
+                                                        const MPI_Datatype sendtypes[],
+                                                        void *recvbuf, const int *recvcounts,
+                                                        const int *rdispls,
+                                                        const MPI_Datatype recvtypes[],
+                                                        MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->ialltoallw(sendbuf, sendcounts, sdispls, sendtypes, recvbuf,
+                                             recvcounts, rdispls, recvtypes, comm, req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_iexscan(const void *sendbuf, void *recvbuf, int count,
+                                                     MPI_Datatype datatype, MPI_Op op,
+                                                     MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->iexscan(sendbuf, recvbuf, count, datatype, op, comm, req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_igather(const void *sendbuf, int sendcount,
+                                                     MPI_Datatype sendtype, void *recvbuf,
+                                                     int recvcount, MPI_Datatype recvtype, int root,
+                                                     MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->igather(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                                          recvtype, root, comm, req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_igatherv(const void *sendbuf, int sendcount,
+                                                      MPI_Datatype sendtype, void *recvbuf,
+                                                      const int *recvcounts, const int *displs,
+                                                      MPI_Datatype recvtype, int root,
+                                                      MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->igatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts,
+                                           displs, recvtype, root, comm, req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ireduce_scatter_block(const void *sendbuf,
+                                                                   void *recvbuf, int recvcount,
+                                                                   MPI_Datatype datatype, MPI_Op op,
+                                                                   MPIR_Comm * comm,
+                                                                   MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->ireduce_scatter_block(sendbuf, recvbuf, recvcount, datatype, op,
+                                                        comm, req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ireduce_scatter(const void *sendbuf, void *recvbuf,
+                                                             const int *recvcounts,
+                                                             MPI_Datatype datatype, MPI_Op op,
+                                                             MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->ireduce_scatter(sendbuf, recvbuf, recvcounts, datatype, op, comm,
+                                                  req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_ireduce(const void *sendbuf, void *recvbuf, int count,
+                                                     MPI_Datatype datatype, MPI_Op op, int root,
+                                                     MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->ireduce(sendbuf, recvbuf, count, datatype, op, root, comm_ptr,
+                                          req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_iscan(const void *sendbuf, void *recvbuf, int count,
+                                                   MPI_Datatype datatype, MPI_Op op,
+                                                   MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->iscan(sendbuf, recvbuf, count, datatype, op, comm, req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_iscatter(const void *sendbuf, int sendcount,
+                                                      MPI_Datatype sendtype, void *recvbuf,
+                                                      int recvcount, MPI_Datatype recvtype,
+                                                      int root, MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->iscatter(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                                           recvtype, root, comm, req);
+};
+
+MPIDI_SHM_STATIC_INLINE_PREFIX int MPIDI_SHM_iscatterv(const void *sendbuf, const int *sendcounts,
+                                                       const int *displs, MPI_Datatype sendtype,
+                                                       void *recvbuf, int recvcount,
+                                                       MPI_Datatype recvtype, int root,
+                                                       MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    return MPIDI_SHM_native_func->iscatterv(sendbuf, sendcounts, displs, sendtype, recvbuf,
+                                            recvcount, recvtype, root, comm_ptr, req);
+};
+
+#endif /* SHM_DISABLE_INLINES  */
+
+#else
+
+#define __shm_direct_stubshm__     0
+#define __shm_direct_posix__    1
+
+#if SHM_DIRECT==__shm_direct_stubshm__
+#include "../stubshm/shm_direct.h"
+#elif SHM_DIRECT==__shm_direct_posix__
+#include "../posix/shm_direct.h"
+#else
+#error "No direct shm included"
+#endif
+
+
+#endif /* SHM_DIRECT           */
+
+#endif
diff --git a/src/mpid/ch4/shm/posix/Makefile.mk b/src/mpid/ch4/shm/posix/Makefile.mk
new file mode 100644
index 0000000..2a0fcb3
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/Makefile.mk
@@ -0,0 +1,38 @@
+## -*- Mode: Makefile; -*-
+## vim: set ft=automake :
+##
+## (C) 2016 by Argonne National Laboratory.
+##     See COPYRIGHT in top-level directory.
+##
+##  Portions of this code were written by Intel Corporation.
+##  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+##  to Argonne National Laboratory subject to Software Grant and Corporate
+##  Contributor License Agreement dated February 8, 2012.
+##
+
+if BUILD_SHM_POSIX
+
+noinst_HEADERS += src/mpid/ch4/shm/posix/posix_am.h        \
+                  src/mpid/ch4/shm/posix/posix_coll.h      \
+                  src/mpid/ch4/shm/posix/posix_datatypes.h \
+                  src/mpid/ch4/shm/posix/shm_direct.h      \
+                  src/mpid/ch4/shm/posix/posix_init.h      \
+                  src/mpid/ch4/shm/posix/posix_progress.h  \
+                  src/mpid/ch4/shm/posix/posix_recv.h      \
+                  src/mpid/ch4/shm/posix/posix_rma.h       \
+                  src/mpid/ch4/shm/posix/posix_spawn.h     \
+                  src/mpid/ch4/shm/posix/posix_win.h       \
+                  src/mpid/ch4/shm/posix/posix_comm.h      \
+                  src/mpid/ch4/shm/posix/posix_defs.h      \
+                  src/mpid/ch4/shm/posix/posix_impl.h      \
+                  src/mpid/ch4/shm/posix/posix_probe.h     \
+                  src/mpid/ch4/shm/posix/posix_queue.h     \
+                  src/mpid/ch4/shm/posix/posix_request.h   \
+                  src/mpid/ch4/shm/posix/posix_send.h      \
+                  src/mpid/ch4/shm/posix/posix_unimpl.h
+
+mpi_core_sources += src/mpid/ch4/shm/posix/globals.c    \
+                    src/mpid/ch4/shm/posix/func_table.c \
+                    src/mpid/ch4/shm/posix/barrier.c
+
+endif
diff --git a/src/mpid/ch4/shm/posix/barrier.c b/src/mpid/ch4/shm/posix/barrier.c
new file mode 100644
index 0000000..151a50b
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/barrier.c
@@ -0,0 +1,42 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#include <mpidimpl.h>
+#include "posix_impl.h"
+
+/* ------------------------------------------------------- */
+/* from mpid/ch3/channels/nemesis/src/ch3i_comm.c          */
+/* ------------------------------------------------------- */
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_POSIX_barrier_vars_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+int MPIDI_POSIX_barrier_vars_init(MPIDI_POSIX_barrier_vars_t * barrier_region)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int i;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_BARRIER_VARS_INIT);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_BARRIER_VARS_INIT);
+
+    if (MPIDI_POSIX_mem_region.local_rank == 0)
+        for (i = 0; i < MPIDI_POSIX_NUM_BARRIER_VARS; ++i) {
+            OPA_store_int(&barrier_region[i].context_id, -1);
+            OPA_store_int(&barrier_region[i].usage_cnt, 0);
+            OPA_store_int(&barrier_region[i].cnt, 0);
+            OPA_store_int(&barrier_region[i].sig0, 0);
+            OPA_store_int(&barrier_region[i].sig, 0);
+        }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_BARRIER_VARS_INIT);
+    return mpi_errno;
+}
diff --git a/src/mpid/ch4/shm/posix/func_table.c b/src/mpid/ch4/shm/posix/func_table.c
new file mode 100644
index 0000000..9b8a049
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/func_table.c
@@ -0,0 +1,152 @@
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#ifndef SHM_DIRECT
+#define SHM_DISABLE_INLINES
+#include <mpidimpl.h>
+#include "shm_direct.h"
+MPIDI_SHM_funcs_t MPIDI_SHM_posix_funcs = {
+    MPIDI_SHM_init,
+    MPIDI_SHM_finalize,
+    MPIDI_SHM_progress,
+    MPIDI_SHM_reg_hdr_handler,
+    MPIDI_SHM_comm_connect,
+    MPIDI_SHM_comm_disconnect,
+    MPIDI_SHM_open_port,
+    MPIDI_SHM_close_port,
+    MPIDI_SHM_comm_accept,
+    MPIDI_SHM_send_am_hdr,
+    MPIDI_SHM_inject_am_hdr,
+    MPIDI_SHM_send_am,
+    MPIDI_SHM_inject_am,
+    MPIDI_SHM_send_amv,
+    MPIDI_SHM_inject_amv,
+    MPIDI_SHM_send_am_hdr_reply,
+    MPIDI_SHM_inject_am_hdr_reply,
+    MPIDI_SHM_send_am_reply,
+    MPIDI_SHM_inject_am_reply,
+    MPIDI_SHM_send_amv_reply,
+    MPIDI_SHM_inject_amv_reply,
+    MPIDI_SHM_am_hdr_max_sz,
+    MPIDI_SHM_am_inject_max_sz,
+    MPIDI_SHM_am_recv,
+    MPIDI_SHM_comm_get_lpid,
+    MPIDI_SHM_gpid_get,
+    MPIDI_SHM_get_node_id,
+    MPIDI_SHM_get_max_node_id,
+    MPIDI_SHM_getallincomm,
+    MPIDI_SHM_gpid_tolpidarray,
+    MPIDI_SHM_create_intercomm_from_lpids,
+    MPIDI_SHM_comm_create,
+    MPIDI_SHM_comm_destroy,
+    MPIDI_SHM_am_request_init,
+};
+
+MPIDI_SHM_native_funcs_t MPIDI_SHM_native_posix_funcs = {
+    MPIDI_SHM_send,
+    MPIDI_SHM_ssend,
+    MPIDI_SHM_startall,
+    MPIDI_SHM_send_init,
+    MPIDI_SHM_ssend_init,
+    MPIDI_SHM_rsend_init,
+    MPIDI_SHM_bsend_init,
+    MPIDI_SHM_isend,
+    MPIDI_SHM_issend,
+    MPIDI_SHM_cancel_send,
+    MPIDI_SHM_recv_init,
+    MPIDI_SHM_recv,
+    MPIDI_SHM_irecv,
+    MPIDI_SHM_imrecv,
+    MPIDI_SHM_cancel_recv,
+    MPIDI_SHM_alloc_mem,
+    MPIDI_SHM_free_mem,
+    MPIDI_SHM_improbe,
+    MPIDI_SHM_iprobe,
+    MPIDI_SHM_win_set_info,
+    MPIDI_SHM_win_shared_query,
+    MPIDI_SHM_put,
+    MPIDI_SHM_win_start,
+    MPIDI_SHM_win_complete,
+    MPIDI_SHM_win_post,
+    MPIDI_SHM_win_wait,
+    MPIDI_SHM_win_test,
+    MPIDI_SHM_win_lock,
+    MPIDI_SHM_win_unlock,
+    MPIDI_SHM_win_get_info,
+    MPIDI_SHM_get,
+    MPIDI_SHM_win_free,
+    MPIDI_SHM_win_fence,
+    MPIDI_SHM_win_create,
+    MPIDI_SHM_accumulate,
+    MPIDI_SHM_win_attach,
+    MPIDI_SHM_win_allocate_shared,
+    MPIDI_SHM_rput,
+    MPIDI_SHM_win_flush_local,
+    MPIDI_SHM_win_detach,
+    MPIDI_SHM_compare_and_swap,
+    MPIDI_SHM_raccumulate,
+    MPIDI_SHM_rget_accumulate,
+    MPIDI_SHM_fetch_and_op,
+    MPIDI_SHM_win_allocate,
+    MPIDI_SHM_win_flush,
+    MPIDI_SHM_win_flush_local_all,
+    MPIDI_SHM_win_unlock_all,
+    MPIDI_SHM_win_create_dynamic,
+    MPIDI_SHM_rget,
+    MPIDI_SHM_win_sync,
+    MPIDI_SHM_win_flush_all,
+    MPIDI_SHM_get_accumulate,
+    MPIDI_SHM_win_lock_all,
+    MPIDI_SHM_barrier,
+    MPIDI_SHM_bcast,
+    MPIDI_SHM_allreduce,
+    MPIDI_SHM_allgather,
+    MPIDI_SHM_allgatherv,
+    MPIDI_SHM_scatter,
+    MPIDI_SHM_scatterv,
+    MPIDI_SHM_gather,
+    MPIDI_SHM_gatherv,
+    MPIDI_SHM_alltoall,
+    MPIDI_SHM_alltoallv,
+    MPIDI_SHM_alltoallw,
+    MPIDI_SHM_reduce,
+    MPIDI_SHM_reduce_scatter,
+    MPIDI_SHM_reduce_scatter_block,
+    MPIDI_SHM_scan,
+    MPIDI_SHM_exscan,
+    MPIDI_SHM_neighbor_allgather,
+    MPIDI_SHM_neighbor_allgatherv,
+    MPIDI_SHM_neighbor_alltoall,
+    MPIDI_SHM_neighbor_alltoallv,
+    MPIDI_SHM_neighbor_alltoallw,
+    MPIDI_SHM_ineighbor_allgather,
+    MPIDI_SHM_ineighbor_allgatherv,
+    MPIDI_SHM_ineighbor_alltoall,
+    MPIDI_SHM_ineighbor_alltoallv,
+    MPIDI_SHM_ineighbor_alltoallw,
+    MPIDI_SHM_ibarrier,
+    MPIDI_SHM_ibcast,
+    MPIDI_SHM_iallgather,
+    MPIDI_SHM_iallgatherv,
+    MPIDI_SHM_iallreduce,
+    MPIDI_SHM_ialltoall,
+    MPIDI_SHM_ialltoallv,
+    MPIDI_SHM_ialltoallw,
+    MPIDI_SHM_iexscan,
+    MPIDI_SHM_igather,
+    MPIDI_SHM_igatherv,
+    MPIDI_SHM_ireduce_scatter_block,
+    MPIDI_SHM_ireduce_scatter,
+    MPIDI_SHM_ireduce,
+    MPIDI_SHM_iscan,
+    MPIDI_SHM_iscatter,
+    MPIDI_SHM_iscatterv,
+};
+#endif
diff --git a/src/mpid/ch4/shm/posix/globals.c b/src/mpid/ch4/shm/posix/globals.c
new file mode 100644
index 0000000..84f538a
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/globals.c
@@ -0,0 +1,22 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#include <mpidimpl.h>
+#include "posix_impl.h"
+
+MPIDI_POSIX_request_queue_t MPIDI_POSIX_sendq = { NULL, NULL };
+MPIDI_POSIX_request_queue_t MPIDI_POSIX_recvq_posted = { NULL, NULL };
+MPIDI_POSIX_request_queue_t MPIDI_POSIX_recvq_unexpected = { NULL, NULL };
+MPIDI_POSIX_mem_region_t MPIDI_POSIX_mem_region = { {0}
+};
+
+char *MPIDI_POSIX_asym_base_addr = 0;
+MPID_Thread_mutex_t MPID_shm_mutex;
diff --git a/src/mpid/ch4/shm/posix/posix_am.h b/src/mpid/ch4/shm/posix/posix_am.h
new file mode 100644
index 0000000..06b2351
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/posix_am.h
@@ -0,0 +1,173 @@
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_POSIX_AM_H_INCLUDED
+#define SHM_POSIX_AM_H_INCLUDED
+
+#include "posix_impl.h"
+
+static inline int MPIDI_SHM_reg_hdr_handler(int handler_id,
+                                            MPIDI_SHM_am_origin_handler_fn origin_handler_fn,
+                                            MPIDI_SHM_am_target_handler_fn target_handler_fn)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_send_am_hdr(int rank,
+                                        MPIR_Comm * comm,
+                                        int handler_id,
+                                        const void *am_hdr,
+                                        size_t am_hdr_sz, MPIR_Request * sreq, void *shm_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_send_am(int rank,
+                                    MPIR_Comm * comm,
+                                    int handler_id,
+                                    const void *am_hdr,
+                                    size_t am_hdr_sz,
+                                    const void *data,
+                                    MPI_Count count,
+                                    MPI_Datatype datatype, MPIR_Request * sreq, void *shm_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_send_amv(int rank,
+                                     MPIR_Comm * comm,
+                                     int handler_id,
+                                     struct iovec *am_hdr,
+                                     size_t iov_len,
+                                     const void *data,
+                                     MPI_Count count,
+                                     MPI_Datatype datatype, MPIR_Request * sreq, void *shm_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_send_am_hdr_reply(MPIR_Context_id_t context_id, int src_rank,
+                                              int handler_id,
+                                              const void *am_hdr,
+                                              size_t am_hdr_sz, MPIR_Request * sreq)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_send_am_reply(MPIR_Context_id_t context_id, int src_rank,
+                                          int handler_id,
+                                          const void *am_hdr,
+                                          size_t am_hdr_sz,
+                                          const void *data,
+                                          MPI_Count count,
+                                          MPI_Datatype datatype, MPIR_Request * sreq)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_send_amv_reply(MPIR_Context_id_t context_id, int src_rank,
+                                           int handler_id,
+                                           struct iovec *am_hdr,
+                                           size_t iov_len,
+                                           const void *data,
+                                           MPI_Count count,
+                                           MPI_Datatype datatype, MPIR_Request * sreq)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline size_t MPIDI_SHM_am_hdr_max_sz(void)
+{
+    MPIR_Assert(0);
+    return 0;
+}
+
+static inline int MPIDI_SHM_inject_am_hdr(int rank,
+                                          MPIR_Comm * comm,
+                                          int handler_id,
+                                          const void *am_hdr, size_t am_hdr_sz, void *shm_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_inject_am(int rank,
+                                      MPIR_Comm * comm,
+                                      int handler_id,
+                                      const void *am_hdr,
+                                      size_t am_hdr_sz,
+                                      const void *data,
+                                      MPI_Count count, MPI_Datatype datatype, void *shm_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_inject_amv(int rank,
+                                       MPIR_Comm * comm,
+                                       int handler_id,
+                                       struct iovec *am_hdr,
+                                       size_t iov_len,
+                                       const void *data,
+                                       MPI_Count count, MPI_Datatype datatype, void *shm_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_inject_am_hdr_reply(MPIR_Context_id_t context_id, int src_rank,
+                                                int handler_id,
+                                                const void *am_hdr, size_t am_hdr_sz)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_inject_am_reply(MPIR_Context_id_t context_id, int src_rank,
+                                            int handler_id,
+                                            const void *am_hdr,
+                                            size_t am_hdr_sz,
+                                            const void *data,
+                                            MPI_Count count, MPI_Datatype datatype)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_inject_amv_reply(MPIR_Context_id_t context_id, int src_rank,
+                                             int handler_id,
+                                             struct iovec *am_hdr,
+                                             size_t iov_len,
+                                             const void *data,
+                                             MPI_Count count, MPI_Datatype datatype)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline size_t MPIDI_SHM_am_inject_max_sz(void)
+{
+    MPIR_Assert(0);
+    return 0;
+}
+
+static inline int MPIDI_SHM_am_recv(MPIR_Request * req)
+{
+    MPIR_Assert(0);
+    return 0;
+}
+
+#endif /* SHM_POSIX_AM_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/posix/posix_coll.h b/src/mpid/ch4/shm/posix/posix_coll.h
new file mode 100644
index 0000000..52e3228
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/posix_coll.h
@@ -0,0 +1,876 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_POSIX_COLL_H_INCLUDED
+#define SHM_POSIX_COLL_H_INCLUDED
+
+#include "posix_impl.h"
+#include "ch4_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_barrier
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_barrier(MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_BARRIER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_BARRIER);
+
+    mpi_errno = MPIR_Barrier(comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_BARRIER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_bcast
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_bcast(void *buffer, int count, MPI_Datatype datatype,
+                                  int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_BCAST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_BCAST);
+
+    mpi_errno = MPIR_Bcast(buffer, count, datatype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_BCAST);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_allreduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_allreduce(const void *sendbuf, void *recvbuf, int count,
+                                      MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                      MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_ALLREDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_ALLREDUCE);
+
+    mpi_errno = MPIR_Allreduce(sendbuf, recvbuf, count, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_ALLREDUCE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                      void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                      MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_ALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_ALLGATHER);
+
+    mpi_errno = MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
+                               comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_ALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_allgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                       void *recvbuf, const int *recvcounts, const int *displs,
+                                       MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                       MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_ALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_ALLGATHERV);
+
+    mpi_errno = MPIR_Allgatherv(sendbuf, sendcount, sendtype,
+                                recvbuf, recvcounts, displs, recvtype, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_ALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_gather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                   int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_GATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_GATHER);
+
+    mpi_errno = MPIR_Gather(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                            recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_GATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_gatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, const int *recvcounts, const int *displs,
+                                    MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr,
+                                    MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_GATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_GATHERV);
+
+    mpi_errno = MPIR_Gatherv(sendbuf, sendcount, sendtype,
+                             recvbuf, recvcounts, displs, recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_GATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_scatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                    int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_SCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_SCATTER);
+
+    mpi_errno = MPIR_Scatter(sendbuf, sendcount, sendtype,
+                             recvbuf, recvcount, recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_SCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_scatterv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_scatterv(const void *sendbuf, const int *sendcounts,
+                                     const int *displs, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                     int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_SCATTERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_SCATTERV);
+
+    mpi_errno = MPIR_Scatterv(sendbuf, sendcounts, displs,
+                              sendtype, recvbuf, recvcount, recvtype, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_SCATTERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_alltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                     MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_ALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_ALLTOALL);
+
+    mpi_errno = MPIR_Alltoall(sendbuf, sendcount, sendtype, recvbuf, recvcount,
+                              recvtype, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_ALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_alltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_alltoallv(const void *sendbuf, const int *sendcounts,
+                                      const int *sdispls, MPI_Datatype sendtype,
+                                      void *recvbuf, const int *recvcounts,
+                                      const int *rdispls, MPI_Datatype recvtype,
+                                      MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_ALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_ALLTOALLV);
+
+    mpi_errno = MPIR_Alltoallv(sendbuf, sendcounts, sdispls,
+                               sendtype, recvbuf, recvcounts, rdispls, recvtype, comm_ptr, errflag);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_ALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_alltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_alltoallw(const void *sendbuf, const int sendcounts[],
+                                      const int sdispls[], const MPI_Datatype sendtypes[],
+                                      void *recvbuf, const int recvcounts[],
+                                      const int rdispls[], const MPI_Datatype recvtypes[],
+                                      MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_ALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_ALLTOALLW);
+
+    mpi_errno = MPIR_Alltoallw(sendbuf, sendcounts, sdispls,
+                               sendtypes, recvbuf, recvcounts,
+                               rdispls, recvtypes, comm_ptr, errflag);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_ALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_reduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_reduce(const void *sendbuf, void *recvbuf, int count,
+                                   MPI_Datatype datatype, MPI_Op op, int root,
+                                   MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_REDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_REDUCE);
+
+    mpi_errno = MPIR_Reduce(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_REDUCE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_reduce_scatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_reduce_scatter(const void *sendbuf, void *recvbuf,
+                                           const int recvcounts[], MPI_Datatype datatype,
+                                           MPI_Op op, MPIR_Comm * comm_ptr,
+                                           MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_REDUCE_SCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_REDUCE_SCATTER);
+
+    mpi_errno = MPIR_Reduce_scatter(sendbuf, recvbuf, recvcounts, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_REDUCE_SCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_reduce_scatter_block
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_reduce_scatter_block(const void *sendbuf, void *recvbuf,
+                                                 int recvcount, MPI_Datatype datatype,
+                                                 MPI_Op op, MPIR_Comm * comm_ptr,
+                                                 MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_REDUCE_SCATTER_BLOCK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_REDUCE_SCATTER_BLOCK);
+
+    mpi_errno = MPIR_Reduce_scatter_block(sendbuf, recvbuf, recvcount,
+                                          datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_REDUCE_SCATTER_BLOCK);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_scan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_scan(const void *sendbuf, void *recvbuf, int count,
+                                 MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                 MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_SCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_SCAN);
+
+    mpi_errno = MPIR_Scan(sendbuf, recvbuf, count, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_SCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_exscan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_exscan(const void *sendbuf, void *recvbuf, int count,
+                                   MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                   MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_EXSCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_EXSCAN);
+
+    mpi_errno = MPIR_Exscan(sendbuf, recvbuf, count, datatype, op, comm_ptr, errflag);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_EXSCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_neighbor_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_neighbor_allgather(const void *sendbuf, int sendcount,
+                                               MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                               MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                               MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_NEIGHBOR_ALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_NEIGHBOR_ALLGATHER);
+
+    mpi_errno =
+        MPIR_Neighbor_allgather_impl(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
+                                     comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_NEIGHBOR_ALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_neighbor_allgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_neighbor_allgatherv(const void *sendbuf, int sendcount,
+                                                MPI_Datatype sendtype, void *recvbuf,
+                                                const int recvcounts[], const int displs[],
+                                                MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                                MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_NEIGHBOR_ALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_NEIGHBOR_ALLGATHERV);
+
+    mpi_errno = MPIR_Neighbor_allgatherv_impl(sendbuf, sendcount, sendtype,
+                                              recvbuf, recvcounts, displs, recvtype, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_NEIGHBOR_ALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_neighbor_alltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_neighbor_alltoall(const void *sendbuf, int sendcount,
+                                              MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                              MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                              MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_NEIGHBOR_ALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_NEIGHBOR_ALLTOALL);
+
+    mpi_errno = MPIR_Neighbor_alltoall_impl(sendbuf, sendcount, sendtype,
+                                            recvbuf, recvcount, recvtype, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_NEIGHBOR_ALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_neighbor_alltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_neighbor_alltoallv(const void *sendbuf, const int sendcounts[],
+                                               const int sdispls[], MPI_Datatype sendtype,
+                                               void *recvbuf, const int recvcounts[],
+                                               const int rdispls[], MPI_Datatype recvtype,
+                                               MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_NEIGHBOR_ALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_NEIGHBOR_ALLTOALLV);
+
+    mpi_errno = MPIR_Neighbor_alltoallv_impl(sendbuf, sendcounts, sdispls, sendtype,
+                                             recvbuf, recvcounts, rdispls, recvtype, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_NEIGHBOR_ALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_neighbor_alltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_neighbor_alltoallw(const void *sendbuf, const int sendcounts[],
+                                               const MPI_Aint sdispls[],
+                                               const MPI_Datatype sendtypes[], void *recvbuf,
+                                               const int recvcounts[], const MPI_Aint rdispls[],
+                                               const MPI_Datatype recvtypes[], MPIR_Comm * comm_ptr,
+                                               MPIR_Errflag_t * errflag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_NEIGHBOR_ALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_NEIGHBOR_ALLTOALLW);
+
+    mpi_errno = MPIR_Neighbor_alltoallw_impl(sendbuf, sendcounts, sdispls, sendtypes,
+                                             recvbuf, recvcounts, rdispls, recvtypes, comm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_NEIGHBOR_ALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ineighbor_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ineighbor_allgather(const void *sendbuf, int sendcount,
+                                                MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                                MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                                MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_INEIGHBOR_ALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_INEIGHBOR_ALLGATHER);
+
+    mpi_errno = MPIR_Ineighbor_allgather_impl(sendbuf, sendcount, sendtype,
+                                              recvbuf, recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_INEIGHBOR_ALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ineighbor_allgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ineighbor_allgatherv(const void *sendbuf, int sendcount,
+                                                 MPI_Datatype sendtype, void *recvbuf,
+                                                 const int recvcounts[], const int displs[],
+                                                 MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                                 MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_INEIGHBOR_ALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_INEIGHBOR_ALLGATHERV);
+
+    mpi_errno = MPIR_Ineighbor_allgatherv_impl(sendbuf, sendcount, sendtype,
+                                               recvbuf, recvcounts, displs, recvtype,
+                                               comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_INEIGHBOR_ALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ineighbor_alltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ineighbor_alltoall(const void *sendbuf, int sendcount,
+                                               MPI_Datatype sendtype, void *recvbuf,
+                                               int recvcount, MPI_Datatype recvtype,
+                                               MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_INEIGHBOR_ALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_INEIGHBOR_ALLTOALL);
+
+    mpi_errno = MPIR_Ineighbor_alltoall_impl(sendbuf, sendcount, sendtype,
+                                             recvbuf, recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_INEIGHBOR_ALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ineighbor_alltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ineighbor_alltoallv(const void *sendbuf, const int sendcounts[],
+                                                const int sdispls[], MPI_Datatype sendtype,
+                                                void *recvbuf, const int recvcounts[],
+                                                const int rdispls[], MPI_Datatype recvtype,
+                                                MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_INEIGHBOR_ALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_INEIGHBOR_ALLTOALLV);
+
+    mpi_errno = MPIR_Ineighbor_alltoallv_impl(sendbuf, sendcounts, sdispls, sendtype,
+                                              recvbuf, recvcounts, rdispls, recvtype,
+                                              comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_INEIGHBOR_ALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ineighbor_alltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ineighbor_alltoallw(const void *sendbuf, const int sendcounts[],
+                                                const MPI_Aint sdispls[],
+                                                const MPI_Datatype sendtypes[], void *recvbuf,
+                                                const int recvcounts[], const MPI_Aint rdispls[],
+                                                const MPI_Datatype recvtypes[],
+                                                MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_INEIGHBOR_ALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_INEIGHBOR_ALLTOALLW);
+
+    mpi_errno = MPIR_Ineighbor_alltoallw_impl(sendbuf, sendcounts, sdispls, sendtypes,
+                                              recvbuf, recvcounts, rdispls, recvtypes,
+                                              comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_INEIGHBOR_ALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ibarrier
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ibarrier(MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_IBARRIER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_IBARRIER);
+
+    mpi_errno = MPIR_Ibarrier_impl(comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_IBARRIER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ibcast
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ibcast(void *buffer, int count, MPI_Datatype datatype,
+                                   int root, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_IBCAST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_IBCAST);
+
+    mpi_errno = MPIR_Ibcast_impl(buffer, count, datatype, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_IBCAST);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_iallgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                       void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                       MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_IALLGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_IALLGATHER);
+
+    mpi_errno = MPIR_Iallgather_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                     recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_IALLGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_iallgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_iallgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                        void *recvbuf, const int *recvcounts, const int *displs,
+                                        MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                        MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_IALLGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_IALLGATHERV);
+
+    mpi_errno = MPIR_Iallgatherv_impl(sendbuf, sendcount, sendtype,
+                                      recvbuf, recvcounts, displs, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_IALLGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ialltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ialltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                      void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                      MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_IALLTOALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_IALLTOALL);
+
+    mpi_errno = MPIR_Ialltoall_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                    recvcount, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_IALLTOALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ialltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ialltoallv(const void *sendbuf, const int *sendcounts,
+                                       const int *sdispls, MPI_Datatype sendtype,
+                                       void *recvbuf, const int *recvcounts,
+                                       const int *rdispls, MPI_Datatype recvtype,
+                                       MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_IALLTOALLV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_IALLTOALLV);
+
+    mpi_errno = MPIR_Ialltoallv_impl(sendbuf, sendcounts, sdispls,
+                                     sendtype, recvbuf, recvcounts,
+                                     rdispls, recvtype, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_IALLTOALLV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ialltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ialltoallw(const void *sendbuf, const int *sendcounts,
+                                       const int *sdispls, const MPI_Datatype sendtypes[],
+                                       void *recvbuf, const int *recvcounts,
+                                       const int *rdispls, const MPI_Datatype recvtypes[],
+                                       MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_IALLTOALLW);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_IALLTOALLW);
+
+    mpi_errno = MPIR_Ialltoallw_impl(sendbuf, sendcounts, sdispls,
+                                     sendtypes, recvbuf, recvcounts,
+                                     rdispls, recvtypes, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_IALLTOALLW);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_iexscan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_iexscan(const void *sendbuf, void *recvbuf, int count,
+                                    MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                    MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_IEXSCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_IEXSCAN);
+
+    mpi_errno = MPIR_Iexscan_impl(sendbuf, recvbuf, count, datatype, op, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_IEXSCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_igather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_igather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                    int root, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_IGATHER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_IGATHER);
+
+    mpi_errno = MPIR_Igather_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                  recvcount, recvtype, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_IGATHER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_igatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_igatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, const int *recvcounts, const int *displs,
+                                     MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr,
+                                     MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_IGATHERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_IGATHERV);
+
+    mpi_errno = MPIR_Igatherv_impl(sendbuf, sendcount, sendtype,
+                                   recvbuf, recvcounts, displs, recvtype, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_IGATHERV);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ireduce_scatter_block
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ireduce_scatter_block(const void *sendbuf, void *recvbuf,
+                                                  int recvcount, MPI_Datatype datatype,
+                                                  MPI_Op op, MPIR_Comm * comm_ptr,
+                                                  MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_IREDUCE_SCATTER_BLOCK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_IREDUCE_SCATTER_BLOCK);
+
+    mpi_errno = MPIR_Ireduce_scatter_block_impl(sendbuf, recvbuf, recvcount,
+                                                datatype, op, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_IREDUCE_SCATTER_BLOCK);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ireduce_scatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ireduce_scatter(const void *sendbuf, void *recvbuf,
+                                            const int recvcounts[], MPI_Datatype datatype,
+                                            MPI_Op op, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_IREDUCE_SCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_IREDUCE_SCATTER);
+
+    mpi_errno = MPIR_Ireduce_scatter_impl(sendbuf, recvbuf, recvcounts, datatype, op,
+                                          comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_IREDUCE_SCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ireduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ireduce(const void *sendbuf, void *recvbuf, int count,
+                                    MPI_Datatype datatype, MPI_Op op, int root,
+                                    MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_IREDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_IREDUCE);
+
+    mpi_errno = MPIR_Ireduce_impl(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_IREDUCE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_iallreduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_iallreduce(const void *sendbuf, void *recvbuf, int count,
+                                       MPI_Datatype datatype, MPI_Op op,
+                                       MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_IALLREDUCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_IALLREDUCE);
+
+    mpi_errno = MPIR_Iallreduce_impl(sendbuf, recvbuf, count, datatype, op, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_IALLREDUCE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_iscan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_iscan(const void *sendbuf, void *recvbuf, int count,
+                                  MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                  MPI_Request * req)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_ISCAN);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_ISCAN);
+
+    mpi_errno = MPIR_Iscan_impl(sendbuf, recvbuf, count, datatype, op, comm_ptr, req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_ISCAN);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_iscatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_iscatter(const void *sendbuf, int sendcount,
+                                     MPI_Datatype sendtype, void *recvbuf,
+                                     int recvcount, MPI_Datatype recvtype,
+                                     int root, MPIR_Comm * comm, MPI_Request * request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_ISCATTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_ISCATTER);
+
+    mpi_errno = MPIR_Iscatter_impl(sendbuf, sendcount, sendtype, recvbuf,
+                                   recvcount, recvtype, root, comm, request);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_ISCATTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_iscatterv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_iscatterv(const void *sendbuf, const int *sendcounts,
+                                      const int *displs, MPI_Datatype sendtype,
+                                      void *recvbuf, int recvcount,
+                                      MPI_Datatype recvtype, int root,
+                                      MPIR_Comm * comm, MPI_Request * request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_ISCATTERV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_ISCATTERV);
+
+    mpi_errno = MPIR_Iscatterv_impl(sendbuf, sendcounts, displs, sendtype,
+                                    recvbuf, recvcount, recvtype, root, comm, request);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_ISCATTERV);
+    return mpi_errno;
+}
+
+#endif
diff --git a/src/mpid/ch4/shm/posix/posix_comm.h b/src/mpid/ch4/shm/posix/posix_comm.h
new file mode 100644
index 0000000..336cd74
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/posix_comm.h
@@ -0,0 +1,45 @@
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_POSIX_COMM_H_INCLUDED
+#define SHM_POSIX_COMM_H_INCLUDED
+
+#include "posix_impl.h"
+#include "mpl_utlist.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_comm_create
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_comm_create(MPIR_Comm * comm)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_COMM_CREATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_COMM_CREATE);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_COMM_CREATE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_comm_destroy
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_comm_destroy(MPIR_Comm * comm)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_COMM_DESTROY);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_COMM_DESTROY);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_COMM_DESTROY);
+    return mpi_errno;
+}
+
+
+#endif /* SHM_POSIX_COMM_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/posix/posix_datatypes.h b/src/mpid/ch4/shm/posix/posix_datatypes.h
new file mode 100644
index 0000000..e99e853
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/posix_datatypes.h
@@ -0,0 +1,185 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_DATATYPES_H
+#define SHM_DATATYPES_H
+
+/* ************************************************************************** */
+/* from mpid/ch3/channels/nemesis/include/mpid_nem_datatypes.h                */
+/* ************************************************************************** */
+
+#define MPIDI_POSIX_OFFSETOF(struc, field) ((int)(&((struc *)0)->field))
+#define MPIDI_POSIX_CACHE_LINE_LEN (64)
+#define MPIDI_POSIX_NUM_CELLS      (64)
+#define MPIDI_POSIX_CELL_LEN       (64*1024)
+
+#if (SIZEOF_OPA_PTR_T > 8)
+#if (SIZEOF_OPA_PTR_T > 16)
+#error unexpected size for OPA_ptr_t
+#endif
+#define MPIDI_POSIX_CELL_HEAD_LEN  16   /* We use this to keep elements 64-bit aligned */
+#else /* (SIZEOF_OPA_PTR_T <= 8) */
+#define MPIDI_POSIX_CELL_HEAD_LEN  8    /* We use this to keep elements 64-bit aligned */
+#endif
+
+/* cell header with matching info now is cache line */
+#define MPIDI_POSIX_CELL_PAYLOAD_LEN (MPIDI_POSIX_CELL_LEN - MPIDI_POSIX_CACHE_LINE_LEN + MPIDI_POSIX_MPICH_HEAD_LEN)
+
+#define MPIDI_POSIX_CALC_CELL_LEN(cellp) (MPIDI_POSIX_CELL_HEAD_LEN + MPIDI_POSIX_MPICH_HEAD_LEN + MPIDI_POSIX_CELL_DLEN (cell))
+
+#define MPIDI_POSIX_ALIGNED(addr, bytes) ((((unsigned long)addr) & (((unsigned long)bytes)-1)) == 0)
+
+#define MPIDI_POSIX_PKT_UNKNOWN     0
+#define MPIDI_POSIX_PKT_MPICH      1
+#define MPIDI_POSIX_PKT_MPICH_HEAD 2
+
+#define MPIDI_POSIX_FBOX_SOURCE(cell) (MPIDI_POSIX_mem_region.local_procs[(cell)->pkt.mpich.source])
+#define MPIDI_POSIX_CELL_SOURCE(cell) ((cell)->pkt.mpich.source)
+#define MPIDI_POSIX_CELL_DEST(cell)   ((cell)->pkt.mpich.dest)
+#define MPIDI_POSIX_CELL_DLEN(cell)   ((cell)->pkt.mpich.datalen)
+#define MPIDI_POSIX_CELL_SEQN(cell)   ((cell)->pkt.mpich.seqno)
+
+#define MPIDI_POSIX_MPICH_HEAD_LEN sizeof(MPIDI_POSIX_pkt_header_t)
+#define MPIDI_POSIX_DATA_LEN (MPIDI_POSIX_CELL_PAYLOAD_LEN - MPIDI_POSIX_MPICH_HEAD_LEN)
+
+#define MPIDI_POSIX_PKT_HEADER_FIELDS          \
+    int source;                             \
+    int dest;                               \
+    uintptr_t datalen;                      \
+    unsigned short seqno;                   \
+    unsigned short type;        /* currently used only with checkpointing */
+
+typedef struct MPIDI_POSIX_pkt_header {
+    MPIDI_POSIX_PKT_HEADER_FIELDS;
+} MPIDI_POSIX_pkt_header_t;
+
+typedef struct MPIDI_POSIX_pkt_mpich {
+    MPIDI_POSIX_PKT_HEADER_FIELDS;
+    union {
+        char payload[MPIDI_POSIX_DATA_LEN];
+        double dummy;           /* align paylod to double */
+    } p;
+} MPIDI_POSIX_pkt_mpich_t;
+
+typedef union {
+    MPIDI_POSIX_pkt_header_t header;
+    MPIDI_POSIX_pkt_mpich_t mpich;
+} MPIDI_POSIX_pkt_t;
+
+/* Nemesis cells which are to be used in shared memory need to use
+ * "relative pointers" because the absolute pointers to a cell from
+ * different processes may be different.  Relative pointers are
+ * offsets from the beginning of the mmapped region where they live.
+ * We use different types for relative and absolute pointers to help
+ * catch errors.  Use MPIDI_POSIX_REL_TO_ABS and MPIDI_POSIX_ABS_TO_REL to
+ * convert between relative and absolute pointers. */
+
+/* This should always be exactly the size of a pointer */
+typedef struct MPIDI_POSIX_cell_rel_ptr {
+    OPA_ptr_t p;
+} MPIDI_POSIX_cell_rel_ptr_t;
+
+/* MPIDI_POSIX_cell and MPIDI_POSIX_abs_cell must be kept in sync so that we
+ * can cast between them.  MPIDI_POSIX_abs_cell should only be used when
+ * a cell is enqueued on a queue local to a single process (e.g., a
+ * queue in a network module) where relative pointers are not
+ * needed. */
+
+typedef struct MPIDI_POSIX_cell {
+    MPIDI_POSIX_cell_rel_ptr_t next;
+#if (MPIDI_POSIX_CELL_HEAD_LEN > SIZEOF_OPA_PTR_T)
+    char padding[MPIDI_POSIX_CELL_HEAD_LEN - sizeof(MPIDI_POSIX_cell_rel_ptr_t)];
+#endif
+    int my_rank;
+    int rank;
+    int tag;
+    int context_id;
+    MPIR_Request *pending;
+#if MPIDI_POSIX_CACHE_LINE_LEN != 0
+    char padding[MPIDI_POSIX_CACHE_LINE_LEN - MPIDI_POSIX_CELL_HEAD_LEN - MPIDI_POSIX_MPICH_HEAD_LEN - 4 * sizeof(int) - sizeof(MPIR_Request *)];       /* should be 64-16-16-16-8 = 8 */
+#endif
+    volatile MPIDI_POSIX_pkt_t pkt;
+} MPIDI_POSIX_cell_t;
+typedef MPIDI_POSIX_cell_t *MPIDI_POSIX_cell_ptr_t;
+
+typedef struct MPIDI_POSIX_abs_cell {
+    struct MPIDI_POSIX_abs_cell *next;
+#if (MPIDI_POSIX_CELL_HEAD_LEN > SIZEOF_VOID_P)
+    char padding[MPIDI_POSIX_CELL_HEAD_LEN - sizeof(struct MPIDI_POSIX_abs_cell *)];
+#endif
+    volatile MPIDI_POSIX_pkt_t pkt;
+} MPIDI_POSIX_abs_cell_t;
+typedef MPIDI_POSIX_abs_cell_t *MPIDI_POSIX_abs_cell_ptr_t;
+
+#define MPIDI_POSIX_CELL_TO_PACKET(cellp) (&(cellp)->pkt)
+#define MPIDI_POSIX_PACKET_TO_CELL(packetp) \
+    ((MPIDI_POSIX_cell_ptr_t) ((char*)(packetp) - (char *)MPIDI_POSIX_CELL_TO_PACKET((MPIDI_POSIX_cell_ptr_t)0)))
+#define MPIDI_POSIX_MIN_PACKET_LEN (sizeof (MPIDI_POSIX_pkt_header_t))
+#define MPIDI_POSIX_MAX_PACKET_LEN (sizeof (MPIDI_POSIX_pkt_t))
+#define MPIDI_POSIX_PACKET_LEN(pkt) ((pkt)->mpich.datalen + MPIDI_POSIX_MPICH_HEAD_LEN)
+
+#define MPIDI_POSIX_OPT_LOAD     16
+#define MPIDI_POSIX_OPT_SIZE     ((sizeof(MPIDI_CH3_Pkt_t)) + (MPIDI_POSIX_OPT_LOAD))
+#define MPIDI_POSIX_OPT_HEAD_LEN ((MPIDI_POSIX_MPICH_HEAD_LEN) + (MPIDI_POSIX_OPT_SIZE))
+
+#define MPIDI_POSIX_PACKET_OPT_LEN(pkt) \
+    (((pkt)->mpich.datalen < MPIDI_POSIX_OPT_SIZE) ? (MPIDI_POSIX_OPT_HEAD_LEN) : (MPIDI_POSIX_PACKET_LEN(pkt)))
+
+#define MPIDI_POSIX_PACKET_PAYLOAD(pkt) ((pkt)->mpich.payload)
+
+typedef struct MPIDI_POSIX_queue {
+    MPIDI_POSIX_cell_rel_ptr_t head;
+    MPIDI_POSIX_cell_rel_ptr_t tail;
+#if (MPIDI_POSIX_CACHE_LINE_LEN > (2 * SIZEOF_OPA_PTR_T))
+    char padding1[MPIDI_POSIX_CACHE_LINE_LEN - 2 * sizeof(MPIDI_POSIX_cell_rel_ptr_t)];
+#endif
+    MPIDI_POSIX_cell_rel_ptr_t my_head;
+#if (MPIDI_POSIX_CACHE_LINE_LEN > SIZEOF_OPA_PTR_T)
+    char padding2[MPIDI_POSIX_CACHE_LINE_LEN - sizeof(MPIDI_POSIX_cell_rel_ptr_t)];
+#endif
+#if !defined(MPIDI_POSIX_USE_LOCK_FREE_QUEUES)
+    /* see FIXME in mpid_nem_queue.h */
+#define MPIDI_POSIX_queue_mutex_t MPID_Thread_mutex_t
+    MPIDI_POSIX_queue_mutex_t lock;
+    char padding3[MPIDI_POSIX_CACHE_LINE_LEN - sizeof(MPID_Thread_mutex_t)];
+#endif
+}
+MPIDI_POSIX_queue_t, *MPIDI_POSIX_queue_ptr_t;
+
+/* Fast Boxes*/
+typedef union {
+    OPA_int_t value;
+#if MPIDI_POSIX_CACHE_LINE_LEN != 0
+    char padding[MPIDI_POSIX_CACHE_LINE_LEN];
+#endif
+} MPIDI_POSIX_opt_volint_t;
+
+typedef struct MPIDI_POSIX_fbox_common {
+    MPIDI_POSIX_opt_volint_t flag;
+} MPIDI_POSIX_fbox_common_t, *MPIDI_POSIX_fbox_common_ptr_t;
+
+typedef struct MPIDI_POSIX_fbox_mpich {
+    MPIDI_POSIX_opt_volint_t flag;
+    MPIDI_POSIX_cell_t cell;
+} MPIDI_POSIX_fbox_mpich_t;
+
+#define MPIDI_POSIX_FBOX_DATALEN MPIDI_POSIX_DATA_LEN
+
+typedef union {
+    MPIDI_POSIX_fbox_common_t common;
+    MPIDI_POSIX_fbox_mpich_t mpich;
+} MPIDI_POSIX_fastbox_t;
+
+typedef struct MPIDI_POSIX_fbox_arrays {
+    MPIDI_POSIX_fastbox_t **in;
+    MPIDI_POSIX_fastbox_t **out;
+} MPIDI_POSIX_fbox_arrays_t;
+
+#endif /* ifndef SHM_DATATYPES_H */
diff --git a/src/mpid/ch4/shm/posix/posix_defs.h b/src/mpid/ch4/shm/posix/posix_defs.h
new file mode 100644
index 0000000..459666d
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/posix_defs.h
@@ -0,0 +1,115 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_DEFS_H
+#define SHM_DEFS_H
+
+/* ************************************************************************** */
+/* from mpid/ch3/channels/nemesis/include/mpid_nem_defs.h                     */
+/* ************************************************************************** */
+
+#include "mpidu_shm.h"
+#define MPIDI_POSIX_MAX_FNAME_LEN 256
+
+/* FIXME: This definition should be gotten from mpidi_ch3_impl.h */
+#ifndef MPIDI_POSIX_MAX_HOSTNAME_LEN
+#define MPIDI_POSIX_MAX_HOSTNAME_LEN 256
+#endif /* MPIDI_POSIX_MAX_HOSTNAME_LEN */
+
+extern char MPIDI_POSIX_hostname[MPIDI_POSIX_MAX_HOSTNAME_LEN];
+
+#define MPIDI_POSIX_RET_OK       1
+#define MPIDI_POSIX_RET_NG      -1
+#define MPIDI_POSIX_KEY          632236
+#define MPIDI_POSIX_ANY_SOURCE  -1
+#define MPIDI_POSIX_IN           1
+#define MPIDI_POSIX_OUT          0
+
+#define MPIDI_POSIX_POLL_IN      0
+#define MPIDI_POSIX_POLL_OUT     1
+
+#define MPIDI_POSIX_ASYMM_NULL_VAL    64
+typedef MPI_Aint MPIDI_POSIX_addr_t;
+extern char *MPIDI_POSIX_asym_base_addr;
+
+#define MPIDI_POSIX_REL_NULL (0x0)
+#define MPIDI_POSIX_IS_REL_NULL(rel_ptr) (OPA_load_ptr(&(rel_ptr).p) == MPIDI_POSIX_REL_NULL)
+#define MPIDI_POSIX_SET_REL_NULL(rel_ptr) (OPA_store_ptr(&((rel_ptr).p), MPIDI_POSIX_REL_NULL))
+#define MPIDI_POSIX_REL_ARE_EQUAL(rel_ptr1, rel_ptr2) \
+    (OPA_load_ptr(&(rel_ptr1).p) == OPA_load_ptr(&(rel_ptr2).p))
+
+#ifndef MPIDI_POSIX_SYMMETRIC_QUEUES
+
+static inline MPIDI_POSIX_cell_ptr_t MPIDI_POSIX_REL_TO_ABS(MPIDI_POSIX_cell_rel_ptr_t r)
+{
+    return (MPIDI_POSIX_cell_ptr_t) ((char *) OPA_load_ptr(&r.p) +
+                                     (MPIDI_POSIX_addr_t) MPIDI_POSIX_asym_base_addr);
+}
+
+static inline MPIDI_POSIX_cell_rel_ptr_t MPIDI_POSIX_ABS_TO_REL(MPIDI_POSIX_cell_ptr_t a)
+{
+    MPIDI_POSIX_cell_rel_ptr_t ret;
+    OPA_store_ptr(&ret.p, (char *) a - (MPIDI_POSIX_addr_t) MPIDI_POSIX_asym_base_addr);
+    return ret;
+}
+
+#else /*MPIDI_POSIX_SYMMETRIC_QUEUES */
+#define MPIDI_POSIX_REL_TO_ABS(ptr) (ptr)
+#define MPIDI_POSIX_ABS_TO_REL(ptr) (ptr)
+#endif /*MPIDI_POSIX_SYMMETRIC_QUEUES */
+
+/* NOTE: MPIDI_POSIX_IS_LOCAL should only be used when the process is known to be
+   in your comm_world (such as at init time).  This will generally not work for
+   dynamic processes.  Check vc_ch->is_local instead.  If that is true, then
+   it's safe to use MPIDI_POSIX_LOCAL_RANK. */
+#define MPIDI_POSIX_NON_LOCAL -1
+#define MPIDI_POSIX_IS_LOCAL(grank) (MPIDI_POSIX_mem_region.local_ranks[grank] != MPIDI_POSIX_NON_LOCAL)
+#define MPIDI_POSIX_LOCAL_RANK(grank) (MPIDI_POSIX_mem_region.local_ranks[grank])
+#define MPIDI_POSIX_NUM_BARRIER_VARS 16
+#define MPIDI_POSIX_SHM_MUTEX        MPID_shm_mutex
+typedef struct MPIDI_POSIX_barrier_vars {
+    OPA_int_t context_id;
+    OPA_int_t usage_cnt;
+    OPA_int_t cnt;
+#if MPIDI_POSIX_CACHE_LINE_LEN != SIZEOF_INT
+    char padding0[MPIDI_POSIX_CACHE_LINE_LEN - sizeof(int)];
+#endif
+    OPA_int_t sig0;
+    OPA_int_t sig;
+    char padding1[MPIDI_POSIX_CACHE_LINE_LEN - 2 * sizeof(int)];
+} MPIDI_POSIX_barrier_vars_t;
+
+typedef struct MPIDI_POSIX_mem_region {
+    MPIDU_shm_seg_t memory;
+    MPIDU_shm_seg_info_t *seg;
+    int num_seg;
+    int map_lock;
+    int num_local;
+    int num_procs;
+    int *local_procs;           /* local_procs[lrank] gives the global rank of proc with local rank lrank */
+    int local_rank;
+    int *local_ranks;           /* local_ranks[grank] gives the local rank of proc with global rank grank or MPIDI_POSIX_NON_LOCAL */
+    int ext_procs;              /* Number of non-local processes */
+    int *ext_ranks;             /* Ranks of non-local processes */
+    MPIDI_POSIX_fbox_arrays_t mailboxes;
+    MPIDI_POSIX_cell_ptr_t Elements;
+    MPIDI_POSIX_queue_ptr_t *FreeQ;
+    MPIDI_POSIX_queue_ptr_t *RecvQ;
+    MPIDU_shm_barrier_t *barrier;
+    MPIDI_POSIX_queue_ptr_t my_freeQ;
+    MPIDI_POSIX_queue_ptr_t my_recvQ;
+    MPIDI_POSIX_barrier_vars_t *barrier_vars;
+    int rank;
+    struct MPIDI_POSIX_mem_region *next;
+} MPIDI_POSIX_mem_region_t, *MPIDI_POSIX_mem_region_ptr_t;
+extern MPIDI_POSIX_mem_region_t MPIDI_POSIX_mem_region;
+extern MPID_Thread_mutex_t MPID_shm_mutex;
+
+#endif /* ifndef SHM_DEFS_H */
diff --git a/src/mpid/ch4/shm/posix/posix_impl.h b/src/mpid/ch4/shm/posix/posix_impl.h
new file mode 100644
index 0000000..97f3e68
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/posix_impl.h
@@ -0,0 +1,174 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_POSIX_IMPL_H_INCLUDED
+#define SHM_POSIX_IMPL_H_INCLUDED
+
+#include <mpidimpl.h>
+#include "mpidch4r.h"
+
+#include "pmi.h"
+
+#include "mpidu_shm.h"
+
+/* ---------------------------------------------------- */
+/* temp headers                                         */
+/* ---------------------------------------------------- */
+#include "posix_datatypes.h"    /* MPID_nem datatypes like cell, fastbox defined here */
+#include "posix_defs.h" /* MPID_nem objects like shared memory region defined here */
+#include "posix_queue.h"        /* MPIDI_POSIX_queue functions defined here */
+
+/* ---------------------------------------------------- */
+/* constants                                            */
+/* ---------------------------------------------------- */
+#define MPIDI_POSIX_EAGER_THRESHOLD MPIDI_POSIX_DATA_LEN
+#define MPIDI_POSIX_TYPESTANDARD    0
+#define MPIDI_POSIX_TYPEEAGER       1
+#define MPIDI_POSIX_TYPELMT         2
+#define MPIDI_POSIX_TYPESYNC        3
+#define MPIDI_POSIX_TYPEBUFFERED    4
+#define MPIDI_POSIX_TYPEREADY       5
+#define MPIDI_POSIX_TYPEACK         6
+#define MPIDI_POSIX_REQUEST(req)    (&(req)->dev.ch4.shm.posix)
+
+/* ---------------------------------------------------- */
+/* shm specific object data                             */
+/* ---------------------------------------------------- */
+/* VCR Table Data */
+typedef struct {
+    unsigned int avt_rank;
+} MPIDI_POSIX_vcr_t;
+
+struct MPIDI_POSIX_vcrt_t {
+    MPIR_OBJECT_HEADER;
+    unsigned size;                            /**< Number of entries in the table */
+    MPIDI_POSIX_vcr_t vcr_table[0]; /**< Array of virtual connection references */
+};
+/* ---------------------------------------------------- */
+/* general send/recv queue types, macros and objects    */
+/* ---------------------------------------------------- */
+typedef struct {
+    MPIR_Request *head;
+    MPIR_Request *tail;
+} MPIDI_POSIX_request_queue_t;
+
+#define MPIDI_POSIX_REQUEST_COMPLETE(req_)    \
+{ \
+    int incomplete__; \
+    MPIR_cc_decr((req_)->cc_ptr, &incomplete__); \
+    dtype_release_if_not_builtin(MPIDI_POSIX_REQUEST(req_)->datatype); \
+    if (!incomplete__) \
+        MPIDI_CH4U_request_release(req_);    \
+}
+
+#define MPIDI_POSIX_REQUEST_ENQUEUE(req,queue) \
+{ \
+    if ((queue).tail != NULL) \
+        MPIDI_POSIX_REQUEST((queue).tail)->next = req; \
+    else \
+        (queue).head = req; \
+    (queue).tail = req; \
+}
+
+#define MPIDI_POSIX_REQUEST_DEQUEUE(req_p,prev_req,queue) \
+{ \
+    MPIR_Request *next = MPIDI_POSIX_REQUEST(*(req_p))->next; \
+    if ((queue).head == *(req_p)) \
+        (queue).head = next; \
+    else \
+        MPIDI_POSIX_REQUEST(prev_req)->next = next; \
+    if ((queue).tail == *(req_p)) \
+        (queue).tail = prev_req; \
+    MPIDI_POSIX_REQUEST(*(req_p))->next = NULL; \
+}
+
+#define MPIDI_POSIX_REQUEST_DEQUEUE_AND_SET_ERROR(req_p,prev_req,queue,err) \
+{ \
+    MPIR_Request *next = MPIDI_POSIX_REQUEST(*(req_p))->next; \
+    if ((queue).head == *(req_p)) \
+        (queue).head = next; \
+    else \
+        MPIDI_POSIX_REQUEST(prev_req)->next = next; \
+    if ((queue).tail == *(req_p)) \
+        (queue).tail = prev_req; \
+    (*(req_p))->status.MPI_ERROR = err; \
+    MPIDI_POSIX_REQUEST_COMPLETE(*(req_p)); \
+    *(req_p) = next; \
+}
+
+#define MPIDI_POSIX_REQUEST_CREATE_SREQ(sreq_)	\
+{								\
+    (sreq_) = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);             \
+    MPIR_Request_add_ref((sreq_));                                      \
+    (sreq_)->u.persist.real_request   = NULL;                          \
+}
+
+#define MPIDI_POSIX_REQUEST_CREATE_RREQ(rreq_)	\
+{								\
+    (rreq_) = MPIR_Request_create(MPIR_REQUEST_KIND__RECV);             \
+    MPIR_Request_add_ref((rreq_));                                      \
+    (rreq_)->u.persist.real_request   = NULL;                          \
+}
+
+/* ---------------------------------------------------- */
+/* matching macros                                      */
+/* ---------------------------------------------------- */
+#define MPIDI_POSIX_ENVELOPE_SET(ptr_,rank_,tag_,context_id_) \
+{ \
+    (ptr_)->rank = rank_; \
+    (ptr_)->tag = tag_; \
+    (ptr_)->context_id = context_id_; \
+}
+
+#define MPIDI_POSIX_ENVELOPE_GET(ptr_,rank_,tag_,context_id_) \
+{ \
+    rank_ = (ptr_)->rank; \
+    tag_ = (ptr_)->tag; \
+    context_id_ = (ptr_)->context_id; \
+}
+
+#define MPIDI_POSIX_ENVELOPE_MATCH(ptr_,rank_,tag_,context_id_) \
+    (((ptr_)->rank == (rank_) || (rank_) == MPI_ANY_SOURCE) && \
+     ((ptr_)->tag == (tag_) || (tag_) == MPI_ANY_TAG) && \
+     (ptr_)->context_id == (context_id_))
+
+/*
+ * Helper routines and macros for request completion
+ */
+#define DECL_FUNC(FUNCNAME)  MPL_QUOTE(FUNCNAME)
+
+#undef FUNCNAME
+#define FUNCNAME nothing
+#define BEGIN_FUNC(FUNCNAME)                    \
+  MPIR_FUNC_VERBOSE_STATE_DECL(FUNCNAME);                   \
+  MPIR_FUNC_VERBOSE_ENTER(FUNCNAME);
+#define END_FUNC(FUNCNAME)                      \
+  MPIR_FUNC_VERBOSE_EXIT(FUNCNAME);
+#define END_FUNC_RC(FUNCNAME) \
+  fn_exit:                    \
+  MPIR_FUNC_VERBOSE_EXIT(FUNCNAME);  \
+  return mpi_errno;           \
+fn_fail:                      \
+  goto fn_exit;
+
+#define __SHORT_FILE__                          \
+  (strrchr(__FILE__,'/')                        \
+   ? strrchr(__FILE__,'/')+1                    \
+   : __FILE__                                   \
+)
+
+int MPIDI_POSIX_barrier_vars_init(MPIDI_POSIX_barrier_vars_t * barrier_region);
+extern MPIDI_POSIX_request_queue_t MPIDI_POSIX_sendq;
+extern MPIDI_POSIX_request_queue_t MPIDI_POSIX_recvq_unexpected;
+extern MPIDI_POSIX_request_queue_t MPIDI_POSIX_recvq_posted;
+
+
+
+#endif /* SHM_POSIX_IMPL_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/posix/posix_init.h b/src/mpid/ch4/shm/posix/posix_init.h
new file mode 100644
index 0000000..30723a6
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/posix_init.h
@@ -0,0 +1,324 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_POSIX_INIT_H_INCLUDED
+#define SHM_POSIX_INIT_H_INCLUDED
+
+#include "posix_impl.h"
+#include "ch4_types.h"
+#include "mpidu_shm.h"
+
+/* ------------------------------------------------------- */
+/* from mpid/ch3/channels/nemesis/src/mpid_nem_init.c */
+/* ------------------------------------------------------- */
+extern MPIDI_POSIX_mem_region_t MPIDI_POSIX_mem_region;
+extern char *MPIDI_POSIX_asym_base_addr;
+
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_SHM_init)
+static inline int MPIDI_SHM_init(int rank, int size)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int num_local = 0;
+    int local_rank = -1;
+    int *local_procs = NULL;
+    int *local_ranks = NULL;
+    int i;
+    int grank;
+    MPIDI_POSIX_fastbox_t *fastboxes_p = NULL;
+    MPIDI_POSIX_cell_t(*cells_p)[MPIDI_POSIX_NUM_CELLS];
+    MPIDI_POSIX_queue_t *recv_queues_p = NULL;
+    MPIDI_POSIX_queue_t *free_queues_p = NULL;
+    MPIR_CHKPMEM_DECL(9);
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_SHM_INIT);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_SHM_INIT);
+
+    MPIDI_POSIX_mem_region.num_seg = 1;
+    MPIR_CHKPMEM_MALLOC(MPIDI_POSIX_mem_region.seg, MPIDU_shm_seg_info_ptr_t,
+                        MPIDI_POSIX_mem_region.num_seg * sizeof(MPIDU_shm_seg_info_t), mpi_errno,
+                        "mem_region segments");
+    MPIR_CHKPMEM_MALLOC(local_procs, int *, size * sizeof(int), mpi_errno,
+                        "local process index array");
+    MPIR_CHKPMEM_MALLOC(local_ranks, int *, size * sizeof(int), mpi_errno,
+                        "mem_region local ranks");
+
+    for (i = 0; i < size; i++) {
+        if (MPIDI_CH4_rank_is_local(i, MPIR_Process.comm_world)) {
+            if (i == rank) {
+                local_rank = num_local;
+            }
+
+            local_procs[num_local] = i;
+            local_ranks[i] = num_local;
+            num_local++;
+        }
+    }
+
+    MPIDI_POSIX_mem_region.rank = rank;
+    MPIDI_POSIX_mem_region.num_local = num_local;
+    MPIDI_POSIX_mem_region.num_procs = size;
+    MPIDI_POSIX_mem_region.local_procs = local_procs;
+    MPIDI_POSIX_mem_region.local_ranks = local_ranks;
+    MPIDI_POSIX_mem_region.local_rank = local_rank;
+    MPIDI_POSIX_mem_region.next = NULL;
+
+    /* Request fastboxes region */
+    mpi_errno =
+        MPIDU_shm_seg_alloc(MAX
+                            ((num_local * ((num_local - 1) * sizeof(MPIDI_POSIX_fastbox_t))),
+                             MPIDI_POSIX_ASYMM_NULL_VAL), (void **) &fastboxes_p);
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    /* Request data cells region */
+    mpi_errno =
+        MPIDU_shm_seg_alloc(num_local * MPIDI_POSIX_NUM_CELLS * sizeof(MPIDI_POSIX_cell_t),
+                            (void **) &cells_p);
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    /* Request free q region */
+    mpi_errno =
+        MPIDU_shm_seg_alloc(num_local * sizeof(MPIDI_POSIX_queue_t), (void **) &free_queues_p);
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    /* Request recv q region */
+    mpi_errno =
+        MPIDU_shm_seg_alloc(num_local * sizeof(MPIDI_POSIX_queue_t), (void **) &recv_queues_p);
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    /* Request shared collectives barrier vars region */
+    mpi_errno =
+        MPIDU_shm_seg_alloc(MPIDI_POSIX_NUM_BARRIER_VARS * sizeof(MPIDI_POSIX_barrier_vars_t),
+                            (void **) &MPIDI_POSIX_mem_region.barrier_vars);
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    /* Actually allocate the segment and assign regions to the pointers */
+    mpi_errno =
+        MPIDU_shm_seg_commit(&MPIDI_POSIX_mem_region.memory, &MPIDI_POSIX_mem_region.barrier,
+                             num_local, local_rank, MPIDI_POSIX_mem_region.local_procs[0],
+                             MPIDI_POSIX_mem_region.rank);
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    /* post check_alloc steps */
+    if (MPIDI_POSIX_mem_region.memory.symmetrical == 1) {
+        MPIDI_POSIX_asym_base_addr = NULL;
+    }
+    else {
+        MPIDI_POSIX_asym_base_addr = MPIDI_POSIX_mem_region.memory.base_addr;
+#ifdef MPIDI_POSIX_SYMMETRIC_QUEUES
+        MPIR_ERR_INTERNALANDJUMP(mpi_errno, "queues are not symmetrically allocated as expected");
+#endif
+    }
+
+    /* init shared collectives barrier region */
+    mpi_errno = MPIDI_POSIX_barrier_vars_init(MPIDI_POSIX_mem_region.barrier_vars);
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    /* local procs barrier */
+    mpi_errno = MPIDU_shm_barrier(MPIDI_POSIX_mem_region.barrier, num_local);
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    /* find our cell region */
+    MPIDI_POSIX_mem_region.Elements = cells_p[local_rank];
+
+    /* Tables of pointers to shared memory Qs */
+    MPIR_CHKPMEM_MALLOC(MPIDI_POSIX_mem_region.FreeQ, MPIDI_POSIX_queue_ptr_t *,
+                        size * sizeof(MPIDI_POSIX_queue_ptr_t), mpi_errno, "FreeQ");
+    MPIR_CHKPMEM_MALLOC(MPIDI_POSIX_mem_region.RecvQ, MPIDI_POSIX_queue_ptr_t *,
+                        size * sizeof(MPIDI_POSIX_queue_ptr_t), mpi_errno, "RecvQ");
+
+    /* Init table entry for our Qs */
+    MPIDI_POSIX_mem_region.FreeQ[rank] = &free_queues_p[local_rank];
+    MPIDI_POSIX_mem_region.RecvQ[rank] = &recv_queues_p[local_rank];
+
+    /* Init our queues */
+    MPIDI_POSIX_queue_init(MPIDI_POSIX_mem_region.RecvQ[rank]);
+    MPIDI_POSIX_queue_init(MPIDI_POSIX_mem_region.FreeQ[rank]);
+
+    /* Init and enqueue our free cells */
+    for (i = 0; i < MPIDI_POSIX_NUM_CELLS; ++i) {
+        MPIDI_POSIX_cell_init(&(MPIDI_POSIX_mem_region.Elements[i]), rank);
+        MPIDI_POSIX_queue_enqueue(MPIDI_POSIX_mem_region.FreeQ[rank],
+                                  &(MPIDI_POSIX_mem_region.Elements[i]));
+    }
+
+    /* set route for local procs through shmem */
+    for (i = 0; i < num_local; i++) {
+        grank = local_procs[i];
+        MPIDI_POSIX_mem_region.FreeQ[grank] = &free_queues_p[i];
+        MPIDI_POSIX_mem_region.RecvQ[grank] = &recv_queues_p[i];
+
+        MPIR_Assert(MPIDI_POSIX_ALIGNED
+                    (MPIDI_POSIX_mem_region.FreeQ[grank], MPIDI_POSIX_CACHE_LINE_LEN));
+        MPIR_Assert(MPIDI_POSIX_ALIGNED
+                    (MPIDI_POSIX_mem_region.RecvQ[grank], MPIDI_POSIX_CACHE_LINE_LEN));
+    }
+
+    /* make pointers to our queues global so we don't have to dereference the array */
+    MPIDI_POSIX_mem_region.my_freeQ = MPIDI_POSIX_mem_region.FreeQ[rank];
+    MPIDI_POSIX_mem_region.my_recvQ = MPIDI_POSIX_mem_region.RecvQ[rank];
+
+    /* local barrier */
+    mpi_errno = MPIDU_shm_barrier(MPIDI_POSIX_mem_region.barrier, num_local);
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    /* Allocate table of pointers to fastboxes */
+    MPIR_CHKPMEM_MALLOC(MPIDI_POSIX_mem_region.mailboxes.in, MPIDI_POSIX_fastbox_t **,
+                        num_local * sizeof(MPIDI_POSIX_fastbox_t *), mpi_errno, "fastboxes");
+    MPIR_CHKPMEM_MALLOC(MPIDI_POSIX_mem_region.mailboxes.out, MPIDI_POSIX_fastbox_t **,
+                        num_local * sizeof(MPIDI_POSIX_fastbox_t *), mpi_errno, "fastboxes");
+
+    MPIR_Assert(num_local > 0);
+
+#define MPIDI_POSIX_MAILBOX_INDEX(sender, receiver) (((sender) > (receiver)) ? ((num_local-1) * (sender) + (receiver)) :		\
+                                          (((sender) < (receiver)) ? ((num_local-1) * (sender) + ((receiver)-1)) : 0))
+
+    /* fill in tables */
+    for (i = 0; i < num_local; ++i) {
+        if (i == local_rank) {
+            /* No fastboxs to myself */
+            MPIDI_POSIX_mem_region.mailboxes.in[i] = NULL;
+            MPIDI_POSIX_mem_region.mailboxes.out[i] = NULL;
+        }
+        else {
+            MPIDI_POSIX_mem_region.mailboxes.in[i] =
+                &fastboxes_p[MPIDI_POSIX_MAILBOX_INDEX(i, local_rank)];
+            MPIDI_POSIX_mem_region.mailboxes.out[i] =
+                &fastboxes_p[MPIDI_POSIX_MAILBOX_INDEX(local_rank, i)];
+            OPA_store_int(&MPIDI_POSIX_mem_region.mailboxes.in[i]->common.flag.value, 0);
+            OPA_store_int(&MPIDI_POSIX_mem_region.mailboxes.out[i]->common.flag.value, 0);
+        }
+    }
+
+#undef MPIDI_POSIX_MAILBOX_INDEX
+
+    MPIR_CHKPMEM_COMMIT();
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_SHM_INIT);
+    return mpi_errno;
+  fn_fail:
+    /* --BEGIN ERROR HANDLING-- */
+    MPIR_CHKPMEM_REAP();
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_SHM_finalize)
+static inline int MPIDI_SHM_finalize(void)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_SHM_FINALIZE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_SHM_FINALIZE);
+
+    /* local barrier */
+    mpi_errno = MPIDU_shm_barrier(MPIDI_POSIX_mem_region.barrier, MPIDI_POSIX_mem_region.num_local);
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    /* from MPIDI_POSIX_init */
+    MPL_free(MPIDI_POSIX_mem_region.FreeQ);
+    MPL_free(MPIDI_POSIX_mem_region.RecvQ);
+    MPL_free(MPIDI_POSIX_mem_region.local_ranks);
+    MPL_free(MPIDI_POSIX_mem_region.seg);
+    MPL_free(MPIDI_POSIX_mem_region.mailboxes.out);
+    MPL_free(MPIDI_POSIX_mem_region.mailboxes.in);
+    MPL_free(MPIDI_POSIX_mem_region.local_procs);
+
+    mpi_errno =
+        MPIDU_shm_seg_destroy(&MPIDI_POSIX_mem_region.memory, MPIDI_POSIX_mem_region.num_local);
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_SHM_FINALIZE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static inline void *MPIDI_SHM_alloc_mem(size_t size, MPIR_Info * info_ptr)
+{
+    MPIR_Assert(0);
+    return NULL;
+}
+
+static inline int MPIDI_SHM_free_mem(void *ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_comm_get_lpid(MPIR_Comm * comm_ptr,
+                                          int idx, int *lpid_ptr, MPL_bool is_remote)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_gpid_get(MPIR_Comm * comm_ptr, int rank, MPIR_Gpid * gpid)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_get_node_id(MPIR_Comm * comm, int rank, MPID_Node_id_t * id_p)
+{
+    *id_p = (MPID_Node_id_t) 0;
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_get_max_node_id(MPIR_Comm * comm, MPID_Node_id_t * max_id_p)
+{
+    *max_id_p = (MPID_Node_id_t) 1;
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_getallincomm(MPIR_Comm * comm_ptr,
+                                         int local_size, MPIR_Gpid local_gpids[], int *singleAVT)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_gpid_tolpidarray(int size, MPIR_Gpid gpid[], int lpid[])
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr,
+                                                        int size, const int lpids[])
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#endif /* SHM_POSIX_INIT_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/posix/posix_pre.h b/src/mpid/ch4/shm/posix/posix_pre.h
new file mode 100644
index 0000000..cd4cec1
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/posix_pre.h
@@ -0,0 +1,41 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#ifndef SHM_POSIX_PRE_H_INCLUDED
+#define SHM_POSIX_PRE_H_INCLUDED
+
+#include <mpi.h>
+
+struct MPIR_Request;
+struct MPIDU_Segment;
+
+typedef struct {
+    struct MPIR_Request *next;
+    struct MPIR_Request *pending;
+    int dest;
+    int rank;
+    int tag;
+    int context_id;
+    char *user_buf;
+    size_t data_sz;
+    int type;
+    int user_count;
+    MPI_Datatype datatype;
+    struct MPIDU_Segment *segment_ptr;
+    size_t segment_first;
+    size_t segment_size;
+} MPIDI_POSIX_request_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_POSIX_comm_t;
+
+#endif /* SHM_POSIX_PRE_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/posix/posix_probe.h b/src/mpid/ch4/shm/posix/posix_probe.h
new file mode 100644
index 0000000..5fe8e11
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/posix_probe.h
@@ -0,0 +1,146 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_POSIX_PROBE_H_INCLUDED
+#define SHM_POSIX_PROBE_H_INCLUDED
+
+#include "posix_impl.h"
+
+
+static inline int MPIDI_SHM_improbe(int source,
+                                    int tag,
+                                    MPIR_Comm * comm,
+                                    int context_offset,
+                                    int *flag, MPIR_Request ** message, MPI_Status * status)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *req, *matched_req = NULL;
+    int count = 0;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_SHM_IMPROBE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_SHM_IMPROBE);
+
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    *message = NULL;
+
+    if (unlikely(source == MPI_PROC_NULL)) {
+        MPIR_Status_set_procnull(status);
+        *flag = true;
+        goto fn_exit;
+    }
+
+    for (req = MPIDI_POSIX_recvq_unexpected.head; req; req = MPIDI_POSIX_REQUEST(req)->next) {
+        if (MPIDI_POSIX_ENVELOPE_MATCH
+            (MPIDI_POSIX_REQUEST(req), source, tag, comm->recvcontext_id + context_offset)) {
+            if (!matched_req)
+                matched_req = req;
+
+            if (req && MPIDI_POSIX_REQUEST(req)->type == MPIDI_POSIX_TYPEEAGER) {
+                *message = matched_req;
+                break;
+            }
+        }
+    }
+
+    if (*message) {
+        MPIDI_POSIX_request_queue_t mqueue = { NULL, NULL };
+        MPIR_Request *prev_req = NULL, *next_req = NULL;
+        req = MPIDI_POSIX_recvq_unexpected.head;
+
+        while (req) {
+            next_req = MPIDI_POSIX_REQUEST(req)->next;
+
+            if (MPIDI_POSIX_ENVELOPE_MATCH
+                (MPIDI_POSIX_REQUEST(req), source, tag, comm->recvcontext_id + context_offset)) {
+                if (mqueue.head == NULL)
+                    MPIR_Assert(req == matched_req);
+
+                count += MPIR_STATUS_GET_COUNT(req->status);
+                MPIDI_POSIX_REQUEST_DEQUEUE(&req, prev_req, MPIDI_POSIX_recvq_unexpected);
+                MPIDI_POSIX_REQUEST_ENQUEUE(req, mqueue);
+
+                if (req && MPIDI_POSIX_REQUEST(req)->type == MPIDI_POSIX_TYPEEAGER)
+                    break;
+            }
+            else
+                prev_req = req;
+
+            req = next_req;
+        }
+
+        *flag = 1;
+        matched_req->kind = MPIR_REQUEST_KIND__MPROBE;
+        matched_req->comm = comm;
+        MPIR_Comm_add_ref(comm);
+        status->MPI_TAG = matched_req->status.MPI_TAG;
+        status->MPI_SOURCE = matched_req->status.MPI_SOURCE;
+        MPIR_STATUS_SET_COUNT(*status, count);
+    }
+    else {
+        *flag = 0;
+        MPIDI_Progress_test();
+    }
+
+  fn_exit:
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_SHM_IMPROBE);
+    return mpi_errno;
+}
+
+static inline int MPIDI_SHM_iprobe(int source,
+                                   int tag,
+                                   MPIR_Comm * comm,
+                                   int context_offset, int *flag, MPI_Status * status)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *req, *matched_req = NULL;
+    int count = 0;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_SHM_IPROBE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_SHM_IPROBE);
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_POSIX_SHM_MUTEX);
+
+    if (unlikely(source == MPI_PROC_NULL)) {
+        MPIR_Status_set_procnull(status);
+        *flag = true;
+        goto fn_exit;
+    }
+
+    for (req = MPIDI_POSIX_recvq_unexpected.head; req; req = MPIDI_POSIX_REQUEST(req)->next) {
+        if (MPIDI_POSIX_ENVELOPE_MATCH
+            (MPIDI_POSIX_REQUEST(req), source, tag, comm->recvcontext_id + context_offset)) {
+            count += MPIR_STATUS_GET_COUNT(req->status);
+
+            if (MPIDI_POSIX_REQUEST(req)->type == MPIDI_POSIX_TYPEEAGER) {
+                matched_req = req;
+                break;
+            }
+        }
+    }
+
+    if (matched_req) {
+        *flag = 1;
+        status->MPI_TAG = matched_req->status.MPI_TAG;
+        status->MPI_SOURCE = matched_req->status.MPI_SOURCE;
+        MPIR_STATUS_SET_COUNT(*status, count);
+    }
+    else {
+        *flag = 0;
+        MPID_THREAD_CS_EXIT(POBJ, MPIDI_POSIX_SHM_MUTEX);
+        MPIDI_Progress_test();
+        MPID_THREAD_CS_ENTER(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    }
+
+  fn_exit:
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_SHM_IPROBE);
+    return mpi_errno;
+}
+
+#endif /* SHM_POSIX_PROBE_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/posix/posix_progress.h b/src/mpid/ch4/shm/posix/posix_progress.h
new file mode 100644
index 0000000..4732f1b
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/posix_progress.h
@@ -0,0 +1,454 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_POSIX_PROGRESS_H_INCLUDED
+#define SHM_POSIX_PROGRESS_H_INCLUDED
+
+#include "posix_impl.h"
+
+/* ----------------------------------------------------- */
+/* MPIDI_POSIX_progress_recv                     */
+/* ----------------------------------------------------- */
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_POSIX_progress_recv)
+static inline int MPIDI_POSIX_progress_recv(int blocking, int *completion_count)
+{
+    int mpi_errno = MPI_SUCCESS;
+    size_t data_sz;
+    int in_cell = 0;
+    MPIDI_POSIX_cell_ptr_t cell = NULL;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_SHM_DO_PROGRESS_RECV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_SHM_DO_PROGRESS_RECV);
+    /* try to match with unexpected */
+    MPIR_Request *sreq = MPIDI_POSIX_recvq_unexpected.head;
+    MPIR_Request *prev_sreq = NULL;
+  unexpected_l:
+
+    if (sreq != NULL) {
+        goto match_l;
+    }
+
+    /* try to receive from recvq */
+    if (MPIDI_POSIX_mem_region.my_recvQ &&
+        !MPIDI_POSIX_queue_empty(MPIDI_POSIX_mem_region.my_recvQ)) {
+        MPIDI_POSIX_queue_dequeue(MPIDI_POSIX_mem_region.my_recvQ, &cell);
+        in_cell = 1;
+        goto match_l;
+    }
+
+    goto fn_exit;
+  match_l:{
+        /* traverse posted receive queue */
+        MPIR_Request *req = MPIDI_POSIX_recvq_posted.head;
+        MPIR_Request *prev_req = NULL;
+        int continue_matching = 1;
+        char *send_buffer =
+            in_cell ? (char *) cell->pkt.mpich.p.payload : (char *) MPIDI_POSIX_REQUEST(sreq)->
+            user_buf;
+        int type = in_cell ? cell->pkt.mpich.type : MPIDI_POSIX_REQUEST(sreq)->type;
+        MPIR_Request *pending = in_cell ? cell->pending : MPIDI_POSIX_REQUEST(sreq)->pending;
+
+        if (type == MPIDI_POSIX_TYPEACK) {
+            /* ACK message doesn't have a matching receive! */
+            int c;
+            MPIR_Assert(in_cell);
+            MPIR_Assert(pending);
+            MPIR_cc_decr(pending->cc_ptr, &c);
+            MPIDI_CH4U_request_release(pending);
+            goto release_cell_l;
+        }
+
+        while (req) {
+            int sender_rank, tag, context_id;
+            MPI_Count count;
+            MPIDI_POSIX_ENVELOPE_GET(MPIDI_POSIX_REQUEST(req), sender_rank, tag, context_id);
+            MPL_DBG_MSG_FMT(MPIR_DBG_HANDLE, TYPICAL,
+                            (MPL_DBG_FDEST, "Posted from grank %d to %d in progress %d,%d,%d\n",
+                             MPIDI_CH4U_rank_to_lpid(sender_rank, req->comm),
+                             MPIDI_POSIX_mem_region.rank, sender_rank, tag, context_id));
+
+            if ((in_cell && MPIDI_POSIX_ENVELOPE_MATCH(cell, sender_rank, tag, context_id)) ||
+                (sreq &&
+                 MPIDI_POSIX_ENVELOPE_MATCH(MPIDI_POSIX_REQUEST(sreq), sender_rank, tag,
+                                            context_id))) {
+
+                /* Request matched */
+
+                continue_matching = 1;
+
+                if (MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(req)) {
+                    MPIDI_CH4R_anysource_matched(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(req),
+                                                 MPIDI_CH4R_SHM, &continue_matching);
+                    MPIDI_CH4U_request_release(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(req));
+
+                    /* Decouple requests */
+                    MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(req))
+                        = NULL;
+                    MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(req) = NULL;
+
+                    if (continue_matching)
+                        break;
+                }
+
+                char *recv_buffer = (char *) MPIDI_POSIX_REQUEST(req)->user_buf;
+
+                if (pending) {
+                    /* we must send ACK */
+                    int srank = in_cell ? cell->rank : sreq->status.MPI_SOURCE;
+                    MPIR_Request *req_ack = NULL;
+                    MPIDI_POSIX_REQUEST_CREATE_SREQ(req_ack);
+                    MPIR_Object_set_ref(req_ack, 1);
+                    req_ack->comm = req->comm;
+                    MPIR_Comm_add_ref(req->comm);
+
+                    MPIDI_POSIX_ENVELOPE_SET(MPIDI_POSIX_REQUEST(req_ack), req->comm->rank, tag,
+                                             context_id);
+                    MPIDI_POSIX_REQUEST(req_ack)->user_buf = NULL;
+                    MPIDI_POSIX_REQUEST(req_ack)->user_count = 0;
+                    MPIDI_POSIX_REQUEST(req_ack)->datatype = MPI_BYTE;
+                    MPIDI_POSIX_REQUEST(req_ack)->data_sz = 0;
+                    MPIDI_POSIX_REQUEST(req_ack)->type = MPIDI_POSIX_TYPEACK;
+                    MPIDI_POSIX_REQUEST(req_ack)->dest = srank;
+                    MPIDI_POSIX_REQUEST(req_ack)->next = NULL;
+                    MPIDI_POSIX_REQUEST(req_ack)->segment_ptr = NULL;
+                    MPIDI_POSIX_REQUEST(req_ack)->pending = pending;
+                    /* enqueue req_ack */
+                    MPIDI_POSIX_REQUEST_ENQUEUE(req_ack, MPIDI_POSIX_sendq);
+                }
+
+                if (type == MPIDI_POSIX_TYPEEAGER)
+                    /* eager message */
+                    data_sz =
+                        in_cell ? cell->pkt.mpich.datalen : MPIDI_POSIX_REQUEST(sreq)->data_sz;
+                else if (type == MPIDI_POSIX_TYPELMT)
+                    data_sz = MPIDI_POSIX_EAGER_THRESHOLD;
+                else {
+                    data_sz = 0;        /*  unused warning */
+                    MPIR_Assert(0);
+                }
+                /* check for user buffer overflow */
+                size_t user_data_sz = MPIDI_POSIX_REQUEST(req)->data_sz;
+                if (user_data_sz < data_sz) {
+                    req->status.MPI_ERROR = MPI_ERR_TRUNCATE;
+                    data_sz = user_data_sz;
+                }
+
+                /* copy to user buffer */
+                if (MPIDI_POSIX_REQUEST(req)->segment_ptr) {
+                    /* non-contig */
+                    size_t last = MPIDI_POSIX_REQUEST(req)->segment_first + data_sz;
+                    MPID_Segment_unpack(MPIDI_POSIX_REQUEST(req)->segment_ptr,
+                                        MPIDI_POSIX_REQUEST(req)->segment_first,
+                                        (MPI_Aint *) & last, send_buffer);
+                    if (last != MPIDI_POSIX_REQUEST(req)->segment_first + data_sz)
+                        req->status.MPI_ERROR = MPI_ERR_TYPE;
+                    if (type == MPIDI_POSIX_TYPEEAGER)
+                        MPID_Segment_free(MPIDI_POSIX_REQUEST(req)->segment_ptr);
+                    else
+                        MPIDI_POSIX_REQUEST(req)->segment_first = last;
+                }
+                else
+                    /* contig */
+                if (send_buffer)
+                    MPIR_Memcpy(recv_buffer, (void *) send_buffer, data_sz);
+                MPIDI_POSIX_REQUEST(req)->data_sz -= data_sz;
+                MPIDI_POSIX_REQUEST(req)->user_buf += data_sz;
+
+                /* set status and dequeue receive request if done */
+                count = MPIR_STATUS_GET_COUNT(req->status) + (MPI_Count) data_sz;
+                MPIR_STATUS_SET_COUNT(req->status, count);
+                if (type == MPIDI_POSIX_TYPEEAGER) {
+                    if (in_cell) {
+                        req->status.MPI_SOURCE = cell->rank;
+                        req->status.MPI_TAG = cell->tag;
+                    }
+                    else {
+                        req->status.MPI_SOURCE = sreq->status.MPI_SOURCE;
+                        req->status.MPI_TAG = sreq->status.MPI_TAG;
+                    }
+                    MPIDI_POSIX_REQUEST_DEQUEUE_AND_SET_ERROR(&req, prev_req,
+                                                              MPIDI_POSIX_recvq_posted,
+                                                              req->status.MPI_ERROR);
+                }
+
+                goto release_cell_l;
+            }   /* if matched  */
+
+            prev_req = req;
+            req = MPIDI_POSIX_REQUEST(req)->next;
+        }
+
+        /* unexpected message, no posted matching req */
+        if (in_cell) {
+            /* free the cell, move to unexpected queue */
+            MPIR_Request *rreq;
+            MPIDI_POSIX_REQUEST_CREATE_RREQ(rreq);
+            MPIR_Object_set_ref(rreq, 1);
+            /* set status */
+            rreq->status.MPI_SOURCE = cell->rank;
+            rreq->status.MPI_TAG = cell->tag;
+            MPIR_STATUS_SET_COUNT(rreq->status, cell->pkt.mpich.datalen);
+            MPIDI_POSIX_ENVELOPE_SET(MPIDI_POSIX_REQUEST(rreq), cell->rank, cell->tag,
+                                     cell->context_id);
+            data_sz = cell->pkt.mpich.datalen;
+            MPIDI_POSIX_REQUEST(rreq)->data_sz = data_sz;
+            MPIDI_POSIX_REQUEST(rreq)->type = cell->pkt.mpich.type;
+
+            if (data_sz > 0) {
+                MPIDI_POSIX_REQUEST(rreq)->user_buf = (char *) MPL_malloc(data_sz);
+                MPIR_Memcpy(MPIDI_POSIX_REQUEST(rreq)->user_buf, (void *) cell->pkt.mpich.p.payload,
+                            data_sz);
+            }
+            else {
+                MPIDI_POSIX_REQUEST(rreq)->user_buf = NULL;
+            }
+
+            MPIDI_POSIX_REQUEST(rreq)->datatype = MPI_BYTE;
+            MPIDI_POSIX_REQUEST(rreq)->next = NULL;
+            MPIDI_POSIX_REQUEST(rreq)->pending = cell->pending;
+            /* enqueue rreq */
+            MPIDI_POSIX_REQUEST_ENQUEUE(rreq, MPIDI_POSIX_recvq_unexpected);
+            MPL_DBG_MSG_FMT(MPIR_DBG_HANDLE, TYPICAL,
+                            (MPL_DBG_FDEST, "Unexpected from grank %d to %d in progress %d,%d,%d\n",
+                             cell->my_rank, MPIDI_POSIX_mem_region.rank,
+                             cell->rank, cell->tag, cell->context_id));
+        }
+        else {
+            /* examine another message in unexpected queue */
+            prev_sreq = sreq;
+            sreq = MPIDI_POSIX_REQUEST(sreq)->next;
+            goto unexpected_l;
+        }
+    }
+  release_cell_l:
+
+    if (in_cell) {
+        /* release cell */
+        MPL_DBG_MSG_FMT(MPIR_DBG_HANDLE, TYPICAL,
+                        (MPL_DBG_FDEST, "Received from grank %d to %d in progress %d,%d,%d\n",
+                         cell->my_rank, MPIDI_POSIX_mem_region.rank, cell->rank, cell->tag,
+                         cell->context_id));
+        cell->pending = NULL;
+        {
+            MPIDI_POSIX_queue_enqueue(MPIDI_POSIX_mem_region.FreeQ[cell->my_rank], cell);
+        }
+    }
+    else {
+        /* destroy unexpected req */
+        MPIDI_POSIX_REQUEST(sreq)->pending = NULL;
+        MPL_free(MPIDI_POSIX_REQUEST(sreq)->user_buf);
+        MPIDI_POSIX_REQUEST_DEQUEUE_AND_SET_ERROR(&sreq, prev_sreq, MPIDI_POSIX_recvq_unexpected,
+                                                  mpi_errno);
+    }
+
+    (*completion_count)++;
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_SHM_DO_PROGRESS_RECV);
+    return mpi_errno;
+}
+
+/* ----------------------------------------------------- */
+/* MPIDI_POSIX_progress_send                     */
+/* ----------------------------------------------------- */
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_POSIX_progress_send)
+static inline int MPIDI_POSIX_progress_send(int blocking, int *completion_count)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int dest;
+    MPIDI_POSIX_cell_ptr_t cell = NULL;
+    MPIR_Request *sreq = MPIDI_POSIX_sendq.head;
+    MPIR_Request *prev_sreq = NULL;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_SHM_DO_PROGRESS_SEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_SHM_DO_PROGRESS_SEND);
+
+    if (sreq == NULL)
+        goto fn_exit;
+
+    /* try to send via freeq */
+    if (!MPIDI_POSIX_queue_empty(MPIDI_POSIX_mem_region.my_freeQ)) {
+        MPIDI_POSIX_queue_dequeue(MPIDI_POSIX_mem_region.my_freeQ, &cell);
+        MPIDI_POSIX_ENVELOPE_GET(MPIDI_POSIX_REQUEST(sreq), cell->rank, cell->tag,
+                                 cell->context_id);
+        dest = MPIDI_POSIX_REQUEST(sreq)->dest;
+        char *recv_buffer = (char *) cell->pkt.mpich.p.payload;
+        size_t data_sz = MPIDI_POSIX_REQUEST(sreq)->data_sz;
+        /*
+         * TODO: make request field dest_lpid (or even recvQ[dest_lpid]) instead of dest - no need to do rank_to_lpid each time
+         */
+        int grank = MPIDI_CH4U_rank_to_lpid(dest, sreq->comm);
+        cell->pending = NULL;
+
+        if (MPIDI_POSIX_REQUEST(sreq)->type == MPIDI_POSIX_TYPESYNC) {
+            /* increase req cc in order to release req only after ACK, do it once per SYNC request */
+            /* non-NULL pending req signal receiver about sending ACK back */
+            /* the pending req should be sent back for sender to decrease cc, for it is dequeued already */
+            int c;
+            cell->pending = sreq;
+            MPIR_cc_incr(sreq->cc_ptr, &c);
+            MPIDI_POSIX_REQUEST(sreq)->type = MPIDI_POSIX_TYPESTANDARD;
+        }
+
+        if (data_sz <= MPIDI_POSIX_EAGER_THRESHOLD) {
+            cell->pkt.mpich.datalen = data_sz;
+
+            if (MPIDI_POSIX_REQUEST(sreq)->type == MPIDI_POSIX_TYPEACK) {
+                cell->pkt.mpich.type = MPIDI_POSIX_TYPEACK;
+                cell->pending = MPIDI_POSIX_REQUEST(sreq)->pending;
+            }
+            else {
+                /* eager message */
+                if (MPIDI_POSIX_REQUEST(sreq)->segment_ptr) {
+                    /* non-contig */
+                    MPID_Segment_pack(MPIDI_POSIX_REQUEST(sreq)->segment_ptr,
+                                      MPIDI_POSIX_REQUEST(sreq)->segment_first,
+                                      (MPI_Aint *) & MPIDI_POSIX_REQUEST(sreq)->segment_size,
+                                      recv_buffer);
+                    MPID_Segment_free(MPIDI_POSIX_REQUEST(sreq)->segment_ptr);
+                }
+                else {
+                    /* contig */
+                    MPIR_Memcpy((void *) recv_buffer, MPIDI_POSIX_REQUEST(sreq)->user_buf, data_sz);
+                }
+
+                cell->pkt.mpich.type = MPIDI_POSIX_TYPEEAGER;
+                /* set status */
+                /*
+                 * TODO: incorrect count for LMT - set to a last chunk of data
+                 * is send status required?
+                 */
+                sreq->status.MPI_SOURCE = cell->rank;
+                sreq->status.MPI_TAG = cell->tag;
+                MPIR_STATUS_SET_COUNT(sreq->status, data_sz);
+            }
+
+            /* dequeue sreq */
+            MPIDI_POSIX_REQUEST_DEQUEUE_AND_SET_ERROR(&sreq, prev_sreq, MPIDI_POSIX_sendq,
+                                                      mpi_errno);
+        }
+        else {
+            /* long message */
+            if (MPIDI_POSIX_REQUEST(sreq)->segment_ptr) {
+                /* non-contig */
+                size_t last =
+                    MPIDI_POSIX_REQUEST(sreq)->segment_first + MPIDI_POSIX_EAGER_THRESHOLD;
+                MPID_Segment_pack(MPIDI_POSIX_REQUEST(sreq)->segment_ptr,
+                                  MPIDI_POSIX_REQUEST(sreq)->segment_first, (MPI_Aint *) & last,
+                                  recv_buffer);
+                MPIDI_POSIX_REQUEST(sreq)->segment_first = last;
+            }
+            else {
+                /* contig */
+                MPIR_Memcpy((void *) recv_buffer, MPIDI_POSIX_REQUEST(sreq)->user_buf,
+                            MPIDI_POSIX_EAGER_THRESHOLD);
+                MPIDI_POSIX_REQUEST(sreq)->user_buf += MPIDI_POSIX_EAGER_THRESHOLD;
+            }
+
+            cell->pkt.mpich.datalen = MPIDI_POSIX_EAGER_THRESHOLD;
+            MPIDI_POSIX_REQUEST(sreq)->data_sz -= MPIDI_POSIX_EAGER_THRESHOLD;
+            cell->pkt.mpich.type = MPIDI_POSIX_TYPELMT;
+        }
+
+        MPL_DBG_MSG_FMT(MPIR_DBG_HANDLE, TYPICAL,
+                        (MPL_DBG_FDEST, "Sent to grank %d from %d in progress %d,%d,%d\n", grank,
+                         cell->my_rank, cell->rank, cell->tag, cell->context_id));
+        MPIDI_POSIX_queue_enqueue(MPIDI_POSIX_mem_region.RecvQ[grank], cell);
+        (*completion_count)++;
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_SHM_DO_PROGRESS_SEND);
+    return mpi_errno;
+}
+
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_SHM_progress)
+static inline int MPIDI_SHM_progress(int blocking)
+{
+    int complete = 0;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_SHM_PROGRESS);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_SHM_PROGRESS);
+
+    do {
+        /* Receieve progress */
+        MPID_THREAD_CS_ENTER(POBJ, MPIDI_POSIX_SHM_MUTEX);
+        MPIDI_POSIX_progress_recv(blocking, &complete);
+        MPID_THREAD_CS_EXIT(POBJ, MPIDI_POSIX_SHM_MUTEX);
+        /* Send progress */
+        MPID_THREAD_CS_ENTER(POBJ, MPIDI_POSIX_SHM_MUTEX);
+        MPIDI_POSIX_progress_send(blocking, &complete);
+        MPID_THREAD_CS_EXIT(POBJ, MPIDI_POSIX_SHM_MUTEX);
+
+        MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
+        MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
+
+        if (complete > 0)
+            break;
+    } while (blocking);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_SHM_PROGRESS);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_progress_test(void)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_progress_poke(void)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline void MPIDI_SHM_progress_start(MPID_Progress_state * state)
+{
+    MPIR_Assert(0);
+    return;
+}
+
+static inline void MPIDI_SHM_progress_end(MPID_Progress_state * state)
+{
+    MPIR_Assert(0);
+    return;
+}
+
+static inline int MPIDI_SHM_progress_wait(MPID_Progress_state * state)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_progress_register(int (*progress_fn) (int *))
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_progress_deregister(int id)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_progress_activate(int id)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_progress_deactivate(int id)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#endif /* SHM_POSIX_PROGRESS_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/posix/posix_queue.h b/src/mpid/ch4/shm/posix/posix_queue.h
new file mode 100644
index 0000000..be68f48
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/posix_queue.h
@@ -0,0 +1,343 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_QUEUE_H
+#define SHM_QUEUE_H
+
+/* ------------------------------------------------------- */
+/* from mpid/ch3/channels/nemesis/include/mpid_nem_debug.h */
+/* ------------------------------------------------------- */
+
+/*#define MPIDI_POSIX_YIELD_IN_SKIP*/
+#ifdef MPIDI_POSIX_YIELD_IN_SKIP
+#define MPIDI_POSIX_SKIP MPL_sched_yield()
+#warning "MPIDI_POSIX_SKIP is yield"
+#else /* MPIDI_POSIX_YIELD_IN_SKIP */
+#define MPIDI_POSIX_SKIP do {} while (0)
+/*#warning "MPIDI_POSIX_SKIP is do ...while" */
+#endif /* MPIDI_POSIX_YIELD_IN_SKIP */
+
+/* ------------------------------------------------------- */
+/* from mpid/ch3/channels/nemesis/include/mpid_nem_queue.h */
+/* ------------------------------------------------------- */
+
+/* Assertion macros for nemesis queues.  We don't use the normal
+ * assertion macros because we don't usually want to assert several
+ * times per queue operation.  These assertions serve more as structured
+ * comments that can easily transformed into being real assertions */
+#define MPIDI_POSIX_Q_assert(a_) \
+    do {/*nothing*/} while (0)
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_POSIX_cell_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline void MPIDI_POSIX_cell_init(MPIDI_POSIX_cell_ptr_t cell, int rank)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_CELL_INIT);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_CELL_INIT);
+
+    MPIDI_POSIX_SET_REL_NULL(cell->next);
+    memset((void *) &cell->pkt, 0, sizeof(MPIDI_POSIX_pkt_header_t));
+    cell->my_rank = rank;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_CELL_INIT);
+}
+
+#if defined(MPIDI_POSIX_USE_LOCK_FREE_QUEUES)
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_POSIX_queue_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline void MPIDI_POSIX_queue_init(MPIDI_POSIX_queue_ptr_t qhead)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_QUEUE_INIT);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_QUEUE_INIT);
+
+    MPIDI_POSIX_SET_REL_NULL(qhead->head);
+    MPIDI_POSIX_SET_REL_NULL(qhead->my_head);
+    MPIDI_POSIX_SET_REL_NULL(qhead->tail);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_QUEUE_INIT);
+}
+
+#define MPIDI_POSIX_USE_SHADOW_HEAD
+
+static inline MPIDI_POSIX_cell_rel_ptr_t MPIDI_POSIX_SWAP_REL(MPIDI_POSIX_cell_rel_ptr_t * ptr,
+                                                              MPIDI_POSIX_cell_rel_ptr_t val)
+{
+    MPIDI_POSIX_cell_rel_ptr_t ret;
+    OPA_store_ptr(&ret.p, OPA_swap_ptr(&(ptr->p), OPA_load_ptr(&val.p)));
+    return ret;
+}
+
+/* do a compare-and-swap with MPIDI_POSIX_RELNULL */
+static inline MPIDI_POSIX_cell_rel_ptr_t MPIDI_POSIX_CAS_REL_NULL(MPIDI_POSIX_cell_rel_ptr_t * ptr,
+                                                                  MPIDI_POSIX_cell_rel_ptr_t oldv)
+{
+    MPIDI_POSIX_cell_rel_ptr_t ret;
+    OPA_store_ptr(&ret.p, OPA_cas_ptr(&(ptr->p), OPA_load_ptr(&oldv.p), MPIDI_POSIX_REL_NULL));
+    return ret;
+}
+
+static inline void MPIDI_POSIX_queue_enqueue(MPIDI_POSIX_queue_ptr_t qhead,
+                                             MPIDI_POSIX_cell_ptr_t element)
+{
+    MPIDI_POSIX_cell_rel_ptr_t r_prev;
+    MPIDI_POSIX_cell_rel_ptr_t r_element = MPIDI_POSIX_ABS_TO_REL(element);
+
+    /* the _dequeue can break if this does not hold */
+    MPIDI_POSIX_Q_assert(MPIDI_POSIX_IS_REL_NULL(element->next));
+
+    /* Orders payload and e->next=NULL w.r.t. the SWAP, updating head, and
+     * updating prev->next.  We assert e->next==NULL above, but it may have been
+     * done by us in the preceding _dequeue operation.
+     *
+     * The SWAP itself does not need to be ordered w.r.t. the payload because
+     * the consumer does not directly inspect the tail.  But the subsequent
+     * update to the head or e->next field does need to be ordered w.r.t. the
+     * payload or the consumer may read incorrect data. */
+    OPA_write_barrier();
+
+    /* enqueue at tail */
+    r_prev = MPIDI_POSIX_SWAP_REL(&(qhead->tail), r_element);
+
+    if (MPIDI_POSIX_IS_REL_NULL(r_prev)) {
+        /* queue was empty, element is the new head too */
+
+        /* no write barrier needed, we believe atomic SWAP with a control
+         * dependence (if) will enforce ordering between the SWAP and the head
+         * assignment */
+        qhead->head = r_element;
+    }
+    else {
+        /* queue was not empty, swing old tail's next field to point to
+         * our element */
+        MPIDI_POSIX_Q_assert(MPIDI_POSIX_IS_REL_NULL(MPIDI_POSIX_REL_TO_ABS(r_prev)->next));
+
+        /* no write barrier needed, we believe atomic SWAP with a control
+         * dependence (if/else) will enforce ordering between the SWAP and the
+         * prev->next assignment */
+        MPIDI_POSIX_REL_TO_ABS(r_prev)->next = r_element;
+    }
+}
+
+/* This operation is only safe because this is a single-dequeuer queue impl.
+   Assumes that MPIDI_POSIX_queue_empty was called immediately prior to fix up any
+   shadow head issues. */
+static inline MPIDI_POSIX_cell_ptr_t MPIDI_POSIX_queue_head(MPIDI_POSIX_queue_ptr_t qhead)
+{
+    MPIDI_POSIX_Q_assert(MPIDI_POSIX_IS_REL_NULL(qhead->head));
+    return MPIDI_POSIX_REL_TO_ABS(qhead->my_head);
+}
+
+static inline int MPIDI_POSIX_queue_empty(MPIDI_POSIX_queue_ptr_t qhead)
+{
+    /* outside of this routine my_head and head should never both
+     * contain a non-null value */
+    MPIDI_POSIX_Q_assert(MPIDI_POSIX_IS_REL_NULL(qhead->my_head) ||
+                         MPIDI_POSIX_IS_REL_NULL(qhead->head));
+
+    if (MPIDI_POSIX_IS_REL_NULL(qhead->my_head)) {
+        /* the order of comparison between my_head and head does not
+         * matter, no read barrier needed here */
+        if (MPIDI_POSIX_IS_REL_NULL(qhead->head)) {
+            /* both null, nothing in queue */
+            return 1;
+        }
+        else {
+            /* shadow head null and head has value, move the value to
+             * our private shadow head and zero the real head */
+            qhead->my_head = qhead->head;
+            /* no barrier needed, my_head is entirely private to consumer */
+            MPIDI_POSIX_SET_REL_NULL(qhead->head);
+        }
+    }
+
+    /* the following assertions are present at the beginning of _dequeue:
+     * MPIDI_POSIX_Q_assert(!MPIDI_POSIX_IS_REL_NULL(qhead->my_head));
+     * MPIDI_POSIX_Q_assert(MPIDI_POSIX_IS_REL_NULL(qhead->head));
+     */
+    return 0;
+}
+
+
+/* Gets the head */
+static inline void MPIDI_POSIX_queue_dequeue(MPIDI_POSIX_queue_ptr_t qhead,
+                                             MPIDI_POSIX_cell_ptr_t * e)
+{
+    MPIDI_POSIX_cell_ptr_t _e;
+    MPIDI_POSIX_cell_rel_ptr_t _r_e;
+
+    /* _empty always called first, moving head-->my_head */
+    MPIDI_POSIX_Q_assert(!MPIDI_POSIX_IS_REL_NULL(qhead->my_head));
+    MPIDI_POSIX_Q_assert(MPIDI_POSIX_IS_REL_NULL(qhead->head));
+
+    _r_e = qhead->my_head;
+    _e = MPIDI_POSIX_REL_TO_ABS(_r_e);
+
+    /* no barrier needed, my_head is private to consumer, plus
+     * head/my_head and _e->next are ordered by a data dependency */
+    if (!MPIDI_POSIX_IS_REL_NULL(_e->next)) {
+        qhead->my_head = _e->next;
+    }
+    else {
+        /* we've reached the end (tail) of the queue */
+        MPIDI_POSIX_cell_rel_ptr_t old_tail;
+
+        MPIDI_POSIX_SET_REL_NULL(qhead->my_head);
+        /* no barrier needed, the caller doesn't need any ordering w.r.t.
+         * my_head or the tail */
+        old_tail = MPIDI_POSIX_CAS_REL_NULL(&(qhead->tail), _r_e);
+
+        if (!MPIDI_POSIX_REL_ARE_EQUAL(old_tail, _r_e)) {
+            /* FIXME is a barrier needed here because of the control-only dependency? */
+            while (MPIDI_POSIX_IS_REL_NULL(_e->next)) {
+                MPIDI_POSIX_SKIP;
+            }
+
+            /* no read barrier needed between loads from the same location */
+            qhead->my_head = _e->next;
+        }
+    }
+
+    MPIDI_POSIX_SET_REL_NULL(_e->next);
+
+    /* Conservative read barrier here to ensure loads from head are ordered
+     * w.r.t. payload reads by the caller.  The McKenney "whymb" document's
+     * Figure 11 indicates that we don't need a barrier, but we are currently
+     * unconvinced of this.  Further work, ideally using more formal methods,
+     * should justify removing this.  (note that this barrier won't cost us
+     * anything on many platforms, esp. x86) */
+    OPA_read_barrier();
+
+    *e = _e;
+}
+
+#else /* !defined(MPIDI_POSIX_USE_LOCK_FREE_QUEUES) */
+
+/* FIXME We shouldn't really be using the MPID_Thread_mutex_* code but the
+ * MPIDU_Process_locks code is a total mess right now.  In the long term we need
+   to resolve this, but in the short run it should be safe on most (all?)
+   platforms to use these instead.  Usually they will both boil down to a
+   pthread_mutex_t and and associated functions. */
+#define MPIDI_POSIX_queue_mutex_create MPID_Thread_mutex_create
+#define MPIDI_POSIX_queue_mutex_lock   MPID_Thread_mutex_lock
+#define MPIDI_POSIX_queue_mutex_unlock MPID_Thread_mutex_unlock
+
+/* must be called by exactly one process per queue */
+#undef FUNCNAME
+#define FUNCNAME MPIDI_POSIX_queue_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline void MPIDI_POSIX_queue_init(MPIDI_POSIX_queue_ptr_t qhead)
+{
+    int err = 0;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_QUEUE_INIT);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_QUEUE_INIT);
+
+    MPIDI_POSIX_SET_REL_NULL(qhead->head);
+    MPIDI_POSIX_SET_REL_NULL(qhead->my_head);
+    MPIDI_POSIX_SET_REL_NULL(qhead->tail);
+    MPIDI_POSIX_queue_mutex_create(&qhead->lock, &err);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_QUEUE_INIT);
+}
+
+static inline void MPIDI_POSIX_queue_enqueue(MPIDI_POSIX_queue_ptr_t qhead,
+                                             MPIDI_POSIX_cell_ptr_t element)
+{
+    int err = 0;
+    MPIDI_POSIX_cell_rel_ptr_t r_prev;
+    MPIDI_POSIX_cell_rel_ptr_t r_element = MPIDI_POSIX_ABS_TO_REL(element);
+
+    MPIDI_POSIX_queue_mutex_lock(&qhead->lock, &err);
+
+    r_prev = qhead->tail;
+    qhead->tail = r_element;
+
+    if (MPIDI_POSIX_IS_REL_NULL(r_prev)) {
+        qhead->head = r_element;
+    }
+    else {
+        MPIDI_POSIX_REL_TO_ABS(r_prev)->next = r_element;
+    }
+
+    MPIDI_POSIX_queue_mutex_unlock(&qhead->lock, &err);
+}
+
+/* This operation is only safe because this is a single-dequeuer queue impl. */
+static inline MPIDI_POSIX_cell_ptr_t MPIDI_POSIX_queue_head(MPIDI_POSIX_queue_ptr_t qhead)
+{
+    return MPIDI_POSIX_REL_TO_ABS(qhead->my_head);
+}
+
+/* Assumption: regular loads & stores are atomic.  This may not be univerally
+   true, but it's not uncommon.  We often need to use these "lock-ful" queues on
+   platforms where atomics are not yet implemented, so we can't rely on the
+   atomics to provide atomic load/store operations for us. */
+static inline int MPIDI_POSIX_queue_empty(MPIDI_POSIX_queue_ptr_t qhead)
+{
+    if (MPIDI_POSIX_IS_REL_NULL(qhead->my_head)) {
+        if (MPIDI_POSIX_IS_REL_NULL(qhead->head)) {
+            return 1;
+        }
+        else {
+            qhead->my_head = qhead->head;
+            MPIDI_POSIX_SET_REL_NULL(qhead->head);      /* reset it for next time */
+        }
+    }
+
+    return 0;
+}
+
+static inline void MPIDI_POSIX_queue_dequeue(MPIDI_POSIX_queue_ptr_t qhead,
+                                             MPIDI_POSIX_cell_ptr_t * e)
+{
+    int err = 0;
+    MPIDI_POSIX_cell_ptr_t _e;
+    MPIDI_POSIX_cell_rel_ptr_t _r_e;
+
+    _r_e = qhead->my_head;
+    _e = MPIDI_POSIX_REL_TO_ABS(_r_e);
+
+
+    if (MPIDI_POSIX_IS_REL_NULL(_e->next)) {
+        /* a REL_NULL _e->next or writing qhead->tail both require locking */
+        MPIDI_POSIX_queue_mutex_lock(&qhead->lock, &err);
+        qhead->my_head = _e->next;
+
+        /* We have to check _e->next again because it may have changed between
+         * the time we checked it without the lock and the time that we acquired
+         * the lock. */
+        if (MPIDI_POSIX_IS_REL_NULL(_e->next)) {
+            MPIDI_POSIX_SET_REL_NULL(qhead->tail);
+        }
+
+        MPIDI_POSIX_queue_mutex_unlock(&qhead->lock, &err);
+    }
+    else {      /* !MPIDI_POSIX_IS_REL_NULL(_e->next) */
+        /* We don't need to lock because a non-null _e->next can't be changed by
+         * anyone but us (the dequeuer) and we don't need to modify qhead->tail
+         * because we aren't removing the last element from the queue. */
+        qhead->my_head = _e->next;
+    }
+
+    MPIDI_POSIX_SET_REL_NULL(_e->next);
+    *e = _e;
+}
+
+#endif /* !defined(MPIDI_POSIX_USE_LOCK_FREE_QUEUES) */
+
+#endif /* ifndef SHM_QUEUE_H */
diff --git a/src/mpid/ch4/shm/posix/posix_recv.h b/src/mpid/ch4/shm/posix/posix_recv.h
new file mode 100644
index 0000000..d51e8e4
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/posix_recv.h
@@ -0,0 +1,355 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_POSIX_RECV_H_INCLUDED
+#define SHM_POSIX_RECV_H_INCLUDED
+
+#include "posix_impl.h"
+#include "ch4_impl.h"
+
+/* ---------------------------------------------------- */
+/* general queues                                       */
+/* ---------------------------------------------------- */
+extern MPIDI_POSIX_request_queue_t MPIDI_POSIX_recvq_posted;
+extern MPIDI_POSIX_request_queue_t MPIDI_POSIX_recvq_unexpected;
+
+/* ---------------------------------------------------- */
+/* MPIDI_POSIX_do_irecv                                             */
+/* ---------------------------------------------------- */
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_POSIX_do_irecv)
+static inline int MPIDI_POSIX_do_irecv(void *buf,
+                                       int count,
+                                       MPI_Datatype datatype,
+                                       int rank,
+                                       int tag,
+                                       MPIR_Comm * comm, int context_offset,
+                                       MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS, dt_contig;
+    size_t data_sz;
+    MPI_Aint dt_true_lb;
+    MPIR_Datatype *dt_ptr;
+    MPIR_Request *rreq = NULL;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_DO_IRECV);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_DO_IRECV);
+
+    MPIDI_POSIX_REQUEST_CREATE_RREQ(rreq);
+
+    if (unlikely(rank == MPI_PROC_NULL)) {
+        MPIR_Status_set_procnull(&(rreq->status));
+        *request = rreq;
+        goto fn_exit;
+    }
+
+    MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);
+    MPIDI_POSIX_ENVELOPE_SET(MPIDI_POSIX_REQUEST(rreq), rank, tag,
+                             comm->context_id + context_offset);
+    rreq->comm = comm;
+    MPIR_Comm_add_ref(comm);
+    MPIDI_POSIX_REQUEST(rreq)->user_buf = (char *) buf + dt_true_lb;
+    MPIDI_POSIX_REQUEST(rreq)->user_count = count;
+    MPIDI_POSIX_REQUEST(rreq)->datatype = datatype;
+    MPIDI_POSIX_REQUEST(rreq)->data_sz = data_sz;
+    MPIDI_POSIX_REQUEST(rreq)->next = NULL;
+    MPIDI_POSIX_REQUEST(rreq)->segment_ptr = NULL;
+    MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(rreq) = NULL;
+    MPIR_STATUS_SET_COUNT(rreq->status, 0);
+
+    if (!dt_contig) {
+        MPIDI_POSIX_REQUEST(rreq)->segment_ptr = MPID_Segment_alloc();
+        MPIR_ERR_CHKANDJUMP1((MPIDI_POSIX_REQUEST(rreq)->segment_ptr == NULL), mpi_errno,
+                             MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
+        MPID_Segment_init((char *) buf, MPIDI_POSIX_REQUEST(rreq)->user_count,
+                          MPIDI_POSIX_REQUEST(rreq)->datatype,
+                          MPIDI_POSIX_REQUEST(rreq)->segment_ptr, 0);
+        MPIDI_POSIX_REQUEST(rreq)->segment_first = 0;
+        MPIDI_POSIX_REQUEST(rreq)->segment_size = data_sz;
+    }
+
+    dtype_add_ref_if_not_builtin(datatype);
+    /* enqueue rreq */
+    MPIDI_POSIX_REQUEST_ENQUEUE(rreq, MPIDI_POSIX_recvq_posted);
+    MPL_DBG_MSG_FMT(MPIR_DBG_HANDLE, TYPICAL,
+                    (MPL_DBG_FDEST,
+                     "Enqueued from grank %d to %d (comm_kind %d) in recv %d,%d,%d\n",
+                     MPIDI_CH4U_rank_to_lpid(rank, comm), MPIDI_POSIX_mem_region.rank,
+                     comm->comm_kind, rank, tag, comm->context_id + context_offset));
+    *request = rreq;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_DO_IRECV);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_SHM_recv)
+static inline int MPIDI_SHM_recv(void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm,
+                                 int context_offset, MPI_Status * status, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS, dt_contig __attribute__ ((__unused__));
+    size_t data_sz __attribute__ ((__unused__));
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPIDI_SHM_RECV);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPIDI_SHM_RECV);
+
+    /* create a request */
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    mpi_errno =
+        MPIDI_POSIX_do_irecv(buf, count, datatype, rank, tag, comm, context_offset, request);
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    MPIR_FUNC_VERBOSE_EXIT(MPIDI_SHM_RECV);
+    return mpi_errno;
+}
+
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_SHM_recv)
+static inline int MPIDI_SHM_recv_init(void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq = NULL;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_SHM_RECV_INIT);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_SHM_RECV_INIT);
+
+    MPIDI_POSIX_REQUEST_CREATE_RREQ(rreq);
+    MPIR_Object_set_ref(rreq, 1);
+    MPIR_cc_set(&rreq->cc, 0);
+    rreq->kind = MPIR_REQUEST_KIND__PREQUEST_RECV;
+    rreq->comm = comm;
+    MPIR_Comm_add_ref(comm);
+    MPIDI_POSIX_ENVELOPE_SET(MPIDI_POSIX_REQUEST(rreq), rank, tag,
+                             comm->context_id + context_offset);
+    MPIDI_POSIX_REQUEST(rreq)->user_buf = (char *) buf;
+    MPIDI_POSIX_REQUEST(rreq)->user_count = count;
+    MPIDI_POSIX_REQUEST(rreq)->datatype = datatype;
+    *request = rreq;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_SHM_RECV_INIT);
+    return mpi_errno;
+}
+
+
+static inline int MPIDI_SHM_imrecv(void *buf,
+                                   int count,
+                                   MPI_Datatype datatype,
+                                   MPIR_Request * message, MPIR_Request ** rreqp)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int dt_contig;
+    size_t data_sz;
+    MPI_Aint dt_true_lb;
+    MPIR_Datatype *dt_ptr;
+    MPIR_Request *rreq = NULL, *sreq = NULL;
+    int rank, tag, context_id;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPIDI_SHM_IMRECV);
+    MPIR_FUNC_VERBOSE_ENTER(MPIDI_SHM_IMRECV);
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_POSIX_SHM_MUTEX);
+
+    if (message == NULL) {
+        MPIDI_Request_create_null_rreq(rreq, mpi_errno, goto fn_fail);
+        *rreqp = rreq;
+        goto fn_exit;
+    }
+
+    MPIR_Assert(message != NULL);
+
+    MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);
+    MPIDI_POSIX_REQUEST_CREATE_RREQ(rreq);
+    MPIR_Object_set_ref(rreq, 1);
+    MPIR_cc_set(&rreq->cc, 0);
+    MPIDI_POSIX_ENVELOPE_GET(MPIDI_POSIX_REQUEST(message), rank, tag, context_id);
+    MPIDI_POSIX_ENVELOPE_SET(MPIDI_POSIX_REQUEST(rreq), rank, tag, context_id);
+    rreq->comm = message->comm;
+    MPIR_Comm_add_ref(message->comm);
+    MPIDI_POSIX_REQUEST(rreq)->user_buf = (char *) buf + dt_true_lb;
+    MPIDI_POSIX_REQUEST(rreq)->user_count = count;
+    MPIDI_POSIX_REQUEST(rreq)->datatype = datatype;
+    MPIDI_POSIX_REQUEST(rreq)->next = NULL;
+    MPIDI_POSIX_REQUEST(rreq)->segment_ptr = NULL;
+    MPIR_STATUS_SET_COUNT(rreq->status, 0);
+
+    if (!dt_contig) {
+        MPIDI_POSIX_REQUEST(rreq)->segment_ptr = MPID_Segment_alloc();
+        MPIR_ERR_CHKANDJUMP1((MPIDI_POSIX_REQUEST(rreq)->segment_ptr == NULL), mpi_errno,
+                             MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
+        MPID_Segment_init((char *) buf, MPIDI_POSIX_REQUEST(rreq)->user_count,
+                          MPIDI_POSIX_REQUEST(rreq)->datatype,
+                          MPIDI_POSIX_REQUEST(rreq)->segment_ptr, 0);
+        MPIDI_POSIX_REQUEST(rreq)->segment_first = 0;
+        MPIDI_POSIX_REQUEST(rreq)->segment_size = data_sz;
+    }
+
+    if (MPIDI_POSIX_REQUEST(message)->pending) {
+        /* Sync send - we must send ACK */
+        int srank = message->status.MPI_SOURCE;
+        MPIR_Request *req_ack = NULL;
+        MPIDI_POSIX_REQUEST_CREATE_SREQ(req_ack);
+        MPIR_Object_set_ref(req_ack, 1);
+        req_ack->comm = message->comm;
+        MPIR_Comm_add_ref(message->comm);
+        MPIDI_POSIX_ENVELOPE_SET(MPIDI_POSIX_REQUEST(req_ack), message->comm->rank, tag,
+                                 context_id);
+        MPIDI_POSIX_REQUEST(req_ack)->user_buf = NULL;
+        MPIDI_POSIX_REQUEST(req_ack)->user_count = 0;
+        MPIDI_POSIX_REQUEST(req_ack)->datatype = MPI_BYTE;
+        MPIDI_POSIX_REQUEST(req_ack)->data_sz = 0;
+        MPIDI_POSIX_REQUEST(req_ack)->type = MPIDI_POSIX_TYPEACK;
+        MPIDI_POSIX_REQUEST(req_ack)->dest = srank;
+        MPIDI_POSIX_REQUEST(req_ack)->next = NULL;
+        MPIDI_POSIX_REQUEST(req_ack)->segment_ptr = NULL;
+        MPIDI_POSIX_REQUEST(req_ack)->pending = MPIDI_POSIX_REQUEST(message)->pending;
+        /* enqueue req_ack */
+        MPIDI_POSIX_REQUEST_ENQUEUE(req_ack, MPIDI_POSIX_sendq);
+    }
+
+    for (sreq = message; sreq;) {
+        MPIR_Request *next_req = NULL;
+        char *send_buffer = MPIDI_POSIX_REQUEST(sreq)->user_buf;
+        char *recv_buffer = (char *) MPIDI_POSIX_REQUEST(rreq)->user_buf;
+
+        if (MPIDI_POSIX_REQUEST(sreq)->type == MPIDI_POSIX_TYPEEAGER) {
+            /* eager message */
+            data_sz = MPIDI_POSIX_REQUEST(sreq)->data_sz;
+
+            if (MPIDI_POSIX_REQUEST(rreq)->segment_ptr) {
+                /* non-contig */
+                size_t last = MPIDI_POSIX_REQUEST(rreq)->segment_first + data_sz;
+                MPID_Segment_unpack(MPIDI_POSIX_REQUEST(rreq)->segment_ptr,
+                                    MPIDI_POSIX_REQUEST(rreq)->segment_first, (MPI_Aint *) & last,
+                                    send_buffer);
+                MPID_Segment_free(MPIDI_POSIX_REQUEST(rreq)->segment_ptr);
+            }
+            else
+                /* contig */
+            if (send_buffer)
+                MPIR_Memcpy(recv_buffer, (void *) send_buffer, data_sz);
+
+            /* set status */
+            rreq->status.MPI_SOURCE = sreq->status.MPI_SOURCE;
+            rreq->status.MPI_TAG = sreq->status.MPI_TAG;
+            count = MPIR_STATUS_GET_COUNT(rreq->status) + (MPI_Count) data_sz;
+            MPIR_STATUS_SET_COUNT(rreq->status, count);
+        }
+        else if (MPIDI_POSIX_REQUEST(sreq)->type == MPIDI_POSIX_TYPELMT) {
+            /* long message */
+            if (MPIDI_POSIX_REQUEST(rreq)->segment_ptr) {
+                /* non-contig */
+                size_t last =
+                    MPIDI_POSIX_REQUEST(rreq)->segment_first + MPIDI_POSIX_EAGER_THRESHOLD;
+                MPID_Segment_unpack(MPIDI_POSIX_REQUEST(rreq)->segment_ptr,
+                                    MPIDI_POSIX_REQUEST(rreq)->segment_first, (MPI_Aint *) & last,
+                                    send_buffer);
+                MPIDI_POSIX_REQUEST(rreq)->segment_first = last;
+            }
+            else
+                /* contig */
+            if (send_buffer)
+                MPIR_Memcpy(recv_buffer, (void *) send_buffer, MPIDI_POSIX_EAGER_THRESHOLD);
+
+            MPIDI_POSIX_REQUEST(rreq)->data_sz -= MPIDI_POSIX_EAGER_THRESHOLD;
+            MPIDI_POSIX_REQUEST(rreq)->user_buf += MPIDI_POSIX_EAGER_THRESHOLD;
+            count = MPIR_STATUS_GET_COUNT(rreq->status) + (MPI_Count) MPIDI_POSIX_EAGER_THRESHOLD;
+            MPIR_STATUS_SET_COUNT(rreq->status, count);
+        }
+
+        /* destroy unexpected req */
+        MPIDI_POSIX_REQUEST(sreq)->pending = NULL;
+        MPL_free(MPIDI_POSIX_REQUEST(sreq)->user_buf);
+        next_req = MPIDI_POSIX_REQUEST(sreq)->next;
+        MPIDI_POSIX_REQUEST_COMPLETE(sreq);
+        sreq = next_req;
+    }
+
+    *rreqp = rreq;
+
+  fn_exit:
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    MPIR_FUNC_VERBOSE_EXIT(MPIDI_SHM_IMRECV);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_SHM_irecv)
+static inline int MPIDI_SHM_irecv(void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  int rank,
+                                  int tag,
+                                  MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPIDI_SHM_IRECV);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPIDI_SHM_IRECV);
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    mpi_errno =
+        MPIDI_POSIX_do_irecv(buf, count, datatype, rank, tag, comm, context_offset, request);
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    MPIR_FUNC_VERBOSE_EXIT(MPIDI_SHM_IRECV);
+    return mpi_errno;
+}
+
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_SHM_cancel_recv)
+static inline int MPIDI_SHM_cancel_recv(MPIR_Request * rreq)
+{
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    MPIR_Request *req = MPIDI_POSIX_recvq_posted.head;
+    MPIR_Request *prev_req = NULL;
+
+    while (req) {
+
+        if (req == rreq) {
+            /* Remove request from shm posted receive queue */
+
+            if (prev_req) {
+                MPIDI_POSIX_REQUEST(prev_req)->next = MPIDI_POSIX_REQUEST(req)->next;
+            }
+            else {
+                MPIDI_POSIX_recvq_posted.head = MPIDI_POSIX_REQUEST(req)->next;
+            }
+
+            if (req == MPIDI_POSIX_recvq_posted.tail) {
+                MPIDI_POSIX_recvq_posted.tail = prev_req;
+            }
+
+            MPIR_STATUS_SET_CANCEL_BIT(req->status, TRUE);
+            MPIR_STATUS_SET_COUNT(req->status, 0);
+            MPIDI_POSIX_REQUEST_COMPLETE(req);
+
+            break;
+        }
+
+        prev_req = req;
+        req = MPIDI_POSIX_REQUEST(req)->next;
+    }
+
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    return MPI_SUCCESS;
+}
+
+#endif /* SHM_POSIX_RECV_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/posix/posix_request.h b/src/mpid/ch4/shm/posix/posix_request.h
new file mode 100644
index 0000000..abf78e6
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/posix_request.h
@@ -0,0 +1,26 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_POSIX_REQUEST_H_INCLUDED
+#define SHM_POSIX_REQUEST_H_INCLUDED
+
+#include "posix_impl.h"
+
+static inline void MPIDI_SHM_am_request_init(MPIR_Request * req)
+{
+    MPIR_Assert(0);
+}
+
+static inline void MPIDI_SHM_am_request_finalize(MPIR_Request * req)
+{
+    MPIR_Assert(0);
+}
+
+#endif /* SHM_POSIX_REQUEST_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/posix/posix_rma.h b/src/mpid/ch4/shm/posix/posix_rma.h
new file mode 100644
index 0000000..8ea78bf
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/posix_rma.h
@@ -0,0 +1,143 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_POSIX_RMA_H_INCLUDED
+#define SHM_POSIX_RMA_H_INCLUDED
+
+#include "posix_impl.h"
+
+static inline int MPIDI_SHM_put(const void *origin_addr,
+                                int origin_count,
+                                MPI_Datatype origin_datatype,
+                                int target_rank,
+                                MPI_Aint target_disp,
+                                int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_get(void *origin_addr,
+                                int origin_count,
+                                MPI_Datatype origin_datatype,
+                                int target_rank,
+                                MPI_Aint target_disp,
+                                int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_rput(const void *origin_addr,
+                                 int origin_count,
+                                 MPI_Datatype origin_datatype,
+                                 int target_rank,
+                                 MPI_Aint target_disp,
+                                 int target_count,
+                                 MPI_Datatype target_datatype,
+                                 MPIR_Win * win, MPIR_Request ** request)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_compare_and_swap(const void *origin_addr,
+                                             const void *compare_addr,
+                                             void *result_addr,
+                                             MPI_Datatype datatype,
+                                             int target_rank, MPI_Aint target_disp, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_raccumulate(const void *origin_addr,
+                                        int origin_count,
+                                        MPI_Datatype origin_datatype,
+                                        int target_rank,
+                                        MPI_Aint target_disp,
+                                        int target_count,
+                                        MPI_Datatype target_datatype,
+                                        MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_rget_accumulate(const void *origin_addr,
+                                            int origin_count,
+                                            MPI_Datatype origin_datatype,
+                                            void *result_addr,
+                                            int result_count,
+                                            MPI_Datatype result_datatype,
+                                            int target_rank,
+                                            MPI_Aint target_disp,
+                                            int target_count,
+                                            MPI_Datatype target_datatype,
+                                            MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_fetch_and_op(const void *origin_addr,
+                                         void *result_addr,
+                                         MPI_Datatype datatype,
+                                         int target_rank,
+                                         MPI_Aint target_disp, MPI_Op op, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_rget(void *origin_addr,
+                                 int origin_count,
+                                 MPI_Datatype origin_datatype,
+                                 int target_rank,
+                                 MPI_Aint target_disp,
+                                 int target_count,
+                                 MPI_Datatype target_datatype,
+                                 MPIR_Win * win, MPIR_Request ** request)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_get_accumulate(const void *origin_addr,
+                                           int origin_count,
+                                           MPI_Datatype origin_datatype,
+                                           void *result_addr,
+                                           int result_count,
+                                           MPI_Datatype result_datatype,
+                                           int target_rank,
+                                           MPI_Aint target_disp,
+                                           int target_count,
+                                           MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_accumulate(const void *origin_addr,
+                                       int origin_count,
+                                       MPI_Datatype origin_datatype,
+                                       int target_rank,
+                                       MPI_Aint target_disp,
+                                       int target_count,
+                                       MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#endif /* SHM_POSIX_RMA_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/posix/posix_send.h b/src/mpid/ch4/shm/posix/posix_send.h
new file mode 100644
index 0000000..afde708
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/posix_send.h
@@ -0,0 +1,462 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_POSIX_SEND_H_INCLUDED
+#define SHM_POSIX_SEND_H_INCLUDED
+
+#include "posix_impl.h"
+#include "ch4_impl.h"
+#include <../mpi/pt2pt/bsendutil.h>
+/* ---------------------------------------------------- */
+/* from mpid/ch3/channels/nemesis/include/mpid_nem_impl.h */
+/* ---------------------------------------------------- */
+/* assumes value!=0 means the fbox is full.  Contains acquire barrier to
+ * ensure that later operations that are dependent on this check don't
+ * escape earlier than this check. */
+#define MPIDI_POSIX_fbox_is_full(pbox_) (OPA_load_acquire_int(&(pbox_)->flag.value))
+
+/* ---------------------------------------------------- */
+/* MPIDI_POSIX_do_isend                                             */
+/* ---------------------------------------------------- */
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_POSIX_do_isend)
+static inline int MPIDI_POSIX_do_isend(const void *buf,
+                                       int count,
+                                       MPI_Datatype datatype,
+                                       int rank,
+                                       int tag,
+                                       MPIR_Comm * comm, int context_offset,
+                                       MPIR_Request ** request, int type)
+{
+    int dt_contig, mpi_errno = MPI_SUCCESS;
+    MPI_Aint dt_true_lb;
+    MPIR_Request *sreq = NULL;
+    size_t data_sz;
+    MPIR_Datatype *dt_ptr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_DO_ISEND);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_DO_ISEND);
+
+    MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);
+    MPIDI_POSIX_REQUEST_CREATE_SREQ(sreq);
+    sreq->comm = comm;
+    MPIR_Comm_add_ref(comm);
+    MPIDI_POSIX_ENVELOPE_SET(MPIDI_POSIX_REQUEST(sreq), comm->rank, tag,
+                             comm->context_id + context_offset);
+    MPIDI_POSIX_REQUEST(sreq)->user_buf = (char *) buf + dt_true_lb;
+    MPIDI_POSIX_REQUEST(sreq)->user_count = count;
+    MPIDI_POSIX_REQUEST(sreq)->datatype = datatype;
+    MPIDI_POSIX_REQUEST(sreq)->data_sz = data_sz;
+    MPIDI_POSIX_REQUEST(sreq)->type = type;
+    MPIDI_POSIX_REQUEST(sreq)->dest = rank;
+    MPIDI_POSIX_REQUEST(sreq)->next = NULL;
+    MPIDI_POSIX_REQUEST(sreq)->pending = NULL;
+    MPIDI_POSIX_REQUEST(sreq)->segment_ptr = NULL;
+
+    if (!dt_contig) {
+        MPIDI_POSIX_REQUEST(sreq)->segment_ptr = MPID_Segment_alloc();
+        MPIR_ERR_CHKANDJUMP1((MPIDI_POSIX_REQUEST(sreq)->segment_ptr == NULL), mpi_errno,
+                             MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
+        MPID_Segment_init((char *) buf, MPIDI_POSIX_REQUEST(sreq)->user_count,
+                          MPIDI_POSIX_REQUEST(sreq)->datatype,
+                          MPIDI_POSIX_REQUEST(sreq)->segment_ptr, 0);
+        MPIDI_POSIX_REQUEST(sreq)->segment_first = 0;
+        MPIDI_POSIX_REQUEST(sreq)->segment_size = data_sz;
+    }
+
+    dtype_add_ref_if_not_builtin(datatype);
+    /* enqueue sreq */
+    MPIDI_POSIX_REQUEST_ENQUEUE(sreq, MPIDI_POSIX_sendq);
+    *request = sreq;
+    MPL_DBG_MSG_FMT(MPIR_DBG_HANDLE, TYPICAL,
+                    (MPL_DBG_FDEST,
+                     "Enqueued to grank %d from %d (comm_kind %d) in recv %d,%d,%d\n",
+                     MPIDI_CH4U_rank_to_lpid(rank, comm), MPIDI_POSIX_mem_region.rank,
+                     comm->comm_kind, comm->rank, tag, comm->context_id + context_offset));
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_DO_ISEND);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_SHM_SEND)
+static inline int MPIDI_SHM_send(const void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int dt_contig __attribute__ ((__unused__)), mpi_errno = MPI_SUCCESS;
+    MPI_Aint dt_true_lb;
+    size_t data_sz;
+    MPIR_Datatype *dt_ptr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_SHM_SEND);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_SHM_SEND);
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);
+
+    /* try to send immediately, contig, short message */
+    if (dt_contig && data_sz <= MPIDI_POSIX_EAGER_THRESHOLD) {
+        /* eager message */
+        int grank = MPIDI_CH4U_rank_to_lpid(rank, comm);
+
+        /* Try freeQ */
+        if (!MPIDI_POSIX_queue_empty(MPIDI_POSIX_mem_region.my_freeQ)) {
+            MPIDI_POSIX_cell_ptr_t cell;
+            MPIDI_POSIX_queue_dequeue(MPIDI_POSIX_mem_region.my_freeQ, &cell);
+            MPIDI_POSIX_ENVELOPE_SET(cell, comm->rank, tag, comm->context_id + context_offset);
+            cell->pkt.mpich.datalen = data_sz;
+            cell->pkt.mpich.type = MPIDI_POSIX_TYPEEAGER;
+            MPIR_Memcpy((void *) cell->pkt.mpich.p.payload, (char *) buf + dt_true_lb, data_sz);
+            cell->pending = NULL;
+            MPIDI_POSIX_queue_enqueue(MPIDI_POSIX_mem_region.RecvQ[grank], cell);
+            *request = NULL;
+            MPL_DBG_MSG_FMT(MPIR_DBG_HANDLE, TYPICAL,
+                            (MPL_DBG_FDEST, "Sent to grank %d from %d in send %d,%d,%d\n", grank,
+                             cell->my_rank, cell->rank, cell->tag, cell->context_id));
+            goto fn_exit;
+        }
+    }
+
+    /* Long message or */
+    /* Failed to send immediately - create and return request */
+    mpi_errno =
+        MPIDI_POSIX_do_isend(buf, count, datatype, rank, tag, comm, context_offset, request,
+                             MPIDI_POSIX_TYPESTANDARD);
+
+  fn_exit:
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_SHM_SEND);
+    return mpi_errno;
+}
+
+
+
+
+static inline int MPIDI_SHM_irsend(const void *buf,
+                                   int count,
+                                   MPI_Datatype datatype,
+                                   int rank,
+                                   int tag,
+                                   MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_SHM_ISEND);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_SHM_ISEND);
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    mpi_errno =
+        MPIDI_POSIX_do_isend(buf, count, datatype, rank, tag, comm, context_offset, request,
+                             MPIDI_POSIX_TYPEREADY);
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_SHM_ISEND);
+    return mpi_errno;
+}
+
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_SHM_SSEND)
+static inline int MPIDI_SHM_ssend(const void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  int rank,
+                                  int tag,
+                                  MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_SHM_SSEND);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_SHM_SSEND);
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    mpi_errno =
+        MPIDI_POSIX_do_isend(buf, count, datatype, rank, tag, comm, context_offset, request,
+                             MPIDI_POSIX_TYPESYNC);
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_SHM_SSEND);
+    return mpi_errno;
+}
+
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_SHM_STARTALL)
+static inline int MPIDI_SHM_startall(int count, MPIR_Request * requests[])
+{
+    int i, mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_SHM_STARTALL);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_SHM_STARTALL);
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_POSIX_SHM_MUTEX);
+
+    for (i = 0; i < count; i++) {
+        MPIR_Request *preq = requests[i];
+
+        if (preq->kind == MPIR_REQUEST_KIND__PREQUEST_SEND) {
+            if (MPIDI_POSIX_REQUEST(preq)->type != MPIDI_POSIX_TYPEBUFFERED) {
+                mpi_errno =
+                    MPIDI_POSIX_do_isend(MPIDI_POSIX_REQUEST(preq)->user_buf,
+                                         MPIDI_POSIX_REQUEST(preq)->user_count,
+                                         MPIDI_POSIX_REQUEST(preq)->datatype,
+                                         MPIDI_POSIX_REQUEST(preq)->dest,
+                                         MPIDI_POSIX_REQUEST(preq)->tag, preq->comm,
+                                         MPIDI_POSIX_REQUEST(preq)->context_id -
+                                         preq->comm->context_id, &preq->u.persist.real_request,
+                                         MPIDI_POSIX_REQUEST(preq)->type);
+            }
+            else {
+                MPI_Request sreq_handle;
+                mpi_errno =
+                    MPIR_Ibsend_impl(MPIDI_POSIX_REQUEST(preq)->user_buf,
+                                     MPIDI_POSIX_REQUEST(preq)->user_count,
+                                     MPIDI_POSIX_REQUEST(preq)->datatype,
+                                     MPIDI_POSIX_REQUEST(preq)->dest,
+                                     MPIDI_POSIX_REQUEST(preq)->tag, preq->comm, &sreq_handle);
+
+                if (mpi_errno == MPI_SUCCESS)
+                    MPIR_Request_get_ptr(sreq_handle, preq->u.persist.real_request);
+            }
+        }
+        else if (preq->kind == MPIR_REQUEST_KIND__PREQUEST_RECV) {
+            mpi_errno =
+                MPIDI_POSIX_do_irecv(MPIDI_POSIX_REQUEST(preq)->user_buf,
+                                     MPIDI_POSIX_REQUEST(preq)->user_count,
+                                     MPIDI_POSIX_REQUEST(preq)->datatype,
+                                     MPIDI_POSIX_REQUEST(preq)->rank,
+                                     MPIDI_POSIX_REQUEST(preq)->tag, preq->comm,
+                                     MPIDI_POSIX_REQUEST(preq)->context_id - preq->comm->context_id,
+                                     &preq->u.persist.real_request);
+        }
+        else {
+            MPIR_Assert(0);
+        }
+
+        if (mpi_errno == MPI_SUCCESS) {
+            preq->status.MPI_ERROR = MPI_SUCCESS;
+
+            if (MPIDI_POSIX_REQUEST(preq)->type == MPIDI_POSIX_TYPEBUFFERED) {
+                preq->cc_ptr = &preq->cc;
+                MPIR_cc_set(&preq->cc, 0);
+            }
+            else
+                preq->cc_ptr = &preq->u.persist.real_request->cc;
+        }
+        else {
+            preq->u.persist.real_request = NULL;
+            preq->status.MPI_ERROR = mpi_errno;
+            preq->cc_ptr = &preq->cc;
+            MPIR_cc_set(&preq->cc, 0);
+        }
+    }
+
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_SHM_STARTALL);
+    return mpi_errno;
+}
+
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_SHM_SEND_INIT)
+static inline int MPIDI_SHM_send_init(const void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *sreq = NULL;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_SHM_SEND_INIT);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_SHM_SEND_INIT);
+    MPIDI_POSIX_REQUEST_CREATE_SREQ(sreq);
+    MPIR_Object_set_ref(sreq, 1);
+    MPIR_cc_set(&(sreq)->cc, 0);
+    sreq->kind = MPIR_REQUEST_KIND__PREQUEST_SEND;
+    sreq->comm = comm;
+    MPIR_Comm_add_ref(comm);
+    MPIDI_POSIX_ENVELOPE_SET(MPIDI_POSIX_REQUEST(sreq), comm->rank, tag,
+                             comm->context_id + context_offset);
+    MPIDI_POSIX_REQUEST(sreq)->user_buf = (char *) buf;
+    MPIDI_POSIX_REQUEST(sreq)->user_count = count;
+    MPIDI_POSIX_REQUEST(sreq)->dest = rank;
+    MPIDI_POSIX_REQUEST(sreq)->datatype = datatype;
+    MPIDI_POSIX_REQUEST(sreq)->type = MPIDI_POSIX_TYPESTANDARD;
+    *request = sreq;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_SHM_SEND_INIT);
+    return mpi_errno;
+}
+
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_SHM_SSEND_INIT)
+static inline int MPIDI_SHM_ssend_init(const void *buf,
+                                       int count,
+                                       MPI_Datatype datatype,
+                                       int rank,
+                                       int tag,
+                                       MPIR_Comm * comm,
+                                       int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *sreq = NULL;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_SHM_SEND_INIT);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_SHM_SEND_INIT);
+    MPIDI_POSIX_REQUEST_CREATE_SREQ(sreq);
+    MPIR_Object_set_ref(sreq, 1);
+    sreq->kind = MPIR_REQUEST_KIND__PREQUEST_SEND;
+    sreq->comm = comm;
+    MPIR_Comm_add_ref(comm);
+    MPIDI_POSIX_ENVELOPE_SET(MPIDI_POSIX_REQUEST(sreq), comm->rank, tag,
+                             comm->context_id + context_offset);
+    MPIDI_POSIX_REQUEST(sreq)->user_buf = (char *) buf;
+    MPIDI_POSIX_REQUEST(sreq)->user_count = count;
+    MPIDI_POSIX_REQUEST(sreq)->dest = rank;
+    MPIDI_POSIX_REQUEST(sreq)->datatype = datatype;
+    MPIDI_POSIX_REQUEST(sreq)->type = MPIDI_POSIX_TYPESYNC;
+    *request = sreq;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_SHM_SEND_INIT);
+    return mpi_errno;
+}
+
+static inline int MPIDI_SHM_bsend_init(const void *buf,
+                                       int count,
+                                       MPI_Datatype datatype,
+                                       int rank,
+                                       int tag,
+                                       MPIR_Comm * comm,
+                                       int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *sreq = NULL;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_SHM_SEND_INIT);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_SHM_SEND_INIT);
+    MPIDI_POSIX_REQUEST_CREATE_SREQ(sreq);
+    MPIR_Object_set_ref(sreq, 1);
+    sreq->kind = MPIR_REQUEST_KIND__PREQUEST_SEND;
+    sreq->comm = comm;
+    MPIR_Comm_add_ref(comm);
+    MPIDI_POSIX_ENVELOPE_SET(MPIDI_POSIX_REQUEST(sreq), comm->rank, tag,
+                             comm->context_id + context_offset);
+    MPIDI_POSIX_REQUEST(sreq)->user_buf = (char *) buf;
+    MPIDI_POSIX_REQUEST(sreq)->user_count = count;
+    MPIDI_POSIX_REQUEST(sreq)->dest = rank;
+    MPIDI_POSIX_REQUEST(sreq)->datatype = datatype;
+    MPIDI_POSIX_REQUEST(sreq)->type = MPIDI_POSIX_TYPEBUFFERED;
+    *request = sreq;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_SHM_SEND_INIT);
+    return mpi_errno;
+}
+
+static inline int MPIDI_SHM_rsend_init(const void *buf,
+                                       int count,
+                                       MPI_Datatype datatype,
+                                       int rank,
+                                       int tag,
+                                       MPIR_Comm * comm,
+                                       int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *sreq = NULL;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_SHM_RSEND_INIT);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_SHM_RSEND_INIT);
+    MPIDI_POSIX_REQUEST_CREATE_SREQ(sreq);
+    MPIR_Object_set_ref(sreq, 1);
+    MPIR_cc_set(&(sreq)->cc, 0);
+    sreq->kind = MPIR_REQUEST_KIND__PREQUEST_SEND;
+    sreq->comm = comm;
+    MPIR_Comm_add_ref(comm);
+    MPIDI_POSIX_ENVELOPE_SET(MPIDI_POSIX_REQUEST(sreq), comm->rank, tag,
+                             comm->context_id + context_offset);
+    MPIDI_POSIX_REQUEST(sreq)->user_buf = (char *) buf;
+    MPIDI_POSIX_REQUEST(sreq)->user_count = count;
+    MPIDI_POSIX_REQUEST(sreq)->dest = rank;
+    MPIDI_POSIX_REQUEST(sreq)->datatype = datatype;
+    MPIDI_POSIX_REQUEST(sreq)->type = MPIDI_POSIX_TYPEREADY;
+    *request = sreq;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_SHM_RSEND_INIT);
+    return mpi_errno;
+}
+
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_SHM_isend)
+static inline int MPIDI_SHM_isend(const void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  int rank,
+                                  int tag,
+                                  MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_SHM_ISEND);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_SHM_ISEND);
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    mpi_errno =
+        MPIDI_POSIX_do_isend(buf, count, datatype, rank, tag, comm, context_offset, request,
+                             MPIDI_POSIX_TYPESTANDARD);
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_SHM_ISEND);
+    return mpi_errno;
+}
+
+static inline int MPIDI_SHM_issend(const void *buf,
+                                   int count,
+                                   MPI_Datatype datatype,
+                                   int rank,
+                                   int tag,
+                                   MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_SHM_ISSEND);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_SHM_ISSEND);
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    mpi_errno =
+        MPIDI_POSIX_do_isend(buf, count, datatype, rank, tag, comm, context_offset, request,
+                             MPIDI_POSIX_TYPESYNC);
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_SHM_ISSEND);
+    return mpi_errno;
+}
+
+static inline int MPIDI_SHM_cancel_send(MPIR_Request * sreq)
+{
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    MPIR_Request *req = MPIDI_POSIX_sendq.head;
+    MPIR_Request *prev_req = NULL;
+    int mpi_errno = MPI_SUCCESS;
+
+    while (req) {
+
+        if (req == sreq) {
+            MPIR_STATUS_SET_CANCEL_BIT(sreq->status, TRUE);
+            MPIR_STATUS_SET_COUNT(sreq->status, 0);
+            MPIDI_POSIX_REQUEST_COMPLETE(sreq);
+            MPIDI_POSIX_REQUEST_DEQUEUE_AND_SET_ERROR(&sreq, prev_req, MPIDI_POSIX_sendq,
+                                                      mpi_errno);
+            break;
+        }
+
+        prev_req = req;
+        req = MPIDI_POSIX_REQUEST(req)->next;
+    }
+
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_POSIX_SHM_MUTEX);
+    return mpi_errno;
+}
+
+#endif /* SHM_POSIX_SEND_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/posix/posix_spawn.h b/src/mpid/ch4/shm/posix/posix_spawn.h
new file mode 100644
index 0000000..2a2e001
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/posix_spawn.h
@@ -0,0 +1,50 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_POSIX_DYNPROC_H_INCLUDED
+#define SHM_POSIX_DYNPROC_H_INCLUDED
+
+#include "posix_impl.h"
+
+static inline int MPIDI_SHM_comm_connect(const char *port_name,
+                                         MPIR_Info * info,
+                                         int root, MPIR_Comm * comm, MPIR_Comm ** newcomm_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_comm_disconnect(MPIR_Comm * comm_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_open_port(MPIR_Info * info_ptr, char *port_name)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_close_port(const char *port_name)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_comm_accept(const char *port_name,
+                                        MPIR_Info * info,
+                                        int root, MPIR_Comm * comm, MPIR_Comm ** newcomm_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#endif /* SHM_POSIX_WIN_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/posix/posix_unimpl.h b/src/mpid/ch4/shm/posix/posix_unimpl.h
new file mode 100644
index 0000000..db8e66b
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/posix_unimpl.h
@@ -0,0 +1,19 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/mpid/ch4/shm/posix/posix_win.h b/src/mpid/ch4/shm/posix/posix_win.h
new file mode 100644
index 0000000..aeac908
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/posix_win.h
@@ -0,0 +1,185 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_POSIX_WIN_H_INCLUDED
+#define SHM_POSIX_WIN_H_INCLUDED
+
+#include "posix_impl.h"
+
+static inline int MPIDI_SHM_win_set_info(MPIR_Win * win, MPIR_Info * info)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_win_start(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_win_complete(MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_post(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_win_wait(MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_win_test(MPIR_Win * win, int *flag)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_lock(int lock_type, int rank, int assert, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_win_unlock(int rank, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_get_info(MPIR_Win * win, MPIR_Info ** info_p_p)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_win_free(MPIR_Win ** win_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_fence(int assert, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_create(void *base,
+                                       MPI_Aint length,
+                                       int disp_unit,
+                                       MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_attach(MPIR_Win * win, void *base, MPI_Aint size)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_allocate_shared(MPI_Aint size,
+                                                int disp_unit,
+                                                MPIR_Info * info_ptr,
+                                                MPIR_Comm * comm_ptr,
+                                                void **base_ptr, MPIR_Win ** win_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_detach(MPIR_Win * win, const void *base)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_shared_query(MPIR_Win * win,
+                                             int rank,
+                                             MPI_Aint * size, int *disp_unit, void *baseptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_allocate(MPI_Aint size,
+                                         int disp_unit,
+                                         MPIR_Info * info,
+                                         MPIR_Comm * comm, void *baseptr, MPIR_Win ** win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_flush(int rank, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_flush_local_all(MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_unlock_all(MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_create_dynamic(MPIR_Info * info, MPIR_Comm * comm, MPIR_Win ** win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_flush_local(int rank, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_sync(MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_flush_all(MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_lock_all(int assert, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+#endif /* SHM_POSIX_WIN_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/posix/shm_direct.h b/src/mpid/ch4/shm/posix/shm_direct.h
new file mode 100644
index 0000000..9173da2
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/shm_direct.h
@@ -0,0 +1,27 @@
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_DIRECT_H_INCLUDED
+#define SHM_DIRECT_H_INCLUDED
+
+#include "posix_init.h"
+#include "posix_probe.h"
+#include "posix_progress.h"
+#include "posix_recv.h"
+#include "posix_request.h"
+#include "posix_send.h"
+#include "posix_win.h"
+#include "posix_rma.h"
+#include "posix_am.h"
+#include "posix_spawn.h"
+#include "posix_comm.h"
+#include "posix_coll.h"
+#include "posix_unimpl.h"
+
+#endif /* SHM_DIRECT_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/posix/subconfigure.m4 b/src/mpid/ch4/shm/posix/subconfigure.m4
new file mode 100644
index 0000000..0a32273
--- /dev/null
+++ b/src/mpid/ch4/shm/posix/subconfigure.m4
@@ -0,0 +1,53 @@
+[#] start of __file__
+dnl MPICH_SUBCFG_BEFORE=src/mpid/common/shm
+dnl MPICH_SUBCFG_AFTER=src/mpid/ch4
+
+AC_DEFUN([PAC_SUBCFG_PREREQ_]PAC_SUBCFG_AUTO_SUFFIX,[
+    AM_COND_IF([BUILD_CH4],[
+        for shm in $ch4_shm ; do
+            AS_CASE([$shm],[posix],[build_ch4_shm_posix=yes])
+        done
+
+        AC_ARG_WITH(ch4-shmmod-posix-args,
+        [  --with-ch4-shmmod-posix-args=arg1:arg2:arg3
+        CH4 POSIX shmmod arguments:
+                disable-lock-free-queues - Disable atomics and lock-free queues
+                ],
+                [posix_shmmod_args=$withval],
+                [posix_shmmod_args=])
+
+dnl Parse the shmmod arguments
+        SAVE_IFS=$IFS
+        IFS=':'
+        args_array=$posix_shmmod_args
+        do_disable_lock_free_queues=false
+        echo "Parsing Arguments for POSIX shmmod"
+        for arg in $args_array; do
+        case ${arg} in
+            disable-lock-free-queues)
+                do_disable_lock_free_queues=true
+                echo " ---> CH4::SHM::POSIX : $arg"
+                ;;
+            esac
+        done
+        IFS=$SAVE_IFS
+
+        if [test "$do_disable_lock_free_queues" = "true"]; then
+            AC_MSG_NOTICE([Disabling POSIX shared memory lock free queues])
+        else
+            AC_MSG_NOTICE([Enabling POSIX shared memory lock free queues])
+            PAC_APPEND_FLAG([-DMPIDI_POSIX_USE_LOCK_FREE_QUEUES],[CPPFLAGS])
+        fi
+        # the POSIX shmmod depends on the common shm code
+        build_mpid_common_shm=yes
+    ])
+    AM_CONDITIONAL([BUILD_SHM_POSIX],[test "X$build_ch4_shm_posix" = "Xyes"])
+])dnl
+
+AC_DEFUN([PAC_SUBCFG_BODY_]PAC_SUBCFG_AUTO_SUFFIX,[
+AM_COND_IF([BUILD_SHM_POSIX],[
+    AC_MSG_NOTICE([RUNNING CONFIGURE FOR ch4:shm:posix])
+])dnl end AM_COND_IF(BUILD_SHM_POSIX,...)
+])dnl end _BODY
+
+[#] end of __file__
diff --git a/src/mpid/ch4/shm/stubshm/Makefile.mk b/src/mpid/ch4/shm/stubshm/Makefile.mk
new file mode 100644
index 0000000..b1dcb01
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/Makefile.mk
@@ -0,0 +1,7 @@
+if BUILD_SHM_STUBSHM
+
+mpi_core_sources += src/mpid/ch4/shm/stubshm/globals.c    \
+                    src/mpid/ch4/shm/stubshm/func_table.c
+# errnames_txt_files += src/mpid/ch4/shm/stub/errnames.txt
+
+endif
diff --git a/src/mpid/ch4/shm/stubshm/func_table.c b/src/mpid/ch4/shm/stubshm/func_table.c
new file mode 100644
index 0000000..1244e17
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/func_table.c
@@ -0,0 +1,152 @@
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#ifndef SHM_DIRECT
+#define SHM_DISABLE_INLINES
+#include <mpidimpl.h>
+#include "shm_direct.h"
+MPIDI_SHM_funcs_t MPIDI_SHM_stubshm_funcs = {
+    MPIDI_SHM_init,
+    MPIDI_SHM_finalize,
+    MPIDI_SHM_progress,
+    MPIDI_SHM_reg_hdr_handler,
+    MPIDI_SHM_comm_connect,
+    MPIDI_SHM_comm_disconnect,
+    MPIDI_SHM_open_port,
+    MPIDI_SHM_close_port,
+    MPIDI_SHM_comm_accept,
+    MPIDI_SHM_send_am_hdr,
+    MPIDI_SHM_inject_am_hdr,
+    MPIDI_SHM_send_am,
+    MPIDI_SHM_inject_am,
+    MPIDI_SHM_send_amv,
+    MPIDI_SHM_inject_amv,
+    MPIDI_SHM_send_am_hdr_reply,
+    MPIDI_SHM_inject_am_hdr_reply,
+    MPIDI_SHM_send_am_reply,
+    MPIDI_SHM_inject_am_reply,
+    MPIDI_SHM_send_amv_reply,
+    MPIDI_SHM_inject_amv_reply,
+    MPIDI_SHM_am_hdr_max_sz,
+    MPIDI_SHM_am_inject_max_sz,
+    MPIDI_SHM_am_recv,
+    MPIDI_SHM_comm_get_lpid,
+    MPIDI_SHM_gpid_get,
+    MPIDI_SHM_get_node_id,
+    MPIDI_SHM_get_max_node_id,
+    MPIDI_SHM_getallincomm,
+    MPIDI_SHM_gpid_tolpidarray,
+    MPIDI_SHM_create_intercomm_from_lpids,
+    MPIDI_SHM_comm_create,
+    MPIDI_SHM_comm_destroy,
+    MPIDI_SHM_am_request_init,
+};
+
+MPIDI_SHM_native_funcs_t MPIDI_SHM_native_stubshm_funcs = {
+    MPIDI_SHM_send,
+    MPIDI_SHM_ssend,
+    MPIDI_SHM_startall,
+    MPIDI_SHM_send_init,
+    MPIDI_SHM_ssend_init,
+    MPIDI_SHM_rsend_init,
+    MPIDI_SHM_bsend_init,
+    MPIDI_SHM_isend,
+    MPIDI_SHM_issend,
+    MPIDI_SHM_cancel_send,
+    MPIDI_SHM_recv_init,
+    MPIDI_SHM_recv,
+    MPIDI_SHM_irecv,
+    MPIDI_SHM_imrecv,
+    MPIDI_SHM_cancel_recv,
+    MPIDI_SHM_alloc_mem,
+    MPIDI_SHM_free_mem,
+    MPIDI_SHM_improbe,
+    MPIDI_SHM_iprobe,
+    MPIDI_SHM_win_set_info,
+    MPIDI_SHM_win_shared_query,
+    MPIDI_SHM_put,
+    MPIDI_SHM_win_start,
+    MPIDI_SHM_win_complete,
+    MPIDI_SHM_win_post,
+    MPIDI_SHM_win_wait,
+    MPIDI_SHM_win_test,
+    MPIDI_SHM_win_lock,
+    MPIDI_SHM_win_unlock,
+    MPIDI_SHM_win_get_info,
+    MPIDI_SHM_get,
+    MPIDI_SHM_win_free,
+    MPIDI_SHM_win_fence,
+    MPIDI_SHM_win_create,
+    MPIDI_SHM_accumulate,
+    MPIDI_SHM_win_attach,
+    MPIDI_SHM_win_allocate_shared,
+    MPIDI_SHM_rput,
+    MPIDI_SHM_win_flush_local,
+    MPIDI_SHM_win_detach,
+    MPIDI_SHM_compare_and_swap,
+    MPIDI_SHM_raccumulate,
+    MPIDI_SHM_rget_accumulate,
+    MPIDI_SHM_fetch_and_op,
+    MPIDI_SHM_win_allocate,
+    MPIDI_SHM_win_flush,
+    MPIDI_SHM_win_flush_local_all,
+    MPIDI_SHM_win_unlock_all,
+    MPIDI_SHM_win_create_dynamic,
+    MPIDI_SHM_rget,
+    MPIDI_SHM_win_sync,
+    MPIDI_SHM_win_flush_all,
+    MPIDI_SHM_get_accumulate,
+    MPIDI_SHM_win_lock_all,
+    MPIDI_SHM_barrier,
+    MPIDI_SHM_bcast,
+    MPIDI_SHM_allreduce,
+    MPIDI_SHM_allgather,
+    MPIDI_SHM_allgatherv,
+    MPIDI_SHM_scatter,
+    MPIDI_SHM_scatterv,
+    MPIDI_SHM_gather,
+    MPIDI_SHM_gatherv,
+    MPIDI_SHM_alltoall,
+    MPIDI_SHM_alltoallv,
+    MPIDI_SHM_alltoallw,
+    MPIDI_SHM_reduce,
+    MPIDI_SHM_reduce_scatter,
+    MPIDI_SHM_reduce_scatter_block,
+    MPIDI_SHM_scan,
+    MPIDI_SHM_exscan,
+    MPIDI_SHM_neighbor_allgather,
+    MPIDI_SHM_neighbor_allgatherv,
+    MPIDI_SHM_neighbor_alltoall,
+    MPIDI_SHM_neighbor_alltoallv,
+    MPIDI_SHM_neighbor_alltoallw,
+    MPIDI_SHM_ineighbor_allgather,
+    MPIDI_SHM_ineighbor_allgatherv,
+    MPIDI_SHM_ineighbor_alltoall,
+    MPIDI_SHM_ineighbor_alltoallv,
+    MPIDI_SHM_ineighbor_alltoallw,
+    MPIDI_SHM_ibarrier,
+    MPIDI_SHM_ibcast,
+    MPIDI_SHM_iallgather,
+    MPIDI_SHM_iallgatherv,
+    MPIDI_SHM_iallreduce,
+    MPIDI_SHM_ialltoall,
+    MPIDI_SHM_ialltoallv,
+    MPIDI_SHM_ialltoallw,
+    MPIDI_SHM_iexscan,
+    MPIDI_SHM_igather,
+    MPIDI_SHM_igatherv,
+    MPIDI_SHM_ireduce_scatter_block,
+    MPIDI_SHM_ireduce_scatter,
+    MPIDI_SHM_ireduce,
+    MPIDI_SHM_iscan,
+    MPIDI_SHM_iscatter,
+    MPIDI_SHM_iscatterv,
+};
+#endif
diff --git a/src/mpid/ch4/shm/stubshm/globals.c b/src/mpid/ch4/shm/stubshm/globals.c
new file mode 100644
index 0000000..74ad1a3
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/globals.c
@@ -0,0 +1,13 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#include <mpidimpl.h>
+#include "stubshm_impl.h"
diff --git a/src/mpid/ch4/shm/stubshm/shm_direct.h b/src/mpid/ch4/shm/stubshm/shm_direct.h
new file mode 100644
index 0000000..69fd12b
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/shm_direct.h
@@ -0,0 +1,27 @@
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_DIRECT_H_INCLUDED
+#define SHM_DIRECT_H_INCLUDED
+
+#include "stubshm_init.h"
+#include "stubshm_probe.h"
+#include "stubshm_progress.h"
+#include "stubshm_recv.h"
+#include "stubshm_request.h"
+#include "stubshm_send.h"
+#include "stubshm_win.h"
+#include "stubshm_rma.h"
+#include "stubshm_am.h"
+#include "stubshm_spawn.h"
+#include "stubshm_comm.h"
+#include "stubshm_coll.h"
+#include "stubshm_unimpl.h"
+
+#endif /* SHM_DIRECT_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/stubshm/stubshm_am.h b/src/mpid/ch4/shm/stubshm/stubshm_am.h
new file mode 100644
index 0000000..1ff7dab
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/stubshm_am.h
@@ -0,0 +1,173 @@
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_STUBSHM_AM_H_INCLUDED
+#define SHM_STUBSHM_AM_H_INCLUDED
+
+#include "stubshm_impl.h"
+
+static inline int MPIDI_SHM_reg_hdr_handler(int handler_id,
+                                            MPIDI_SHM_am_origin_handler_fn origin_handler_fn,
+                                            MPIDI_SHM_am_target_handler_fn target_handler_fn)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_send_am_hdr(int rank,
+                                        MPIR_Comm * comm,
+                                        int handler_id,
+                                        const void *am_hdr,
+                                        size_t am_hdr_sz, MPIR_Request * sreq, void *shm_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_send_am(int rank,
+                                    MPIR_Comm * comm,
+                                    int handler_id,
+                                    const void *am_hdr,
+                                    size_t am_hdr_sz,
+                                    const void *data,
+                                    MPI_Count count,
+                                    MPI_Datatype datatype, MPIR_Request * sreq, void *shm_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_send_amv(int rank,
+                                     MPIR_Comm * comm,
+                                     int handler_id,
+                                     struct iovec *am_hdr,
+                                     size_t iov_len,
+                                     const void *data,
+                                     MPI_Count count,
+                                     MPI_Datatype datatype, MPIR_Request * sreq, void *shm_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_send_am_hdr_reply(MPIR_Context_id_t context_id, int src_rank,
+                                              int handler_id,
+                                              const void *am_hdr,
+                                              size_t am_hdr_sz, MPIR_Request * sreq)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_send_am_reply(MPIR_Context_id_t context_id, int src_rank,
+                                          int handler_id,
+                                          const void *am_hdr,
+                                          size_t am_hdr_sz,
+                                          const void *data,
+                                          MPI_Count count,
+                                          MPI_Datatype datatype, MPIR_Request * sreq)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_send_amv_reply(MPIR_Context_id_t context_id, int src_rank,
+                                           int handler_id,
+                                           struct iovec *am_hdr,
+                                           size_t iov_len,
+                                           const void *data,
+                                           MPI_Count count,
+                                           MPI_Datatype datatype, MPIR_Request * sreq)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline size_t MPIDI_SHM_am_hdr_max_sz(void)
+{
+    MPIR_Assert(0);
+    return 0;
+}
+
+static inline int MPIDI_SHM_inject_am_hdr(int rank,
+                                          MPIR_Comm * comm,
+                                          int handler_id,
+                                          const void *am_hdr, size_t am_hdr_sz, void *shm_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_inject_am(int rank,
+                                      MPIR_Comm * comm,
+                                      int handler_id,
+                                      const void *am_hdr,
+                                      size_t am_hdr_sz,
+                                      const void *data,
+                                      MPI_Count count, MPI_Datatype datatype, void *shm_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_inject_amv(int rank,
+                                       MPIR_Comm * comm,
+                                       int handler_id,
+                                       struct iovec *am_hdr,
+                                       size_t iov_len,
+                                       const void *data,
+                                       MPI_Count count, MPI_Datatype datatype, void *shm_context)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_inject_am_hdr_reply(MPIR_Context_id_t context_id, int src_rank,
+                                                int handler_id,
+                                                const void *am_hdr, size_t am_hdr_sz)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_inject_am_reply(MPIR_Context_id_t context_id, int src_rank,
+                                            int handler_id,
+                                            const void *am_hdr,
+                                            size_t am_hdr_sz,
+                                            const void *data,
+                                            MPI_Count count, MPI_Datatype datatype)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_inject_amv_reply(MPIR_Context_id_t context_id, int src_rank,
+                                             int handler_id,
+                                             struct iovec *am_hdr,
+                                             size_t iov_len,
+                                             const void *data,
+                                             MPI_Count count, MPI_Datatype datatype)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline size_t MPIDI_SHM_am_inject_max_sz(void)
+{
+    MPIR_Assert(0);
+    return 0;
+}
+
+static inline int MPIDI_SHM_am_recv(MPIR_Request * req)
+{
+    MPIR_Assert(0);
+    return 0;
+}
+
+#endif /* SHM_STUBSHM_AM_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/stubshm/stubshm_coll.h b/src/mpid/ch4/shm/stubshm/stubshm_coll.h
new file mode 100644
index 0000000..73ef4e8
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/stubshm_coll.h
@@ -0,0 +1,695 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_STUBSHM_COLL_H_INCLUDED
+#define SHM_STUBSHM_COLL_H_INCLUDED
+
+#include "stubshm_impl.h"
+#include "ch4_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_barrier
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_barrier(MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_bcast
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_bcast(void *buffer, int count, MPI_Datatype datatype,
+                                  int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_allreduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_allreduce(const void *sendbuf, void *recvbuf, int count,
+                                      MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                      MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                      void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                      MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_allgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                       void *recvbuf, const int *recvcounts, const int *displs,
+                                       MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                       MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_gather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                   int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_gatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, const int *recvcounts, const int *displs,
+                                    MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr,
+                                    MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_scatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                    int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_scatterv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_scatterv(const void *sendbuf, const int *sendcounts,
+                                     const int *displs, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                     int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_alltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                     MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_alltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_alltoallv(const void *sendbuf, const int *sendcounts,
+                                      const int *sdispls, MPI_Datatype sendtype,
+                                      void *recvbuf, const int *recvcounts,
+                                      const int *rdispls, MPI_Datatype recvtype,
+                                      MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_alltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_alltoallw(const void *sendbuf, const int sendcounts[],
+                                      const int sdispls[], const MPI_Datatype sendtypes[],
+                                      void *recvbuf, const int recvcounts[],
+                                      const int rdispls[], const MPI_Datatype recvtypes[],
+                                      MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Assert(0);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_reduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_reduce(const void *sendbuf, void *recvbuf, int count,
+                                   MPI_Datatype datatype, MPI_Op op, int root,
+                                   MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_reduce_scatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_reduce_scatter(const void *sendbuf, void *recvbuf,
+                                           const int recvcounts[], MPI_Datatype datatype,
+                                           MPI_Op op, MPIR_Comm * comm_ptr,
+                                           MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Assert(0);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_reduce_scatter_block
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_reduce_scatter_block(const void *sendbuf, void *recvbuf,
+                                                 int recvcount, MPI_Datatype datatype,
+                                                 MPI_Op op, MPIR_Comm * comm_ptr,
+                                                 MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Assert(0);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_scan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_scan(const void *sendbuf, void *recvbuf, int count,
+                                 MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                 MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_exscan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_exscan(const void *sendbuf, void *recvbuf, int count,
+                                   MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                   MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_neighbor_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_neighbor_allgather(const void *sendbuf, int sendcount,
+                                               MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                               MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                               MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_neighbor_allgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_neighbor_allgatherv(const void *sendbuf, int sendcount,
+                                                MPI_Datatype sendtype, void *recvbuf,
+                                                const int recvcounts[], const int displs[],
+                                                MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                                MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_neighbor_alltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_neighbor_alltoall(const void *sendbuf, int sendcount,
+                                              MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                              MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                              MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_neighbor_alltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_neighbor_alltoallv(const void *sendbuf, const int sendcounts[],
+                                               const int sdispls[], MPI_Datatype sendtype,
+                                               void *recvbuf, const int recvcounts[],
+                                               const int rdispls[], MPI_Datatype recvtype,
+                                               MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_neighbor_alltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_neighbor_alltoallw(const void *sendbuf, const int sendcounts[],
+                                               const MPI_Aint sdispls[],
+                                               const MPI_Datatype sendtypes[], void *recvbuf,
+                                               const int recvcounts[], const MPI_Aint rdispls[],
+                                               const MPI_Datatype recvtypes[], MPIR_Comm * comm_ptr,
+                                               MPIR_Errflag_t * errflag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ineighbor_allgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ineighbor_allgather(const void *sendbuf, int sendcount,
+                                                MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                                MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                                MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ineighbor_allgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ineighbor_allgatherv(const void *sendbuf, int sendcount,
+                                                 MPI_Datatype sendtype, void *recvbuf,
+                                                 const int recvcounts[], const int displs[],
+                                                 MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                                 MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ineighbor_alltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ineighbor_alltoall(const void *sendbuf, int sendcount,
+                                               MPI_Datatype sendtype, void *recvbuf,
+                                               int recvcount, MPI_Datatype recvtype,
+                                               MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ineighbor_alltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ineighbor_alltoallv(const void *sendbuf, const int sendcounts[],
+                                                const int sdispls[], MPI_Datatype sendtype,
+                                                void *recvbuf, const int recvcounts[],
+                                                const int rdispls[], MPI_Datatype recvtype,
+                                                MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ineighbor_alltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ineighbor_alltoallw(const void *sendbuf, const int sendcounts[],
+                                                const MPI_Aint sdispls[],
+                                                const MPI_Datatype sendtypes[], void *recvbuf,
+                                                const int recvcounts[], const MPI_Aint rdispls[],
+                                                const MPI_Datatype recvtypes[],
+                                                MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ibarrier
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ibarrier(MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ibcast
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ibcast(void *buffer, int count, MPI_Datatype datatype,
+                                   int root, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_iallgather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                       void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                       MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_iallgatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_iallgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                        void *recvbuf, const int *recvcounts, const int *displs,
+                                        MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
+                                        MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ialltoall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ialltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                      void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                      MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ialltoallv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ialltoallv(const void *sendbuf, const int *sendcounts,
+                                       const int *sdispls, MPI_Datatype sendtype,
+                                       void *recvbuf, const int *recvcounts,
+                                       const int *rdispls, MPI_Datatype recvtype,
+                                       MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ialltoallw
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ialltoallw(const void *sendbuf, const int *sendcounts,
+                                       const int *sdispls, const MPI_Datatype sendtypes[],
+                                       void *recvbuf, const int *recvcounts,
+                                       const int *rdispls, const MPI_Datatype recvtypes[],
+                                       MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Assert(0);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_iexscan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_iexscan(const void *sendbuf, void *recvbuf, int count,
+                                    MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                    MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_igather
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_igather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                    int root, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_igatherv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_igatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, const int *recvcounts, const int *displs,
+                                     MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr,
+                                     MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ireduce_scatter_block
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ireduce_scatter_block(const void *sendbuf, void *recvbuf,
+                                                  int recvcount, MPI_Datatype datatype,
+                                                  MPI_Op op, MPIR_Comm * comm_ptr,
+                                                  MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ireduce_scatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ireduce_scatter(const void *sendbuf, void *recvbuf,
+                                            const int recvcounts[], MPI_Datatype datatype,
+                                            MPI_Op op, MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_ireduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_ireduce(const void *sendbuf, void *recvbuf, int count,
+                                    MPI_Datatype datatype, MPI_Op op, int root,
+                                    MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_iallreduce
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_iallreduce(const void *sendbuf, void *recvbuf, int count,
+                                       MPI_Datatype datatype, MPI_Op op,
+                                       MPIR_Comm * comm_ptr, MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_iscan
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_iscan(const void *sendbuf, void *recvbuf, int count,
+                                  MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr,
+                                  MPI_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Assert(0);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_iscatter
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_iscatter(const void *sendbuf, int sendcount,
+                                     MPI_Datatype sendtype, void *recvbuf,
+                                     int recvcount, MPI_Datatype recvtype,
+                                     int root, MPIR_Comm * comm, MPI_Request * request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Assert(0);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_SHM_iscatterv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_SHM_iscatterv(const void *sendbuf, const int *sendcounts,
+                                      const int *displs, MPI_Datatype sendtype,
+                                      void *recvbuf, int recvcount,
+                                      MPI_Datatype recvtype, int root,
+                                      MPIR_Comm * comm, MPI_Request * request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Assert(0);
+    return mpi_errno;
+}
+
+#endif /* SHM_STUBSHM_COLL_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/stubshm/stubshm_comm.h b/src/mpid/ch4/shm/stubshm/stubshm_comm.h
new file mode 100644
index 0000000..db0b3a6
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/stubshm_comm.h
@@ -0,0 +1,29 @@
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_STUBSHM_COMM_H_INCLUDED
+#define SHM_STUBSHM_COMM_H_INCLUDED
+
+#include "stubshm_impl.h"
+static inline int MPIDI_SHM_comm_create(MPIR_Comm * comm)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Assert(0);
+    return mpi_errno;
+}
+
+static inline int MPIDI_SHM_comm_destroy(MPIR_Comm * comm)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Assert(0);
+    return mpi_errno;
+}
+
+
+#endif /* SHM_STUBSHM_COMM_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/stubshm/stubshm_impl.h b/src/mpid/ch4/shm/stubshm/stubshm_impl.h
new file mode 100644
index 0000000..a0b274c
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/stubshm_impl.h
@@ -0,0 +1,14 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_STUBSHM_IMPL_H_INCLUDED
+#define SHM_STUBSHM_IMPL_H_INCLUDED
+
+#endif /* SHM_STUBSHM_IMPL_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/stubshm/stubshm_init.h b/src/mpid/ch4/shm/stubshm/stubshm_init.h
new file mode 100644
index 0000000..cfe95fe
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/stubshm_init.h
@@ -0,0 +1,85 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_STUBSHM_INIT_H_INCLUDED
+#define SHM_STUBSHM_INIT_H_INCLUDED
+
+#include "stubshm_impl.h"
+
+static inline int MPIDI_SHM_init(int rank, int size)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_finalize(void)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline void *MPIDI_SHM_alloc_mem(size_t size, MPIR_Info * info_ptr)
+{
+    MPIR_Assert(0);
+    return NULL;
+}
+
+static inline int MPIDI_SHM_free_mem(void *ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_comm_get_lpid(MPIR_Comm * comm_ptr,
+                                          int idx, int *lpid_ptr, MPL_bool is_remote)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_gpid_get(MPIR_Comm * comm_ptr, int rank, MPIR_Gpid * gpid)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_get_node_id(MPIR_Comm * comm, int rank, MPID_Node_id_t * id_p)
+{
+    *id_p = (MPID_Node_id_t) 0;
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_get_max_node_id(MPIR_Comm * comm, MPID_Node_id_t * max_id_p)
+{
+    *max_id_p = (MPID_Node_id_t) 1;
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_getallincomm(MPIR_Comm * comm_ptr,
+                                         int local_size, MPIR_Gpid local_gpids[], int *singleAVT)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_gpid_tolpidarray(int size, MPIR_Gpid gpid[], int lpid[])
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr,
+                                                        int size, const int lpids[])
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#endif /* SHM_STUBSHM_INIT_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/stubshm/stubshm_pre.h b/src/mpid/ch4/shm/stubshm/stubshm_pre.h
new file mode 100644
index 0000000..fc0f625
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/stubshm_pre.h
@@ -0,0 +1,27 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#ifndef SHM_STUBSHM_PRE_H_INCLUDED
+#define SHM_STUBSHM_PRE_H_INCLUDED
+
+typedef struct {
+    int dummy;
+} MPIDI_STUBSHM_am_request_t;
+
+typedef struct {
+    int dummy;
+} MPIDI_STUBSHM_request_t;
+
+typedef struct MPIDI_STUBSHM_comm_t {
+    int dummy;
+} MPIDI_STUBSHM_comm_t;
+
+#endif /* SHM_STUBSHM_PRE_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/stubshm/stubshm_probe.h b/src/mpid/ch4/shm/stubshm/stubshm_probe.h
new file mode 100644
index 0000000..0ba72f6
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/stubshm_probe.h
@@ -0,0 +1,36 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_STUBSHM_PROBE_H_INCLUDED
+#define SHM_STUBSHM_PROBE_H_INCLUDED
+
+#include "stubshm_impl.h"
+
+
+static inline int MPIDI_SHM_improbe(int source,
+                                    int tag,
+                                    MPIR_Comm * comm,
+                                    int context_offset,
+                                    int *flag, MPIR_Request ** message, MPI_Status * status)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_iprobe(int source,
+                                   int tag,
+                                   MPIR_Comm * comm,
+                                   int context_offset, int *flag, MPI_Status * status)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#endif /* SHM_STUBSHM_PROBE_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/stubshm/stubshm_proc.h b/src/mpid/ch4/shm/stubshm/stubshm_proc.h
new file mode 100644
index 0000000..984002e
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/stubshm_proc.h
@@ -0,0 +1,21 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_STUBSHM_PROC_H_INCLUDED
+#define SHM_STUBSHM_PROC_H_INCLUDED
+
+#include "stubshm_impl.h"
+
+static inline int MPIDI_SHM_rank_is_local(int rank, MPIR_Comm * comm)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+#endif /* SHM_STUBSHM_PROC_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/stubshm/stubshm_progress.h b/src/mpid/ch4/shm/stubshm/stubshm_progress.h
new file mode 100644
index 0000000..5c39ce7
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/stubshm_progress.h
@@ -0,0 +1,88 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_STUBSHM_PROGRESS_H_INCLUDED
+#define SHM_STUBSHM_PROGRESS_H_INCLUDED
+
+#include "stubshm_impl.h"
+
+static inline int MPIDI_SHM_do_progress_recv(int blocking, int *completion_count)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_do_progress_send(int blocking, int *completion_count)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_progress(int blocking)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_progress_test(void)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_progress_poke(void)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline void MPIDI_SHM_progress_start(MPID_Progress_state * state)
+{
+    MPIR_Assert(0);
+    return;
+}
+
+static inline void MPIDI_SHM_progress_end(MPID_Progress_state * state)
+{
+    MPIR_Assert(0);
+    return;
+}
+
+static inline int MPIDI_SHM_progress_wait(MPID_Progress_state * state)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_progress_register(int (*progress_fn) (int *))
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_progress_deregister(int id)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_progress_activate(int id)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_progress_deactivate(int id)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#endif /* SHM_STUBSHM_PROGRESS_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/stubshm/stubshm_recv.h b/src/mpid/ch4/shm/stubshm/stubshm_recv.h
new file mode 100644
index 0000000..63eaf7a
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/stubshm_recv.h
@@ -0,0 +1,70 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_STUBSHM_RECV_H_INCLUDED
+#define SHM_STUBSHM_RECV_H_INCLUDED
+
+#include "stubshm_impl.h"
+
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_SHM_recv)
+static inline int MPIDI_SHM_recv(void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm,
+                                 int context_offset, MPI_Status * status, MPIR_Request ** request)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_recv_init(void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_imrecv(void *buf,
+                                   int count,
+                                   MPI_Datatype datatype,
+                                   MPIR_Request * message, MPIR_Request ** rreqp)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_SHM_irecv)
+static inline int MPIDI_SHM_irecv(void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  int rank,
+                                  int tag,
+                                  MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_cancel_recv(MPIR_Request * rreq)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#endif /* SHM_STUBSHM_RECV_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/stubshm/stubshm_request.h b/src/mpid/ch4/shm/stubshm/stubshm_request.h
new file mode 100644
index 0000000..edbb381
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/stubshm_request.h
@@ -0,0 +1,26 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_STUBSHM_REQUEST_H_INCLUDED
+#define SHM_STUBSHM_REQUEST_H_INCLUDED
+
+#include "stubshm_impl.h"
+
+static inline void MPIDI_SHM_am_request_init(MPIR_Request * req)
+{
+    MPIR_Assert(0);
+}
+
+static inline void MPIDI_SHM_am_request_finalize(MPIR_Request * req)
+{
+    MPIR_Assert(0);
+}
+
+#endif /* SHM_STUBSHM_REQUEST_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/stubshm/stubshm_rma.h b/src/mpid/ch4/shm/stubshm/stubshm_rma.h
new file mode 100644
index 0000000..34256d1
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/stubshm_rma.h
@@ -0,0 +1,143 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_STUBSHM_RMA_H_INCLUDED
+#define SHM_STUBSHM_RMA_H_INCLUDED
+
+#include "stubshm_impl.h"
+
+static inline int MPIDI_SHM_put(const void *origin_addr,
+                                int origin_count,
+                                MPI_Datatype origin_datatype,
+                                int target_rank,
+                                MPI_Aint target_disp,
+                                int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_get(void *origin_addr,
+                                int origin_count,
+                                MPI_Datatype origin_datatype,
+                                int target_rank,
+                                MPI_Aint target_disp,
+                                int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_rput(const void *origin_addr,
+                                 int origin_count,
+                                 MPI_Datatype origin_datatype,
+                                 int target_rank,
+                                 MPI_Aint target_disp,
+                                 int target_count,
+                                 MPI_Datatype target_datatype,
+                                 MPIR_Win * win, MPIR_Request ** request)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_compare_and_swap(const void *origin_addr,
+                                             const void *compare_addr,
+                                             void *result_addr,
+                                             MPI_Datatype datatype,
+                                             int target_rank, MPI_Aint target_disp, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_raccumulate(const void *origin_addr,
+                                        int origin_count,
+                                        MPI_Datatype origin_datatype,
+                                        int target_rank,
+                                        MPI_Aint target_disp,
+                                        int target_count,
+                                        MPI_Datatype target_datatype,
+                                        MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_rget_accumulate(const void *origin_addr,
+                                            int origin_count,
+                                            MPI_Datatype origin_datatype,
+                                            void *result_addr,
+                                            int result_count,
+                                            MPI_Datatype result_datatype,
+                                            int target_rank,
+                                            MPI_Aint target_disp,
+                                            int target_count,
+                                            MPI_Datatype target_datatype,
+                                            MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_fetch_and_op(const void *origin_addr,
+                                         void *result_addr,
+                                         MPI_Datatype datatype,
+                                         int target_rank,
+                                         MPI_Aint target_disp, MPI_Op op, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_rget(void *origin_addr,
+                                 int origin_count,
+                                 MPI_Datatype origin_datatype,
+                                 int target_rank,
+                                 MPI_Aint target_disp,
+                                 int target_count,
+                                 MPI_Datatype target_datatype,
+                                 MPIR_Win * win, MPIR_Request ** request)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_get_accumulate(const void *origin_addr,
+                                           int origin_count,
+                                           MPI_Datatype origin_datatype,
+                                           void *result_addr,
+                                           int result_count,
+                                           MPI_Datatype result_datatype,
+                                           int target_rank,
+                                           MPI_Aint target_disp,
+                                           int target_count,
+                                           MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_accumulate(const void *origin_addr,
+                                       int origin_count,
+                                       MPI_Datatype origin_datatype,
+                                       int target_rank,
+                                       MPI_Aint target_disp,
+                                       int target_count,
+                                       MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#endif /* SHM_STUBSHM_RMA_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/stubshm/stubshm_send.h b/src/mpid/ch4/shm/stubshm/stubshm_send.h
new file mode 100644
index 0000000..446959f
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/stubshm_send.h
@@ -0,0 +1,140 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_STUBSHM_SEND_H_INCLUDED
+#define SHM_STUBSHM_SEND_H_INCLUDED
+
+#include "stubshm_impl.h"
+
+static inline int MPIDI_SHM_send(const void *buf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 int rank,
+                                 int tag,
+                                 MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int err = MPI_SUCCESS;
+    MPIR_Assert(0);
+    return err;
+}
+
+
+
+
+static inline int MPIDI_SHM_irsend(const void *buf,
+                                   int count,
+                                   MPI_Datatype datatype,
+                                   int rank,
+                                   int tag,
+                                   MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_ssend(const void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  int rank,
+                                  int tag,
+                                  MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int err = MPI_SUCCESS;
+    MPIR_Assert(0);
+
+    return err;
+}
+
+static inline int MPIDI_SHM_startall(int count, MPIR_Request * requests[])
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_send_init(const void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_ssend_init(const void *buf,
+                                       int count,
+                                       MPI_Datatype datatype,
+                                       int rank,
+                                       int tag,
+                                       MPIR_Comm * comm,
+                                       int context_offset, MPIR_Request ** request)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_bsend_init(const void *buf,
+                                       int count,
+                                       MPI_Datatype datatype,
+                                       int rank,
+                                       int tag,
+                                       MPIR_Comm * comm,
+                                       int context_offset, MPIR_Request ** request)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_rsend_init(const void *buf,
+                                       int count,
+                                       MPI_Datatype datatype,
+                                       int rank,
+                                       int tag,
+                                       MPIR_Comm * comm,
+                                       int context_offset, MPIR_Request ** request)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#undef FCNAME
+#define FCNAME DECL_FUNC(MPIDI_SHM_isend)
+static inline int MPIDI_SHM_isend(const void *buf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  int rank,
+                                  int tag,
+                                  MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int err = MPI_SUCCESS;
+    MPIR_Assert(0);
+    return err;
+}
+
+static inline int MPIDI_SHM_issend(const void *buf,
+                                   int count,
+                                   MPI_Datatype datatype,
+                                   int rank,
+                                   int tag,
+                                   MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int err = MPI_SUCCESS;
+    MPIR_Assert(0);
+    return err;
+}
+
+static inline int MPIDI_SHM_cancel_send(MPIR_Request * sreq)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#endif /* SHM_STUBSHM_SEND_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/stubshm/stubshm_spawn.h b/src/mpid/ch4/shm/stubshm/stubshm_spawn.h
new file mode 100644
index 0000000..cedb3f4
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/stubshm_spawn.h
@@ -0,0 +1,50 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_STUBSHM_SPAWN_H_INCLUDED
+#define SHM_STUBSHM_SPAWN_H_INCLUDED
+
+#include "stubshm_impl.h"
+
+static inline int MPIDI_SHM_comm_connect(const char *port_name,
+                                         MPIR_Info * info,
+                                         int root, MPIR_Comm * comm, MPIR_Comm ** newcomm_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_comm_disconnect(MPIR_Comm * comm_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_open_port(MPIR_Info * info_ptr, char *port_name)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_close_port(const char *port_name)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_comm_accept(const char *port_name,
+                                        MPIR_Info * info,
+                                        int root, MPIR_Comm * comm, MPIR_Comm ** newcomm_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+#endif /* SHM_STUBSHM_SPAWN_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/stubshm/stubshm_unimpl.h b/src/mpid/ch4/shm/stubshm/stubshm_unimpl.h
new file mode 100644
index 0000000..db8e66b
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/stubshm_unimpl.h
@@ -0,0 +1,19 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/mpid/ch4/shm/stubshm/stubshm_win.h b/src/mpid/ch4/shm/stubshm/stubshm_win.h
new file mode 100644
index 0000000..24a9b1b
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/stubshm_win.h
@@ -0,0 +1,185 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef SHM_STUBSHM_WIN_H_INCLUDED
+#define SHM_STUBSHM_WIN_H_INCLUDED
+
+#include "stubshm_impl.h"
+
+static inline int MPIDI_SHM_win_set_info(MPIR_Win * win, MPIR_Info * info)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_win_start(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_win_complete(MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_post(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_win_wait(MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_win_test(MPIR_Win * win, int *flag)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_lock(int lock_type, int rank, int assert, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_win_unlock(int rank, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_get_info(MPIR_Win * win, MPIR_Info ** info_p_p)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+static inline int MPIDI_SHM_win_free(MPIR_Win ** win_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_fence(int assert, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_create(void *base,
+                                       MPI_Aint length,
+                                       int disp_unit,
+                                       MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_attach(MPIR_Win * win, void *base, MPI_Aint size)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_allocate_shared(MPI_Aint size,
+                                                int disp_unit,
+                                                MPIR_Info * info_ptr,
+                                                MPIR_Comm * comm_ptr,
+                                                void **base_ptr, MPIR_Win ** win_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_detach(MPIR_Win * win, const void *base)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_shared_query(MPIR_Win * win,
+                                             int rank,
+                                             MPI_Aint * size, int *disp_unit, void *baseptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_allocate(MPI_Aint size,
+                                         int disp_unit,
+                                         MPIR_Info * info,
+                                         MPIR_Comm * comm, void *baseptr, MPIR_Win ** win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_flush(int rank, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_flush_local_all(MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_unlock_all(MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_create_dynamic(MPIR_Info * info, MPIR_Comm * comm, MPIR_Win ** win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_flush_local(int rank, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_sync(MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_flush_all(MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDI_SHM_win_lock_all(int assert, MPIR_Win * win)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+
+#endif /* SHM_STUBSHM_WIN_H_INCLUDED */
diff --git a/src/mpid/ch4/shm/stubshm/subconfigure.m4 b/src/mpid/ch4/shm/stubshm/subconfigure.m4
new file mode 100644
index 0000000..9e6d42a
--- /dev/null
+++ b/src/mpid/ch4/shm/stubshm/subconfigure.m4
@@ -0,0 +1,19 @@
+[#] start of __file__
+dnl MPICH_SUBCFG_AFTER=src/mpid/ch4
+
+AC_DEFUN([PAC_SUBCFG_PREREQ_]PAC_SUBCFG_AUTO_SUFFIX,[
+    AM_COND_IF([BUILD_CH4],[
+        for shm in $ch4_shm ; do
+            AS_CASE([$shm],[stubshm],[build_ch4_shm_stubshm=yes])
+        done
+    ])
+    AM_CONDITIONAL([BUILD_SHM_STUBSHM],[test "X$build_ch4_shm_stubshm" = "Xyes"])
+])dnl
+
+AC_DEFUN([PAC_SUBCFG_BODY_]PAC_SUBCFG_AUTO_SUFFIX,[
+AM_COND_IF([BUILD_SHM_STUBSHM],[
+    AC_MSG_NOTICE([RUNNING CONFIGURE FOR ch4:shm:stubshm])
+])dnl end AM_COND_IF(BUILD_SHM_STUBSHM,...)
+])dnl end _BODY
+
+[#] end of __file__
diff --git a/src/mpid/ch4/src/Makefile.mk b/src/mpid/ch4/src/Makefile.mk
new file mode 100644
index 0000000..db04a3b
--- /dev/null
+++ b/src/mpid/ch4/src/Makefile.mk
@@ -0,0 +1,44 @@
+## -*- Mode: Makefile; -*-
+## vim: set ft=automake :
+##
+## (C) 2016 by Argonne National Laboratory.
+##     See COPYRIGHT in top-level directory.
+##
+##  Portions of this code were written by Intel Corporation.
+##  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+##  to Argonne National Laboratory subject to Software Grant and Corporate
+##  Contributor License Agreement dated February 8, 2012.
+##
+
+AM_CPPFLAGS += -I$(top_srcdir)/src/mpid/ch4/src
+
+noinst_HEADERS += src/mpid/ch4/src/ch4_comm.h     \
+                  src/mpid/ch4/src/ch4_init.h     \
+                  src/mpid/ch4/src/ch4_progress.h \
+                  src/mpid/ch4/src/ch4_request.h  \
+                  src/mpid/ch4/src/ch4_send.h     \
+                  src/mpid/ch4/src/ch4_types.h    \
+                  src/mpid/ch4/src/ch4_impl.h     \
+                  src/mpid/ch4/src/ch4_probe.h    \
+                  src/mpid/ch4/src/ch4_proc.h     \
+                  src/mpid/ch4/src/ch4_recv.h     \
+                  src/mpid/ch4/src/ch4_rma.h      \
+                  src/mpid/ch4/src/ch4_spawn.h    \
+                  src/mpid/ch4/src/ch4_win.h      \
+                  src/mpid/ch4/src/ch4r_probe.h   \
+                  src/mpid/ch4/src/ch4r_recv.h    \
+                  src/mpid/ch4/src/ch4r_rma.h     \
+                  src/mpid/ch4/src/ch4r_win.h     \
+                  src/mpid/ch4/src/ch4r_init.h    \
+                  src/mpid/ch4/src/ch4r_proc.h    \
+                  src/mpid/ch4/src/ch4i_comm.h    \
+                  src/mpid/ch4/src/ch4r_recvq.h   \
+                  src/mpid/ch4/src/ch4i_util.h 	  \
+                  src/mpid/ch4/src/ch4r_symheap.h \
+                  src/mpid/ch4/src/ch4r_send.h	  \
+                  src/mpid/ch4/src/ch4r_buf.h     \
+                  src/mpid/ch4/src/ch4r_request.h
+
+mpi_core_sources += src/mpid/ch4/src/ch4_globals.c        \
+                    src/mpid/ch4/src/mpid_ch4_net_array.c \
+                    src/mpid/ch4/src/mpid_ch4_shm_array.c
diff --git a/src/mpid/ch4/src/ch4_coll.h b/src/mpid/ch4/src/ch4_coll.h
new file mode 100644
index 0000000..03dae8f
--- /dev/null
+++ b/src/mpid/ch4/src/ch4_coll.h
@@ -0,0 +1,378 @@
+/* -*- Mode: C ; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4_COLL_H_INCLUDED
+#define CH4_COLL_H_INCLUDED
+
+#include "ch4_impl.h"
+#include "ch4r_proc.h"
+
+__CH4_INLINE__ int MPIDI_Barrier(MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_barrier(comm, errflag);
+}
+
+__CH4_INLINE__ int MPIDI_Bcast(void *buffer, int count, MPI_Datatype datatype,
+                               int root, MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_bcast(buffer, count, datatype, root, comm, errflag);
+}
+
+__CH4_INLINE__ int MPIDI_Allreduce(const void *sendbuf, void *recvbuf, int count,
+                                   MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                   MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_allreduce(sendbuf, recvbuf, count, datatype, op, comm, errflag);
+}
+
+__CH4_INLINE__ int MPIDI_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                   MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_allgather(sendbuf, sendcount, sendtype, recvbuf,
+                              recvcount, recvtype, comm, errflag);
+}
+
+__CH4_INLINE__ int MPIDI_Allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, const int *recvcounts, const int *displs,
+                                    MPI_Datatype recvtype, MPIR_Comm * comm,
+                                    MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_allgatherv(sendbuf, sendcount, sendtype, recvbuf,
+                               recvcounts, displs, recvtype, comm, errflag);
+}
+
+__CH4_INLINE__ int MPIDI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                 void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                 int root, MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_scatter(sendbuf, sendcount, sendtype, recvbuf,
+                            recvcount, recvtype, root, comm, errflag);
+}
+
+__CH4_INLINE__ int MPIDI_Scatterv(const void *sendbuf, const int *sendcounts,
+                                  const int *displs, MPI_Datatype sendtype,
+                                  void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                  int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_scatterv(sendbuf, sendcounts, displs, sendtype,
+                             recvbuf, recvcount, recvtype, root, comm_ptr, errflag);
+}
+
+__CH4_INLINE__ int MPIDI_Gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                int root, MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_gather(sendbuf, sendcount, sendtype, recvbuf,
+                           recvcount, recvtype, root, comm, errflag);
+}
+
+__CH4_INLINE__ int MPIDI_Gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                 void *recvbuf, const int *recvcounts,
+                                 const int *displs, MPI_Datatype recvtype,
+                                 int root, MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_gatherv(sendbuf, sendcount, sendtype, recvbuf,
+                            recvcounts, displs, recvtype, root, comm, errflag);
+}
+
+__CH4_INLINE__ int MPIDI_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                  void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                  MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_alltoall(sendbuf, sendcount, sendtype, recvbuf,
+                             recvcount, recvtype, comm, errflag);
+}
+
+__CH4_INLINE__ int MPIDI_Alltoallv(const void *sendbuf, const int *sendcounts,
+                                   const int *sdispls, MPI_Datatype sendtype,
+                                   void *recvbuf, const int *recvcounts,
+                                   const int *rdispls, MPI_Datatype recvtype,
+                                   MPIR_Comm * comm, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_alltoallv(sendbuf, sendcounts, sdispls, sendtype,
+                              recvbuf, recvcounts, rdispls, recvtype, comm, errflag);
+}
+
+__CH4_INLINE__ int MPIDI_Alltoallw(const void *sendbuf, const int sendcounts[],
+                                   const int sdispls[], const MPI_Datatype sendtypes[],
+                                   void *recvbuf, const int recvcounts[],
+                                   const int rdispls[], const MPI_Datatype recvtypes[],
+                                   MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_alltoallw(sendbuf, sendcounts, sdispls, sendtypes,
+                              recvbuf, recvcounts, rdispls, recvtypes, comm_ptr, errflag);
+}
+
+__CH4_INLINE__ int MPIDI_Reduce(const void *sendbuf, void *recvbuf,
+                                int count, MPI_Datatype datatype, MPI_Op op,
+                                int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_reduce(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, errflag);
+}
+
+__CH4_INLINE__ int MPIDI_Reduce_scatter(const void *sendbuf, void *recvbuf,
+                                        const int recvcounts[], MPI_Datatype datatype,
+                                        MPI_Op op, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_reduce_scatter(sendbuf, recvbuf, recvcounts, datatype, op, comm_ptr, errflag);
+}
+
+__CH4_INLINE__ int MPIDI_Reduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount,
+                                              MPI_Datatype datatype, MPI_Op op,
+                                              MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_reduce_scatter_block(sendbuf, recvbuf, recvcount,
+                                         datatype, op, comm_ptr, errflag);
+}
+
+__CH4_INLINE__ int MPIDI_Scan(const void *sendbuf, void *recvbuf, int count,
+                              MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                              MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_scan(sendbuf, recvbuf, count, datatype, op, comm, errflag);
+}
+
+__CH4_INLINE__ int MPIDI_Exscan(const void *sendbuf, void *recvbuf, int count,
+                                MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                MPIR_Errflag_t * errflag)
+{
+    return MPIDI_NM_exscan(sendbuf, recvbuf, count, datatype, op, comm, errflag);
+}
+
+__CH4_INLINE__ int MPIDI_Neighbor_allgather(const void *sendbuf, int sendcount,
+                                            MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                            MPI_Datatype recvtype, MPIR_Comm * comm)
+{
+    return MPIDI_NM_neighbor_allgather(sendbuf, sendcount, sendtype,
+                                       recvbuf, recvcount, recvtype, comm);
+}
+
+__CH4_INLINE__ int MPIDI_Neighbor_allgatherv(const void *sendbuf, int sendcount,
+                                             MPI_Datatype sendtype, void *recvbuf,
+                                             const int *recvcounts, const int *displs,
+                                             MPI_Datatype recvtype, MPIR_Comm * comm)
+{
+    return MPIDI_NM_neighbor_allgatherv(sendbuf, sendcount, sendtype,
+                                        recvbuf, recvcounts, displs, recvtype, comm);
+}
+
+__CH4_INLINE__ int MPIDI_Neighbor_alltoallv(const void *sendbuf, const int *sendcounts,
+                                            const int *sdispls, MPI_Datatype sendtype,
+                                            void *recvbuf, const int *recvcounts,
+                                            const int *rdispls, MPI_Datatype recvtype,
+                                            MPIR_Comm * comm)
+{
+    return MPIDI_NM_neighbor_alltoallv(sendbuf, sendcounts, sdispls,
+                                       sendtype, recvbuf, recvcounts, rdispls, recvtype, comm);
+}
+
+__CH4_INLINE__ int MPIDI_Neighbor_alltoallw(const void *sendbuf, const int *sendcounts,
+                                            const MPI_Aint * sdispls,
+                                            const MPI_Datatype * sendtypes, void *recvbuf,
+                                            const int *recvcounts, const MPI_Aint * rdispls,
+                                            const MPI_Datatype * recvtypes, MPIR_Comm * comm)
+{
+    return MPIDI_NM_neighbor_alltoallw(sendbuf, sendcounts, sdispls,
+                                       sendtypes, recvbuf, recvcounts, rdispls, recvtypes, comm);
+}
+
+__CH4_INLINE__ int MPIDI_Neighbor_alltoall(const void *sendbuf, int sendcount,
+                                           MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                           MPI_Datatype recvtype, MPIR_Comm * comm)
+{
+    return MPIDI_NM_neighbor_alltoall(sendbuf, sendcount, sendtype,
+                                      recvbuf, recvcount, recvtype, comm);
+}
+
+__CH4_INLINE__ int MPIDI_Ineighbor_allgather(const void *sendbuf, int sendcount,
+                                             MPI_Datatype sendtype, void *recvbuf,
+                                             int recvcount, MPI_Datatype recvtype,
+                                             MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_ineighbor_allgather(sendbuf, sendcount, sendtype,
+                                        recvbuf, recvcount, recvtype, comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Ineighbor_allgatherv(const void *sendbuf, int sendcount,
+                                              MPI_Datatype sendtype, void *recvbuf,
+                                              const int *recvcounts, const int *displs,
+                                              MPI_Datatype recvtype, MPIR_Comm * comm,
+                                              MPI_Request * req)
+{
+    return MPIDI_NM_ineighbor_allgatherv(sendbuf, sendcount, sendtype,
+                                         recvbuf, recvcounts, displs, recvtype, comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Ineighbor_alltoall(const void *sendbuf, int sendcount,
+                                            MPI_Datatype sendtype, void *recvbuf,
+                                            int recvcount, MPI_Datatype recvtype,
+                                            MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_ineighbor_alltoall(sendbuf, sendcount, sendtype,
+                                       recvbuf, recvcount, recvtype, comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Ineighbor_alltoallv(const void *sendbuf, const int *sendcounts,
+                                             const int *sdispls, MPI_Datatype sendtype,
+                                             void *recvbuf, const int *recvcounts,
+                                             const int *rdispls, MPI_Datatype recvtype,
+                                             MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_ineighbor_alltoallv(sendbuf, sendcounts, sdispls,
+                                        sendtype, recvbuf, recvcounts, rdispls, recvtype, comm,
+                                        req);
+}
+
+__CH4_INLINE__ int MPIDI_Ineighbor_alltoallw(const void *sendbuf, const int *sendcounts,
+                                             const MPI_Aint * sdispls,
+                                             const MPI_Datatype * sendtypes, void *recvbuf,
+                                             const int *recvcounts, const MPI_Aint * rdispls,
+                                             const MPI_Datatype * recvtypes, MPIR_Comm * comm,
+                                             MPI_Request * req)
+{
+    return MPIDI_NM_ineighbor_alltoallw(sendbuf, sendcounts, sdispls,
+                                        sendtypes, recvbuf, recvcounts, rdispls, recvtypes, comm,
+                                        req);
+}
+
+__CH4_INLINE__ int MPIDI_Ibarrier(MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_ibarrier(comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Ibcast(void *buffer, int count, MPI_Datatype datatype,
+                                int root, MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_ibcast(buffer, count, datatype, root, comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                    MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_iallgather(sendbuf, sendcount, sendtype, recvbuf,
+                               recvcount, recvtype, comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Iallgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, const int *recvcounts, const int *displs,
+                                     MPI_Datatype recvtype, MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_iallgatherv(sendbuf, sendcount, sendtype, recvbuf,
+                                recvcounts, displs, recvtype, comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Iallreduce(const void *sendbuf, void *recvbuf, int count,
+                                    MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                    MPI_Request * req)
+{
+    return MPIDI_NM_iallreduce(sendbuf, recvbuf, count, datatype, op, comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Ialltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                   MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_ialltoall(sendbuf, sendcount, sendtype, recvbuf,
+                              recvcount, recvtype, comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Ialltoallv(const void *sendbuf, const int *sendcounts,
+                                    const int *sdispls, MPI_Datatype sendtype,
+                                    void *recvbuf, const int *recvcounts,
+                                    const int *rdispls, MPI_Datatype recvtype,
+                                    MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_ialltoallv(sendbuf, sendcounts, sdispls, sendtype,
+                               recvbuf, recvcounts, rdispls, recvtype, comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Ialltoallw(const void *sendbuf, const int *sendcounts,
+                                    const int *sdispls, const MPI_Datatype * sendtypes,
+                                    void *recvbuf, const int *recvcounts,
+                                    const int *rdispls, const MPI_Datatype * recvtypes,
+                                    MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_ialltoallw(sendbuf, sendcounts, sdispls, sendtypes,
+                               recvbuf, recvcounts, rdispls, recvtypes, comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Iexscan(const void *sendbuf, void *recvbuf, int count,
+                                 MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                 MPI_Request * req)
+{
+    return MPIDI_NM_iexscan(sendbuf, recvbuf, count, datatype, op, comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Igather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                 void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                 int root, MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_igather(sendbuf, sendcount, sendtype, recvbuf,
+                            recvcount, recvtype, root, comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Igatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                  void *recvbuf, const int *recvcounts, const int *displs,
+                                  MPI_Datatype recvtype, int root, MPIR_Comm * comm,
+                                  MPI_Request * req)
+{
+    return MPIDI_NM_igatherv(sendbuf, sendcount, sendtype, recvbuf,
+                             recvcounts, displs, recvtype, root, comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Ireduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount,
+                                               MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                                               MPI_Request * req)
+{
+    return MPIDI_NM_ireduce_scatter_block(sendbuf, recvbuf, recvcount, datatype, op, comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Ireduce_scatter(const void *sendbuf, void *recvbuf,
+                                         const int *recvcounts, MPI_Datatype datatype,
+                                         MPI_Op op, MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_ireduce_scatter(sendbuf, recvbuf, recvcounts, datatype, op, comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Ireduce(const void *sendbuf, void *recvbuf, int count,
+                                 MPI_Datatype datatype, MPI_Op op, int root,
+                                 MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_ireduce(sendbuf, recvbuf, count, datatype, op, root, comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Iscan(const void *sendbuf, void *recvbuf, int count,
+                               MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm,
+                               MPI_Request * req)
+{
+    return MPIDI_NM_iscan(sendbuf, recvbuf, count, datatype, op, comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Iscatter(const void *sendbuf, int sendcount,
+                                  MPI_Datatype sendtype, void *recvbuf, int recvcount,
+                                  MPI_Datatype recvtype, int root, MPIR_Comm * comm,
+                                  MPI_Request * req)
+{
+    return MPIDI_NM_iscatter(sendbuf, sendcount, sendtype, recvbuf,
+                             recvcount, recvtype, root, comm, req);
+}
+
+__CH4_INLINE__ int MPIDI_Iscatterv(const void *sendbuf, const int *sendcounts,
+                                   const int *displs, MPI_Datatype sendtype,
+                                   void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                   int root, MPIR_Comm * comm, MPI_Request * req)
+{
+    return MPIDI_NM_iscatterv(sendbuf, sendcounts, displs, sendtype,
+                              recvbuf, recvcount, recvtype, root, comm, req);
+}
+
+#endif /* CH4_COLL_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4_comm.h b/src/mpid/ch4/src/ch4_comm.h
new file mode 100644
index 0000000..695566d
--- /dev/null
+++ b/src/mpid/ch4/src/ch4_comm.h
@@ -0,0 +1,249 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4_COMM_H_INCLUDED
+#define CH4_COMM_H_INCLUDED
+
+#include "ch4_impl.h"
+#include "ch4i_comm.h"
+
+__CH4_INLINE__ int MPIDI_Comm_AS_enabled(MPIR_Comm * comm)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+__CH4_INLINE__ int MPIDI_Comm_reenable_anysource(MPIR_Comm * comm, MPIR_Group ** failed_group_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+__CH4_INLINE__ int MPIDI_Comm_remote_group_failed(MPIR_Comm * comm, MPIR_Group ** failed_group_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+__CH4_INLINE__ int MPIDI_Comm_group_failed(MPIR_Comm * comm_ptr, MPIR_Group ** failed_group_ptr)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+__CH4_INLINE__ int MPIDI_Comm_failure_ack(MPIR_Comm * comm_ptr)
+{
+    MPIR_Assert(0);
+    return 0;
+}
+
+__CH4_INLINE__ int MPIDI_Comm_failure_get_acked(MPIR_Comm * comm_ptr,
+                                                MPIR_Group ** failed_group_ptr)
+{
+    MPIR_Assert(0);
+    return 0;
+}
+
+__CH4_INLINE__ int MPIDI_Comm_revoke(MPIR_Comm * comm_ptr, int is_remote)
+{
+    MPIR_Assert(0);
+    return 0;
+}
+
+__CH4_INLINE__ int MPIDI_Comm_get_all_failed_procs(MPIR_Comm * comm_ptr, MPIR_Group ** failed_group,
+                                                   int tag)
+{
+    MPIR_Assert(0);
+    return 0;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Comm_split_type
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Comm_split_type(MPIR_Comm * comm_ptr,
+                                         int split_type,
+                                         int key, MPIR_Info * info_ptr, MPIR_Comm ** newcomm_ptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int idx;
+    MPID_Node_id_t node_id;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_COMM_SPLIT_TYPE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_COMM_SPLIT_TYPE);
+
+    if (split_type == MPI_COMM_TYPE_SHARED) {
+        MPIDI_Comm_get_lpid(comm_ptr, comm_ptr->rank, &idx, FALSE);
+        MPIDI_Get_node_id(comm_ptr, comm_ptr->rank, &node_id);
+        mpi_errno = MPIR_Comm_split_impl(comm_ptr, node_id, key, newcomm_ptr);
+    }
+    else
+        mpi_errno = MPIR_Comm_split_impl(comm_ptr, MPI_UNDEFINED, key, newcomm_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_COMM_SPLIT_TYPE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Comm_create
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Comm_create(MPIR_Comm * comm)
+{
+    int mpi_errno;
+    int i, *uniq_avtids;
+    int max_n_avts;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_COMM_CREATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_COMM_CREATE);
+    mpi_errno = MPIDI_NM_comm_create(comm);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+#if defined(MPIDI_BUILD_CH4_SHM)
+    mpi_errno = MPIDI_SHM_comm_create(comm);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+#endif
+
+    /* comm_world and comm_self are already initialized */
+    if (comm != MPIR_Process.comm_world && comm != MPIR_Process.comm_self) {
+        MPIDII_comm_create_rank_map(comm);
+        /* add ref to avts */
+        switch (MPIDII_COMM(comm, map).mode) {
+        case MPIDII_RANK_MAP_NONE:
+            break;
+        case MPIDII_RANK_MAP_MLUT:
+            max_n_avts = MPIDIU_get_max_n_avts();
+            uniq_avtids = (int *) MPL_malloc(max_n_avts * sizeof(int));
+            memset(uniq_avtids, 0, max_n_avts);
+            for (i = 0; i < MPIDII_COMM(comm, map).size; i++) {
+                if (uniq_avtids[MPIDII_COMM(comm, map).irreg.mlut.gpid[i].avtid] == 0) {
+                    uniq_avtids[MPIDII_COMM(comm, map).irreg.mlut.gpid[i].avtid] = 1;
+                    MPIDIU_avt_add_ref(MPIDII_COMM(comm, map).irreg.mlut.gpid[i].avtid);
+                }
+            }
+            MPL_free(uniq_avtids);
+            break;
+        default:
+            MPIDIU_avt_add_ref(MPIDII_COMM(comm, map).avtid);
+        }
+
+        switch (MPIDII_COMM(comm, local_map).mode) {
+        case MPIDII_RANK_MAP_NONE:
+            break;
+        case MPIDII_RANK_MAP_MLUT:
+            max_n_avts = MPIDIU_get_max_n_avts();
+            uniq_avtids = (int *) MPL_malloc(max_n_avts * sizeof(int));
+            memset(uniq_avtids, 0, max_n_avts);
+            for (i = 0; i < MPIDII_COMM(comm, local_map).size; i++) {
+                if (uniq_avtids[MPIDII_COMM(comm, local_map).irreg.mlut.gpid[i].avtid] == 0) {
+                    uniq_avtids[MPIDII_COMM(comm, local_map).irreg.mlut.gpid[i].avtid] = 1;
+                    MPIDIU_avt_add_ref(MPIDII_COMM(comm, local_map).irreg.mlut.gpid[i].avtid);
+                }
+            }
+            MPL_free(uniq_avtids);
+            break;
+        default:
+            MPIDIU_avt_add_ref(MPIDII_COMM(comm, local_map).avtid);
+        }
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_COMM_CREATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Comm_destroy
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Comm_destroy(MPIR_Comm * comm)
+{
+    int mpi_errno;
+    int i, *uniq_avtids;
+    int max_n_avts;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_COMM_DESTROY);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_COMM_DESTROY);
+    /* release ref to avts */
+    switch (MPIDII_COMM(comm, map).mode) {
+    case MPIDII_RANK_MAP_NONE:
+        break;
+    case MPIDII_RANK_MAP_MLUT:
+        max_n_avts = MPIDIU_get_max_n_avts();
+        uniq_avtids = (int *) MPL_malloc(max_n_avts * sizeof(int));
+        memset(uniq_avtids, 0, max_n_avts);
+        for (i = 0; i < MPIDII_COMM(comm, map).size; i++) {
+            if (uniq_avtids[MPIDII_COMM(comm, map).irreg.mlut.gpid[i].avtid] == 0) {
+                uniq_avtids[MPIDII_COMM(comm, map).irreg.mlut.gpid[i].avtid] = 1;
+                MPIDIU_avt_release_ref(MPIDII_COMM(comm, map).irreg.mlut.gpid[i].avtid);
+            }
+        }
+        MPL_free(uniq_avtids);
+        break;
+    default:
+        MPIDIU_avt_release_ref(MPIDII_COMM(comm, map).avtid);
+    }
+
+    switch (MPIDII_COMM(comm, local_map).mode) {
+    case MPIDII_RANK_MAP_NONE:
+        break;
+    case MPIDII_RANK_MAP_MLUT:
+        max_n_avts = MPIDIU_get_max_n_avts();
+        uniq_avtids = (int *) MPL_malloc(max_n_avts * sizeof(int));
+        memset(uniq_avtids, 0, max_n_avts);
+        for (i = 0; i < MPIDII_COMM(comm, local_map).size; i++) {
+            if (uniq_avtids[MPIDII_COMM(comm, local_map).irreg.mlut.gpid[i].avtid] == 0) {
+                uniq_avtids[MPIDII_COMM(comm, local_map).irreg.mlut.gpid[i].avtid] = 1;
+                MPIDIU_avt_release_ref(MPIDII_COMM(comm, local_map).irreg.mlut.gpid[i].avtid);
+            }
+        }
+        MPL_free(uniq_avtids);
+        break;
+    default:
+        MPIDIU_avt_release_ref(MPIDII_COMM(comm, local_map).avtid);
+    }
+
+    mpi_errno = MPIDI_NM_comm_destroy(comm);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+#if defined(MPIDI_BUILD_CH4_SHM)
+    mpi_errno = MPIDI_SHM_comm_destroy(comm);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+#endif
+
+    if (MPIDII_COMM(comm, map).mode == MPIDII_RANK_MAP_LUT
+        || MPIDII_COMM(comm, map).mode == MPIDII_RANK_MAP_LUT_INTRA) {
+        MPIDIU_release_lut(MPIDII_COMM(comm, map).irreg.lut.t);
+    }
+    if (MPIDII_COMM(comm, local_map).mode == MPIDII_RANK_MAP_LUT
+        || MPIDII_COMM(comm, local_map).mode == MPIDII_RANK_MAP_LUT_INTRA) {
+        MPIDIU_release_lut(MPIDII_COMM(comm, local_map).irreg.lut.t);
+    }
+    if (MPIDII_COMM(comm, map).mode == MPIDII_RANK_MAP_MLUT) {
+        MPIDIU_release_mlut(MPIDII_COMM(comm, map).irreg.mlut.t);
+    }
+    if (MPIDII_COMM(comm, local_map).mode == MPIDII_RANK_MAP_MLUT) {
+        MPIDIU_release_mlut(MPIDII_COMM(comm, local_map).irreg.mlut.t);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_COMM_DESTROY);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#endif /* CH4_COMM_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4_globals.c b/src/mpid/ch4/src/ch4_globals.c
new file mode 100644
index 0000000..44ccbcd
--- /dev/null
+++ b/src/mpid/ch4/src/ch4_globals.c
@@ -0,0 +1,62 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+/* All global ADI data structures need to go in this file */
+/* reference them with externs from other files           */
+
+#include <mpidimpl.h>
+#include "ch4_impl.h"
+
+MPIDI_CH4_Global_t MPIDI_CH4_Global;
+MPIDII_av_table_t **MPIDII_av_table;
+MPIDII_av_table_t *MPIDII_av_table0;
+
+MPIDI_NM_funcs_t *MPIDI_NM_func;
+MPIDI_NM_native_funcs_t *MPIDI_NM_native_func;
+
+#ifdef MPIDI_BUILD_CH4_SHM
+MPIDI_SHM_funcs_t *MPIDI_SHM_func;
+MPIDI_SHM_native_funcs_t *MPIDI_SHM_native_func;
+#endif
+
+#ifdef MPID_DEVICE_DEFINES_THREAD_CS
+pthread_mutex_t MPIDI_Mutex_lock[MPIDI_NUM_LOCKS];
+#endif
+
+/* The MPID_Abort ADI is strangely defined by the upper layers */
+/* We should fix the upper layer to define MPID_Abort like any */
+/* Other ADI */
+#ifdef MPID_Abort
+#define MPID_TMP MPID_Abort
+#undef MPID_Abort
+int MPID_Abort(MPIR_Comm * comm, int mpi_errno, int exit_code, const char *error_msg)
+{
+    return MPIDI_Abort(comm, mpi_errno, exit_code, error_msg);
+}
+
+#define MPID_Abort MPID_TMP
+#endif
+
+/* Another weird ADI that doesn't follow convention */
+static void init_comm() __attribute__ ((constructor));
+static void init_comm()
+{
+    MPIR_Comm_fns = &MPIDI_CH4_Global.MPIR_Comm_fns_store;
+    MPIR_Comm_fns->split_type = MPIDI_Comm_split_type;
+}
+
+MPL_dbg_class MPIDI_CH4_DBG_GENERAL;
+
+#if defined(MPL_USE_DBG_LOGGING)
+MPL_dbg_class MPIDI_CH4_DBG_GENERAL;
+MPL_dbg_class MPIDI_CH4_DBG_MAP;
+MPL_dbg_class MPIDI_CH4_DBG_MEMORY;
+#endif
diff --git a/src/mpid/ch4/src/ch4_impl.h b/src/mpid/ch4/src/ch4_impl.h
new file mode 100644
index 0000000..f8a8b75
--- /dev/null
+++ b/src/mpid/ch4/src/ch4_impl.h
@@ -0,0 +1,553 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4_IMPL_H_INCLUDED
+#define CH4_IMPL_H_INCLUDED
+
+#include "ch4_types.h"
+#include <mpidch4.h>
+
+/* Static inlines */
+static inline int MPIDI_CH4U_get_tag(uint64_t match_bits)
+{
+    int tag = (match_bits & MPIDI_CH4U_TAG_MASK);
+    /* Left shift and right shift by MPIDI_CH4U_TAG_SHIFT_UNPACK is to make sure the sign of tag is retained */
+    return ((tag << MPIDI_CH4U_TAG_SHIFT_UNPACK) >> MPIDI_CH4U_TAG_SHIFT_UNPACK);
+}
+
+static inline int MPIDI_CH4U_get_context(uint64_t match_bits)
+{
+    return ((int) ((match_bits & MPIDI_CH4U_CONTEXT_MASK) >>
+                   (MPIDI_CH4U_TAG_SHIFT + MPIDI_CH4U_SOURCE_SHIFT)));
+}
+
+static inline int MPIDI_CH4U_get_context_index(uint64_t context_id)
+{
+    int raw_prefix, idx, bitpos, gen_id;
+    raw_prefix = MPIR_CONTEXT_READ_FIELD(PREFIX, context_id);
+    idx = raw_prefix / MPIR_CONTEXT_INT_BITS;
+    bitpos = raw_prefix % MPIR_CONTEXT_INT_BITS;
+    gen_id = (idx * MPIR_CONTEXT_INT_BITS) + (31 - bitpos);
+    return gen_id;
+}
+
+static inline MPIR_Comm *MPIDI_CH4U_context_id_to_comm(uint64_t context_id)
+{
+    int comm_idx = MPIDI_CH4U_get_context_index(context_id);
+    int subcomm_type = MPIR_CONTEXT_READ_FIELD(SUBCOMM, context_id);
+    int is_localcomm = MPIR_CONTEXT_READ_FIELD(IS_LOCALCOMM, context_id);
+    MPIR_Assert(subcomm_type <= 3);
+    MPIR_Assert(is_localcomm <= 2);
+    return MPIDI_CH4_Global.comm_req_lists[comm_idx].comm[is_localcomm][subcomm_type];
+}
+
+static inline MPIDI_CH4U_rreq_t **MPIDI_CH4U_context_id_to_uelist(uint64_t context_id)
+{
+    int comm_idx = MPIDI_CH4U_get_context_index(context_id);
+    int subcomm_type = MPIR_CONTEXT_READ_FIELD(SUBCOMM, context_id);
+    int is_localcomm = MPIR_CONTEXT_READ_FIELD(IS_LOCALCOMM, context_id);
+    MPIR_Assert(subcomm_type <= 3);
+    MPIR_Assert(is_localcomm <= 2);
+    return &MPIDI_CH4_Global.comm_req_lists[comm_idx].uelist[is_localcomm][subcomm_type];
+}
+
+static inline uint64_t MPIDI_CH4U_generate_win_id(MPIR_Comm * comm_ptr)
+{
+    /* context id lower bits, window instance upper bits */
+    return 1 + (((uint64_t) comm_ptr->context_id) |
+                ((uint64_t) ((MPIDI_CH4U_COMM(comm_ptr, window_instance))++) << 32));
+}
+
+static inline MPIR_Context_id_t MPIDI_CH4U_win_id_to_context(uint64_t win_id)
+{
+    /* pick the lower 32-bit to extract context id */
+    return (win_id - 1) & 0xffffffff;
+}
+
+static inline MPIR_Context_id_t MPIDI_CH4U_win_to_context(const MPIR_Win * win)
+{
+    return MPIDI_CH4U_win_id_to_context(MPIDI_CH4U_WIN(win, win_id));
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_request_release
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ void MPIDI_CH4U_request_release(MPIR_Request * req)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4R_REQUEST_RELEASE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4R_REQEUST_RELEASE);
+
+    if (req->kind == MPIR_REQUEST_KIND__PREQUEST_RECV &&
+        NULL != MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(req)) {
+        MPIR_Request_free(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(req));
+    }
+    MPIR_Request_free(req);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4R_REQUEST_RELEASE);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_request_complete
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ void MPIDI_CH4U_request_complete(MPIR_Request * req)
+{
+    int incomplete;
+    MPIR_cc_decr(req->cc_ptr, &incomplete);
+    if (!incomplete)
+        MPIDI_CH4U_request_release(req);
+}
+
+#ifndef dtype_add_ref_if_not_builtin
+#define dtype_add_ref_if_not_builtin(datatype_)                         \
+    do {								\
+	if ((datatype_) != MPI_DATATYPE_NULL &&				\
+	    HANDLE_GET_KIND((datatype_)) != HANDLE_KIND_BUILTIN)	\
+	{								\
+	    MPIR_Datatype *dtp_ = NULL;					\
+	    MPID_Datatype_get_ptr((datatype_), dtp_);			\
+	    MPID_Datatype_add_ref(dtp_);				\
+	}								\
+    } while (0)
+#endif
+
+#ifndef dtype_release_if_not_builtin
+#define dtype_release_if_not_builtin(datatype_)				\
+    do {								\
+	if ((datatype_) != MPI_DATATYPE_NULL &&				\
+	    HANDLE_GET_KIND((datatype_)) != HANDLE_KIND_BUILTIN)	\
+	{								\
+	    MPIR_Datatype *dtp_ = NULL;					\
+	    MPID_Datatype_get_ptr((datatype_), dtp_);			\
+	    MPID_Datatype_release(dtp_);				\
+	}								\
+    } while (0)
+#endif
+
+#define MPIDI_Datatype_get_info(_count, _datatype,                      \
+                                _dt_contig_out, _data_sz_out,           \
+                                _dt_ptr, _dt_true_lb)                   \
+    do {								\
+	if (IS_BUILTIN(_datatype))					\
+	{								\
+	    (_dt_ptr)        = NULL;					\
+	    (_dt_contig_out) = TRUE;					\
+	    (_dt_true_lb)    = 0;					\
+	    (_data_sz_out)   = (size_t)(_count) *		\
+		MPID_Datatype_get_basic_size(_datatype);		\
+	}								\
+	else								\
+	{								\
+	    MPID_Datatype_get_ptr((_datatype), (_dt_ptr));		\
+            if (_dt_ptr)                                                \
+            {                                                           \
+                (_dt_contig_out) = (_dt_ptr)->is_contig;                \
+                (_dt_true_lb)    = (_dt_ptr)->true_lb;                  \
+                (_data_sz_out)   = (size_t)(_count) *           \
+                    (_dt_ptr)->size;                                    \
+            }                                                           \
+            else                                                        \
+            {                                                           \
+                (_dt_contig_out) = 1;                                   \
+                (_dt_true_lb)    = 0;                                   \
+                (_data_sz_out)   = 0;                                   \
+            }								\
+        }                                                               \
+    } while (0)
+
+#define MPIDI_Datatype_get_size_dt_ptr(_count, _datatype,               \
+                                       _data_sz_out, _dt_ptr)           \
+    do {								\
+	if (IS_BUILTIN(_datatype))					\
+	{								\
+	    (_dt_ptr)        = NULL;					\
+	    (_data_sz_out)   = (size_t)(_count) *		\
+		MPID_Datatype_get_basic_size(_datatype);		\
+	}								\
+	else								\
+	{								\
+	    MPID_Datatype_get_ptr((_datatype), (_dt_ptr));		\
+	    (_data_sz_out)   = (_dt_ptr) ? (size_t)(_count) *   \
+                (_dt_ptr)->size : 0;                                    \
+	}								\
+    } while (0)
+
+#define MPIDI_Datatype_check_contig(_datatype,_dt_contig_out)	\
+    do {							\
+      if (IS_BUILTIN(_datatype))				\
+      {								\
+       (_dt_contig_out) = TRUE;					\
+       }							\
+      else							\
+      {								\
+       MPIR_Datatype *_dt_ptr;					\
+       MPID_Datatype_get_ptr((_datatype), (_dt_ptr));		\
+       (_dt_contig_out) = (_dt_ptr) ? (_dt_ptr)->is_contig : 1; \
+      }                                                         \
+    } while (0)
+
+#define MPIDI_Datatype_check_contig_size(_datatype,_count,              \
+                                         _dt_contig_out,                \
+                                         _data_sz_out)                  \
+    do {								\
+      if (IS_BUILTIN(_datatype))					\
+      {                                                                 \
+	  (_dt_contig_out) = TRUE;					\
+	  (_data_sz_out)   = (size_t)(_count) *			\
+	      MPID_Datatype_get_basic_size(_datatype);			\
+      }                                                                 \
+      else								\
+      {                                                                 \
+	  MPIR_Datatype *_dt_ptr;					\
+	  MPID_Datatype_get_ptr((_datatype), (_dt_ptr));		\
+          if (_dt_ptr)                                                  \
+          {                                                             \
+              (_dt_contig_out) = (_dt_ptr)->is_contig;                  \
+              (_data_sz_out)   = (size_t)(_count) *             \
+                  (_dt_ptr)->size;                                      \
+          }                                                             \
+          else                                                          \
+          {                                                             \
+              (_dt_contig_out) = 1;                                     \
+              (_data_sz_out)   = 0;                                     \
+          }                                                             \
+      }                                                                 \
+    } while (0)
+
+#define MPIDI_Datatype_check_size(_datatype,_count,_data_sz_out)        \
+    do {								\
+        if (IS_BUILTIN(_datatype))                                      \
+        {                                                               \
+            (_data_sz_out)   = (size_t)(_count) *               \
+                MPID_Datatype_get_basic_size(_datatype);                \
+        }                                                               \
+        else                                                            \
+        {                                                               \
+            MPIR_Datatype *_dt_ptr;                                     \
+            MPID_Datatype_get_ptr((_datatype), (_dt_ptr));              \
+            (_data_sz_out)   = (_dt_ptr) ? (size_t)(_count) *   \
+                (_dt_ptr)->size : 0;                                    \
+        }                                                               \
+    } while (0)
+
+#define MPIDI_Datatype_check_contig_size_lb(_datatype,_count,           \
+                                            _dt_contig_out,             \
+                                            _data_sz_out,               \
+                                            _dt_true_lb)                \
+    do {								\
+	if (IS_BUILTIN(_datatype))					\
+	{								\
+	    (_dt_contig_out) = TRUE;					\
+	    (_data_sz_out)   = (size_t)(_count) *		\
+		MPID_Datatype_get_basic_size(_datatype);		\
+	    (_dt_true_lb)    = 0;					\
+	}								\
+	else								\
+	{								\
+	    MPIR_Datatype *_dt_ptr;					\
+	    MPID_Datatype_get_ptr((_datatype), (_dt_ptr));		\
+            if (_dt_ptr)                                                \
+            {                                                           \
+                (_dt_contig_out) = (_dt_ptr)->is_contig;                \
+                (_data_sz_out)   = (size_t)(_count) *           \
+                    (_dt_ptr)->size;                                    \
+                (_dt_true_lb)    = (_dt_ptr)->true_lb;                  \
+            }                                                           \
+            else                                                        \
+            {                                                           \
+                (_dt_contig_out) = 1;                                   \
+                (_data_sz_out)   = 0;                                   \
+                (_dt_true_lb)    = 0;                                   \
+            }                                                           \
+	}								\
+    } while (0)
+
+#define MPIDI_Request_create_null_rreq(rreq_, mpi_errno_, FAIL_)        \
+  do {                                                                  \
+    (rreq_) = MPIR_Request_create(MPIR_REQUEST_KIND__RECV);             \
+    if ((rreq_) != NULL) {                                              \
+      MPIR_cc_set(&(rreq_)->cc, 0);                                     \
+      MPIR_Status_set_procnull(&(rreq_)->status);                       \
+    }                                                                   \
+    else {                                                              \
+      MPIR_ERR_SETANDJUMP(mpi_errno_,MPI_ERR_OTHER,"**nomemreq");       \
+    }                                                                   \
+  } while (0)
+
+#define IS_BUILTIN(_datatype)				\
+    (HANDLE_GET_KIND(_datatype) == HANDLE_KIND_BUILTIN)
+
+#ifndef container_of
+#define container_of(ptr, type, field)			\
+    ((type *) ((char *)ptr - offsetof(type, field)))
+#endif
+
+static inline uint64_t MPIDI_CH4U_init_send_tag(MPIR_Context_id_t contextid, int source, int tag)
+{
+    uint64_t match_bits;
+    match_bits = contextid;
+    match_bits = (match_bits << MPIDI_CH4U_SOURCE_SHIFT);
+    match_bits |= (source & (MPIDI_CH4U_SOURCE_MASK >> MPIDI_CH4U_TAG_SHIFT));
+    match_bits = (match_bits << MPIDI_CH4U_TAG_SHIFT);
+    match_bits |= (MPIDI_CH4U_TAG_MASK & tag);
+    return match_bits;
+}
+
+static inline uint64_t MPIDI_CH4U_init_recvtag(uint64_t * mask_bits,
+                                               MPIR_Context_id_t contextid, int source, int tag)
+{
+    uint64_t match_bits = 0;
+    *mask_bits = MPIDI_CH4U_PROTOCOL_MASK;
+    match_bits = contextid;
+    match_bits = (match_bits << MPIDI_CH4U_SOURCE_SHIFT);
+
+    if (MPI_ANY_SOURCE == source) {
+        match_bits = (match_bits << MPIDI_CH4U_TAG_SHIFT);
+        *mask_bits |= MPIDI_CH4U_SOURCE_MASK;
+    }
+    else {
+        match_bits |= source;
+        match_bits = (match_bits << MPIDI_CH4U_TAG_SHIFT);
+    }
+
+    if (MPI_ANY_TAG == tag)
+        *mask_bits |= MPIDI_CH4U_TAG_MASK;
+    else
+        match_bits |= (MPIDI_CH4U_TAG_MASK & tag);
+
+    return match_bits;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4I_valid_group_rank
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4I_valid_group_rank(MPIR_Comm * comm, int rank, MPIR_Group * grp)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_VALID_GROUP_RANK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_VALID_GROUP_RANK);
+
+    int lpid;
+    int size = grp->size;
+    int z;
+    int ret;
+
+    if (unlikely(rank == MPI_PROC_NULL)) {
+        /* Treat PROC_NULL as always valid */
+        ret = 1;
+        goto fn_exit;
+    }
+
+    MPIDI_NM_comm_get_lpid(comm, rank, &lpid, FALSE);
+
+    for (z = 0; z < size && lpid != grp->lrank_to_lpid[z].lpid; ++z) {
+    }
+
+    ret = (z < size);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_VALID_GROUP_RANK);
+  fn_exit:
+    return ret;
+}
+
+#define MPIDI_CH4R_PROGRESS()                                   \
+    do {							\
+	mpi_errno = MPIDI_Progress_test();			\
+	if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno);	\
+    } while (0)
+
+#define MPIDI_CH4R_PROGRESS_WHILE(cond)         \
+    do {					\
+	while (cond)				\
+	    MPIDI_CH4R_PROGRESS();              \
+    } while (0)
+
+#ifdef HAVE_ERROR_CHECKING
+#define MPIDI_CH4U_EPOCH_CHECK_SYNC(win, mpi_errno, stmt)               \
+    do {                                                                \
+        MPID_BEGIN_ERROR_CHECKS;                                        \
+        if (MPIDI_CH4U_WIN(win, sync).origin_epoch_type == MPIDI_CH4U_WIN(win, sync).target_epoch_type && \
+           MPIDI_CH4U_WIN(win, sync).origin_epoch_type == MPIDI_CH4U_EPOTYPE_REFENCE) \
+        {                                                               \
+            MPIDI_CH4U_WIN(win, sync).origin_epoch_type = MPIDI_CH4U_EPOTYPE_FENCE; \
+            MPIDI_CH4U_WIN(win, sync).target_epoch_type = MPIDI_CH4U_EPOTYPE_FENCE; \
+        }                                                               \
+        if (MPIDI_CH4U_WIN(win, sync).origin_epoch_type == MPIDI_CH4U_EPOTYPE_NONE || \
+           MPIDI_CH4U_WIN(win, sync).origin_epoch_type == MPIDI_CH4U_EPOTYPE_POST) \
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC,            \
+                                stmt, "**rmasync");                     \
+        MPID_END_ERROR_CHECKS;                                          \
+    } while (0)
+
+#define MPIDI_CH4U_EPOCH_CHECK_TYPE(win,mpi_errno,stmt)            \
+    do {                                                                \
+        MPID_BEGIN_ERROR_CHECKS;                                        \
+        if (MPIDI_CH4U_WIN(win, sync).origin_epoch_type != MPIDI_CH4U_EPOTYPE_NONE && \
+           MPIDI_CH4U_WIN(win, sync).origin_epoch_type != MPIDI_CH4U_EPOTYPE_REFENCE) \
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC,            \
+                                stmt, "**rmasync");                     \
+        MPID_END_ERROR_CHECKS;                                          \
+    } while (0)
+
+#define MPIDI_CH4U_EPOCH_START_CHECK(win,mpi_errno,stmt)                \
+    do {                                                                \
+        MPID_BEGIN_ERROR_CHECKS;                                        \
+        if (MPIDI_CH4U_WIN(win, sync).origin_epoch_type == MPIDI_CH4U_EPOTYPE_START && \
+            !MPIDI_CH4I_valid_group_rank(win->comm_ptr, target_rank,    \
+                                         MPIDI_CH4U_WIN(win, sync).sc.group)) \
+            MPIR_ERR_SETANDSTMT(mpi_errno,                              \
+                                MPI_ERR_RMA_SYNC,                       \
+                                stmt,                                   \
+                                "**rmasync");                           \
+        MPID_END_ERROR_CHECKS;                                          \
+    } while (0)
+
+#define MPIDI_CH4U_EPOCH_START_CHECK2(win,mpi_errno,stmt)               \
+    do {                                                                \
+        MPID_BEGIN_ERROR_CHECKS;                                        \
+        if (MPIDI_CH4U_WIN(win, sync).origin_epoch_type != MPIDI_CH4U_EPOTYPE_START) { \
+            MPIR_ERR_SETANDSTMT(mpi_errno,                              \
+                                MPI_ERR_RMA_SYNC,                       \
+                                stmt,                                   \
+                                "**rmasync");                           \
+        }                                                               \
+        MPID_END_ERROR_CHECKS;                                          \
+    } while (0)
+
+#define MPIDI_CH4U_EPOCH_FENCE_CHECK(win,mpi_errno,stmt)                \
+    do {                                                                \
+        MPID_BEGIN_ERROR_CHECKS;                                        \
+        if (MPIDI_CH4U_WIN(win, sync).origin_epoch_type != MPIDI_CH4U_WIN(win, sync).target_epoch_type) \
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC,            \
+                                stmt, "**rmasync");                     \
+        if (!(massert & MPI_MODE_NOPRECEDE) &&                          \
+            MPIDI_CH4U_WIN(win, sync).origin_epoch_type != MPIDI_CH4U_EPOTYPE_FENCE && \
+            MPIDI_CH4U_WIN(win, sync).origin_epoch_type != MPIDI_CH4U_EPOTYPE_REFENCE && \
+            MPIDI_CH4U_WIN(win, sync).origin_epoch_type != MPIDI_CH4U_EPOTYPE_NONE) \
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC,            \
+                                stmt, "**rmasync");                     \
+        MPID_END_ERROR_CHECKS;                                          \
+    } while (0)
+
+#define MPIDI_CH4U_EPOCH_POST_CHECK(win,mpi_errno,stmt)            \
+    do {                                                                \
+        MPID_BEGIN_ERROR_CHECKS;                                        \
+        if (MPIDI_CH4U_WIN(win, sync).target_epoch_type != MPIDI_CH4U_EPOTYPE_NONE && \
+           MPIDI_CH4U_WIN(win, sync).target_epoch_type != MPIDI_CH4U_EPOTYPE_REFENCE) \
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC,            \
+                                stmt, "**rmasync");                     \
+        MPID_END_ERROR_CHECKS;                                          \
+    } while (0)
+
+#define MPIDI_CH4U_EPOCH_LOCK_CHECK(win,mpi_errno,stmt)               \
+do {                                                                  \
+    MPID_BEGIN_ERROR_CHECKS;                                      \
+    if ((MPIDI_CH4U_WIN(win, sync).origin_epoch_type != MPIDI_CH4U_EPOTYPE_LOCK) && \
+       (MPIDI_CH4U_WIN(win, sync).origin_epoch_type != MPIDI_CH4U_EPOTYPE_LOCK_ALL)) \
+        MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC,                \
+                            stmt, "**rmasync");                 \
+    MPID_END_ERROR_CHECKS;                                              \
+} while (0)
+
+#define MPIDI_CH4U_EPOCH_FREE_CHECK(win,mpi_errno,stmt)                 \
+    do {                                                                \
+        MPID_BEGIN_ERROR_CHECKS;                                        \
+        if (MPIDI_CH4U_WIN(win, sync).origin_epoch_type != MPIDI_CH4U_WIN(win, sync).target_epoch_type || \
+           (MPIDI_CH4U_WIN(win, sync).origin_epoch_type != MPIDI_CH4U_EPOTYPE_NONE && \
+            MPIDI_CH4U_WIN(win, sync).origin_epoch_type != MPIDI_CH4U_EPOTYPE_REFENCE)) \
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, stmt, "**rmasync"); \
+        MPID_END_ERROR_CHECKS;                                          \
+    } while (0)
+
+#define MPIDI_CH4U_EPOCH_ORIGIN_CHECK(win, epoch_type, mpi_errno, stmt) \
+    do {                                                                \
+        MPID_BEGIN_ERROR_CHECKS;                                        \
+        if (MPIDI_CH4U_WIN(win, sync).origin_epoch_type != epoch_type)    \
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC,            \
+                                stmt, "**rmasync");                     \
+        MPID_END_ERROR_CHECKS;                                          \
+    } while (0)
+
+#define MPIDI_CH4U_EPOCH_TARGET_CHECK(win, epoch_type, mpi_errno, stmt) \
+    do {                                                                \
+        MPID_BEGIN_ERROR_CHECKS;                                        \
+        if (MPIDI_CH4U_WIN(win, sync).target_epoch_type != epoch_type)    \
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC,            \
+                                stmt, "**rmasync");         \
+        MPID_END_ERROR_CHECKS;                                          \
+    } while (0)
+
+#else /* HAVE_ERROR_CHECKING */
+#define MPIDI_CH4U_EPOCH_CHECK_SYNC(win, mpi_errno, stmt)               if (0) goto fn_fail;
+#define MPIDI_CH4U_EPOCH_CHECK_TYPE(win, mpi_errno, stmt)               if (0) goto fn_fail;
+#define MPIDI_CH4U_EPOCH_START_CHECK(win, mpi_errno, stmt)              if (0) goto fn_fail;
+#define MPIDI_CH4U_EPOCH_START_CHECK2(win, mpi_errno, stmt)             if (0) goto fn_fail;
+#define MPIDI_CH4U_EPOCH_FENCE_CHECK(win, mpi_errno, stmt)              if (0) goto fn_fail;
+#define MPIDI_CH4U_EPOCH_POST_CHECK(win, mpi_errno, stmt)               if (0) goto fn_fail;
+#define MPIDI_CH4U_EPOCH_LOCK_CHECK(win, mpi_errno, stmt)               if (0) goto fn_fail;
+#define MPIDI_CH4U_EPOCH_FREE_CHECK(win, mpi_errno, stmt)               if (0) goto fn_fail;
+#define MPIDI_CH4U_EPOCH_ORIGIN_CHECK(win, epoch_type, mpi_errno, stmt) if (0) goto fn_fail;
+#define MPIDI_CH4U_EPOCH_TARGET_CHECK(win, epoch_type, mpi_errno, stmt) if (0) goto fn_fail;
+#endif /* HAVE_ERROR_CHECKING */
+
+#define MPIDI_CH4U_EPOCH_FENCE_EVENT(win, massert)                 \
+    do {                                                                \
+        if (massert & MPI_MODE_NOSUCCEED)                           \
+        {                                                               \
+            MPIDI_CH4U_WIN(win, sync).origin_epoch_type = MPIDI_CH4U_EPOTYPE_NONE; \
+            MPIDI_CH4U_WIN(win, sync).target_epoch_type = MPIDI_CH4U_EPOTYPE_NONE; \
+        }                                                               \
+        else                                                            \
+        {                                                               \
+            MPIDI_CH4U_WIN(win, sync).origin_epoch_type = MPIDI_CH4U_EPOTYPE_REFENCE; \
+            MPIDI_CH4U_WIN(win, sync).target_epoch_type = MPIDI_CH4U_EPOTYPE_REFENCE; \
+        }                                                               \
+    } while (0)
+
+#define MPIDI_CH4U_EPOCH_TARGET_EVENT(win)                              \
+    do {                                                            \
+        if (MPIDI_CH4U_WIN(win, sync).target_epoch_type == MPIDI_CH4U_EPOTYPE_REFENCE) \
+            MPIDI_CH4U_WIN(win, sync).origin_epoch_type = MPIDI_CH4U_EPOTYPE_REFENCE; \
+        else                                                            \
+            MPIDI_CH4U_WIN(win, sync).origin_epoch_type = MPIDI_CH4U_EPOTYPE_NONE; \
+    } while (0)
+
+#define MPIDI_CH4U_EPOCH_ORIGIN_EVENT(Win)                              \
+    do {                                                                \
+        if (MPIDI_CH4U_WIN(win, sync).origin_epoch_type == MPIDI_CH4U_EPOTYPE_REFENCE) \
+            MPIDI_CH4U_WIN(win, sync).target_epoch_type = MPIDI_CH4U_EPOTYPE_REFENCE; \
+        else                                                            \
+            MPIDI_CH4U_WIN(win, sync).target_epoch_type = MPIDI_CH4U_EPOTYPE_NONE; \
+    } while (0)
+
+/*
+  Calculate base address of the target window at the origin side
+  Return zero to let the target side calculate the actual address
+  (only offset from window base is given to the target in this case)
+*/
+static inline uintptr_t MPIDI_CH4I_win_base_at_origin(const MPIR_Win * win, int target_rank)
+{
+    /* TODO: In future we may want to calculate the full virtual address
+     * in the target at the origin side. It can be done by looking at
+     * MPIDI_CH4U_WINFO(win, target_rank)->base_addr */
+    return 0;
+}
+
+/*
+  Calculate base address of the window at the target side
+  If MPIDI_CH4I_win_base_at_origin calculates the full virtual address
+  this function must return zero
+*/
+static inline uintptr_t MPIDI_CH4I_win_base_at_target(const MPIR_Win * win)
+{
+    return (uintptr_t) win->base;
+}
+
+#endif /* CH4_IMPL_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4_init.h b/src/mpid/ch4/src/ch4_init.h
new file mode 100644
index 0000000..c26db6b
--- /dev/null
+++ b/src/mpid/ch4/src/ch4_init.h
@@ -0,0 +1,751 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4_INIT_H_INCLUDED
+#define CH4_INIT_H_INCLUDED
+
+#include "ch4_impl.h"
+#include "ch4r_proc.h"
+#include "ch4i_comm.h"
+
+/*
+=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===
+
+categories:
+    - name        : CH4
+      description : cvars that control behavior of the CH4 device
+
+cvars:
+    - name        : MPIR_CVAR_CH4_NETMOD
+      category    : CH4
+      type        : string
+      default     : ""
+      class       : device
+      verbosity   : MPI_T_VERBOSITY_USER_BASIC
+      scope       : MPI_T_SCOPE_ALL_EQ
+      description : >-
+        If non-empty, this cvar specifies which network module to use
+
+    - name        : MPIR_CVAR_CH4_SHM
+      category    : CH4
+      type        : string
+      default     : ""
+      class       : device
+      verbosity   : MPI_T_VERBOSITY_USER_BASIC
+      scope       : MPI_T_SCOPE_ALL_EQ
+      description : >-
+        If non-empty, this cvar specifies which shm module to use
+
+=== END_MPI_T_CVAR_INFO_BLOCK ===
+*/
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_choose_netmod
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_choose_netmod(void)
+{
+    int i, mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_CHOOSE_NETMOD);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_CHOOSE_NETMOD);
+
+    MPIR_Assert(MPIR_CVAR_CH4_NETMOD != NULL);
+
+    if (strcmp(MPIR_CVAR_CH4_NETMOD, "") == 0) {
+        /* netmod not specified, using the default */
+        MPIDI_NM_func = MPIDI_NM_funcs[0];
+        MPIDI_NM_native_func = MPIDI_NM_native_funcs[0];
+        goto fn_exit;
+    }
+
+    for (i = 0; i < MPIDI_num_netmods; ++i) {
+        /* use MPL variant of strncasecmp if we get one */
+        if (!strncasecmp(MPIR_CVAR_CH4_NETMOD, MPIDI_NM_strings[i], MPIDI_MAX_NETMOD_STRING_LEN)) {
+            MPIDI_NM_func = MPIDI_NM_funcs[i];
+            MPIDI_NM_native_func = MPIDI_NM_native_funcs[i];
+            goto fn_exit;
+        }
+    }
+
+    MPIR_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**invalid_netmod", "**invalid_netmod %s",
+                         MPIR_CVAR_CH4_NETMOD);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_CHOOSE_NETMOD);
+    return mpi_errno;
+  fn_fail:
+
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_choose_shm
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_choose_shm(void)
+{
+
+    int mpi_errno = MPI_SUCCESS;
+#if defined(MPIDI_BUILD_CH4_SHM)
+    int i;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_CHOOSE_SHM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_CHOOSE_SHM);
+
+
+    MPIR_Assert(MPIR_CVAR_CH4_SHM != NULL);
+
+    if (strcmp(MPIR_CVAR_CH4_SHM, "") == 0) {
+        /* shm not specified, using the default */
+        MPIDI_SHM_func = MPIDI_SHM_funcs[0];
+        MPIDI_SHM_native_func = MPIDI_SHM_native_funcs[0];
+        goto fn_exit;
+    }
+
+    for (i = 0; i < MPIDI_num_shms; ++i) {
+        /* use MPL variant of strncasecmp if we get one */
+        if (!strncasecmp(MPIR_CVAR_CH4_SHM, MPIDI_SHM_strings[i], MPIDI_MAX_SHM_STRING_LEN)) {
+            MPIDI_SHM_func = MPIDI_SHM_funcs[i];
+            MPIDI_SHM_native_func = MPIDI_SHM_native_funcs[i];
+            goto fn_exit;
+        }
+    }
+
+    MPIR_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**ch4|invalid_shm", "**ch4|invalid_shm %s",
+                         MPIR_CVAR_CH4_SHM);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_CHOOSE_SHM);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+#else
+    return mpi_errno;
+#endif
+}
+
+
+#if (MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__POBJ)
+#define MAX_THREAD_MODE MPI_THREAD_MULTIPLE
+#elif  (MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__GLOBAL)
+#define MAX_THREAD_MODE MPI_THREAD_MULTIPLE
+#elif  (MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__SINGLE)
+#define MAX_THREAD_MODE MPI_THREAD_SERIALIZED
+#elif  (MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__LOCKFREE)
+#define MAX_THREAD_MODE MPI_THREAD_SERIALIZED
+#else
+#error "Thread Granularity:  Invalid"
+#endif
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Init(int *argc,
+                              char ***argv,
+                              int requested, int *provided, int *has_args, int *has_env)
+{
+    int pmi_errno, mpi_errno = MPI_SUCCESS, rank, has_parent, size, appnum, thr_err;
+    void *netmod_contexts;
+    int avtid, max_n_avts;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_INIT);
+
+#ifdef MPL_USE_DBG_LOGGING
+    MPIDI_CH4_DBG_GENERAL = MPL_dbg_class_alloc("CH4", "ch4");
+    MPIDI_CH4_DBG_MAP = MPL_dbg_class_alloc("CH4_MAP", "ch4_map");
+    MPIDI_CH4_DBG_MEMORY = MPL_dbg_class_alloc("CH4_MEMORY", "ch4_memory");
+#endif
+    MPIDI_choose_netmod();
+    pmi_errno = PMI_Init(&has_parent);
+
+    if (pmi_errno != PMI_SUCCESS) {
+        MPIR_ERR_SETANDJUMP1(pmi_errno, MPI_ERR_OTHER, "**pmi_init", "**pmi_init %d", pmi_errno);
+    }
+
+    pmi_errno = PMI_Get_rank(&rank);
+
+    if (pmi_errno != PMI_SUCCESS) {
+        MPIR_ERR_SETANDJUMP1(pmi_errno, MPI_ERR_OTHER, "**pmi_get_rank",
+                             "**pmi_get_rank %d", pmi_errno);
+    }
+
+    pmi_errno = PMI_Get_size(&size);
+
+    if (pmi_errno != 0) {
+        MPIR_ERR_SETANDJUMP1(pmi_errno, MPI_ERR_OTHER, "**pmi_get_size",
+                             "**pmi_get_size %d", pmi_errno);
+    }
+
+    pmi_errno = PMI_Get_appnum(&appnum);
+
+    if (pmi_errno != PMI_SUCCESS) {
+        MPIR_ERR_SETANDJUMP1(pmi_errno, MPI_ERR_OTHER, "**pmi_get_appnum",
+                             "**pmi_get_appnum %d", pmi_errno);
+    }
+
+    MPID_Thread_mutex_create(&MPIDI_CH4I_THREAD_PROGRESS_MUTEX, &thr_err);
+    MPID_Thread_mutex_create(&MPIDI_CH4I_THREAD_PROGRESS_HOOK_MUTEX, &thr_err);
+
+    /* ---------------------------------- */
+    /* Initialize MPI_COMM_SELF           */
+    /* ---------------------------------- */
+    MPIR_Process.comm_self->rank = 0;
+    MPIR_Process.comm_self->remote_size = 1;
+    MPIR_Process.comm_self->local_size = 1;
+
+    /* ---------------------------------- */
+    /* Initialize MPI_COMM_WORLD          */
+    /* ---------------------------------- */
+    MPIR_Process.comm_world->rank = rank;
+    MPIR_Process.comm_world->remote_size = size;
+    MPIR_Process.comm_world->local_size = size;
+
+    mpi_errno = MPIDI_choose_shm();
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POPFATAL(mpi_errno);
+    }
+
+    MPIDI_CH4_Global.allocated_max_n_avts = 0;
+    MPIDIU_avt_init();
+    MPIDIU_get_next_avtid(&avtid);
+    MPIR_Assert(avtid == 0);
+    max_n_avts = MPIDIU_get_max_n_avts();
+
+    MPIDII_av_table = (MPIDII_av_table_t **)
+        MPL_malloc(max_n_avts * sizeof(MPIDII_av_table_t *));
+
+    MPIDII_av_table[0] = (MPIDII_av_table_t *)
+        MPL_malloc(size * sizeof(MPIDII_av_entry_t)
+                   + sizeof(MPIDII_av_table_t));
+
+    MPIDII_av_table[0]->size = size;
+    MPIR_Object_set_ref(MPIDII_av_table[0], 1);
+
+    MPIDIU_alloc_globals_for_avtid(avtid);
+
+    MPIDII_av_table0 = MPIDII_av_table[0];
+
+    /* initialize rank_map */
+    MPIDII_COMM(MPIR_Process.comm_world, map).mode = MPIDII_RANK_MAP_DIRECT_INTRA;
+    MPIDII_COMM(MPIR_Process.comm_world, map).avtid = 0;
+    MPIDII_COMM(MPIR_Process.comm_world, map).size = size;
+    MPIDII_COMM(MPIR_Process.comm_world, local_map).mode = MPIDII_RANK_MAP_NONE;
+    MPIDIU_avt_add_ref(0);
+
+    MPIDII_COMM(MPIR_Process.comm_self, map).mode = MPIDII_RANK_MAP_OFFSET_INTRA;
+    MPIDII_COMM(MPIR_Process.comm_self, map).avtid = 0;
+    MPIDII_COMM(MPIR_Process.comm_self, map).size = 1;
+    MPIDII_COMM(MPIR_Process.comm_self, map).reg.offset = rank;
+    MPIDII_COMM(MPIR_Process.comm_self, local_map).mode = MPIDII_RANK_MAP_NONE;
+    MPIDIU_avt_add_ref(0);
+
+    MPIR_Process.attrs.tag_ub = (1ULL << MPIDI_CH4U_TAG_SHIFT) - 1;
+    /* discuss */
+
+    mpi_errno = MPIDI_NM_init(rank, size, appnum, &MPIR_Process.attrs.tag_ub,
+                              MPIR_Process.comm_world,
+                              MPIR_Process.comm_self, has_parent, 1, &netmod_contexts);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POPFATAL(mpi_errno);
+    }
+
+#ifdef MPIDI_BUILD_CH4_LOCALITY_INFO
+    int i;
+    for (i = 0; i < MPIR_Process.comm_world->local_size; i++) {
+        MPIDII_av_table0->table[i].is_local = 0;
+    }
+    MPIDI_CH4U_build_nodemap(MPIR_Process.comm_world->rank,
+                             MPIR_Process.comm_world,
+                             MPIR_Process.comm_world->local_size,
+                             MPIDI_CH4_Global.node_map[0], &MPIDI_CH4_Global.max_node_id);
+
+    for (i = 0; i < MPIR_Process.comm_world->local_size; i++) {
+        MPIDII_av_table0->table[i].is_local =
+            (MPIDI_CH4_Global.node_map[0][i] ==
+             MPIDI_CH4_Global.node_map[0][MPIR_Process.comm_world->rank]) ? 1 : 0;
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_GENERAL, VERBOSE,
+                        (MPL_DBG_FDEST, "WORLD RANK %d %s local", i,
+                         MPIDII_av_table0->table[i].is_local ? "is" : "is not"));
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_GENERAL, VERBOSE,
+                        (MPL_DBG_FDEST, "Node id (i) (me) %d %d", MPIDI_CH4_Global.node_map[0][i],
+                         MPIDI_CH4_Global.node_map[0][MPIR_Process.comm_world->rank]));
+    }
+#endif
+
+#ifdef MPIDI_BUILD_CH4_SHM
+    mpi_errno = MPIDI_SHM_init(rank, size);
+
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POPFATAL(mpi_errno);
+    }
+#endif
+
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+
+    MPIR_Process.attrs.appnum = appnum;
+    MPIR_Process.attrs.wtime_is_global = 1;
+    MPIR_Process.attrs.io = MPI_ANY_SOURCE;
+
+    MPIR_Comm_commit(MPIR_Process.comm_self);
+    MPIR_Comm_commit(MPIR_Process.comm_world);
+
+    /* -------------------------------- */
+    /* Return MPICH Parameters          */
+    /* -------------------------------- */
+    switch (requested) {
+    case MPI_THREAD_SINGLE:
+    case MPI_THREAD_SERIALIZED:
+    case MPI_THREAD_FUNNELED:
+        *provided = requested;
+        break;
+
+    case MPI_THREAD_MULTIPLE:
+        *provided = MAX_THREAD_MODE;
+        break;
+    }
+
+    *has_args = TRUE;
+    *has_env = TRUE;
+    MPIDI_CH4_Global.is_initialized = 0;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_INIT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_InitCompleted
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_InitCompleted(void)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_INITCOMPLETED);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_INITCOMPLETED);
+    MPIDI_CH4_Global.is_initialized = 1;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_INITCOMPLETED);
+    return MPI_SUCCESS;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Finalize
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Finalize(void)
+{
+    int mpi_errno, thr_err;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_FINALIZE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_FINALIZE);
+
+    mpi_errno = MPIDI_NM_finalize();
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+#ifdef MPIDI_BUILD_CH4_SHM
+    mpi_errno = MPIDI_SHM_finalize();
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+#endif
+
+    int i;
+    int max_n_avts;
+    max_n_avts = MPIDIU_get_max_n_avts();
+    for (i = 0; i < max_n_avts; i++) {
+        if (MPIDII_av_table[i] != NULL) {
+            MPIDIU_avt_release_ref(i);
+        }
+    }
+    MPL_free(MPIDII_av_table);
+    MPL_free(MPIDI_CH4_Global.node_map);
+
+    MPIDIU_avt_destroy();
+
+    MPID_Thread_mutex_destroy(&MPIDI_CH4I_THREAD_PROGRESS_MUTEX, &thr_err);
+    MPID_Thread_mutex_destroy(&MPIDI_CH4I_THREAD_PROGRESS_HOOK_MUTEX, &thr_err);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_FINALIZE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Get_universe_size
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Get_universe_size(int *universe_size)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int pmi_errno = PMI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_GET_UNIVERSE_SIZE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_GET_UNIVERSE_SIZE);
+
+
+    pmi_errno = PMI_Get_universe_size(universe_size);
+
+    if (pmi_errno != PMI_SUCCESS)
+        MPIR_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER,
+                             "**pmi_get_universe_size", "**pmi_get_universe_size %d", pmi_errno);
+
+    if (*universe_size < 0)
+        *universe_size = MPIR_UNIVERSE_SIZE_NOT_AVAILABLE;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_GET_UNIVERSE_SIZE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Get_processor_name
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Get_processor_name(char *name, int namelen, int *resultlen)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_GET_PROCESSOR_NAME);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_GET_PROCESSOR_NAME);
+
+    if (!MPIDI_CH4_Global.pname_set) {
+#if defined(HAVE_GETHOSTNAME)
+
+        if (gethostname(MPIDI_CH4_Global.pname, MPI_MAX_PROCESSOR_NAME) == 0)
+            MPIDI_CH4_Global.pname_len = (int) strlen(MPIDI_CH4_Global.pname);
+
+#elif defined(HAVE_SYSINFO)
+
+        if (sysinfo(SI_HOSTNAME, MPIDI_CH4_Global.pname, MPI_MAX_PROCESSOR_NAME) == 0)
+            MPIDI_CH4_Global.pname_len = (int) strlen(MPIDI_CH4_Global.pname);
+
+#else
+        MPL_snprintf(MPIDI_CH4_Global.pname, MPI_MAX_PROCESSOR_NAME, "%d",
+                     MPIR_Process.comm_world->rank);
+        MPIDI_CH4_Global.pname_len = (int) strlen(MPIDI_CH4_Global.pname);
+#endif
+        MPIDI_CH4_Global.pname_set = 1;
+    }
+
+    MPIR_ERR_CHKANDJUMP(MPIDI_CH4_Global.pname_len <= 0,
+                        mpi_errno, MPI_ERR_OTHER, "**procnamefailed");
+    MPL_strncpy(name, MPIDI_CH4_Global.pname, namelen);
+
+    if (resultlen)
+        *resultlen = MPIDI_CH4_Global.pname_len;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_GET_PROCESSOR_NAME);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Abort
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Abort(MPIR_Comm * comm,
+                               int mpi_errno, int exit_code, const char *error_msg)
+{
+    char sys_str[MPI_MAX_ERROR_STRING + 5] = "";
+    char comm_str[MPI_MAX_ERROR_STRING] = "";
+    char world_str[MPI_MAX_ERROR_STRING] = "";
+    char error_str[2 * MPI_MAX_ERROR_STRING + 128];
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_ABORT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_ABORT);
+
+    if (MPIR_Process.comm_world) {
+        int rank = MPIR_Process.comm_world->rank;
+        snprintf(world_str, sizeof(world_str), " on node %d", rank);
+    }
+
+    if (comm) {
+        int rank = comm->rank;
+        int context_id = comm->context_id;
+        snprintf(comm_str, sizeof(comm_str), " (rank %d in comm %d)", rank, context_id);
+    }
+
+    if (!error_msg)
+        error_msg = "Internal error";
+
+    if (mpi_errno != MPI_SUCCESS) {
+        char msg[MPI_MAX_ERROR_STRING] = "";
+        MPIR_Err_get_string(mpi_errno, msg, MPI_MAX_ERROR_STRING, NULL);
+        snprintf(sys_str, sizeof(msg), " (%s)", msg);
+    }
+    MPL_snprintf(error_str, sizeof(error_str), "Abort(%d)%s%s: %s%s\n",
+                 exit_code, world_str, comm_str, error_msg, sys_str);
+    MPL_error_printf("%s", error_str);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_ABORT);
+    fflush(stderr);
+    fflush(stdout);
+    PMI_Abort(exit_code, error_msg);
+    return 0;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Alloc_mem
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ void *MPIDI_Alloc_mem(size_t size, MPIR_Info * info_ptr)
+{
+    void *p;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_ALLOC_MEM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_ALLOC_MEM);
+
+    p = MPIDI_NM_alloc_mem(size, info_ptr);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_ALLOC_MEM);
+    return p;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Free_mem
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Free_mem(void *ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_FREE_MEM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_FREE_MEM);
+    mpi_errno = MPIDI_NM_free_mem(ptr);
+
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_FREE_MEM);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Comm_get_lpid
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Comm_get_lpid(MPIR_Comm * comm_ptr,
+                                       int idx, int *lpid_ptr, MPL_bool is_remote)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int avtid = 0, lpid = 0;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_COMM_GET_LPID);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_COMM_GET_LPID);
+
+    if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM)
+        MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid);
+    else if (is_remote)
+        MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid);
+    else {
+        MPIDIU_comm_rank_to_pid_local(comm_ptr, idx, &lpid, &avtid);
+    }
+
+    *lpid_ptr = MPIDIU_LPID_CREATE(avtid, lpid);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_COMM_GET_LPID);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_GPID_Get
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_GPID_Get(MPIR_Comm * comm_ptr, int rank, MPIR_Gpid * gpid)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_GPID_GET);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_GPID_GET);
+
+    mpi_errno = MPIDI_NM_gpid_get(comm_ptr, rank, gpid);
+    MPIDI_CH4U_get_node_id(comm_ptr, rank, &MPIDII_GPID(gpid).node);
+
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_GPID_GET);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Get_node_id
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Get_node_id(MPIR_Comm * comm, int rank, MPID_Node_id_t * id_p)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_GET_NODE_ID);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_GET_NODE_ID);
+
+    MPIDI_CH4U_get_node_id(comm, rank, id_p);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_GET_NODE_ID);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Get_max_node_id
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Get_max_node_id(MPIR_Comm * comm, MPID_Node_id_t * max_id_p)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_GET_MAX_NODE_ID);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_GET_MAX_NODE_ID);
+
+    MPIDI_CH4U_get_max_node_id(comm, max_id_p);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_GET_MAX_NODE_ID);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_GetAllInComm
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_GPID_GetAllInComm(MPIR_Comm * comm_ptr,
+                                           int local_size, MPIR_Gpid local_gpids[], int *singleAVT)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_GETALLINCOMM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_GETALLINCOMM);
+
+    mpi_errno = MPIDI_NM_getallincomm(comm_ptr, local_size, local_gpids, singleAVT);
+
+    if (MPIDII_COMM(comm_ptr, map).mode == MPIDII_RANK_MAP_MLUT) {
+        *singleAVT = FALSE;
+    }
+    else {
+        *singleAVT = TRUE;
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_GETALLINCOMM);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_GPID_ToLpidArray
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_GPID_ToLpidArray(int size, MPIR_Gpid gpid[], int lpid[])
+{
+    int mpi_errno = MPI_SUCCESS, i;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_GPID_TOLPIDARRAY);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_GPID_TOLPIDARRAY);
+
+    mpi_errno = MPIDI_NM_gpid_tolpidarray(size, gpid, lpid);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+
+    /* update node_map */
+    for (i = 0; i < size; i++) {
+        int _avtid = 0, _lpid = 0;
+        /* if this is a new process, update node_map and locality */
+        if (MPIDIU_LPID_IS_NEW_AVT(lpid[i])) {
+            MPIDIU_LPID_CLEAR_NEW_AVT_MARK(lpid[i]);
+            _avtid = MPIDIU_LPID_GET_AVTID(lpid[i]);
+            _lpid = MPIDIU_LPID_GET_LPID(lpid[i]);
+            MPIDI_CH4_Global.node_map[_avtid][_lpid] = MPIDII_GPID(&gpid[i]).node;
+            /* new process groups are always assumed to be remote */
+#ifdef MPIDI_BUILD_CH4_LOCALITY_INFO
+            if (_avtid != 0) {
+                MPIDII_av_table[_avtid]->table[_lpid].is_local = 0;
+            }
+#endif
+        }
+    }
+
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_GPID_TOLPIDARRAY);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Create_intercomm_from_lpids
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr,
+                                                     int size, const int lpids[])
+{
+    int mpi_errno = MPI_SUCCESS, i;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_CREATE_INTERCOMM_FROM_LPIDS);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_CREATE_INTERCOMM_FROM_LPIDS);
+
+    MPIDII_rank_map_mlut_t *mlut = NULL;
+    MPIDII_COMM(newcomm_ptr, map).mode = MPIDII_RANK_MAP_MLUT;
+    MPIDII_COMM(newcomm_ptr, map).avtid = -1;
+    mpi_errno = MPIDII_alloc_mlut(&mlut, size);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+    MPIDII_COMM(newcomm_ptr, map).size = size;
+    MPIDII_COMM(newcomm_ptr, map).irreg.mlut.t = mlut;
+    MPIDII_COMM(newcomm_ptr, map).irreg.mlut.gpid = mlut->gpid;
+
+    for (i = 0; i < size; i++) {
+        MPIDII_COMM(newcomm_ptr, map).irreg.mlut.gpid[i].avtid = MPIDIU_LPID_GET_AVTID(lpids[i]);
+        MPIDII_COMM(newcomm_ptr, map).irreg.mlut.gpid[i].lpid = MPIDIU_LPID_GET_LPID(lpids[i]);
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, " remote rank=%d, avtid=%d, lpid=%d", i,
+                         MPIDII_COMM(newcomm_ptr, map).irreg.mlut.gpid[i].avtid,
+                         MPIDII_COMM(newcomm_ptr, map).irreg.mlut.gpid[i].lpid));
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_CREATE_INTERCOMM_FROM_LPIDS);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Aint_add
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ MPI_Aint MPIDI_Aint_add(MPI_Aint base, MPI_Aint disp)
+{
+    MPI_Aint result;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_AINT_ADD);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_AINT_ADD);
+    result = MPIR_VOID_PTR_CAST_TO_MPI_AINT((char *) MPIR_AINT_CAST_TO_VOID_PTR(base) + disp);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_AINT_ADD);
+    return result;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Aint_diff
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ MPI_Aint MPIDI_Aint_diff(MPI_Aint addr1, MPI_Aint addr2)
+{
+    MPI_Aint result;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_AINT_DIFF);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_AINT_DIFF);
+
+    result = MPIR_PTR_DISP_CAST_TO_MPI_AINT((char *) MPIR_AINT_CAST_TO_VOID_PTR(addr1)
+                                            - (char *) MPIR_AINT_CAST_TO_VOID_PTR(addr2));
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_AINT_DIFF);
+    return result;
+}
+
+#endif /* CH4_INIT_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4_probe.h b/src/mpid/ch4/src/ch4_probe.h
new file mode 100644
index 0000000..89b7bc9
--- /dev/null
+++ b/src/mpid/ch4/src/ch4_probe.h
@@ -0,0 +1,220 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4_PROBE_H_INCLUDED
+#define CH4_PROBE_H_INCLUDED
+
+#include "ch4_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Probe
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Probe(int source,
+                               int tag, MPIR_Comm * comm, int context_offset, MPI_Status * status)
+{
+    int mpi_errno, flag = 0;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_PROBE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_PROBE);
+
+    if (unlikely(source == MPI_PROC_NULL)) {
+        MPIR_Status_set_procnull(status);
+        mpi_errno = MPI_SUCCESS;
+        goto fn_exit;
+    }
+
+    while (!flag) {
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+        mpi_errno = MPIDI_NM_iprobe(source, tag, comm, context_offset, &flag, status);
+#else
+        if (unlikely(source == MPI_ANY_SOURCE)) {
+            mpi_errno = MPIDI_SHM_iprobe(source, tag, comm, context_offset, &flag, status);
+            if (!flag)
+                mpi_errno = MPIDI_NM_iprobe(source, tag, comm, context_offset, &flag, status);
+        }
+        else if (MPIDI_CH4_rank_is_local(source, comm))
+            mpi_errno = MPIDI_SHM_iprobe(source, tag, comm, context_offset, &flag, status);
+        else
+            mpi_errno = MPIDI_NM_iprobe(source, tag, comm, context_offset, &flag, status);
+#endif
+        if (mpi_errno != MPI_SUCCESS) {
+            MPIR_ERR_POP(mpi_errno);
+        }
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+        MPIDI_NM_progress(MPIDI_CH4_Global.netmod_context[0], 0);
+#else
+        if (unlikely(source == MPI_ANY_SOURCE)) {
+            MPIDI_SHM_progress(0);
+            MPIDI_NM_progress(MPIDI_CH4_Global.netmod_context[0], 0);
+        }
+        if (MPIDI_CH4_rank_is_local(source, comm))
+            MPIDI_SHM_progress(0);
+        else
+            MPIDI_NM_progress(MPIDI_CH4_Global.netmod_context[0], 0);
+#endif
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_PROBE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Mprobe
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Mprobe(int source,
+                                int tag,
+                                MPIR_Comm * comm,
+                                int context_offset, MPIR_Request ** message, MPI_Status * status)
+{
+    int mpi_errno = MPI_SUCCESS, flag = 0;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_MPROBE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_MPROBE);
+
+    if (source == MPI_PROC_NULL) {
+        MPIR_Status_set_procnull(status);
+        *message = NULL;        /* should be interpreted as MPI_MESSAGE_NO_PROC */
+        mpi_errno = MPI_SUCCESS;
+        goto fn_exit;
+    }
+
+    while (!flag) {
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+        mpi_errno = MPIDI_NM_improbe(source, tag, comm, context_offset, &flag, message, status);
+#else
+        if (unlikely(source == MPI_ANY_SOURCE)) {
+            mpi_errno =
+                MPIDI_SHM_improbe(source, tag, comm, context_offset, &flag, message, status);
+            if (!flag)
+                mpi_errno =
+                    MPIDI_NM_improbe(source, tag, comm, context_offset, &flag, message, status);
+        }
+        else if (MPIDI_CH4_rank_is_local(source, comm))
+            mpi_errno =
+                MPIDI_SHM_improbe(source, tag, comm, context_offset, &flag, message, status);
+        else
+            mpi_errno = MPIDI_NM_improbe(source, tag, comm, context_offset, &flag, message, status);
+#endif
+        if (mpi_errno != MPI_SUCCESS) {
+            MPIR_ERR_POP(mpi_errno);
+        }
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+        MPIDI_NM_progress(MPIDI_CH4_Global.netmod_context[0], 0);
+#else
+        if (unlikely(source == MPI_ANY_SOURCE)) {
+            MPIDI_SHM_progress(0);
+            MPIDI_NM_progress(MPIDI_CH4_Global.netmod_context[0], 0);
+        }
+        else if (MPIDI_CH4_rank_is_local(source, comm))
+            MPIDI_SHM_progress(0);
+        else
+            MPIDI_NM_progress(MPIDI_CH4_Global.netmod_context[0], 0);
+#endif
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_MPROBE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Improbe
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Improbe(int source,
+                                 int tag,
+                                 MPIR_Comm * comm,
+                                 int context_offset,
+                                 int *flag, MPIR_Request ** message, MPI_Status * status)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_IMPROBE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_IMPROBE);
+
+    if (source == MPI_PROC_NULL) {
+        MPIR_Status_set_procnull(status);
+        *flag = 1;
+        *message = NULL;        /* should be interpreted as MPI_MESSAGE_NO_PROC */
+        mpi_errno = MPI_SUCCESS;
+        goto fn_exit;
+    }
+
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_improbe(source, tag, comm, context_offset, flag, message, status);
+#else
+    if (unlikely(source == MPI_ANY_SOURCE)) {
+        mpi_errno = MPIDI_SHM_improbe(source, tag, comm, context_offset, flag, message, status);
+        if (!*flag)
+            mpi_errno = MPIDI_NM_improbe(source, tag, comm, context_offset, flag, message, status);
+    }
+    else if (MPIDI_CH4_rank_is_local(source, comm))
+        mpi_errno = MPIDI_SHM_improbe(source, tag, comm, context_offset, flag, message, status);
+    else
+        mpi_errno = MPIDI_NM_improbe(source, tag, comm, context_offset, flag, message, status);
+#endif
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_IMPROBE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Iprobe
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Iprobe(int source,
+                                int tag,
+                                MPIR_Comm * comm,
+                                int context_offset, int *flag, MPI_Status * status)
+{
+
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_IPROBE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_IPROBE);
+
+    if (unlikely(source == MPI_PROC_NULL)) {
+        MPIR_Status_set_procnull(status);
+        *flag = 1;
+        mpi_errno = MPI_SUCCESS;
+        goto fn_exit;
+    }
+
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_iprobe(source, tag, comm, context_offset, flag, status);
+#else
+    if (unlikely(source == MPI_ANY_SOURCE)) {
+        mpi_errno = MPIDI_SHM_iprobe(source, tag, comm, context_offset, flag, status);
+        if (!*flag)
+            mpi_errno = MPIDI_NM_iprobe(source, tag, comm, context_offset, flag, status);
+    }
+    else if (MPIDI_CH4_rank_is_local(source, comm))
+        mpi_errno = MPIDI_SHM_iprobe(source, tag, comm, context_offset, flag, status);
+    else
+        mpi_errno = MPIDI_NM_iprobe(source, tag, comm, context_offset, flag, status);
+#endif
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_IPROBE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#endif /* CH4_PROBE_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4_proc.h b/src/mpid/ch4/src/ch4_proc.h
new file mode 100644
index 0000000..4aefcf0
--- /dev/null
+++ b/src/mpid/ch4/src/ch4_proc.h
@@ -0,0 +1,37 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4_PROC_H_INCLUDED
+#define CH4_PROC_H_INCLUDED
+
+#include "ch4_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4_rank_is_local
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4_rank_is_local(int rank, MPIR_Comm * comm)
+{
+    int ret;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPIDI_STATE_CH4_RANK_IS_LOCAL);
+    MPIR_FUNC_VERBOSE_ENTER(MPIDI_STATE_CH4_RANK_IS_LOCAL);
+
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    /* Ask the netmod for locality information. If it decided not to build it,
+     * it will call back up to the CH4U function to get the infomration. */
+    ret = MPIDI_NM_rank_is_local(rank, comm);
+#else
+    ret = MPIDI_CH4U_rank_is_local(rank, comm);
+#endif
+
+    MPIR_FUNC_VERBOSE_EXIT(MPIDI_STATE_CH4_RANK_IS_LOCAL);
+    return ret;
+}
+#endif /* CH4_PROC_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4_progress.h b/src/mpid/ch4/src/ch4_progress.h
new file mode 100644
index 0000000..37bffef
--- /dev/null
+++ b/src/mpid/ch4/src/ch4_progress.h
@@ -0,0 +1,185 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4_PROGRESS_H_INCLUDED
+#define CH4_PROGRESS_H_INCLUDED
+
+#include "ch4_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Progress_test
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Progress_test(void)
+{
+    int mpi_errno, made_progress, i;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_PROGRESS_TEST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_PROGRESS_TEST);
+
+
+    if (OPA_load_int(&MPIDI_CH4_Global.active_progress_hooks)) {
+        MPID_THREAD_CS_ENTER(POBJ, MPIDI_CH4I_THREAD_PROGRESS_MUTEX);
+        for (i = 0; i < MAX_PROGRESS_HOOKS; i++) {
+            if (MPIDI_CH4_Global.progress_hooks[i].active == TRUE) {
+                MPIR_Assert(MPIDI_CH4_Global.progress_hooks[i].func_ptr != NULL);
+                mpi_errno = MPIDI_CH4_Global.progress_hooks[i].func_ptr(&made_progress);
+                if (mpi_errno) {
+                    MPID_THREAD_CS_EXIT(POBJ, MPIDI_CH4I_THREAD_PROGRESS_MUTEX);
+                    MPIR_ERR_POP(mpi_errno);
+                }
+            }
+        }
+        MPID_THREAD_CS_EXIT(POBJ, MPIDI_CH4I_THREAD_PROGRESS_MUTEX);
+    }
+    /* todo: progress unexp_list */
+    mpi_errno = MPIDI_NM_progress(MPIDI_CH4_Global.netmod_context[0], 0);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+#ifdef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_SHM_progress(0);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+#endif
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_PROGRESS_TEST);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;;
+}
+
+__CH4_INLINE__ int MPIDI_Progress_poke(void)
+{
+    return MPIDI_Progress_test();
+}
+
+__CH4_INLINE__ void MPIDI_Progress_start(MPID_Progress_state * state)
+{
+    return;
+}
+
+__CH4_INLINE__ void MPIDI_Progress_end(MPID_Progress_state * state)
+{
+    return;
+}
+
+__CH4_INLINE__ int MPIDI_Progress_wait(MPID_Progress_state * state)
+{
+    return MPIDI_Progress_test();
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Progress_register
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Progress_register(int (*progress_fn) (int *), int *id)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int i;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_PROGRESS_REGISTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_PROGRESS_REGISTER);
+
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_CH4I_THREAD_PROGRESS_HOOK_MUTEX);
+    for (i = 0; i < MAX_PROGRESS_HOOKS; i++) {
+        if (MPIDI_CH4_Global.progress_hooks[i].func_ptr == NULL) {
+            MPIDI_CH4_Global.progress_hooks[i].func_ptr = progress_fn;
+            MPIDI_CH4_Global.progress_hooks[i].active = FALSE;
+            break;
+        }
+    }
+
+    if (i >= MAX_PROGRESS_HOOKS)
+        goto fn_fail;
+
+    OPA_incr_int(&MPIDI_CH4_Global.active_progress_hooks);
+
+    (*id) = i;
+
+  fn_exit:
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_CH4I_THREAD_PROGRESS_HOOK_MUTEX);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_PROGRESS_REGISTER);
+    return mpi_errno;
+  fn_fail:
+    mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+                                     "MPIDI_Progress_register", __LINE__,
+                                     MPI_ERR_INTERN, "**progresshookstoomany", 0);
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Progress_deregister
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Progress_deregister(int id)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_PROGRESS_DEREGISTER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_PROGRESS_DEREGISTER);
+
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_CH4I_THREAD_PROGRESS_HOOK_MUTEX);
+    MPIR_Assert(id >= 0);
+    MPIR_Assert(id < MAX_PROGRESS_HOOKS);
+    MPIR_Assert(MPIDI_CH4_Global.progress_hooks[id].func_ptr != NULL);
+    MPIDI_CH4_Global.progress_hooks[id].func_ptr = NULL;
+    MPIDI_CH4_Global.progress_hooks[id].active = FALSE;
+
+    OPA_decr_int(&MPIDI_CH4_Global.active_progress_hooks);
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_CH4I_THREAD_PROGRESS_HOOK_MUTEX);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_PROGRESS_DEREGISTER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Progress_activate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Progress_activate(int id)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_PROGRESS_ACTIVATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_PROGRESS_ACTIVATE);
+
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_CH4I_THREAD_PROGRESS_HOOK_MUTEX);
+    MPIR_Assert(id >= 0);
+    MPIR_Assert(id < MAX_PROGRESS_HOOKS);
+    MPIR_Assert(MPIDI_CH4_Global.progress_hooks[id].active == FALSE);
+    MPIR_Assert(MPIDI_CH4_Global.progress_hooks[id].func_ptr != NULL);
+    MPIDI_CH4_Global.progress_hooks[id].active = TRUE;
+
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_CH4I_THREAD_PROGRESS_HOOK_MUTEX);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_PROGRESS_ACTIVATE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Progress_deactivate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Progress_deactivate(int id)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_PROGRESS_DEACTIVATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_PROGRESS_DEACTIVATE);
+
+    MPID_THREAD_CS_ENTER(POBJ, MPIDI_CH4I_THREAD_PROGRESS_HOOK_MUTEX);
+    MPIR_Assert(id >= 0);
+    MPIR_Assert(id < MAX_PROGRESS_HOOKS);
+    MPIR_Assert(MPIDI_CH4_Global.progress_hooks[id].active == TRUE);
+    MPIR_Assert(MPIDI_CH4_Global.progress_hooks[id].func_ptr != NULL);
+    MPIDI_CH4_Global.progress_hooks[id].active = FALSE;
+
+    MPID_THREAD_CS_EXIT(POBJ, MPIDI_CH4I_THREAD_PROGRESS_HOOK_MUTEX);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_PROGRESS_DEACTIVATE);
+    return mpi_errno;
+}
+
+#endif /* CH4_PROGRESS_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4_recv.h b/src/mpid/ch4/src/ch4_recv.h
new file mode 100644
index 0000000..71af9db
--- /dev/null
+++ b/src/mpid/ch4/src/ch4_recv.h
@@ -0,0 +1,411 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4_RECV_H_INCLUDED
+#define CH4_RECV_H_INCLUDED
+
+#include "ch4_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Recv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Recv(void *buf,
+                              int count,
+                              MPI_Datatype datatype,
+                              int rank,
+                              int tag,
+                              MPIR_Comm * comm,
+                              int context_offset, MPI_Status * status, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_RECV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_RECV);
+
+    if (unlikely(rank == MPI_PROC_NULL)) {
+        MPIR_Request *rreq = MPIR_Request_create(MPIR_REQUEST_KIND__RECV);
+        *request = rreq;
+        MPIR_Request_add_ref(rreq);
+        rreq->status.MPI_SOURCE = rank;
+        rreq->status.MPI_TAG = tag;
+        MPIDI_CH4U_request_complete(rreq);
+        mpi_errno = MPI_SUCCESS;
+        goto fn_exit;
+    }
+
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno =
+        MPIDI_NM_recv(buf, count, datatype, rank, tag, comm, context_offset, status, request);
+#else
+    if (unlikely(rank == MPI_ANY_SOURCE)) {
+        mpi_errno = MPIDI_SHM_irecv(buf, count, datatype, rank, tag, comm, context_offset, request);
+
+        if (mpi_errno != MPI_SUCCESS) {
+            MPIR_ERR_POP(mpi_errno);
+        }
+
+        mpi_errno = MPIDI_NM_irecv(buf, count, datatype, rank, tag, comm, context_offset,
+                                   &(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*request)));
+
+        if (mpi_errno != MPI_SUCCESS) {
+            MPIR_ERR_POP(mpi_errno);
+        }
+        /* cancel the shm request if netmod/am handles the request from unexpected queue. */
+        else if (*request) {
+            if (MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*request)->status.MPI_SOURCE != MPI_UNDEFINED) {
+                mpi_errno = MPIDI_SHM_cancel_recv(*request);
+                if (MPIR_STATUS_GET_CANCEL_BIT((*request)->status)) {
+                    (*request)->status = MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*request)->status;
+                }
+                goto fn_exit;
+            }
+            MPIDI_CH4I_REQUEST(*request, is_local) = 1;
+            MPIDI_CH4I_REQUEST(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*request), is_local) = 0;
+        }
+
+        MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*request)) =
+            *request;
+    }
+    else {
+        int r;
+        if ((r = MPIDI_CH4_rank_is_local(rank, comm)))
+            mpi_errno =
+                MPIDI_SHM_recv(buf, count, datatype, rank, tag, comm, context_offset, status,
+                               request);
+        else
+            mpi_errno =
+                MPIDI_NM_recv(buf, count, datatype, rank, tag, comm, context_offset, status,
+                              request);
+        if (mpi_errno == MPI_SUCCESS && *request) {
+            MPIDI_CH4I_REQUEST(*request, is_local) = r;
+            MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*request) = NULL;
+        }
+    }
+#endif
+
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_RECV);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Recv_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Recv_init(void *buf,
+                                   int count,
+                                   MPI_Datatype datatype,
+                                   int rank,
+                                   int tag,
+                                   MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_RECV_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_RECV_INIT);
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_recv_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_RECV_INIT);
+    return mpi_errno;
+#else
+    if (unlikely(rank == MPI_ANY_SOURCE)) {
+        mpi_errno =
+            MPIDI_SHM_recv_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+
+        if (mpi_errno != MPI_SUCCESS) {
+            MPIR_ERR_POP(mpi_errno);
+        }
+
+        mpi_errno = MPIDI_NM_recv_init(buf, count, datatype, rank, tag, comm, context_offset,
+                                       &(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*request)));
+
+        if (mpi_errno != MPI_SUCCESS) {
+            MPIR_ERR_POP(mpi_errno);
+        }
+
+        MPIDI_CH4I_REQUEST(*request, is_local) = 1;
+        MPIDI_CH4I_REQUEST(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*request), is_local) = 0;
+
+        MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*request)) =
+            *request;
+    }
+    else {
+        int r;
+        if ((r = MPIDI_CH4_rank_is_local(rank, comm)))
+            mpi_errno = MPIDI_SHM_recv_init(buf, count, datatype, rank, tag,
+                                            comm, context_offset, request);
+        else
+            mpi_errno = MPIDI_NM_recv_init(buf, count, datatype, rank, tag,
+                                           comm, context_offset, request);
+        if (mpi_errno == MPI_SUCCESS) {
+            MPIDI_CH4I_REQUEST(*request, is_local) = r;
+            MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*request) = NULL;
+        }
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_RECV_INIT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+#endif
+
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Mrecv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Mrecv(void *buf,
+                               int count,
+                               MPI_Datatype datatype, MPIR_Request * message, MPI_Status * status)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_MRECV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_MRECV);
+
+    MPI_Request req_handle;
+    int active_flag;
+    MPIR_Request *rreq = NULL;
+
+    if (message == NULL) {
+        /* treat as though MPI_MESSAGE_NO_PROC was passed */
+        MPIR_Status_set_procnull(status);
+        mpi_errno = MPI_SUCCESS;
+        goto fn_exit;
+    }
+
+    if (unlikely(message->status.MPI_SOURCE == MPI_PROC_NULL)) {
+        MPIR_Request *rreq = message;
+        rreq->status.MPI_SOURCE = message->status.MPI_SOURCE;
+        rreq->status.MPI_TAG = message->status.MPI_TAG;
+        MPIDI_CH4U_request_complete(rreq);
+        mpi_errno = MPI_SUCCESS;
+        goto fn_exit;
+    }
+
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_imrecv(buf, count, datatype, message, &rreq);
+#else
+
+    if (unlikely(message->status.MPI_SOURCE == MPI_ANY_SOURCE)) {
+        mpi_errno = MPIDI_SHM_imrecv(buf, count, datatype, message, &rreq);
+
+        if (mpi_errno != MPI_SUCCESS) {
+            MPIR_ERR_POP(mpi_errno);
+        }
+
+        mpi_errno =
+            MPIDI_NM_imrecv(buf, count, datatype, message,
+                            &(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(rreq)));
+
+        if (mpi_errno != MPI_SUCCESS) {
+            MPIR_ERR_POP(mpi_errno);
+        }
+
+        MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(rreq)) = rreq;
+        MPIDI_CH4I_REQUEST(rreq, is_local) = 1;
+        MPIDI_CH4I_REQUEST(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(rreq), is_local) = 0;
+    }
+    else {
+        int local = MPIDI_CH4_rank_is_local(message->status.MPI_SOURCE, message->comm);
+        if (local)
+            mpi_errno = MPIDI_SHM_imrecv(buf, count, datatype, message, &rreq);
+        else
+            mpi_errno = MPIDI_NM_imrecv(buf, count, datatype, message, &rreq);
+
+        if (mpi_errno == MPI_SUCCESS) {
+            MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(rreq) = NULL;
+            MPIDI_CH4I_REQUEST(rreq, is_local) = local;
+        }
+    }
+#endif
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+    while (!MPIR_Request_is_complete(rreq)) {
+        MPIDI_NM_progress(MPIDI_CH4_Global.netmod_context[0], 0);
+#ifdef MPIDI_CH4_EXCLUSIVE_SHM
+        MPIDI_SHM_progress(0);
+#endif
+    }
+
+    /* This should probably be moved to MPICH (above device) level */
+    /* Someone neglected to put the blocking at the MPICH level    */
+    MPIR_Request_extract_status(rreq, status);
+    MPIR_Request_complete(&req_handle, rreq, status, &active_flag);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_MRECV);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Imrecv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Imrecv(void *buf,
+                                int count,
+                                MPI_Datatype datatype,
+                                MPIR_Request * message, MPIR_Request ** rreqp)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_IMRECV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_IMRECV);
+
+    if (message == NULL) {
+        MPIR_Request *rreq;
+        MPIDI_Request_create_null_rreq(rreq, mpi_errno, goto fn_fail);
+        *rreqp = rreq;
+        goto fn_exit;
+    }
+
+    if (unlikely(message->status.MPI_SOURCE == MPI_PROC_NULL)) {
+        MPIR_Request *rreq = message;
+        rreq->status.MPI_SOURCE = message->status.MPI_SOURCE;
+        rreq->status.MPI_TAG = message->status.MPI_TAG;
+        MPIDI_CH4U_request_complete(rreq);
+        *rreqp = rreq;
+        mpi_errno = MPI_SUCCESS;
+        goto fn_exit;
+    }
+
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_imrecv(buf, count, datatype, message, rreqp);
+#else
+    {
+        int local = MPIDI_CH4_rank_is_local(message->status.MPI_SOURCE, message->comm);
+        if (local)
+            mpi_errno = MPIDI_SHM_imrecv(buf, count, datatype, message, rreqp);
+        else
+            mpi_errno = MPIDI_NM_imrecv(buf, count, datatype, message, rreqp);
+        if (mpi_errno == MPI_SUCCESS) {
+            MPIDI_CH4I_REQUEST(*rreqp, is_local) = local;
+            MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*rreqp) = NULL;
+        }
+    }
+#endif
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_IMRECV);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Irecv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Irecv(void *buf,
+                               int count,
+                               MPI_Datatype datatype,
+                               int rank,
+                               int tag,
+                               MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_IRECV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_IRECV);
+
+    if (unlikely(rank == MPI_PROC_NULL)) {
+        MPIR_Request *rreq = MPIR_Request_create(MPIR_REQUEST_KIND__RECV);
+        *request = rreq;
+        MPIR_Request_add_ref(rreq);
+        rreq->status.MPI_SOURCE = rank;
+        rreq->status.MPI_TAG = tag;
+        MPIDI_CH4U_request_complete(rreq);
+        mpi_errno = MPI_SUCCESS;
+        goto fn_exit;
+    }
+
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_irecv(buf, count, datatype, rank, tag, comm, context_offset, request);
+#else
+    if (unlikely(rank == MPI_ANY_SOURCE)) {
+        mpi_errno = MPIDI_SHM_irecv(buf, count, datatype, rank, tag, comm, context_offset, request);
+
+        if (mpi_errno != MPI_SUCCESS) {
+            MPIR_ERR_POP(mpi_errno);
+        }
+
+        mpi_errno = MPIDI_NM_irecv(buf, count, datatype, rank, tag, comm, context_offset,
+                                   &(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*request)));
+
+        if (mpi_errno != MPI_SUCCESS) {
+            MPIR_ERR_POP(mpi_errno);
+        }
+        else if (*request) {
+            MPIDI_CH4I_REQUEST(*request, is_local) = 1;
+            MPIDI_CH4I_REQUEST(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*request), is_local) = 0;
+        }
+
+        MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*request)) =
+            *request;
+    }
+    else {
+        int r;
+        if ((r = MPIDI_CH4_rank_is_local(rank, comm)))
+            mpi_errno =
+                MPIDI_SHM_irecv(buf, count, datatype, rank, tag, comm, context_offset, request);
+        else
+            mpi_errno =
+                MPIDI_NM_irecv(buf, count, datatype, rank, tag, comm, context_offset, request);
+        if (mpi_errno == MPI_SUCCESS && *request) {
+            MPIDI_CH4I_REQUEST(*request, is_local) = r;
+            MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*request) = NULL;
+        }
+    }
+#endif
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_IRECV);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Cancel_Recv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Cancel_recv(MPIR_Request * rreq)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_CANCEL_RECV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_CANCEL_RECV);
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_cancel_recv(rreq);
+#else
+    if (MPIDI_CH4I_REQUEST(rreq, is_local))
+        mpi_errno = MPIDI_SHM_cancel_recv(rreq);
+    else
+        mpi_errno = MPIDI_NM_cancel_recv(rreq);
+#endif
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_CANCEL_RECV);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#endif /* CH4_RECV_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4_request.h b/src/mpid/ch4/src/ch4_request.h
new file mode 100644
index 0000000..ac11dac
--- /dev/null
+++ b/src/mpid/ch4/src/ch4_request.h
@@ -0,0 +1,91 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4_REQUEST_H_INCLUDED
+#define CH4_REQUEST_H_INCLUDED
+
+#include "ch4_impl.h"
+#include "ch4r_buf.h"
+
+__CH4_INLINE__ int MPIDI_Request_is_anysource(MPIR_Request * req)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+__CH4_INLINE__ int MPIDI_Request_is_pending_failure(MPIR_Request * req)
+{
+    MPIR_Assert(0);
+    return MPI_SUCCESS;
+}
+
+__CH4_INLINE__ void MPIDI_Request_set_completed(MPIR_Request * req)
+{
+    MPIR_cc_set(&req->cc, 0);
+    return;
+}
+
+__CH4_INLINE__ void MPIDI_Request_add_ref(MPIR_Request * req)
+{
+    MPIR_Request_add_ref(req);
+    return;
+}
+
+__CH4_INLINE__ void MPIDI_Request_release_ref(MPIR_Request * req)
+{
+    int inuse;
+    MPIR_Request_release_ref(req, &inuse);
+    return;
+}
+
+/* These request functions should be called by the MPI layer only
+   since they only do base initialization of the request object.
+   A few notes:
+
+   It is each layer's responsibility to initialize a request
+   properly.
+
+   The CH4I_request functions are even more bare bones.
+   They create request objects that are not useable by the
+   lower layers until further initialization takes place.
+
+   CH4R_request_xxx functions can be used to create and destroy
+   request objects at any CH4 layer, including shmmod and netmod.
+   These functions create and initialize a base request with
+   the appropriate "above device" fields initialized, and any
+   required CH4 layer fields initialized.
+
+   The net/shm mods can upcall to CH4R to create a request, or
+   they can iniitalize their own requests internally, but note
+   that all the fields from the upper layers must be initialized
+   properly.
+
+   Note that the request_release function is used by the MPI
+   layer to release the ref on a request object.  It is important
+   for the netmods to release any memory pointed to by the request
+   when the internal completion counters hits zero, NOT when the
+   ref hits zero or there will be a memory leak. The generic
+   release function will not release any memory pointed to by
+   the request because it does not know about the internals of
+   the ch4r/netmod/shmmod fields of the request.
+*/
+#undef FUNCNAME
+#define FUNCNAME MPIDI_request_complete
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Request_complete(MPIR_Request * req)
+{
+    int incomplete;
+    MPIR_cc_decr(req->cc_ptr, &incomplete);
+    MPIR_Request_free(req);
+    return MPI_SUCCESS;
+}
+
+#endif /* CH4_REQUEST_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4_rma.h b/src/mpid/ch4/src/ch4_rma.h
new file mode 100644
index 0000000..a85cdb8
--- /dev/null
+++ b/src/mpid/ch4/src/ch4_rma.h
@@ -0,0 +1,297 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4_RMA_H_INCLUDED
+#define CH4_RMA_H_INCLUDED
+
+#include "ch4_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Put
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Put(const void *origin_addr,
+                             int origin_count,
+                             MPI_Datatype origin_datatype,
+                             int target_rank,
+                             MPI_Aint target_disp,
+                             int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_PUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_PUT);
+    mpi_errno = MPIDI_NM_put(origin_addr, origin_count, origin_datatype,
+                             target_rank, target_disp, target_count, target_datatype, win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_PUT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Get
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Get(void *origin_addr,
+                             int origin_count,
+                             MPI_Datatype origin_datatype,
+                             int target_rank,
+                             MPI_Aint target_disp,
+                             int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_GET);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_GET);
+    mpi_errno = MPIDI_NM_get(origin_addr, origin_count, origin_datatype,
+                             target_rank, target_disp, target_count, target_datatype, win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_GET);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Accumulate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Accumulate(const void *origin_addr,
+                                    int origin_count,
+                                    MPI_Datatype origin_datatype,
+                                    int target_rank,
+                                    MPI_Aint target_disp,
+                                    int target_count,
+                                    MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_ACCUMULATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_ACCUMULATE);
+    mpi_errno = MPIDI_NM_accumulate(origin_addr, origin_count, origin_datatype,
+                                    target_rank, target_disp, target_count,
+                                    target_datatype, op, win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_ACCUMULATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Compare_and_swap
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Compare_and_swap(const void *origin_addr,
+                                          const void *compare_addr,
+                                          void *result_addr,
+                                          MPI_Datatype datatype,
+                                          int target_rank, MPI_Aint target_disp, MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_COMPARE_AND_SWAP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_COMPARE_AND_SWAP);
+    mpi_errno = MPIDI_NM_compare_and_swap(origin_addr, compare_addr, result_addr,
+                                          datatype, target_rank, target_disp, win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_COMPARE_AND_SWAP);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Raccumulate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Raccumulate(const void *origin_addr,
+                                     int origin_count,
+                                     MPI_Datatype origin_datatype,
+                                     int target_rank,
+                                     MPI_Aint target_disp,
+                                     int target_count,
+                                     MPI_Datatype target_datatype,
+                                     MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_RACCUMULATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_RACCUMULATE);
+    mpi_errno = MPIDI_NM_raccumulate(origin_addr, origin_count, origin_datatype,
+                                     target_rank, target_disp, target_count,
+                                     target_datatype, op, win, request);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_RACCUMULATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Rget_accumulate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Rget_accumulate(const void *origin_addr,
+                                         int origin_count,
+                                         MPI_Datatype origin_datatype,
+                                         void *result_addr,
+                                         int result_count,
+                                         MPI_Datatype result_datatype,
+                                         int target_rank,
+                                         MPI_Aint target_disp,
+                                         int target_count,
+                                         MPI_Datatype target_datatype,
+                                         MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_RGET_ACCUMULATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_RGET_ACCUMULATE);
+    mpi_errno = MPIDI_NM_rget_accumulate(origin_addr, origin_count, origin_datatype,
+                                         result_addr, result_count, result_datatype,
+                                         target_rank, target_disp, target_count,
+                                         target_datatype, op, win, request);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_RGET_ACCUMULATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Fetch_and_op
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Fetch_and_op(const void *origin_addr,
+                                      void *result_addr,
+                                      MPI_Datatype datatype,
+                                      int target_rank,
+                                      MPI_Aint target_disp, MPI_Op op, MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_FETCH_AND_OP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_FETCH_AND_OP);
+    mpi_errno = MPIDI_NM_fetch_and_op(origin_addr, result_addr,
+                                      datatype, target_rank, target_disp, op, win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_FETCH_AND_OP);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Rget
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Rget(void *origin_addr,
+                              int origin_count,
+                              MPI_Datatype origin_datatype,
+                              int target_rank,
+                              MPI_Aint target_disp,
+                              int target_count,
+                              MPI_Datatype target_datatype, MPIR_Win * win, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_RGET);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_RGET);
+    mpi_errno = MPIDI_NM_rget(origin_addr, origin_count, origin_datatype,
+                              target_rank, target_disp, target_count,
+                              target_datatype, win, request);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_RGET);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Rput
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Rput(const void *origin_addr,
+                              int origin_count,
+                              MPI_Datatype origin_datatype,
+                              int target_rank,
+                              MPI_Aint target_disp,
+                              int target_count,
+                              MPI_Datatype target_datatype, MPIR_Win * win, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_RPUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_RPUT);
+    mpi_errno = MPIDI_NM_rput(origin_addr, origin_count, origin_datatype,
+                              target_rank, target_disp, target_count,
+                              target_datatype, win, request);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_RPUT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Get_accumulate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Get_accumulate(const void *origin_addr,
+                                        int origin_count,
+                                        MPI_Datatype origin_datatype,
+                                        void *result_addr,
+                                        int result_count,
+                                        MPI_Datatype result_datatype,
+                                        int target_rank,
+                                        MPI_Aint target_disp,
+                                        int target_count,
+                                        MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_GET_ACCUMULATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_GET_ACCUMULATE);
+    mpi_errno = MPIDI_NM_get_accumulate(origin_addr, origin_count, origin_datatype,
+                                        result_addr, result_count, result_datatype,
+                                        target_rank, target_disp, target_count, target_datatype,
+                                        op, win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_GET_ACCUMULATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#endif /* CH4_RMA_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4_send.h b/src/mpid/ch4/src/ch4_send.h
new file mode 100644
index 0000000..318e79b
--- /dev/null
+++ b/src/mpid/ch4/src/ch4_send.h
@@ -0,0 +1,515 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4_SEND_H_INCLUDED
+#define CH4_SEND_H_INCLUDED
+
+#include "ch4_impl.h"
+#include "ch4r_proc.h"
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Send
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Send(const void *buf,
+                              int count,
+                              MPI_Datatype datatype,
+                              int rank,
+                              int tag,
+                              MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_SEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_SEND);
+
+    if (unlikely(rank == MPI_PROC_NULL)) {
+        MPIR_Request *rreq = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
+        MPIR_Request_add_ref(rreq);
+        *request = rreq;
+        MPIDI_CH4U_request_complete(rreq);
+        mpi_errno = MPI_SUCCESS;
+        goto fn_exit;
+    }
+
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_send(buf, count, datatype, rank, tag, comm, context_offset, request);
+#else
+    int r;
+    if ((r = MPIDI_CH4_rank_is_local(rank, comm)))
+        mpi_errno = MPIDI_SHM_send(buf, count, datatype, rank, tag, comm, context_offset, request);
+    else
+        mpi_errno = MPIDI_NM_send(buf, count, datatype, rank, tag, comm, context_offset, request);
+    if (mpi_errno == MPI_SUCCESS && *request)
+        MPIDI_CH4I_REQUEST(*request, is_local) = r;
+#endif
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_SEND);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Isend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Isend(const void *buf,
+                               int count,
+                               MPI_Datatype datatype,
+                               int rank,
+                               int tag,
+                               MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_ISEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_ISEND);
+
+    if (unlikely(rank == MPI_PROC_NULL)) {
+        MPIR_Request *rreq = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
+        MPIR_Request_add_ref(rreq);
+        *request = rreq;
+        MPIDI_CH4U_request_complete(rreq);
+        mpi_errno = MPI_SUCCESS;
+        goto fn_exit;
+    }
+
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_isend(buf, count, datatype, rank, tag, comm, context_offset, request);
+#else
+    int r;
+    if ((r = MPIDI_CH4_rank_is_local(rank, comm)))
+        mpi_errno = MPIDI_SHM_isend(buf, count, datatype, rank, tag, comm, context_offset, request);
+    else
+        mpi_errno = MPIDI_NM_isend(buf, count, datatype, rank, tag, comm, context_offset, request);
+    if (mpi_errno == MPI_SUCCESS)
+        MPIDI_CH4I_REQUEST(*request, is_local) = r;
+#endif
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_ISEND);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Rsend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Rsend(const void *buf,
+                               int count,
+                               MPI_Datatype datatype,
+                               int rank,
+                               int tag,
+                               MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_RSEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_RSEND);
+
+    if (unlikely(rank == MPI_PROC_NULL)) {
+        MPIR_Request *rreq = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
+        MPIR_Request_add_ref(rreq);
+        *request = rreq;
+        MPIDI_CH4U_request_complete(rreq);
+        mpi_errno = MPI_SUCCESS;
+        goto fn_exit;
+    }
+
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_send(buf, count, datatype, rank, tag, comm, context_offset, request);
+#else
+    int r;
+    if ((r = MPIDI_CH4_rank_is_local(rank, comm)))
+        mpi_errno = MPIDI_SHM_send(buf, count, datatype, rank, tag, comm, context_offset, request);
+    else
+        mpi_errno = MPIDI_NM_send(buf, count, datatype, rank, tag, comm, context_offset, request);
+    if (mpi_errno == MPI_SUCCESS && *request)
+        MPIDI_CH4I_REQUEST(*request, is_local) = r;
+#endif
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_RSEND);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Irsend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Irsend(const void *buf,
+                                int count,
+                                MPI_Datatype datatype,
+                                int rank,
+                                int tag,
+                                MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_IRSEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_IRSEND);
+
+    if (unlikely(rank == MPI_PROC_NULL)) {
+        MPIR_Request *rreq = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
+        MPIR_Request_add_ref(rreq);
+        *request = rreq;
+        MPIDI_CH4U_request_complete(rreq);
+        mpi_errno = MPI_SUCCESS;
+        goto fn_exit;
+    }
+
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_isend(buf, count, datatype, rank, tag, comm, context_offset, request);
+#else
+    int r;
+    if ((r = MPIDI_CH4_rank_is_local(rank, comm)))
+        mpi_errno = MPIDI_SHM_isend(buf, count, datatype, rank, tag, comm, context_offset, request);
+    else
+        mpi_errno = MPIDI_NM_isend(buf, count, datatype, rank, tag, comm, context_offset, request);
+    if (mpi_errno == MPI_SUCCESS)
+        MPIDI_CH4I_REQUEST(*request, is_local) = r;
+#endif
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_SEND);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Ssend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Ssend(const void *buf,
+                               int count,
+                               MPI_Datatype datatype,
+                               int rank,
+                               int tag,
+                               MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_SSEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_SSEND);
+
+    if (unlikely(rank == MPI_PROC_NULL)) {
+        MPIR_Request *rreq = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
+        MPIR_Request_add_ref(rreq);
+        *request = rreq;
+        MPIDI_CH4U_request_complete(rreq);
+        mpi_errno = MPI_SUCCESS;
+        goto fn_exit;
+    }
+
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_ssend(buf, count, datatype, rank, tag, comm, context_offset, request);
+#else
+    int r;
+    if ((r = MPIDI_CH4_rank_is_local(rank, comm)))
+        mpi_errno = MPIDI_SHM_ssend(buf, count, datatype, rank, tag, comm, context_offset, request);
+    else
+        mpi_errno = MPIDI_NM_ssend(buf, count, datatype, rank, tag, comm, context_offset, request);
+    if (mpi_errno == MPI_SUCCESS && *request)
+        MPIDI_CH4I_REQUEST(*request, is_local) = r;
+#endif
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_SSEND);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Issend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Issend(const void *buf,
+                                int count,
+                                MPI_Datatype datatype,
+                                int rank,
+                                int tag,
+                                MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_ISSEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_ISSEND);
+
+    if (unlikely(rank == MPI_PROC_NULL)) {
+        MPIR_Request *rreq = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
+        MPIR_Request_add_ref(rreq);
+        *request = rreq;
+        MPIDI_CH4U_request_complete(rreq);
+        mpi_errno = MPI_SUCCESS;
+        goto fn_exit;
+    }
+
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_issend(buf, count, datatype, rank, tag, comm, context_offset, request);
+#else
+    int r;
+    if ((r = MPIDI_CH4_rank_is_local(rank, comm)))
+        mpi_errno =
+            MPIDI_SHM_issend(buf, count, datatype, rank, tag, comm, context_offset, request);
+    else
+        mpi_errno = MPIDI_NM_issend(buf, count, datatype, rank, tag, comm, context_offset, request);
+    if (mpi_errno == MPI_SUCCESS)
+        MPIDI_CH4I_REQUEST(*request, is_local) = r;
+#endif
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_ISSEND);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Startall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Startall(int count, MPIR_Request * requests[])
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_STARTALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_STARTALL);
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_startall(count, requests);
+#else
+    int i;
+    for (i = 0; i < count; i++) {
+        /* This is sub-optimal, can we do better? */
+        if (MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(requests[i])) {
+            mpi_errno = MPIDI_SHM_startall(1, &requests[i]);
+            if (mpi_errno == MPI_SUCCESS) {
+                mpi_errno =
+                    MPIDI_NM_startall(1, &MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(requests[i]));
+                MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(requests[i]->u.persist.real_request) =
+                    MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(requests[i])->u.persist.real_request;
+                MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER
+                                                     (requests[i])->u.persist.real_request) =
+                    requests[i]->u.persist.real_request;
+            }
+        }
+        else if (MPIDI_CH4I_REQUEST(requests[i], is_local))
+            mpi_errno = MPIDI_SHM_startall(1, &requests[i]);
+        else
+            mpi_errno = MPIDI_NM_startall(1, &requests[i]);
+    }
+#endif
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_STARTALL);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Send_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Send_init(const void *buf,
+                                   int count,
+                                   MPI_Datatype datatype,
+                                   int rank,
+                                   int tag,
+                                   MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_SEND_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_SEND_INIT);
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_send_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+#else
+    int r;
+    if ((r = MPIDI_CH4_rank_is_local(rank, comm)))
+        mpi_errno = MPIDI_SHM_send_init(buf, count, datatype, rank, tag,
+                                        comm, context_offset, request);
+    else
+        mpi_errno = MPIDI_NM_send_init(buf, count, datatype, rank, tag,
+                                       comm, context_offset, request);
+    if (mpi_errno == MPI_SUCCESS)
+        MPIDI_CH4I_REQUEST(*request, is_local) = r;
+    MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*request) = NULL;
+#endif
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_SEND_INIT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Ssend_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Ssend_init(const void *buf,
+                                    int count,
+                                    MPI_Datatype datatype,
+                                    int rank,
+                                    int tag,
+                                    MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_SSEND_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_SSEND_INIT);
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_ssend_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+#else
+    int r;
+    if ((r = MPIDI_CH4_rank_is_local(rank, comm)))
+        mpi_errno = MPIDI_SHM_ssend_init(buf, count, datatype, rank, tag,
+                                         comm, context_offset, request);
+    else
+        mpi_errno = MPIDI_NM_ssend_init(buf, count, datatype, rank, tag,
+                                        comm, context_offset, request);
+    if (mpi_errno == MPI_SUCCESS && *request) {
+        MPIDI_CH4I_REQUEST(*request, is_local) = r;
+        MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*request) = NULL;
+    }
+#endif
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_SSEND_INIT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Bsend_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Bsend_init(const void *buf,
+                                    int count,
+                                    MPI_Datatype datatype,
+                                    int rank,
+                                    int tag,
+                                    MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_BSEND_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_BSEND_INIT);
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_bsend_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+#else
+    int r;
+    if ((r = MPIDI_CH4_rank_is_local(rank, comm)))
+        mpi_errno = MPIDI_SHM_bsend_init(buf, count, datatype, rank, tag,
+                                         comm, context_offset, request);
+    else
+        mpi_errno = MPIDI_NM_bsend_init(buf, count, datatype, rank, tag,
+                                        comm, context_offset, request);
+    if (mpi_errno == MPI_SUCCESS && *request) {
+        MPIDI_CH4I_REQUEST(*request, is_local) = r;
+        MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*request) = NULL;
+    }
+#endif
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_BSEND_INIT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Rsend_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Rsend_init(const void *buf,
+                                    int count,
+                                    MPI_Datatype datatype,
+                                    int rank,
+                                    int tag,
+                                    MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_RSEND_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_RSEND_INIT);
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_rsend_init(buf, count, datatype, rank, tag, comm, context_offset, request);
+#else
+    int r;
+    if ((r = MPIDI_CH4_rank_is_local(rank, comm)))
+        mpi_errno = MPIDI_SHM_rsend_init(buf, count, datatype, rank, tag,
+                                         comm, context_offset, request);
+    else
+        mpi_errno = MPIDI_NM_rsend_init(buf, count, datatype, rank, tag,
+                                        comm, context_offset, request);
+    if (mpi_errno == MPI_SUCCESS && *request) {
+        MPIDI_CH4I_REQUEST(*request, is_local) = r;
+        MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(*request) = NULL;
+    }
+#endif
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_RSEND_INIT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Cancel_send
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Cancel_send(MPIR_Request * sreq)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_CANCEL_SEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_CANCEL_SEND);
+#ifndef MPIDI_CH4_EXCLUSIVE_SHM
+    mpi_errno = MPIDI_NM_cancel_send(sreq);
+#else
+    if (MPIDI_CH4I_REQUEST(sreq, is_local))
+        mpi_errno = MPIDI_SHM_cancel_send(sreq);
+    else
+        mpi_errno = MPIDI_NM_cancel_send(sreq);
+#endif
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_CANCEL_SEND);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#endif /* CH4_SEND_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4_spawn.h b/src/mpid/ch4/src/ch4_spawn.h
new file mode 100644
index 0000000..d4968a1
--- /dev/null
+++ b/src/mpid/ch4/src/ch4_spawn.h
@@ -0,0 +1,319 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4_SPAWN_H_INCLUDED
+#define CH4_SPAWN_H_INCLUDED
+
+#include "ch4_impl.h"
+
+static inline int MPIDI_mpi_to_pmi_keyvals(MPIR_Info * info_ptr,
+                                           PMI_keyval_t ** kv_ptr, int *nkeys_ptr)
+{
+    char key[MPI_MAX_INFO_KEY];
+    PMI_keyval_t *kv = 0;
+    int i, nkeys = 0, vallen, flag, mpi_errno = MPI_SUCCESS;
+
+    if (!info_ptr || info_ptr->handle == MPI_INFO_NULL)
+        goto fn_exit;
+
+    MPIR_Info_get_nkeys_impl(info_ptr, &nkeys);
+
+    if (nkeys == 0)
+        goto fn_exit;
+
+    kv = (PMI_keyval_t *) MPL_malloc(nkeys * sizeof(PMI_keyval_t));
+
+    for (i = 0; i < nkeys; i++) {
+        mpi_errno = MPIR_Info_get_nthkey_impl(info_ptr, i, key);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+        MPIR_Info_get_valuelen_impl(info_ptr, key, &vallen, &flag);
+        kv[i].key = (const char *) MPL_strdup(key);
+        kv[i].val = (char *) MPL_malloc(vallen + 1);
+        MPIR_Info_get_impl(info_ptr, key, vallen + 1, kv[i].val, &flag);
+    }
+
+  fn_fail:
+  fn_exit:
+    *kv_ptr = kv;
+    *nkeys_ptr = nkeys;
+    return mpi_errno;
+}
+
+static inline void MPIDI_free_pmi_keyvals(PMI_keyval_t ** kv, int size, int *counts)
+{
+    int i, j;
+
+    for (i = 0; i < size; i++) {
+        for (j = 0; j < counts[i]; j++) {
+            if (kv[i][j].key != NULL)
+                MPL_free((char *) kv[i][j].key);
+
+            if (kv[i][j].val != NULL)
+                MPL_free(kv[i][j].val);
+        }
+
+        if (kv[i] != NULL)
+            MPL_free(kv[i]);
+    }
+
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Comm_spawn_multiple
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Comm_spawn_multiple(int count,
+                                             char *commands[],
+                                             char **argvs[],
+                                             const int maxprocs[],
+                                             MPIR_Info * info_ptrs[],
+                                             int root,
+                                             MPIR_Comm * comm_ptr,
+                                             MPIR_Comm ** intercomm, int errcodes[])
+{
+    char port_name[MPI_MAX_PORT_NAME];
+    int *info_keyval_sizes = 0, i, mpi_errno = MPI_SUCCESS;
+    PMI_keyval_t **info_keyval_vectors = 0, preput_keyval_vector;
+    int *pmi_errcodes = 0, pmi_errno = 0;
+    int total_num_processes, should_accept = 1;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_COMM_SPAWN_MULTIPLE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_COMM_SPAWN_MULTIPLE);
+
+    memset(port_name, 0, sizeof(port_name));
+
+    if (comm_ptr->rank == root) {
+        total_num_processes = 0;
+
+        for (i = 0; i < count; i++)
+            total_num_processes += maxprocs[i];
+
+        pmi_errcodes = (int *) MPL_malloc(sizeof(int) * total_num_processes);
+        MPIR_ERR_CHKANDJUMP(!pmi_errcodes, mpi_errno, MPI_ERR_OTHER, "**nomem");
+
+        for (i = 0; i < total_num_processes; i++)
+            pmi_errcodes[i] = 0;
+
+        mpi_errno = MPIDI_Open_port(NULL, port_name);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+
+        info_keyval_sizes = (int *) MPL_malloc(count * sizeof(int));
+        MPIR_ERR_CHKANDJUMP(!info_keyval_sizes, mpi_errno, MPI_ERR_OTHER, "**nomem");
+        info_keyval_vectors = (PMI_keyval_t **) MPL_malloc(count * sizeof(PMI_keyval_t *));
+        MPIR_ERR_CHKANDJUMP(!info_keyval_vectors, mpi_errno, MPI_ERR_OTHER, "**nomem");
+
+        if (!info_ptrs)
+            for (i = 0; i < count; i++) {
+                info_keyval_vectors[i] = 0;
+                info_keyval_sizes[i] = 0;
+            }
+        else
+            for (i = 0; i < count; i++) {
+                mpi_errno = MPIDI_mpi_to_pmi_keyvals(info_ptrs[i],
+                                                     &info_keyval_vectors[i],
+                                                     &info_keyval_sizes[i]);
+                if (mpi_errno)
+                    MPIR_ERR_POP(mpi_errno);
+            }
+
+        preput_keyval_vector.key = MPIDI_PARENT_PORT_KVSKEY;
+        preput_keyval_vector.val = port_name;
+        pmi_errno = PMI_Spawn_multiple(count, (const char **)
+                                       commands,
+                                       (const char ** *) argvs,
+                                       maxprocs, info_keyval_sizes, (const PMI_keyval_t **)
+                                       info_keyval_vectors, 1, &preput_keyval_vector, pmi_errcodes);
+
+        if (pmi_errno != PMI_SUCCESS)
+            MPIR_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER,
+                                 "**pmi_spawn_multiple", "**pmi_spawn_multiple %d", pmi_errno);
+
+        if (errcodes != MPI_ERRCODES_IGNORE) {
+            for (i = 0; i < total_num_processes; i++) {
+                errcodes[i] = pmi_errcodes[0];
+                should_accept = should_accept && errcodes[i];
+            }
+
+            should_accept = !should_accept;
+        }
+    }
+
+    if (errcodes != MPI_ERRCODES_IGNORE) {
+        MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+        mpi_errno = MPIR_Bcast_impl(&should_accept, 1, MPI_INT, root, comm_ptr, &errflag);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+
+        mpi_errno = MPIR_Bcast_impl(&pmi_errno, 1, MPI_INT, root, comm_ptr, &errflag);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+
+        mpi_errno = MPIR_Bcast_impl(&total_num_processes, 1, MPI_INT, root, comm_ptr, &errflag);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+
+        mpi_errno = MPIR_Bcast_impl(errcodes, total_num_processes, MPI_INT,
+                                    root, comm_ptr, &errflag);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+
+    if (should_accept) {
+        mpi_errno = MPIDI_Comm_accept(port_name, NULL, root, comm_ptr, intercomm);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+    else {
+        if ((pmi_errno == PMI_SUCCESS) && (errcodes[0] != 0))
+            MPIR_Comm_create(intercomm);
+    }
+
+    if (comm_ptr->rank == root) {
+        mpi_errno = MPIDI_Close_port(port_name);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+
+  fn_exit:
+
+    if (info_keyval_vectors) {
+        MPIDI_free_pmi_keyvals(info_keyval_vectors, count, info_keyval_sizes);
+        MPL_free(info_keyval_vectors);
+    }
+
+    if (info_keyval_sizes)
+        MPL_free(info_keyval_sizes);
+
+    if (pmi_errcodes)
+        MPL_free(pmi_errcodes);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_COMM_SPAWN_MULTIPLE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Comm_connect
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Comm_connect(const char *port_name,
+                                      MPIR_Info * info,
+                                      int root, MPIR_Comm * comm, MPIR_Comm ** newcomm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_COMM_CONNECT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_COMM_CONNECT);
+    mpi_errno = MPIDI_NM_comm_connect(port_name, info, root, comm, newcomm_ptr);
+
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_COMM_CONNECT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Comm_disconnect
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Comm_disconnect(MPIR_Comm * comm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_COMM_DISCONNECT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_COMM_DISCONNECT);
+    mpi_errno = MPIDI_NM_comm_disconnect(comm_ptr);
+
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_COMM_DISCONNECT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Open_port
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Open_port(MPIR_Info * info_ptr, char *port_name)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_OPEN_PORT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_OPEN_PORT);
+    mpi_errno = MPIDI_NM_open_port(info_ptr, port_name);
+
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_OPEN_PORT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Close_port
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Close_port(const char *port_name)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_CLOSE_PORT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_CLOSE_PORT);
+    mpi_errno = MPIDI_NM_close_port(port_name);
+
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_CLOSE_PORT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Comm_accept
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Comm_accept(const char *port_name,
+                                     MPIR_Info * info,
+                                     int root, MPIR_Comm * comm, MPIR_Comm ** newcomm_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_COMM_ACCEPT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_COMM_ACCEPT);
+    mpi_errno = MPIDI_NM_comm_accept(port_name, info, root, comm, newcomm_ptr);
+
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_COMM_ACCEPT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#endif /* CH4_SPAWN_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4_types.h b/src/mpid/ch4/src/ch4_types.h
new file mode 100644
index 0000000..061122e
--- /dev/null
+++ b/src/mpid/ch4/src/ch4_types.h
@@ -0,0 +1,285 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4_TYPES_H_INCLUDED
+#define CH4_TYPES_H_INCLUDED
+
+#include <mpidimpl.h>
+#include <stdio.h>
+#include "mpir_cvars.h"
+#include "pmi.h"
+
+/* Macros and inlines */
+/* match/ignore bit manipulation
+ *
+ * 0123 4567 01234567 0123 4567 01234567 0123 4567 01234567 01234567 01234567
+ *     |                  |                  |
+ * ^   |    context id    |       source     |       message tag
+ * |   |                  |                  |
+ * +---- protocol
+ */
+#define MPIDI_CH4U_PROTOCOL_MASK (0x9000000000000000ULL)
+#define MPIDI_CH4U_CONTEXT_MASK  (0x0FFFF00000000000ULL)
+#define MPIDI_CH4U_SOURCE_MASK   (0x00000FFFF0000000ULL)
+#define MPIDI_CH4U_TAG_MASK      (0x000000000FFFFFFFULL)
+#define MPIDI_CH4U_DYNPROC_SEND  (0x4000000000000000ULL)
+#define MPIDI_CH4U_TAG_SHIFT     (28)
+#define MPIDI_CH4U_SOURCE_SHIFT  (16)
+#define MPIDI_CH4U_SOURCE_SHIFT_UNPACK (sizeof(int)*8 - MPIDI_CH4U_SOURCE_SHIFT)
+#define MPIDI_CH4U_TAG_SHIFT_UNPACK (sizeof(int)*8 - MPIDI_CH4U_TAG_SHIFT)
+
+#define MPIDI_CH4I_MAP_NOT_FOUND      ((void*)(-1UL))
+
+#define MAX_NETMOD_CONTEXTS 8
+#define MAX_PROGRESS_HOOKS 4
+
+#define MPIDI_CH4I_BUF_POOL_NUM (1024)
+#define MPIDI_CH4I_BUF_POOL_SZ (256)
+
+typedef int (*progress_func_ptr_t) (int *made_progress);
+typedef struct progress_hook_slot {
+    progress_func_ptr_t func_ptr;
+    int active;
+} progress_hook_slot_t;
+
+typedef enum {
+    MPIDI_CH4U_SEND = 0,        /* Eager send */
+
+    MPIDI_CH4U_SEND_LONG_REQ,   /* Rendezvous send RTS (request to send) */
+    MPIDI_CH4U_SEND_LONG_ACK,   /* Rendezvous send CTS (clear to send) */
+    MPIDI_CH4U_SEND_LONG_LMT,   /* Rendezvous send LMT */
+
+    MPIDI_CH4U_SSEND_REQ,
+    MPIDI_CH4U_SSEND_ACK,
+
+    MPIDI_CH4U_WIN_CTRL,
+
+    MPIDI_CH4U_PUT_REQ,
+    MPIDI_CH4U_PUT_ACK,
+    MPIDI_CH4U_PUT_IOV_REQ,
+    MPIDI_CH4U_PUT_DAT_REQ,
+    MPIDI_CH4U_PUT_IOV_ACK,
+
+    MPIDI_CH4U_GET_REQ,
+    MPIDI_CH4U_GET_ACK,
+
+    MPIDI_CH4U_ACC_REQ,
+    MPIDI_CH4U_ACC_ACK,
+    MPIDI_CH4U_ACC_IOV_REQ,
+    MPIDI_CH4U_ACC_DAT_REQ,
+    MPIDI_CH4U_ACC_IOV_ACK,
+    MPIDI_CH4U_GET_ACC_ACK,
+
+    MPIDI_CH4U_CSWAP_REQ,
+    MPIDI_CH4U_CSWAP_ACK,
+    MPIDI_CH4U_FETCH_OP
+} MPIDI_CH4U_TYPE;
+
+typedef enum {
+    MPIDI_CH4U_WIN_COMPLETE,
+    MPIDI_CH4U_WIN_POST,
+    MPIDI_CH4U_WIN_LOCK,
+    MPIDI_CH4U_WIN_LOCK_ACK,
+    MPIDI_CH4U_WIN_UNLOCK,
+    MPIDI_CH4U_WIN_UNLOCK_ACK,
+    MPIDI_CH4U_WIN_LOCKALL,
+    MPIDI_CH4U_WIN_LOCKALL_ACK,
+    MPIDI_CH4U_WIN_UNLOCKALL,
+    MPIDI_CH4U_WIN_UNLOCKALL_ACK
+} MPIDI_CH4U_WIN_CTRL_MSG_TYPE;
+
+enum {
+    MPIDI_CH4U_EPOTYPE_NONE = 0,          /**< No epoch in affect */
+    MPIDI_CH4U_EPOTYPE_LOCK = 1,          /**< MPI_Win_lock access epoch */
+    MPIDI_CH4U_EPOTYPE_START = 2,         /**< MPI_Win_start access epoch */
+    MPIDI_CH4U_EPOTYPE_POST = 3,          /**< MPI_Win_post exposure epoch */
+    MPIDI_CH4U_EPOTYPE_FENCE = 4,         /**< MPI_Win_fence access/exposure epoch */
+    MPIDI_CH4U_EPOTYPE_REFENCE = 5,       /**< MPI_Win_fence possible access/exposure epoch */
+    MPIDI_CH4U_EPOTYPE_LOCK_ALL = 6       /**< MPI_Win_lock_all access epoch */
+};
+
+/* Enum for calling types between netmod and shm */
+enum {
+    MPIDI_CH4R_NETMOD = 0,
+    MPIDI_CH4R_SHM = 1
+};
+
+typedef struct MPIDI_CH4U_hdr_t {
+    uint64_t msg_tag;
+    int src_rank;
+} MPIDI_CH4U_hdr_t;
+
+typedef struct MPIDI_CH4U_send_long_req_msg_t {
+    MPIDI_CH4U_hdr_t hdr;
+    size_t data_sz;             /* Message size in bytes */
+    uint64_t sreq_ptr;          /* Pointer value of the request object at the sender side */
+} MPIDI_CH4U_send_long_req_msg_t;
+
+typedef struct MPIDI_CH4U_send_long_ack_msg_t {
+    uint64_t sreq_ptr;
+    uint64_t rreq_ptr;
+} MPIDI_CH4U_send_long_ack_msg_t;
+
+typedef struct MPIDI_CH4U_send_long_lmt_msg_t {
+    uint64_t rreq_ptr;
+} MPIDI_CH4U_send_long_lmt_msg_t;
+
+typedef struct MPIDI_CH4U_ssend_req_msg_t {
+    MPIDI_CH4U_hdr_t hdr;
+    uint64_t sreq_ptr;
+} MPIDI_CH4U_ssend_req_msg_t;
+
+typedef struct MPIDI_CH4U_ssend_ack_msg_t {
+    uint64_t sreq_ptr;
+} MPIDI_CH4U_ssend_ack_msg_t;
+
+typedef struct MPIDI_CH4U_win_cntrl_msg_t {
+    uint64_t win_id;
+    uint32_t origin_rank;
+    int16_t lock_type;
+    int16_t type;
+} MPIDI_CH4U_win_cntrl_msg_t;
+
+typedef struct MPIDI_CH4U_put_msg_t {
+    int src_rank;
+    uint64_t win_id;
+    uint64_t preq_ptr;
+    MPI_Aint target_disp;
+    uint64_t count;
+    MPI_Datatype datatype;
+    int n_iov;
+} MPIDI_CH4U_put_msg_t;
+
+typedef struct MPIDI_CH4U_put_iov_ack_msg_t {
+    int src_rank;
+    uint64_t target_preq_ptr;
+    uint64_t origin_preq_ptr;
+} MPIDI_CH4U_put_iov_ack_msg_t;
+typedef MPIDI_CH4U_put_iov_ack_msg_t MPIDI_CH4U_acc_iov_ack_msg_t;
+
+typedef struct MPIDI_CH4U_put_dat_msg_t {
+    uint64_t preq_ptr;
+} MPIDI_CH4U_put_dat_msg_t;
+typedef MPIDI_CH4U_put_dat_msg_t MPIDI_CH4U_acc_dat_msg_t;
+
+typedef struct MPIDI_CH4U_put_ack_msg_t {
+    uint64_t preq_ptr;
+} MPIDI_CH4U_put_ack_msg_t;
+
+typedef struct MPIDI_CH4U_get_req_msg_t {
+    int src_rank;
+    uint64_t win_id;
+    uint64_t greq_ptr;
+    MPI_Aint target_disp;
+    uint64_t count;
+    MPI_Datatype datatype;
+    int n_iov;
+} MPIDI_CH4U_get_req_msg_t;
+
+typedef struct MPIDI_CH4U_get_ack_msg_t {
+    uint64_t greq_ptr;
+} MPIDI_CH4U_get_ack_msg_t;
+
+typedef struct MPIDI_CH4U_cswap_req_msg_t {
+    int src_rank;
+    uint64_t win_id;
+    uint64_t req_ptr;
+    MPI_Aint target_disp;
+    MPI_Datatype datatype;
+} MPIDI_CH4U_cswap_req_msg_t;
+
+typedef struct MPIDI_CH4U_cswap_ack_msg_t {
+    uint64_t req_ptr;
+} MPIDI_CH4U_cswap_ack_msg_t;
+
+typedef struct MPIDI_CH4U_acc_req_msg_t {
+    int src_rank;
+    uint64_t win_id;
+    uint64_t req_ptr;
+    int origin_count;
+    MPI_Datatype origin_datatype;
+    int target_count;
+    MPI_Datatype target_datatype;
+    MPI_Op op;
+    int do_get;
+    MPI_Aint target_disp;
+    uint64_t result_data_sz;
+    int n_iov;
+} MPIDI_CH4U_acc_req_msg_t;
+
+typedef struct MPIDI_CH4U_acc_ack_msg_t {
+    uint64_t req_ptr;
+} MPIDI_CH4U_acc_ack_msg_t;
+
+typedef struct MPIDI_CH4U_comm_req_list_t {
+    MPIR_Comm *comm[2][4];
+    MPIDI_CH4U_rreq_t *uelist[2][4];
+} MPIDI_CH4U_comm_req_list_t;
+
+typedef struct MPIU_buf_pool_t {
+    int size;
+    int num;
+    void *memory_region;
+    struct MPIU_buf_pool_t *next;
+    struct MPIU_buf_t *head;
+    pthread_mutex_t lock;
+} MPIU_buf_pool_t;
+
+typedef struct MPIU_buf_t {
+    struct MPIU_buf_t *next;
+    MPIU_buf_pool_t *pool;
+    char data[];
+} MPIU_buf_t;
+
+typedef struct {
+    int max_n_avts;
+    int n_avts;
+    int next_avtid;
+    int *free_avtid;
+} MPIDI_CH4_avt_manager;
+
+typedef struct MPIDI_CH4_Global_t {
+    MPIR_Request *request_test;
+    MPIR_Comm *comm_test;
+    int pname_set;
+    int pname_len;
+    char pname[MPI_MAX_PROCESSOR_NAME];
+    int is_initialized;
+    int allocated_max_n_avts;
+    MPIDI_CH4_avt_manager avt_mgr;
+    int is_ch4u_initialized;
+    MPID_Node_id_t **node_map, max_node_id;
+    MPIDI_CH4U_comm_req_list_t *comm_req_lists;
+    OPA_int_t active_progress_hooks;
+    MPIR_Commops MPIR_Comm_fns_store;
+    progress_hook_slot_t progress_hooks[MAX_PROGRESS_HOOKS];
+    MPID_Thread_mutex_t m[2];
+    MPIR_Win *win_hash;
+    int jobid;
+#ifndef MPIDI_CH4U_USE_PER_COMM_QUEUE
+    MPIDI_CH4U_rreq_t *posted_list;
+    MPIDI_CH4U_rreq_t *unexp_list;
+#endif
+    MPIDI_CH4U_req_ext_t *cmpl_list;
+    OPA_int_t exp_seq_no;
+    OPA_int_t nxt_seq_no;
+    void *netmod_context[8];
+    MPIU_buf_pool_t *buf_pool;
+} MPIDI_CH4_Global_t;
+extern MPIDI_CH4_Global_t MPIDI_CH4_Global;
+#ifdef MPL_USE_DBG_LOGGING
+extern MPL_dbg_class MPIDI_CH4_DBG_GENERAL;
+extern MPL_dbg_class MPIDI_CH4_DBG_MAP;
+extern MPL_dbg_class MPIDI_CH4_DBG_MEMORY;
+#endif
+#define MPIDI_CH4I_THREAD_PROGRESS_MUTEX  MPIDI_CH4_Global.m[0]
+#define MPIDI_CH4I_THREAD_PROGRESS_HOOK_MUTEX  MPIDI_CH4_Global.m[1]
+
+#endif /* CH4_TYPES_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4_win.h b/src/mpid/ch4/src/ch4_win.h
new file mode 100644
index 0000000..222f7fe
--- /dev/null
+++ b/src/mpid/ch4/src/ch4_win.h
@@ -0,0 +1,530 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4_WIN_H_INCLUDED
+#define CH4_WIN_H_INCLUDED
+
+#include "ch4_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_set_info
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_set_info(MPIR_Win * win, MPIR_Info * info)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_SET_INFO);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_SET_INFO);
+    mpi_errno = MPIDI_NM_win_set_info(win, info);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_SET_INFO);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_start
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_start(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_START);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_START);
+    mpi_errno = MPIDI_NM_win_start(group, assert, win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_START);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_complete
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_complete(MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_COMPLETE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_COMPLETE);
+    mpi_errno = MPIDI_NM_win_complete(win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_COMPLETE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_post
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_post(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_POST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_POST);
+    mpi_errno = MPIDI_NM_win_post(group, assert, win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_POST);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_wait
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_wait(MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_WAIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_WAIT);
+    mpi_errno = MPIDI_NM_win_wait(win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_WAIT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_test
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_test(MPIR_Win * win, int *flag)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_TEST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_TEST);
+    mpi_errno = MPIDI_NM_win_test(win, flag);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_TEST);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_lock
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_lock(int lock_type, int rank, int assert, MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_LOCK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_LOCK);
+    mpi_errno = MPIDI_NM_win_lock(lock_type, rank, assert, win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_LOCK);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_unlock
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_unlock(int rank, MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_UNLOCK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_UNLOCK);
+    mpi_errno = MPIDI_NM_win_unlock(rank, win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_UNLOCK);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_get_info
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_get_info(MPIR_Win * win, MPIR_Info ** info_p_p)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_GET_INFO);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_GET_INFO);
+    mpi_errno = MPIDI_NM_win_get_info(win, info_p_p);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_GET_INFO);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_free
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_free(MPIR_Win ** win_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_FREE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_FREE);
+    mpi_errno = MPIDI_NM_win_free(win_ptr);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_FREE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_fence
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_fence(int assert, MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_FENCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_FENCE);
+    mpi_errno = MPIDI_NM_win_fence(assert, win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_FENCE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_create
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_create(void *base,
+                                    MPI_Aint length,
+                                    int disp_unit,
+                                    MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_CREATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_CREATE);
+    mpi_errno = MPIDI_NM_win_create(base, length, disp_unit, info, comm_ptr, win_ptr);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_CREATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_attach
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_attach(MPIR_Win * win, void *base, MPI_Aint size)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_ATTACH);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_ATTACH);
+    mpi_errno = MPIDI_NM_win_attach(win, base, size);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_ATTACH);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_allocate_shared
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_allocate_shared(MPI_Aint size,
+                                             int disp_unit,
+                                             MPIR_Info * info_ptr,
+                                             MPIR_Comm * comm_ptr,
+                                             void **base_ptr, MPIR_Win ** win_ptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_ALLOCATE_SHARED);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_ALLOCATE_SHARED);
+    mpi_errno = MPIDI_NM_win_allocate_shared(size, disp_unit,
+                                             info_ptr, comm_ptr, base_ptr, win_ptr);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_ALLOCATE_SHARED);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_flush_local
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_flush_local(int rank, MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_FLUSH_LOCAL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_FLUSH_LOCAL);
+    mpi_errno = MPIDI_NM_win_flush_local(rank, win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_FLUSH_LOCAL);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_detach
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_detach(MPIR_Win * win, const void *base)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_DETACH);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_DETACH);
+    mpi_errno = MPIDI_NM_win_detach(win, base);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_DETACH);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_shared_query
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_shared_query(MPIR_Win * win,
+                                          int rank, MPI_Aint * size, int *disp_unit, void *baseptr)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_SHARED_QUERY);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_SHARED_QUERY);
+    mpi_errno = MPIDI_NM_win_shared_query(win, rank, size, disp_unit, baseptr);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_SHARED_QUERY);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_allocate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_allocate(MPI_Aint size,
+                                      int disp_unit,
+                                      MPIR_Info * info,
+                                      MPIR_Comm * comm, void *baseptr, MPIR_Win ** win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_ALLOCATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_ALLOCATE);
+    mpi_errno = MPIDI_NM_win_allocate(size, disp_unit, info, comm, baseptr, win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_ALLOCATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_flush
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_flush(int rank, MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_FLUSH);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_FLUSH);
+    mpi_errno = MPIDI_NM_win_flush(rank, win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_FLUSH);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_flush_local_all
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_flush_local_all(MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_FLUSH_LOCAL_ALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_FLUSH_LOCAL_ALL);
+    mpi_errno = MPIDI_NM_win_flush_local_all(win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_FLUSH_LOCAL_ALL);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_unlock_all
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_unlock_all(MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_UNLOCK_ALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_UNLOCK_ALL);
+    mpi_errno = MPIDI_NM_win_unlock_all(win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_UNLOCK_ALL);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_create_dynamic
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_create_dynamic(MPIR_Info * info, MPIR_Comm * comm, MPIR_Win ** win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_CREATE_DYNAMIC);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_CREATE_DYNAMIC);
+    mpi_errno = MPIDI_NM_win_create_dynamic(info, comm, win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_CREATE_DYNAMIC);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_sync
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_sync(MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_SYNC);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_SYNC);
+    mpi_errno = MPIDI_NM_win_sync(win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_SYNC);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_flush_all
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_flush_all(MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_FLUSH_ALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_FLUSH_ALL);
+    mpi_errno = MPIDI_NM_win_flush_all(win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_FLUSH_ALL);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_lock_all
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_Win_lock_all(int assert, MPIR_Win * win)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_WIN_LOCK_ALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_WIN_LOCK_ALL);
+    mpi_errno = MPIDI_NM_win_lock_all(assert, win);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIR_ERR_POP(mpi_errno);
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_WIN_LOCK_ALL);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#endif /* CH4_WIN_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4i_comm.h b/src/mpid/ch4/src/ch4i_comm.h
new file mode 100644
index 0000000..b34405c
--- /dev/null
+++ b/src/mpid/ch4/src/ch4i_comm.h
@@ -0,0 +1,1161 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#ifndef CH4I_COMM_H_INCLUDED
+#define CH4I_COMM_H_INCLUDED
+
+#include "ch4_types.h"
+#include "mpl_utlist.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDII_alloc_lut
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDII_alloc_lut(MPIDII_rank_map_lut_t ** lut, int size)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDII_rank_map_lut_t *new_lut = NULL;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDIU_ALLOC_LUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDIU_ALLOC_LUT);
+
+    new_lut = (MPIDII_rank_map_lut_t *) MPL_malloc(sizeof(MPIDII_rank_map_lut_t)
+                                                   + size * sizeof(MPIDII_lpid_t));
+    if (new_lut == NULL) {
+        *lut = NULL;
+        MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
+    }
+
+    MPIR_Object_set_ref(new_lut, 1);
+    *lut = new_lut;
+
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE,
+                    (MPL_DBG_FDEST, "alloc lut %p, size %ld, refcount=%d",
+                     new_lut, size * sizeof(MPIDII_lpid_t), MPIR_Object_get_ref(new_lut)));
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDIU_ALLOC_LUT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDIU_release_lut
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDIU_release_lut(MPIDII_rank_map_lut_t * lut)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int count = 0;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDIU_RELEASE_LUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDIU_RELEASE_LUT);
+
+    MPIR_Object_release_ref(lut, &count);
+    if (count == 0) {
+        MPL_free(lut);
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, (MPL_DBG_FDEST, "free lut %p", lut));
+    }
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDIU_RELEASE_LUT);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDII_alloc_mlut
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDII_alloc_mlut(MPIDII_rank_map_mlut_t ** mlut, int size)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDII_rank_map_mlut_t *new_mlut = NULL;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDIU_ALLOC_MLUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDIU_ALLOC_MLUT);
+
+    new_mlut = (MPIDII_rank_map_mlut_t *) MPL_malloc(sizeof(MPIDII_rank_map_mlut_t)
+                                                     + size * sizeof(MPIDII_gpid_t));
+    if (new_mlut == NULL) {
+        *mlut = NULL;
+        MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
+    }
+
+    MPIR_Object_set_ref(new_mlut, 1);
+    *mlut = new_mlut;
+
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE,
+                    (MPL_DBG_FDEST, "alloc mlut %p, size %ld, refcount=%d",
+                     new_mlut, size * sizeof(MPIDII_gpid_t), MPIR_Object_get_ref(new_mlut)));
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDIU_ALLOC_MLUT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDIU_release_mlut
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDIU_release_mlut(MPIDII_rank_map_mlut_t * mlut)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int count = 0;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDIU_RELEASE_MLUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDIU_RELEASE_MLUT);
+
+    MPIR_Object_release_ref(mlut, &count);
+    if (count == 0) {
+        MPL_free(mlut);
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, (MPL_DBG_FDEST, "free mlut %p", mlut));
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDIU_RELEASE_MLUT);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDII_map_size
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDII_map_size(MPIR_Comm_map_t map)
+{
+    if (map.type == MPIR_COMM_MAP_TYPE__IRREGULAR)
+        return map.src_mapping_size;
+    else if (map.dir == MPIR_COMM_MAP_DIR__L2L || map.dir == MPIR_COMM_MAP_DIR__L2R)
+        return map.src_comm->local_size;
+    else
+        return map.src_comm->remote_size;
+}
+
+/*
+ * This enum is used exclusively in this header file
+ */
+enum MPIDII_src_mapper_models {
+    MPIDII_SRC_MAPPER_IRREGULAR = 0,
+    MPIDII_SRC_MAPPER_DIRECT = 1,
+    MPIDII_SRC_MAPPER_OFFSET = 2,
+    MPIDII_SRC_MAPPER_STRIDE = 3
+};
+
+#undef FUNCNAME
+#define FUNCNAME MPIDII_detect_regular_model
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDII_detect_regular_model(int *lpid, int size,
+                                              int *offset, int *blocksize, int *stride)
+{
+    int off = 0, bs = 0, st = 0;
+    int i;
+
+    off = lpid[0];
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, "\tdetect model: offset %d", off));
+
+    for (i = 0; i < size; i++) {
+        if (lpid[i] != i + off) {
+            break;
+        }
+        bs++;
+    }
+
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                    (MPL_DBG_FDEST, "\tdetect model: blocksize %d", bs));
+    if (bs == size) {
+        if (off == 0) {
+            return MPIDII_SRC_MAPPER_DIRECT;
+        }
+        else {
+            *offset = off;
+            return MPIDII_SRC_MAPPER_OFFSET;
+        }
+    }
+
+    /* blocksize less than total size, try if this is stride */
+    st = lpid[bs] - lpid[0];
+    if (st < 0 || st <= bs) {
+        return MPIDII_SRC_MAPPER_IRREGULAR;
+    }
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, "\tdetect model: stride %d", bs));
+    for (i = bs; i < size; i++) {
+        if (lpid[i] != MPIDII_CALC_STRIDE(i, st, bs, off)) {
+            return MPIDII_SRC_MAPPER_IRREGULAR;
+        }
+    }
+    *offset = off;
+    *blocksize = bs;
+    *stride = st;
+    return MPIDII_SRC_MAPPER_STRIDE;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDII_src_comm_to_lut
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDII_src_comm_to_lut(MPIDII_rank_map_t * src,
+                                         MPIDII_rank_map_t * dest,
+                                         int size, int total_mapper_size, int mapper_offset)
+{
+    int mpi_errno = MPI_SUCCESS, i;
+    MPIDII_rank_map_lut_t *lut = NULL;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDII_SRC_COMM_TO_LUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDII_SRC_COMM_TO_LUT);
+
+    if (!mapper_offset) {
+        mpi_errno = MPIDII_alloc_lut(&lut, total_mapper_size);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+        dest->size = total_mapper_size;
+        dest->mode = MPIDII_RANK_MAP_LUT;
+        dest->avtid = src->avtid;
+        dest->irreg.lut.t = lut;
+        dest->irreg.lut.lpid = lut->lpid;
+    }
+
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                    (MPL_DBG_FDEST, " source mode %d", (int) src->mode));
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, " size %d", size));
+    switch (src->mode) {
+    case MPIDII_RANK_MAP_DIRECT:
+    case MPIDII_RANK_MAP_DIRECT_INTRA:
+        for (i = 0; i < size; i++) {
+            dest->irreg.lut.lpid[i + mapper_offset] = i;
+        }
+        break;
+    case MPIDII_RANK_MAP_OFFSET:
+    case MPIDII_RANK_MAP_OFFSET_INTRA:
+        for (i = 0; i < size; i++) {
+            dest->irreg.lut.lpid[i + mapper_offset] = i + src->reg.offset;
+        }
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, " source offset %d", src->reg.offset));
+        break;
+    case MPIDII_RANK_MAP_STRIDE:
+    case MPIDII_RANK_MAP_STRIDE_INTRA:
+        for (i = 0; i < size; i++) {
+            dest->irreg.lut.lpid[i + mapper_offset] = MPIDII_CALC_STRIDE_SIMPLE(i,
+                                                                                src->reg.stride.
+                                                                                stride,
+                                                                                src->reg.stride.
+                                                                                offset);
+        }
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, " source stride %d blocksize %d offset %d",
+                         src->reg.stride.stride, src->reg.stride.blocksize,
+                         src->reg.stride.offset));
+        break;
+    case MPIDII_RANK_MAP_STRIDE_BLOCK:
+    case MPIDII_RANK_MAP_STRIDE_BLOCK_INTRA:
+        for (i = 0; i < size; i++) {
+            dest->irreg.lut.lpid[i + mapper_offset] = MPIDII_CALC_STRIDE(i,
+                                                                         src->reg.stride.stride,
+                                                                         src->reg.stride.blocksize,
+                                                                         src->reg.stride.offset);
+        }
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, " source stride %d blocksize %d offset %d",
+                         src->reg.stride.stride, src->reg.stride.blocksize,
+                         src->reg.stride.offset));
+        break;
+    case MPIDII_RANK_MAP_LUT:
+    case MPIDII_RANK_MAP_LUT_INTRA:
+        for (i = 0; i < size; i++) {
+            dest->irreg.lut.lpid[i + mapper_offset] = src->irreg.lut.lpid[i];
+        }
+        break;
+    case MPIDII_RANK_MAP_MLUT:
+    case MPIDII_RANK_MAP_NONE:
+        MPIR_Assert(0);
+        break;
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDII_SRC_COMM_TO_LUT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDII_src_comm_to_mlut
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDII_src_comm_to_mlut(MPIDII_rank_map_t * src,
+                                          MPIDII_rank_map_t * dest,
+                                          int size, int total_mapper_size, int mapper_offset)
+{
+    int mpi_errno = MPI_SUCCESS, i;
+    MPIDII_rank_map_mlut_t *mlut = NULL;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDII_SRC_COMM_TO_MLUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDII_SRC_COMM_TO_MLUT);
+
+    if (!mapper_offset) {
+        mpi_errno = MPIDII_alloc_mlut(&mlut, total_mapper_size);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+        dest->size = total_mapper_size;
+        dest->mode = MPIDII_RANK_MAP_MLUT;
+        dest->avtid = -1;
+        dest->irreg.mlut.t = mlut;
+        dest->irreg.mlut.gpid = mlut->gpid;
+    }
+
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                    (MPL_DBG_FDEST, " source mode %d", (int) src->mode));
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, " size %d", size));
+    switch (src->mode) {
+    case MPIDII_RANK_MAP_DIRECT:
+    case MPIDII_RANK_MAP_DIRECT_INTRA:
+        for (i = 0; i < size; i++) {
+            dest->irreg.mlut.gpid[i + mapper_offset].lpid = i;
+            dest->irreg.mlut.gpid[i + mapper_offset].avtid = src->avtid;
+        }
+        break;
+    case MPIDII_RANK_MAP_OFFSET:
+    case MPIDII_RANK_MAP_OFFSET_INTRA:
+        for (i = 0; i < size; i++) {
+            dest->irreg.mlut.gpid[i + mapper_offset].lpid = i + src->reg.offset;
+            dest->irreg.mlut.gpid[i + mapper_offset].avtid = src->avtid;
+        }
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, " source offset %d", src->reg.offset));
+        break;
+    case MPIDII_RANK_MAP_STRIDE:
+    case MPIDII_RANK_MAP_STRIDE_INTRA:
+        for (i = 0; i < size; i++) {
+            dest->irreg.mlut.gpid[i + mapper_offset].lpid = MPIDII_CALC_STRIDE_SIMPLE(i,
+                                                                                      src->reg.
+                                                                                      stride.stride,
+                                                                                      src->reg.
+                                                                                      stride.
+                                                                                      offset);
+            dest->irreg.mlut.gpid[i + mapper_offset].avtid = src->avtid;
+        }
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, " source stride %d blocksize %d offset %d",
+                         src->reg.stride.stride, src->reg.stride.blocksize,
+                         src->reg.stride.offset));
+        break;
+    case MPIDII_RANK_MAP_STRIDE_BLOCK:
+    case MPIDII_RANK_MAP_STRIDE_BLOCK_INTRA:
+        for (i = 0; i < size; i++) {
+            dest->irreg.mlut.gpid[i + mapper_offset].lpid = MPIDII_CALC_STRIDE(i,
+                                                                               src->reg.stride.
+                                                                               stride,
+                                                                               src->reg.stride.
+                                                                               blocksize,
+                                                                               src->reg.stride.
+                                                                               offset);
+            dest->irreg.mlut.gpid[i + mapper_offset].avtid = src->avtid;
+        }
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, " source stride %d blocksize %d offset %d",
+                         src->reg.stride.stride, src->reg.stride.blocksize,
+                         src->reg.stride.offset));
+        break;
+    case MPIDII_RANK_MAP_LUT:
+    case MPIDII_RANK_MAP_LUT_INTRA:
+        for (i = 0; i < size; i++) {
+            dest->irreg.mlut.gpid[i + mapper_offset].lpid = src->irreg.lut.lpid[i];
+            dest->irreg.mlut.gpid[i + mapper_offset].avtid = src->avtid;
+        }
+        break;
+    case MPIDII_RANK_MAP_MLUT:
+        for (i = 0; i < size; i++) {
+            dest->irreg.mlut.gpid[i + mapper_offset].lpid = src->irreg.mlut.gpid[i].lpid;
+            dest->irreg.mlut.gpid[i + mapper_offset].avtid = src->irreg.mlut.gpid[i].avtid;
+        }
+        break;
+    case MPIDII_RANK_MAP_NONE:
+        MPIR_Assert(0);
+        break;
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDII_SRC_COMM_TO_MLUT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDII_src_mlut_to_mlut
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDII_src_mlut_to_mlut(MPIDII_rank_map_t * src,
+                                          MPIDII_rank_map_t * dest,
+                                          MPIR_Comm_map_t * mapper,
+                                          int total_mapper_size, int mapper_offset)
+{
+    int mpi_errno = MPI_SUCCESS, i;
+    int size = MPIDII_map_size(*mapper);
+    MPIDII_rank_map_mlut_t *mlut = NULL;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDII_MLUT_TO_MLUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDII_MLUT_TO_MLUT);
+
+    if (!mapper_offset) {
+        mpi_errno = MPIDII_alloc_mlut(&mlut, total_mapper_size);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+        dest->size = total_mapper_size;
+    }
+
+    dest->mode = MPIDII_RANK_MAP_MLUT;
+    dest->mode = src->mode;
+    dest->irreg.mlut.t = mlut;
+    dest->irreg.mlut.gpid = mlut->gpid;
+    for (i = 0; i < size; i++) {
+        dest->irreg.mlut.gpid[i + mapper_offset].avtid =
+            src->irreg.mlut.gpid[mapper->src_mapping[i]].avtid;
+        dest->irreg.mlut.gpid[i + mapper_offset].lpid =
+            src->irreg.mlut.gpid[mapper->src_mapping[i]].lpid;
+    }
+  fn_exit:
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                    (MPL_DBG_FDEST, " src mode %d, dest mode %d",
+                     (int) src->mode, (int) dest->mode));
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDII_MLUT_TO_MLUT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDII_src_map_to_lut
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDII_src_map_to_lut(MPIDII_rank_map_t * src,
+                                        MPIDII_rank_map_t * dest,
+                                        MPIR_Comm_map_t * mapper,
+                                        int total_mapper_size, int mapper_offset)
+{
+    int mpi_errno = MPI_SUCCESS, i;
+    int size = MPIDII_map_size(*mapper);
+    MPIDII_rank_map_lut_t *lut = NULL;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDII_MAP_TO_LUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDII_MAP_TO_LUT);
+
+    if (!mapper_offset) {
+        mpi_errno = MPIDII_alloc_lut(&lut, total_mapper_size);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+        dest->size = total_mapper_size;
+    }
+
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                    (MPL_DBG_FDEST, " source mode %d", (int) src->mode));
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                    (MPL_DBG_FDEST, " size %d, mapper->src_mapping_size %d",
+                     size, mapper->src_mapping_size));
+    dest->mode = MPIDII_RANK_MAP_LUT;
+    dest->avtid = src->avtid;
+    dest->irreg.lut.t = lut;
+    dest->irreg.lut.lpid = lut->lpid;
+    switch (src->mode) {
+    case MPIDII_RANK_MAP_DIRECT:
+    case MPIDII_RANK_MAP_DIRECT_INTRA:
+        for (i = 0; i < size; i++) {
+            dest->irreg.lut.lpid[i + mapper_offset] = mapper->src_mapping[i];
+        }
+        break;
+    case MPIDII_RANK_MAP_OFFSET:
+    case MPIDII_RANK_MAP_OFFSET_INTRA:
+        for (i = 0; i < size; i++) {
+            dest->irreg.lut.lpid[i + mapper_offset] = mapper->src_mapping[i] + src->reg.offset;
+        }
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, " source offset %d", src->reg.offset));
+        break;
+    case MPIDII_RANK_MAP_STRIDE:
+    case MPIDII_RANK_MAP_STRIDE_INTRA:
+        for (i = 0; i < size; i++) {
+            dest->irreg.lut.lpid[i + mapper_offset] =
+                MPIDII_CALC_STRIDE_SIMPLE(mapper->src_mapping[i], src->reg.stride.stride,
+                                          src->reg.stride.offset);
+        }
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, " source stride %d blocksize %d offset %d",
+                         src->reg.stride.stride, src->reg.stride.blocksize,
+                         src->reg.stride.offset));
+        break;
+    case MPIDII_RANK_MAP_STRIDE_BLOCK:
+    case MPIDII_RANK_MAP_STRIDE_BLOCK_INTRA:
+        for (i = 0; i < size; i++) {
+            dest->irreg.lut.lpid[i + mapper_offset] = MPIDII_CALC_STRIDE(mapper->src_mapping[i],
+                                                                         src->reg.stride.stride,
+                                                                         src->reg.stride.blocksize,
+                                                                         src->reg.stride.offset);
+        }
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, " source stride %d blocksize %d offset %d",
+                         src->reg.stride.stride, src->reg.stride.blocksize,
+                         src->reg.stride.offset));
+        break;
+    case MPIDII_RANK_MAP_LUT:
+    case MPIDII_RANK_MAP_LUT_INTRA:
+        for (i = 0; i < size; i++) {
+            dest->irreg.lut.lpid[i + mapper_offset] = src->irreg.lut.lpid[mapper->src_mapping[i]];
+        }
+        break;
+    default:
+        mpi_errno = 1;
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, " cannot convert mode %d to lut", (int) src->mode));
+        goto fn_fail;
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDII_MAP_TO_LUT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDII_direct_of_src_rmap
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline void MPIDII_direct_of_src_rmap(MPIDII_rank_map_t * src,
+                                             MPIDII_rank_map_t * dest, MPIR_Comm_map_t * mapper)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDII_DIRECT_OF_SRC_RMAP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDII_DIRECT_OF_SRC_RMAP);
+    dest->mode = src->mode;
+    dest->size = MPIDII_map_size(*mapper);
+    dest->avtid = src->avtid;
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                    (MPL_DBG_FDEST, " source mode %d", (int) src->mode));
+    switch (src->mode) {
+    case MPIDII_RANK_MAP_DIRECT:
+    case MPIDII_RANK_MAP_DIRECT_INTRA:
+        break;
+    case MPIDII_RANK_MAP_OFFSET:
+    case MPIDII_RANK_MAP_OFFSET_INTRA:
+        dest->reg.offset = src->reg.offset;
+        break;
+    case MPIDII_RANK_MAP_STRIDE:
+    case MPIDII_RANK_MAP_STRIDE_INTRA:
+    case MPIDII_RANK_MAP_STRIDE_BLOCK:
+    case MPIDII_RANK_MAP_STRIDE_BLOCK_INTRA:
+        dest->reg.stride.stride = src->reg.stride.stride;
+        dest->reg.stride.blocksize = src->reg.stride.blocksize;
+        dest->reg.stride.offset = src->reg.stride.offset;
+        break;
+    case MPIDII_RANK_MAP_LUT:
+    case MPIDII_RANK_MAP_LUT_INTRA:
+        dest->irreg.lut.t = src->irreg.lut.t;
+        dest->irreg.lut.lpid = src->irreg.lut.lpid;
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, "\tref count %d", MPIR_Object_get_ref(src->irreg.lut.t)));
+        MPIR_Object_add_ref(src->irreg.lut.t);
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, "\tadd ref to src lut"));
+        break;
+    case MPIDII_RANK_MAP_MLUT:
+        dest->irreg.mlut.t = src->irreg.mlut.t;
+        dest->irreg.mlut.gpid = src->irreg.mlut.gpid;
+        MPIR_Object_add_ref(src->irreg.mlut.t);
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, "\tadd ref to src mlut"));
+        break;
+    case MPIDII_RANK_MAP_NONE:
+        MPIR_Assert(0);
+        break;
+    }
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDII_DIRECT_OF_SRC_RMAP);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDII_offset_of_src_rmap
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline void MPIDII_offset_of_src_rmap(MPIDII_rank_map_t * src,
+                                             MPIDII_rank_map_t * dest,
+                                             MPIR_Comm_map_t * mapper, int offset)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDII_OFFSET_OF_SRC_RMAP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDII_OFFSET_OF_SRC_RMAP);
+    dest->avtid = src->avtid;
+    dest->size = MPIDII_map_size(*mapper);
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                    (MPL_DBG_FDEST, " source mode %d", (int) src->mode));
+    switch (src->mode) {
+    case MPIDII_RANK_MAP_DIRECT_INTRA:
+        dest->mode = MPIDII_RANK_MAP_OFFSET_INTRA;
+        dest->reg.offset = offset;
+        break;
+    case MPIDII_RANK_MAP_DIRECT:
+        dest->mode = MPIDII_RANK_MAP_OFFSET;
+        dest->reg.offset = offset;
+        break;
+    case MPIDII_RANK_MAP_OFFSET:
+        dest->mode = MPIDII_RANK_MAP_OFFSET;
+        dest->reg.offset = src->reg.offset + offset;
+        break;
+    case MPIDII_RANK_MAP_OFFSET_INTRA:
+        dest->mode = MPIDII_RANK_MAP_OFFSET_INTRA;
+        dest->reg.offset = src->reg.offset + offset;
+        break;
+    case MPIDII_RANK_MAP_STRIDE:
+        dest->mode = MPIDII_RANK_MAP_STRIDE;
+        dest->reg.stride.stride = src->reg.stride.stride;
+        dest->reg.stride.blocksize = src->reg.stride.blocksize;
+        dest->reg.stride.offset = src->reg.stride.offset + offset * src->reg.stride.stride;
+        break;
+    case MPIDII_RANK_MAP_STRIDE_INTRA:
+        dest->mode = MPIDII_RANK_MAP_STRIDE_INTRA;
+        dest->reg.stride.stride = src->reg.stride.stride;
+        dest->reg.stride.blocksize = src->reg.stride.blocksize;
+        dest->reg.stride.offset = src->reg.stride.offset + offset * src->reg.stride.stride;
+        break;
+    case MPIDII_RANK_MAP_STRIDE_BLOCK:
+    case MPIDII_RANK_MAP_STRIDE_BLOCK_INTRA:
+        MPIDII_src_map_to_lut(src, dest, mapper, mapper->src_mapping_size, 0);
+        break;
+    case MPIDII_RANK_MAP_LUT:
+    case MPIDII_RANK_MAP_LUT_INTRA:
+        dest->mode = src->mode;
+        dest->irreg.lut.t = src->irreg.lut.t;
+        dest->irreg.lut.lpid = &src->irreg.lut.lpid[offset];
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, "\tref count %d", MPIR_Object_get_ref(src->irreg.lut.t)));
+        MPIR_Object_add_ref(src->irreg.lut.t);
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, "\tadd ref to src lut"));
+        break;
+    case MPIDII_RANK_MAP_MLUT:
+        dest->mode = src->mode;
+        dest->irreg.mlut.t = src->irreg.mlut.t;
+        dest->irreg.mlut.gpid = &src->irreg.mlut.gpid[offset];
+        MPIR_Object_add_ref(src->irreg.mlut.t);
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, "\tadd ref to src mlut"));
+        break;
+    case MPIDII_RANK_MAP_NONE:
+        MPIR_Assert(0);
+        break;
+    }
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDII_OFFSET_OF_SRC_RMAP);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDII_stride_of_src_rmap
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline void MPIDII_stride_of_src_rmap(MPIDII_rank_map_t * src,
+                                             MPIDII_rank_map_t * dest,
+                                             MPIR_Comm_map_t * mapper,
+                                             int stride, int blocksize, int offset)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDII_STRIDE_OF_SRC_RMAP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDII_STRIDE_OF_SRC_RMAP);
+    dest->avtid = src->avtid;
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                    (MPL_DBG_FDEST, " source mode %d", (int) src->mode));
+    switch (src->mode) {
+    case MPIDII_RANK_MAP_DIRECT_INTRA:
+        if (blocksize == 1) {
+            dest->mode = MPIDII_RANK_MAP_STRIDE_INTRA;
+        }
+        else {
+            dest->mode = MPIDII_RANK_MAP_STRIDE_BLOCK_INTRA;
+        }
+        dest->size = MPIDII_map_size(*mapper);
+        dest->reg.stride.stride = stride;
+        dest->reg.stride.blocksize = blocksize;
+        dest->reg.stride.offset = offset;
+        MPIR_Assert(stride > 0);
+        MPIR_Assert(blocksize > 0);
+        break;
+    case MPIDII_RANK_MAP_DIRECT:
+        if (blocksize == 1) {
+            dest->mode = MPIDII_RANK_MAP_STRIDE;
+        }
+        else {
+            dest->mode = MPIDII_RANK_MAP_STRIDE_BLOCK;
+        }
+        dest->size = MPIDII_map_size(*mapper);
+        dest->reg.stride.stride = stride;
+        dest->reg.stride.blocksize = blocksize;
+        dest->reg.stride.offset = offset;
+        MPIR_Assert(stride > 0);
+        MPIR_Assert(blocksize > 0);
+        break;
+    case MPIDII_RANK_MAP_OFFSET:
+        if (blocksize == 1) {
+            dest->mode = MPIDII_RANK_MAP_STRIDE;
+        }
+        else {
+            dest->mode = MPIDII_RANK_MAP_STRIDE_BLOCK;
+        }
+        dest->size = MPIDII_map_size(*mapper);
+        dest->reg.stride.stride = stride;
+        dest->reg.stride.blocksize = blocksize;
+        dest->reg.stride.offset = offset + src->reg.offset;
+        break;
+    case MPIDII_RANK_MAP_OFFSET_INTRA:
+        if (blocksize == 1) {
+            dest->mode = MPIDII_RANK_MAP_STRIDE_INTRA;
+        }
+        else {
+            dest->mode = MPIDII_RANK_MAP_STRIDE_BLOCK_INTRA;
+        }
+        dest->size = MPIDII_map_size(*mapper);
+        dest->reg.stride.stride = stride;
+        dest->reg.stride.blocksize = blocksize;
+        dest->reg.stride.offset = offset + src->reg.offset;
+        break;
+    case MPIDII_RANK_MAP_STRIDE:
+        if (blocksize == 1) {
+            dest->mode = MPIDII_RANK_MAP_STRIDE;
+            dest->reg.stride.stride = src->reg.stride.stride * stride;
+            dest->reg.stride.blocksize = blocksize;
+            dest->reg.stride.offset = src->reg.stride.stride * offset + src->reg.stride.offset;
+        }
+        else {
+            MPIDII_src_map_to_lut(src, dest, mapper, mapper->src_mapping_size, 0);
+        }
+        break;
+    case MPIDII_RANK_MAP_STRIDE_INTRA:
+        if (blocksize == 1) {
+            dest->mode = MPIDII_RANK_MAP_STRIDE_INTRA;
+            dest->reg.stride.stride = src->reg.stride.stride * stride;
+            dest->reg.stride.blocksize = blocksize;
+            dest->reg.stride.offset = src->reg.stride.stride * offset + src->reg.stride.offset;
+        }
+        else {
+            MPIDII_src_map_to_lut(src, dest, mapper, mapper->src_mapping_size, 0);
+        }
+        break;
+    case MPIDII_RANK_MAP_STRIDE_BLOCK:
+    case MPIDII_RANK_MAP_STRIDE_BLOCK_INTRA:
+        MPIDII_src_map_to_lut(src, dest, mapper, mapper->src_mapping_size, 0);
+        break;
+    case MPIDII_RANK_MAP_LUT:
+    case MPIDII_RANK_MAP_LUT_INTRA:
+        MPIDII_src_map_to_lut(src, dest, mapper, mapper->src_mapping_size, 0);
+        break;
+    case MPIDII_RANK_MAP_MLUT:
+        MPIDII_src_mlut_to_mlut(src, dest, mapper, mapper->src_mapping_size, 0);
+        break;
+    case MPIDII_RANK_MAP_NONE:
+        MPIR_Assert(0);
+        break;
+    }
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDII_STRIDE_OF_SRC_RMAP);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDII_check_convert_mlut_to_lut
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDII_check_convert_mlut_to_lut(MPIDII_rank_map_t * src)
+{
+    int mpi_errno = MPI_SUCCESS, i;
+    int flag = 1;
+    int avtid;
+    MPIDII_rank_map_mlut_t *mlut = NULL;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDII_CONVERT_MLUT_TO_LUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDII_CONVERT_MLUT_TO_LUT);
+
+    if (src->mode != MPIDII_RANK_MAP_MLUT) {
+        goto fn_exit;
+    }
+
+    /* check if all mlut item has the same avtid */
+    avtid = src->irreg.mlut.gpid[0].avtid;
+    for (i = 1; i < src->size; i++) {
+        if (src->irreg.mlut.gpid[i].avtid != avtid) {
+            flag = 0;
+            break;
+        }
+    }
+    if (!flag) {        /* multiple avtid */
+        goto fn_exit;
+    }
+
+    src->avtid = avtid;
+    if (avtid == 0) {
+        src->mode = MPIDII_RANK_MAP_LUT_INTRA;
+    }
+    else {
+        src->mode = MPIDII_RANK_MAP_LUT;
+    }
+    mlut = src->irreg.mlut.t;
+    mpi_errno = MPIDII_alloc_lut(&src->irreg.lut.t, src->size);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+    src->irreg.lut.lpid = src->irreg.lut.t->lpid;
+    for (i = 0; i < src->size; i++) {
+        src->irreg.lut.lpid[i] = mlut->gpid[i].lpid;
+    }
+    MPIDIU_release_mlut(mlut);
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, " avtid %d", src->avtid));
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDII_CONVERT_MLUT_TO_LUT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDII_check_convert_lut_to_regular
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDII_check_convert_lut_to_regular(MPIDII_rank_map_t * src)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int mode_detected, offset, blocksize, stride;
+    MPIDII_rank_map_lut_t *lut = NULL;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDII_CONVERT_LUT_TO_REGULAR);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDII_CONVERT_LUT_TO_REGULAR);
+
+    if (src->mode != MPIDII_RANK_MAP_LUT && src->mode != MPIDII_RANK_MAP_LUT_INTRA) {
+        goto fn_exit;
+    }
+
+    lut = src->irreg.lut.t;
+    mode_detected = MPIDII_detect_regular_model(src->irreg.lut.lpid, src->size,
+                                                &offset, &blocksize, &stride);
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                    (MPL_DBG_FDEST, " detected mode: %d", mode_detected));
+
+
+    switch (mode_detected) {
+    case MPIDII_SRC_MAPPER_DIRECT:
+        src->mode = MPIDII_RANK_MAP_DIRECT;
+        if (src->avtid == 0) {
+            src->mode = MPIDII_RANK_MAP_DIRECT_INTRA;
+        }
+        src->irreg.lut.t = NULL;
+        src->irreg.lut.lpid = NULL;
+        MPIDIU_release_lut(lut);
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, "\tlut to mode %d", (int) src->mode));
+        break;
+    case MPIDII_SRC_MAPPER_OFFSET:
+        src->mode = MPIDII_RANK_MAP_OFFSET;
+        if (src->avtid == 0) {
+            src->mode = MPIDII_RANK_MAP_OFFSET_INTRA;
+        }
+        src->reg.offset = offset;
+        src->irreg.lut.t = NULL;
+        src->irreg.lut.lpid = NULL;
+        MPIDIU_release_lut(lut);
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, "  lut to mode %d", (int) src->mode));
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, "\toffset: %d", src->reg.offset));
+        break;
+    case MPIDII_SRC_MAPPER_STRIDE:
+        if (blocksize == 1) {
+            src->mode = MPIDII_RANK_MAP_STRIDE;
+            if (src->avtid == 0) {
+                src->mode = MPIDII_RANK_MAP_STRIDE_INTRA;
+            }
+        }
+        else {
+            src->mode = MPIDII_RANK_MAP_STRIDE_BLOCK;
+            if (src->avtid == 0) {
+                src->mode = MPIDII_RANK_MAP_STRIDE_BLOCK_INTRA;
+            }
+        }
+        src->reg.stride.stride = stride;
+        src->reg.stride.blocksize = blocksize;
+        src->reg.stride.offset = offset;
+        src->irreg.lut.t = NULL;
+        src->irreg.lut.lpid = NULL;
+        MPIDIU_release_lut(lut);
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, "  lut to mode %d", (int) src->mode));
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, "\toffset: %d", src->reg.stride.offset));
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, "\toffset: %d", src->reg.stride.blocksize));
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, "\toffset: %d", src->reg.stride.stride));
+        break;
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDII_CONVERT_LUT_TO_REGULAR);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDII_set_map
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDII_set_map(MPIDII_rank_map_t * src_rmap,
+                                 MPIDII_rank_map_t * dest_rmap,
+                                 MPIR_Comm_map_t * mapper,
+                                 int src_comm_size, int total_mapper_size, int mapper_offset)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDII_rank_map_mode src_mode;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDII_SET_MAP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDII_SET_MAP);
+
+    src_mode = src_rmap->mode;
+
+    /* Simplest case: MAP_DUP, exact duplication of src_comm */
+    if (mapper->type == MPIR_COMM_MAP_TYPE__DUP && src_comm_size == total_mapper_size) {
+        MPIDII_direct_of_src_rmap(src_rmap, dest_rmap, mapper);
+        goto fn_exit;
+    }
+    /* single src_comm, newcomm is smaller than src_comm, only one mapper */
+    else if (mapper->type == MPIR_COMM_MAP_TYPE__IRREGULAR &&
+             mapper->src_mapping_size == total_mapper_size) {
+        /* check if new comm has the same mapping as src_comm */
+        /* detect src_mapping_offset for direct_to_direct and offset_to_offset */
+        int mode_detected, offset, blocksize, stride;
+        mode_detected = MPIDII_detect_regular_model(mapper->src_mapping, mapper->src_mapping_size,
+                                                    &offset, &blocksize, &stride);
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, "\tdetected mode: %d", mode_detected));
+
+        switch (mode_detected) {
+        case MPIDII_SRC_MAPPER_DIRECT:
+            MPIDII_direct_of_src_rmap(src_rmap, dest_rmap, mapper);
+            break;
+        case MPIDII_SRC_MAPPER_OFFSET:
+            MPIDII_offset_of_src_rmap(src_rmap, dest_rmap, mapper, offset);
+            break;
+        case MPIDII_SRC_MAPPER_STRIDE:
+            MPIDII_stride_of_src_rmap(src_rmap, dest_rmap, mapper, stride, blocksize, offset);
+            break;
+        default:
+            if (src_rmap->mode == MPIDII_RANK_MAP_MLUT) {
+                MPIDII_src_mlut_to_mlut(src_rmap, dest_rmap, mapper, total_mapper_size,
+                                        mapper_offset);
+            }
+            else {      /* src_mode != MPIDII_RANK_MAP_MLUT */
+                MPIDII_src_map_to_lut(src_rmap, dest_rmap, mapper, mapper->src_mapping_size,
+                                      mapper_offset);
+            }
+        }
+        goto fn_exit;
+    }
+
+    /* more complex case: multiple mappers
+     * We always alloc lut (or mlut is src_rmap is mlut). We will check if a
+     * lut mapping can be converted to something simpler after all the mapper
+     * are processed
+     */
+
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, " multiple mapper"));
+    if (mapper->type == MPIR_COMM_MAP_TYPE__DUP) {
+        MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                        (MPL_DBG_FDEST, " check map_size %d, src_comm_size %d",
+                         MPIDII_map_size(*mapper), src_comm_size));
+        if (src_mode == MPIDII_RANK_MAP_MLUT) {
+            MPIDII_src_comm_to_mlut(src_rmap, dest_rmap, src_comm_size,
+                                    total_mapper_size, mapper_offset);
+        }
+        else {  /* src_mode != MPIDII_RANK_MAP_MLUT */
+            MPIDII_src_comm_to_lut(src_rmap, dest_rmap, src_comm_size,
+                                   total_mapper_size, mapper_offset);
+        }
+    }
+    else {      /* mapper->type == MPIR_COMM_MAP_TYPE__IRREGULAR */
+        if (src_mode == MPIDII_RANK_MAP_MLUT) {
+            MPIDII_src_mlut_to_mlut(src_rmap, dest_rmap, mapper, total_mapper_size, mapper_offset);
+        }
+        else {  /* src_mode != MPIDII_RANK_MAP_MLUT */
+            MPIDII_src_map_to_lut(src_rmap, dest_rmap, mapper, total_mapper_size, mapper_offset);
+        }
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDII_SET_MAP);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDII_comm_create_rank_map
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDII_comm_create_rank_map(MPIR_Comm * comm)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Comm_map_t *mapper;
+    MPIR_Comm *src_comm;
+    int total_mapper_size, mapper_offset;
+
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDII_COMM_CREATE_RANK_MAP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDII_COMM_CREATE_RANK_MAP);
+
+    /* do some sanity checks */
+    MPL_LL_FOREACH(comm->mapper_head, mapper) {
+        if (mapper->src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM)
+            MPIR_Assert(mapper->dir == MPIR_COMM_MAP_DIR__L2L ||
+                        mapper->dir == MPIR_COMM_MAP_DIR__L2R);
+
+        if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM)
+            MPIR_Assert(mapper->dir == MPIR_COMM_MAP_DIR__L2L ||
+                        mapper->dir == MPIR_COMM_MAP_DIR__R2L);
+    }
+
+    /* First, handle all the mappers that contribute to the local part
+     * of the comm */
+    total_mapper_size = 0;
+    MPL_LL_FOREACH(comm->mapper_head, mapper) {
+        if (mapper->dir == MPIR_COMM_MAP_DIR__L2R || mapper->dir == MPIR_COMM_MAP_DIR__R2R)
+            continue;
+
+        total_mapper_size += MPIDII_map_size(*mapper);
+    }
+    mapper_offset = 0;
+    MPL_LL_FOREACH(comm->mapper_head, mapper) {
+        src_comm = mapper->src_comm;
+
+        if (mapper->dir == MPIR_COMM_MAP_DIR__L2R || mapper->dir == MPIR_COMM_MAP_DIR__R2R)
+            continue;
+
+        if (mapper->dir == MPIR_COMM_MAP_DIR__L2L) {
+            if (src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM &&
+                comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) {
+                MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                                (MPL_DBG_FDEST,
+                                 " intra->intra, L2L, size=%d, total_mapper_size=%d, mapper_offset=%d",
+                                 src_comm->local_size, total_mapper_size, mapper_offset));
+                MPIDII_set_map(&MPIDII_COMM(src_comm, map), &MPIDII_COMM(comm, map), mapper,
+                               src_comm->local_size, total_mapper_size, mapper_offset);
+            }
+            else if (src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM &&
+                     comm->comm_kind == MPIR_COMM_KIND__INTERCOMM) {
+                MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                                (MPL_DBG_FDEST,
+                                 " intra->inter, L2L, size=%d, total_mapper_size=%d, mapper_offset=%d",
+                                 src_comm->local_size, total_mapper_size, mapper_offset));
+                MPIDII_set_map(&MPIDII_COMM(src_comm, map), &MPIDII_COMM(comm, local_map), mapper,
+                               src_comm->local_size, total_mapper_size, mapper_offset);
+            }
+            else if (src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM &&
+                     comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) {
+                MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                                (MPL_DBG_FDEST,
+                                 " inter->intra, L2L, size=%d, total_mapper_size=%d, mapper_offset=%d",
+                                 src_comm->local_size, total_mapper_size, mapper_offset));
+                MPIDII_set_map(&MPIDII_COMM(src_comm, local_map), &MPIDII_COMM(comm, map), mapper,
+                               src_comm->local_size, total_mapper_size, mapper_offset);
+            }
+            else {      /* src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM && comm->comm_kind == MPIR_COMM_KIND__INTERCOMM */
+                MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                                (MPL_DBG_FDEST,
+                                 " inter->inter, L2L, size=%d, total_mapper_size=%d, mapper_offset=%d",
+                                 src_comm->local_size, total_mapper_size, mapper_offset));
+                MPIDII_set_map(&MPIDII_COMM(src_comm, local_map), &MPIDII_COMM(comm, local_map),
+                               mapper, src_comm->local_size, total_mapper_size, mapper_offset);
+            }
+        }
+        else {  /* mapper->dir == MPIR_COMM_MAP_DIR__R2L */
+            MPIR_Assert(src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM);
+
+            if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) {
+                MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                                (MPL_DBG_FDEST,
+                                 " ->intra, R2L, size=%d, total_mapper_size=%d, mapper_offset=%d",
+                                 src_comm->remote_size, total_mapper_size, mapper_offset));
+                MPIDII_set_map(&MPIDII_COMM(src_comm, map), &MPIDII_COMM(comm, map), mapper,
+                               src_comm->remote_size, total_mapper_size, mapper_offset);
+            }
+            else {
+                MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                                (MPL_DBG_FDEST,
+                                 " ->inter, R2L, size=%d, total_mapper_size=%d, mapper_offset=%d",
+                                 src_comm->remote_size, total_mapper_size, mapper_offset));
+                MPIDII_set_map(&MPIDII_COMM(src_comm, map), &MPIDII_COMM(comm, local_map), mapper,
+                               src_comm->remote_size, total_mapper_size, mapper_offset);
+            }
+        }
+
+        mapper_offset += MPIDII_map_size(*mapper);
+    }
+
+    /* Next, handle all the mappers that contribute to the remote part
+     * of the comm (only valid for intercomms)
+     */
+    total_mapper_size = 0;
+    MPL_LL_FOREACH(comm->mapper_head, mapper) {
+        if (mapper->dir == MPIR_COMM_MAP_DIR__L2L || mapper->dir == MPIR_COMM_MAP_DIR__R2L)
+            continue;
+
+        total_mapper_size += MPIDII_map_size(*mapper);
+    }
+    mapper_offset = 0;
+    MPL_LL_FOREACH(comm->mapper_head, mapper) {
+        src_comm = mapper->src_comm;
+
+        if (mapper->dir == MPIR_COMM_MAP_DIR__L2L || mapper->dir == MPIR_COMM_MAP_DIR__R2L)
+            continue;
+
+        MPIR_Assert(comm->comm_kind == MPIR_COMM_KIND__INTERCOMM);
+
+        if (mapper->dir == MPIR_COMM_MAP_DIR__L2R) {
+            if (src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) {
+                MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                                (MPL_DBG_FDEST,
+                                 " intra->, L2R, size=%d, total_mapper_size=%d, mapper_offset=%d",
+                                 src_comm->local_size, total_mapper_size, mapper_offset));
+                MPIDII_set_map(&MPIDII_COMM(src_comm, map), &MPIDII_COMM(comm, map), mapper,
+                               src_comm->local_size, total_mapper_size, mapper_offset);
+            }
+            else {      /* src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM */
+                MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                                (MPL_DBG_FDEST,
+                                 " inter->, L2R, size=%d, total_mapper_size=%d, mapper_offset=%d",
+                                 src_comm->local_size, total_mapper_size, mapper_offset));
+                MPIDII_set_map(&MPIDII_COMM(src_comm, local_map), &MPIDII_COMM(comm, map), mapper,
+                               src_comm->local_size, total_mapper_size, mapper_offset);
+            }
+        }
+        else {  /* mapper->dir == MPIR_COMM_MAP_DIR__R2R */
+            MPIR_Assert(src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM);
+            MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                            (MPL_DBG_FDEST,
+                             " inter->, R2R, size=%d, total_mapper_size=%d, mapper_offset=%d",
+                             src_comm->remote_size, total_mapper_size, mapper_offset));
+            MPIDII_set_map(&MPIDII_COMM(src_comm, map), &MPIDII_COMM(comm, map), mapper,
+                           src_comm->remote_size, total_mapper_size, mapper_offset);
+        }
+
+        mapper_offset += MPIDII_map_size(*mapper);
+    }
+
+    /* check before finishing
+     * 1. if mlut can be converted to lut: all avtids are the same
+     * 2. if lut can be converted to regular modes: direct, offset, and more
+     */
+    MPIDII_check_convert_mlut_to_lut(&MPIDII_COMM(comm, map));
+    MPIDII_check_convert_lut_to_regular(&MPIDII_COMM(comm, map));
+    if (comm->comm_kind == MPIR_COMM_KIND__INTERCOMM) {
+        MPIDII_check_convert_mlut_to_lut(&MPIDII_COMM(comm, local_map));
+        MPIDII_check_convert_lut_to_regular(&MPIDII_COMM(comm, local_map));
+    }
+
+    if (comm->comm_kind == MPIR_COMM_KIND__INTERCOMM) {
+        /* setup the lut for the local_comm in the intercomm */
+        if (comm->local_comm) {
+            MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                            (MPL_DBG_FDEST, "\t create local_comm using src_comm"));
+            MPIDII_direct_of_src_rmap(&MPIDII_COMM(comm, local_map),
+                                      &MPIDII_COMM(comm->local_comm, map), mapper);
+
+            MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE,
+                            (MPL_DBG_FDEST, "create local_comm using src_comm"));
+        }
+    }
+
+    if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) {
+        MPIDII_COMM(comm, local_map).mode = MPIDII_RANK_MAP_NONE;
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDII_COMM_CREATE_RANK_MAP);
+    return mpi_errno;
+}
+
+#endif /* ifndef CH4I_COMM_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4i_util.h b/src/mpid/ch4/src/ch4i_util.h
new file mode 100644
index 0000000..e6a636d
--- /dev/null
+++ b/src/mpid/ch4/src/ch4i_util.h
@@ -0,0 +1,20 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4I_UTIL_H_INCLUDED
+#define CH4I_UTIL_H_INCLUDED
+
+void MPIDI_CH4I_map_create(void **_map);
+void MPIDI_CH4I_map_destroy(void *_map);
+void MPIDI_CH4I_map_set(void *_map, uint64_t id, void *val);
+void MPIDI_CH4I_map_erase(void *_map, uint64_t id);
+void *MPIDI_CH4I_map_lookup(void *_map, uint64_t id);
+
+#endif /* CH4I_UTIL_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4r_buf.h b/src/mpid/ch4/src/ch4r_buf.h
new file mode 100644
index 0000000..4b8ea31
--- /dev/null
+++ b/src/mpid/ch4/src/ch4r_buf.h
@@ -0,0 +1,168 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4R_BUF_H_INCLUDED
+#define CH4R_BUF_H_INCLUDED
+
+#include "ch4_impl.h"
+#include "ch4i_util.h"
+#include <pthread.h>
+
+/*
+   initial prototype of buffer pool.
+
+   TODO:
+   - align buffer region
+   - add garbage collection
+   - use huge pages
+*/
+
+static inline MPIU_buf_pool_t *create_buf_pool(int num, int size, MPIU_buf_pool_t * parent_pool)
+{
+    int i;
+    MPIU_buf_pool_t *buf_pool;
+    MPIU_buf_t *curr, *next;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_CREATE_BUF_POOL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_CREATE_BUF_POOL);
+
+    buf_pool = (MPIU_buf_pool_t *) MPL_malloc(sizeof(*buf_pool));
+    MPIR_Assert(buf_pool);
+    pthread_mutex_init(&buf_pool->lock, NULL);
+
+    buf_pool->size = size;
+    buf_pool->num = num;
+    buf_pool->next = NULL;
+    buf_pool->memory_region = MPL_malloc(num * (sizeof(MPIU_buf_t) + size));
+    MPIR_Assert(buf_pool->memory_region);
+
+    curr = (MPIU_buf_t *) buf_pool->memory_region;
+    buf_pool->head = curr;
+    for (i = 0; i < num - 1; i++) {
+        next = (MPIU_buf_t *) ((char *) curr + size + sizeof(MPIU_buf_t));
+        curr->next = next;
+        curr->pool = parent_pool ? parent_pool : buf_pool;
+        curr = curr->next;
+    }
+    curr->next = NULL;
+    curr->pool = parent_pool ? parent_pool : buf_pool;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_CREATE_BUF_POOL);
+    return buf_pool;
+}
+
+static inline MPIU_buf_pool_t *MPIDI_CH4U_create_buf_pool(int num, int size)
+{
+    MPIU_buf_pool_t *buf_pool;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_CREATE_BUF_POOL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_CREATE_BUF_POOL);
+
+    buf_pool = create_buf_pool(num, size, NULL);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_CREATE_BUF_POOL);
+    return buf_pool;
+}
+
+static inline void *MPIDI_CH4U_get_head_buf(MPIU_buf_pool_t * pool)
+{
+    void *buf;
+    MPIU_buf_t *curr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_GET_HEAD_BUF);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_GET_HEAD_BUF);
+
+    curr = pool->head;
+    pool->head = curr->next;
+    buf = curr->data;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_GET_HEAD_BUF);
+    return buf;
+}
+
+static inline void *MPIDI_CH4R_get_buf_safe(MPIU_buf_pool_t * pool)
+{
+    void *buf;
+    MPIU_buf_pool_t *curr_pool;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_GET_BUF_SAFE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_GET_BUF_SAFE);
+
+    if (pool->head) {
+        buf = MPIDI_CH4U_get_head_buf(pool);
+        goto fn_exit;
+    }
+
+    curr_pool = pool;
+    while (curr_pool->next)
+        curr_pool = curr_pool->next;
+
+    curr_pool->next = create_buf_pool(pool->num, pool->size, pool);
+    MPIR_Assert(curr_pool->next);
+    pool->head = curr_pool->next->head;
+    buf = MPIDI_CH4U_get_head_buf(pool);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_GET_BUF_SAFE);
+    return buf;
+}
+
+
+static inline void *MPIDI_CH4R_get_buf(MPIU_buf_pool_t * pool)
+{
+    void *buf;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_GET_BUF);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_GET_BUF);
+
+    pthread_mutex_lock(&pool->lock);
+    buf = MPIDI_CH4R_get_buf_safe(pool);
+    pthread_mutex_unlock(&pool->lock);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_GET_BUF);
+    return buf;
+}
+
+static inline void MPIDI_CH4R_release_buf_safe(void *buf)
+{
+    MPIU_buf_t *curr_buf;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_RELEASE_BUF_SAFE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_RELEASE_BUF_SAFE);
+
+    curr_buf = container_of(buf, MPIU_buf_t, data);
+    curr_buf->next = curr_buf->pool->head;
+    curr_buf->pool->head = curr_buf;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_RELEASE_BUF_SAFE);
+}
+
+static inline void MPIDI_CH4R_release_buf(void *buf)
+{
+    MPIU_buf_t *curr_buf;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_RELEASE_BUF);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_RELEASE_BUF);
+
+    curr_buf = container_of(buf, MPIU_buf_t, data);
+    pthread_mutex_lock(&curr_buf->pool->lock);
+    curr_buf->next = curr_buf->pool->head;
+    curr_buf->pool->head = curr_buf;
+    pthread_mutex_unlock(&curr_buf->pool->lock);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_RELEASE_BUF);
+}
+
+
+static inline void MPIDI_CH4R_destroy_buf_pool(MPIU_buf_pool_t * pool)
+{
+    if (pool->next)
+        MPIDI_CH4R_destroy_buf_pool(pool->next);
+
+    MPL_free(pool->memory_region);
+    MPL_free(pool);
+}
+
+#endif /* CH4R_BUF_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4r_callbacks.h b/src/mpid/ch4/src/ch4r_callbacks.h
new file mode 100644
index 0000000..db0ec20
--- /dev/null
+++ b/src/mpid/ch4/src/ch4r_callbacks.h
@@ -0,0 +1,2836 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4R_CALLBACKS_H_INCLUDED
+#define CH4R_CALLBACKS_H_INCLUDED
+
+#include "ch4r_request.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_progress_cmpl_list
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline void MPIDI_CH4U_progress_cmpl_list(void)
+{
+    MPIR_Request *req;
+    MPIDI_CH4U_req_ext_t *curr, *tmp;
+    MPIDI_NM_am_completion_handler_fn cmpl_handler_fn;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_PROGRESS_CMPL_LIST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_PROGRESS_CMPL_LIST);
+
+    /* MPIDI_CS_ENTER(); */
+  do_check_again:
+    MPL_DL_FOREACH_SAFE(MPIDI_CH4_Global.cmpl_list, curr, tmp) {
+        if (curr->seq_no == (uint64_t) OPA_load_int(&MPIDI_CH4_Global.exp_seq_no)) {
+            MPL_DL_DELETE(MPIDI_CH4_Global.cmpl_list, curr);
+            req = (MPIR_Request *) curr->request;
+            cmpl_handler_fn = (MPIDI_NM_am_completion_handler_fn) curr->cmpl_handler_fn;
+            cmpl_handler_fn(req);
+            goto do_check_again;
+        }
+    }
+    /* MPIDI_CS_EXIT(); */
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_PROGRESS_CMPL_LIST);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_check_cmpl_order
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_check_cmpl_order(MPIR_Request * req,
+                                              MPIDI_NM_am_completion_handler_fn cmpl_handler_fn)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_CHECK_CMPL_ORDER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_CHECK_CMPL_ORDER);
+
+    if (MPIDI_CH4U_REQUEST(req, req->seq_no) ==
+        (uint64_t) OPA_load_int(&MPIDI_CH4_Global.exp_seq_no)) {
+        OPA_incr_int(&MPIDI_CH4_Global.exp_seq_no);
+        return 1;
+    }
+
+    MPIDI_CH4U_REQUEST(req, req->cmpl_handler_fn) = (void *) cmpl_handler_fn;
+    MPIDI_CH4U_REQUEST(req, req->request) = (uint64_t) req;
+    /* MPIDI_CS_ENTER(); */
+    MPL_DL_APPEND(MPIDI_CH4_Global.cmpl_list, req->dev.ch4.ch4u.req);
+    /* MPIDI_CS_EXIT(); */
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_CHECK_CMPL_ORDER);
+    return 0;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_send_origin_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_send_origin_cmpl_handler(MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_SEND_TX_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_SEND_TX_HANDLER);
+    MPIDI_CH4I_am_request_complete(sreq);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_SEND_TX_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_send_long_lmt_origin_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_send_long_lmt_origin_cmpl_handler(MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_SEND_LONG_LMT_TX_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_SEND_LONG_LMT_TX_HANDLER);
+    dtype_release_if_not_builtin(MPIDI_CH4U_REQUEST(sreq, req->lreq).datatype);
+    MPIDI_CH4I_am_request_complete(sreq);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_SEND_LONG_LMT_TX_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_ssend_ack_origin_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_ssend_ack_origin_cmpl_handler(MPIR_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_SSEND_ACK_TX_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_SSEND_ACK_TX_HANDLER);
+    MPIDI_CH4I_am_request_complete(req);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_SSEND_ACK_TX_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_put_ack_origin_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_put_ack_origin_cmpl_handler(MPIR_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_PUT_ACK_TX_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_PUT_ACK_TX_HANDLER);
+    MPIDI_CH4I_am_request_complete(req);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_PUT_ACK_TX_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_get_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_get_cmpl_handler(MPIR_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS, i, c;
+    size_t data_sz, offset;
+    MPIDI_CH4U_get_ack_msg_t get_ack;
+    struct iovec *iov;
+    char *p_data;
+    uintptr_t base;
+    MPIR_Win *win;
+    MPIR_Context_id_t context_id;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_GET_CMPL_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_GET_CMPL_HANDLER);
+
+    if (!MPIDI_CH4U_check_cmpl_order(req, MPIDI_CH4U_get_cmpl_handler))
+        return mpi_errno;
+
+    base = MPIDI_CH4U_REQUEST(req, req->greq.addr);
+
+    MPIR_cc_incr(req->cc_ptr, &c);
+    get_ack.greq_ptr = MPIDI_CH4U_REQUEST(req, req->greq.greq_ptr);
+    win = MPIDI_CH4U_REQUEST(req, req->greq.win_ptr);
+    context_id = MPIDI_CH4U_win_to_context(win);
+    if (MPIDI_CH4U_REQUEST(req, req->greq.n_iov) == 0) {
+        mpi_errno = MPIDI_NM_send_am_reply(context_id,
+                                           MPIDI_CH4U_REQUEST(req, src_rank),
+                                           MPIDI_CH4U_GET_ACK,
+                                           &get_ack, sizeof(get_ack),
+                                           (void *) MPIDI_CH4U_REQUEST(req, req->greq.addr),
+                                           MPIDI_CH4U_REQUEST(req, req->greq.count),
+                                           MPIDI_CH4U_REQUEST(req, req->greq.datatype), req);
+        MPIDI_CH4I_am_request_complete(req);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+        goto fn_exit;
+    }
+
+    iov = (struct iovec *) MPIDI_CH4U_REQUEST(req, req->greq.dt_iov);
+
+    data_sz = 0;
+    for (i = 0; i < MPIDI_CH4U_REQUEST(req, req->greq.n_iov); i++) {
+        data_sz += iov[i].iov_len;
+    }
+
+    p_data = (char *) MPL_malloc(data_sz);
+    MPIR_Assert(p_data);
+
+    offset = 0;
+    for (i = 0; i < MPIDI_CH4U_REQUEST(req, req->greq.n_iov); i++) {
+        /* Adjust a window base address */
+        iov[i].iov_base = (char *) iov[i].iov_base + base;
+        MPIR_Memcpy(p_data + offset, iov[i].iov_base, iov[i].iov_len);
+        offset += iov[i].iov_len;
+    }
+
+    MPL_free(MPIDI_CH4U_REQUEST(req, req->greq.dt_iov));
+    MPIDI_CH4U_REQUEST(req, req->greq.dt_iov) = (void *) p_data;
+
+    mpi_errno = MPIDI_NM_send_am_reply(context_id,
+                                       MPIDI_CH4U_REQUEST(req, src_rank),
+                                       MPIDI_CH4U_GET_ACK,
+                                       &get_ack, sizeof(get_ack), p_data, data_sz, MPI_BYTE, req);
+    MPIDI_CH4I_am_request_complete(req);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+    MPIDI_CH4U_progress_cmpl_list();
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_GET_CMPL_HANDLER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_acc_ack_origin_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_acc_ack_origin_cmpl_handler(MPIR_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_ACC_ACK_TX_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_ACC_ACK_TX_HANDLER);
+    MPIDI_CH4I_am_request_complete(req);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_ACC_ACK_TX_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_get_acc_ack_origin_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_get_acc_ack_origin_cmpl_handler(MPIR_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Win *win;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_GET_ACC_ACK_TX_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_GET_ACC_ACK_TX_HANDLER);
+    MPL_free(MPIDI_CH4U_REQUEST(req, req->areq.data));
+
+    win = MPIDI_CH4U_REQUEST(req, req->areq.win_ptr);
+    /* MPIDI_CS_ENTER(); */
+    OPA_decr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+
+    MPIDI_CH4I_am_request_complete(req);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_GET_ACC_ACK_TX_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_cswap_ack_origin_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_cswap_ack_origin_cmpl_handler(MPIR_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Win *win;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_CSWAP_ACK_TX_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_CSWAP_ACK_TX_HANDLER);
+
+    MPL_free(MPIDI_CH4U_REQUEST(req, req->creq.data));
+    win = MPIDI_CH4U_REQUEST(req, req->creq.win_ptr);
+    /* MPIDI_CS_ENTER(); */
+    OPA_decr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+
+    MPIDI_CH4I_am_request_complete(req);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_CSWAP_ACK_TX_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_get_ack_origin_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_get_ack_origin_cmpl_handler(MPIR_Request * req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Win *win;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_GET_ACK_TX_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_GET_ACK_TX_HANDLER);
+
+    if (MPIDI_CH4U_REQUEST(req, req->greq.dt_iov)) {
+        MPL_free(MPIDI_CH4U_REQUEST(req, req->greq.dt_iov));
+    }
+
+    win = MPIDI_CH4U_REQUEST(req, req->greq.win_ptr);
+    /* MPIDI_CS_ENTER(); */
+    OPA_decr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+
+    MPIDI_CH4I_am_request_complete(req);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_GET_ACK_TX_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_put_origin_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_put_origin_cmpl_handler(MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_PUT_TX_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_PUT_TX_HANDLER);
+    MPIDI_CH4I_am_request_complete(sreq);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_PUT_TX_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_cswap_origin_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_cswap_origin_cmpl_handler(MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_CSWAP_TX_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_CSWAP_TX_HANDLER);
+    MPIDI_CH4I_am_request_complete(sreq);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_CSWAP_TX_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_acc_origin_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_acc_origin_cmpl_handler(MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_ACC_TX_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_ACC_TX_HANDLER);
+    MPIDI_CH4I_am_request_complete(sreq);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_ACC_TX_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_put_data_origin_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_put_data_origin_cmpl_handler(MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_PUT_DATA_TX_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_PUT_DATA_TX_HANDLER);
+    MPIDI_CH4I_am_request_complete(sreq);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_PUT_DATA_TX_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_acc_data_origin_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_acc_data_origin_cmpl_handler(MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_ACC_DATA_TX_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_ACC_DATA_TX_HANDLER);
+    MPIDI_CH4I_am_request_complete(sreq);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_ACC_DATA_TX_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_put_iov_origin_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_put_iov_origin_cmpl_handler(MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_PUT_IOV_TX_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_PUT_IOV_TX_HANDLER);
+    MPIDI_CH4I_am_request_complete(sreq);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_PUT_IOV_TX_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_acc_iov_origin_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_acc_iov_origin_cmpl_handler(MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_ACC_IOV_TX_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_ACC_IOV_TX_HANDLER);
+    MPIDI_CH4I_am_request_complete(sreq);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_ACC_IOV_TX_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_get_origin_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_get_origin_cmpl_handler(MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_GET_TX_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_GET_TX_HANDLER);
+    MPIDI_CH4I_am_request_complete(sreq);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_GET_TX_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_reply_ssend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_reply_ssend(MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS, c;
+    MPIDI_CH4U_ssend_ack_msg_t ack_msg;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_REPLY_SSEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_REPLY_SSEND);
+    MPIR_cc_incr(rreq->cc_ptr, &c);
+    ack_msg.sreq_ptr = MPIDI_CH4U_REQUEST(rreq, req->rreq.peer_req_ptr);
+
+    mpi_errno = MPIDI_NM_send_am_hdr_reply(MPIDI_CH4U_get_context(MPIDI_CH4U_REQUEST(rreq, tag)),
+                                           MPIDI_CH4U_REQUEST(rreq, src_rank),
+                                           MPIDI_CH4U_SSEND_ACK, &ack_msg, sizeof(ack_msg), rreq);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_REPLY_SSEND);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_ack_put
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_ack_put(MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_CH4U_put_ack_msg_t ack_msg;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_ACK_PUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_ACK_PUT);
+
+    ack_msg.preq_ptr = MPIDI_CH4U_REQUEST(rreq, req->preq.preq_ptr);
+    mpi_errno =
+        MPIDI_NM_inject_am_hdr_reply(MPIDI_CH4U_win_to_context
+                                     (MPIDI_CH4U_REQUEST(rreq, req->preq.win_ptr)),
+                                     MPIDI_CH4U_REQUEST(rreq, src_rank), MPIDI_CH4U_PUT_ACK,
+                                     &ack_msg, sizeof(ack_msg));
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_ACK_PUT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_ack_cswap
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_ack_cswap(MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS, c;
+    MPIDI_CH4U_cswap_ack_msg_t ack_msg;
+    void *result_addr;
+    size_t data_sz;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_ACK_CSWAP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_ACK_CSWAP);
+
+    MPIDI_Datatype_check_size(MPIDI_CH4U_REQUEST(rreq, req->creq.datatype), 1, data_sz);
+    result_addr = ((char *) MPIDI_CH4U_REQUEST(rreq, req->creq.data)) + data_sz;
+
+    MPIR_cc_incr(rreq->cc_ptr, &c);
+    ack_msg.req_ptr = MPIDI_CH4U_REQUEST(rreq, req->creq.creq_ptr);
+
+    mpi_errno =
+        MPIDI_NM_send_am_reply(MPIDI_CH4U_win_to_context
+                               (MPIDI_CH4U_REQUEST(rreq, req->creq.win_ptr)),
+                               MPIDI_CH4U_REQUEST(rreq, src_rank), MPIDI_CH4U_CSWAP_ACK, &ack_msg,
+                               sizeof(ack_msg), result_addr, 1, MPIDI_CH4U_REQUEST(rreq,
+                                                                                   req->creq.
+                                                                                   datatype), rreq);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_ACK_CSWAP);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_ack_acc
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_ack_acc(MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_CH4U_acc_ack_msg_t ack_msg;
+    MPIR_Win *win;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_ACK_ACC);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_ACK_ACC);
+
+    ack_msg.req_ptr = MPIDI_CH4U_REQUEST(rreq, req->areq.req_ptr);
+    mpi_errno =
+        MPIDI_NM_inject_am_hdr_reply(MPIDI_CH4U_win_to_context
+                                     (MPIDI_CH4U_REQUEST(rreq, req->areq.win_ptr)),
+                                     MPIDI_CH4U_REQUEST(rreq, src_rank), MPIDI_CH4U_ACC_ACK,
+                                     &ack_msg, sizeof(ack_msg));
+
+    win = MPIDI_CH4U_REQUEST(rreq, req->areq.win_ptr);
+    /* MPIDI_CS_ENTER(); */
+    OPA_decr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_ACK_ACC);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_ack_get_acc
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_ack_get_acc(MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS, c;
+    MPIDI_CH4U_acc_ack_msg_t ack_msg;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_ACK_GET_ACC);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_ACK_GET_ACC);
+
+    MPIR_cc_incr(rreq->cc_ptr, &c);
+    ack_msg.req_ptr = MPIDI_CH4U_REQUEST(rreq, req->areq.req_ptr);
+
+    mpi_errno =
+        MPIDI_NM_send_am_reply(MPIDI_CH4U_win_to_context
+                               (MPIDI_CH4U_REQUEST(rreq, req->areq.win_ptr)),
+                               MPIDI_CH4U_REQUEST(rreq, src_rank), MPIDI_CH4U_GET_ACC_ACK, &ack_msg,
+                               sizeof(ack_msg), MPIDI_CH4U_REQUEST(rreq, req->areq.data),
+                               MPIDI_CH4U_REQUEST(rreq, req->areq.data_sz), MPI_BYTE, rreq);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_ACK_GET_ACC);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_unexp_mrecv_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_unexp_mrecv_cmpl_handler(MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    uint64_t msg_tag;
+    size_t message_sz;
+    MPI_Aint last;
+    int dt_contig;
+    MPI_Aint dt_true_lb;
+    MPIR_Datatype *dt_ptr;
+    size_t data_sz, dt_sz;
+    MPID_Segment *segment_ptr;
+    void *buf;
+    int count;
+    MPI_Datatype datatype;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_UNEXP_MRECV_CMPL_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_UNEXP_MRECV_CMPL_HANDLER);
+
+    msg_tag = MPIDI_CH4U_REQUEST(rreq, tag);
+    rreq->status.MPI_SOURCE = MPIDI_CH4U_REQUEST(rreq, src_rank);
+    rreq->status.MPI_TAG = MPIDI_CH4U_get_tag(msg_tag);
+
+    buf = MPIDI_CH4U_REQUEST(rreq, req->rreq.mrcv_buffer);
+    count = MPIDI_CH4U_REQUEST(rreq, req->rreq.mrcv_count);
+    datatype = MPIDI_CH4U_REQUEST(rreq, req->rreq.mrcv_datatype);
+
+    message_sz = MPIDI_CH4U_REQUEST(rreq, count);
+    MPID_Datatype_get_size_macro(datatype, dt_sz);
+
+    if (message_sz > count * dt_sz) {
+        rreq->status.MPI_ERROR = MPI_ERR_TRUNCATE;
+    }
+    else {
+        rreq->status.MPI_ERROR = MPI_SUCCESS;
+        count = message_sz / dt_sz;
+    }
+
+    MPIR_STATUS_SET_COUNT(rreq->status, count * dt_sz);
+    MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);
+
+    if (!dt_contig) {
+        segment_ptr = MPIDU_Segment_alloc();
+        MPIR_ERR_CHKANDJUMP1(segment_ptr == NULL, mpi_errno,
+                             MPI_ERR_OTHER, "**nomem", "**nomem %s", "Recv MPIDU_Segment_alloc");
+        MPIDU_Segment_init(buf, count, datatype, segment_ptr, 0);
+
+        last = count * dt_sz;
+        MPIDU_Segment_unpack(segment_ptr, 0, &last, MPIDI_CH4U_REQUEST(rreq, buffer));
+        MPIDU_Segment_free(segment_ptr);
+        if (last != (MPI_Aint) (count * dt_sz)) {
+            mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+                                             __FUNCTION__, __LINE__,
+                                             MPI_ERR_TYPE, "**dtypemismatch", 0);
+            rreq->status.MPI_ERROR = mpi_errno;
+        }
+    }
+    else {
+        MPIR_Memcpy((char *) buf + dt_true_lb, MPIDI_CH4U_REQUEST(rreq, buffer), data_sz);
+    }
+
+    MPL_free(MPIDI_CH4U_REQUEST(rreq, buffer));
+    rreq->kind = MPIR_REQUEST_KIND__RECV;
+
+    if (MPIDI_CH4U_REQUEST(rreq, req->status) & MPIDI_CH4U_REQ_PEER_SSEND) {
+        mpi_errno = MPIDI_CH4U_reply_ssend(rreq);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+    MPIDI_CH4I_am_request_complete(rreq);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_UNEXP_MRECV_CMPL_HANDLER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_unexp_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_unexp_cmpl_handler(MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS, c;
+    MPIR_Comm *root_comm;
+    MPIR_Request *match_req = NULL;
+    uint64_t msg_tag;
+    size_t count;
+    MPI_Aint last;
+    int dt_contig;
+    MPI_Aint dt_true_lb;
+    MPIR_Datatype *dt_ptr;
+    size_t dt_sz;
+    MPID_Segment *segment_ptr;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_UNEXP_CMPL_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_UNEXP_CMPL_HANDLER);
+
+    /* MPIDI_CS_ENTER(); */
+    if (MPIDI_CH4U_REQUEST(rreq, req->status) & MPIDI_CH4U_REQ_UNEXP_DQUED) {
+        if (MPIDI_CH4U_REQUEST(rreq, req->status) & MPIDI_CH4U_REQ_UNEXP_CLAIMED) {
+            MPIDI_CH4U_unexp_mrecv_cmpl_handler(rreq);
+        }
+        /* MPIDI_CS_EXIT(); */
+        goto fn_exit;
+    }
+    /* MPIDI_CS_EXIT(); */
+
+    msg_tag = MPIDI_CH4U_REQUEST(rreq, tag);
+    root_comm = MPIDI_CH4U_context_id_to_comm(MPIDI_CH4U_get_context(msg_tag));
+
+    if (MPIDI_CH4U_REQUEST(rreq, req->status) & MPIDI_CH4U_REQ_MATCHED) {
+        match_req = (MPIR_Request *) MPIDI_CH4U_REQUEST(rreq, req->rreq.match_req);
+    }
+    else {
+        /* MPIDI_CS_ENTER(); */
+        if (root_comm)
+            match_req =
+                MPIDI_CH4U_dequeue_posted(msg_tag, &MPIDI_CH4U_COMM(root_comm, posted_list));
+
+        if (match_req) {
+            MPIDI_CH4U_delete_unexp(rreq, &MPIDI_CH4U_COMM(root_comm, unexp_list));
+            /* Decrement the counter twice, one for posted_list and the other for unexp_list */
+            MPIR_Comm_release(root_comm);
+            MPIR_Comm_release(root_comm);
+        }
+        /* MPIDI_CS_EXIT(); */
+    }
+
+    if (!match_req) {
+        MPIDI_CH4U_REQUEST(rreq, req->status) &= ~MPIDI_CH4U_REQ_BUSY;
+        goto fn_exit;
+    }
+
+    match_req->status.MPI_SOURCE = MPIDI_CH4U_REQUEST(rreq, src_rank);
+    match_req->status.MPI_TAG = MPIDI_CH4U_get_tag(msg_tag);
+
+    MPIDI_Datatype_get_info(MPIDI_CH4U_REQUEST(match_req, count),
+                            MPIDI_CH4U_REQUEST(match_req, datatype),
+                            dt_contig, dt_sz, dt_ptr, dt_true_lb);
+    MPID_Datatype_get_size_macro(MPIDI_CH4U_REQUEST(match_req, datatype), dt_sz);
+
+    if (MPIDI_CH4U_REQUEST(rreq, count) > dt_sz * MPIDI_CH4U_REQUEST(match_req, count)) {
+        rreq->status.MPI_ERROR = MPI_ERR_TRUNCATE;
+        count = MPIDI_CH4U_REQUEST(match_req, count);
+    }
+    else {
+        rreq->status.MPI_ERROR = MPI_SUCCESS;
+        count = MPIDI_CH4U_REQUEST(rreq, count) / dt_sz;
+    }
+
+    MPIR_STATUS_SET_COUNT(match_req->status, count * dt_sz);
+    MPIDI_CH4U_REQUEST(rreq, count) = count;
+
+    if (!dt_contig) {
+        segment_ptr = MPIDU_Segment_alloc();
+        MPIR_ERR_CHKANDJUMP1(segment_ptr == NULL, mpi_errno,
+                             MPI_ERR_OTHER, "**nomem", "**nomem %s", "Recv MPIDU_Segment_alloc");
+        MPIDU_Segment_init(MPIDI_CH4U_REQUEST(match_req, buffer), count,
+                           MPIDI_CH4U_REQUEST(match_req, datatype), segment_ptr, 0);
+
+        last = count * dt_sz;
+        MPIDU_Segment_unpack(segment_ptr, 0, &last, MPIDI_CH4U_REQUEST(rreq, buffer));
+        MPIDU_Segment_free(segment_ptr);
+        if (last != (MPI_Aint) (count * dt_sz)) {
+            mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+                                             __FUNCTION__, __LINE__,
+                                             MPI_ERR_TYPE, "**dtypemismatch", 0);
+            match_req->status.MPI_ERROR = mpi_errno;
+        }
+    }
+    else {
+        MPIR_Memcpy((char *) MPIDI_CH4U_REQUEST(match_req, buffer) + dt_true_lb,
+                    MPIDI_CH4U_REQUEST(rreq, buffer), count * dt_sz);
+    }
+
+    MPIDI_CH4U_REQUEST(rreq, req->status) &= ~MPIDI_CH4U_REQ_UNEXPECTED;
+    if (MPIDI_CH4U_REQUEST(rreq, req->status) & MPIDI_CH4U_REQ_PEER_SSEND) {
+        mpi_errno = MPIDI_CH4U_reply_ssend(rreq);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+
+    dtype_release_if_not_builtin(MPIDI_CH4U_REQUEST(match_req, datatype));
+    MPL_free(MPIDI_CH4U_REQUEST(rreq, buffer));
+    MPIR_Object_release_ref(rreq, &c);
+    MPIDI_CH4I_am_request_complete(rreq);
+    MPIDI_CH4I_am_request_complete(match_req);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_UNEXP_CMPL_HANDLER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_recv_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_recv_cmpl_handler(MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_RECV_CMPL_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_RECV_CMPL_HANDLER);
+
+    if (!MPIDI_CH4U_check_cmpl_order(rreq, MPIDI_CH4U_recv_cmpl_handler))
+        return mpi_errno;
+
+    if (MPIDI_CH4U_REQUEST(rreq, req->status) & MPIDI_CH4U_REQ_RCV_NON_CONTIG) {
+        MPL_free(MPIDI_CH4U_REQUEST(rreq, req->iov));
+    }
+
+    if (MPIDI_CH4U_REQUEST(rreq, req->status) & MPIDI_CH4U_REQ_UNEXPECTED) {
+        mpi_errno = MPIDI_CH4U_unexp_cmpl_handler(rreq);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+        goto fn_exit;
+    }
+
+    rreq->status.MPI_SOURCE = MPIDI_CH4U_REQUEST(rreq, src_rank);
+    rreq->status.MPI_TAG = MPIDI_CH4U_get_tag(MPIDI_CH4U_REQUEST(rreq, tag));
+
+    if (MPIDI_CH4U_REQUEST(rreq, req->status) & MPIDI_CH4U_REQ_PEER_SSEND) {
+        mpi_errno = MPIDI_CH4U_reply_ssend(rreq);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+
+#ifdef MPIDI_BUILD_CH4_SHM
+    if (MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(rreq)) {
+        int continue_matching = 1;
+        MPIDI_CH4R_anysource_matched(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(rreq), MPIDI_CH4R_NETMOD,
+                                     &continue_matching);
+        if (unlikely(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(rreq))) {
+            MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(rreq)) = NULL;
+            MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(rreq) = NULL;
+        }
+    }
+#endif
+
+    dtype_release_if_not_builtin(MPIDI_CH4U_REQUEST(rreq, datatype));
+    MPIDI_CH4I_am_request_complete(rreq);
+  fn_exit:
+    MPIDI_CH4U_progress_cmpl_list();
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_RECV_CMPL_HANDLER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_get_acc_ack_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_get_acc_ack_cmpl_handler(MPIR_Request * areq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Win *win;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_GET_ACC_ACK_CMPL_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_GET_ACC_ACK_CMPL_HANDLER);
+
+    if (!MPIDI_CH4U_check_cmpl_order(areq, MPIDI_CH4U_get_acc_ack_cmpl_handler))
+        return mpi_errno;
+
+    if (MPIDI_CH4U_REQUEST(areq, req->status) & MPIDI_CH4U_REQ_RCV_NON_CONTIG) {
+        MPL_free(MPIDI_CH4U_REQUEST(areq, req->iov));
+    }
+
+    win = MPIDI_CH4U_REQUEST(areq, req->areq.win_ptr);
+    /* MPIDI_CS_ENTER(); */
+    OPA_decr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+
+    dtype_release_if_not_builtin(MPIDI_CH4U_REQUEST(areq, req->areq.result_datatype));
+    MPIDI_CH4I_am_request_complete(areq);
+
+    MPIDI_CH4U_progress_cmpl_list();
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_GET_ACC_ACK_CMPL_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_put_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_put_cmpl_handler(MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Win *win;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_PUT_CMPL_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_PUT_CMPL_HANDLER);
+
+    if (!MPIDI_CH4U_check_cmpl_order(rreq, MPIDI_CH4U_put_cmpl_handler))
+        return mpi_errno;
+
+    if (MPIDI_CH4U_REQUEST(rreq, req->status) & MPIDI_CH4U_REQ_RCV_NON_CONTIG) {
+        MPL_free(MPIDI_CH4U_REQUEST(rreq, req->iov));
+    }
+
+    if (MPIDI_CH4U_REQUEST(rreq, req->preq.dt_iov)) {
+        MPL_free(MPIDI_CH4U_REQUEST(rreq, req->preq.dt_iov));
+    }
+
+    mpi_errno = MPIDI_CH4U_ack_put(rreq);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    win = (MPIR_Win *) MPIDI_CH4U_REQUEST(rreq, req->preq.win_ptr);
+
+    /* MPIDI_CS_ENTER(); */
+    OPA_decr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+
+    MPIDI_CH4I_am_request_complete(rreq);
+    MPIDI_CH4U_progress_cmpl_list();
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_PUT_CMPL_HANDLER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_put_iov_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_put_iov_cmpl_handler(MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_CH4U_put_iov_ack_msg_t ack_msg;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_PUT_IOV_CMPL_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_PUT_IOV_CMPL_HANDLER);
+
+    ack_msg.src_rank = MPIDI_CH4U_REQUEST(rreq, src_rank);
+    ack_msg.origin_preq_ptr = (uint64_t) MPIDI_CH4U_REQUEST(rreq, req->preq.preq_ptr);
+    ack_msg.target_preq_ptr = (uint64_t) rreq;
+
+    mpi_errno =
+        MPIDI_NM_inject_am_hdr_reply(MPIDI_CH4U_win_to_context
+                                     (MPIDI_CH4U_REQUEST(rreq, req->preq.win_ptr)),
+                                     MPIDI_CH4U_REQUEST(rreq, src_rank), MPIDI_CH4U_PUT_IOV_ACK,
+                                     &ack_msg, sizeof(ack_msg));
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_PUT_IOV_CMPL_HANDLER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_acc_iov_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_acc_iov_cmpl_handler(MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_CH4U_acc_iov_ack_msg_t ack_msg;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_ACC_IOV_CMPL_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_ACC_IOV_CMPL_HANDLER);
+
+    ack_msg.origin_preq_ptr = (uint64_t) MPIDI_CH4U_REQUEST(rreq, req->areq.req_ptr);
+    ack_msg.target_preq_ptr = (uint64_t) rreq;
+
+    mpi_errno =
+        MPIDI_NM_inject_am_hdr_reply(MPIDI_CH4U_win_to_context
+                                     (MPIDI_CH4U_REQUEST(rreq, req->areq.win_ptr)),
+                                     MPIDI_CH4U_REQUEST(rreq, src_rank), MPIDI_CH4U_ACC_IOV_ACK,
+                                     &ack_msg, sizeof(ack_msg));
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_ACC_IOV_CMPL_HANDLER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_cswap_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_cswap_cmpl_handler(MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    void *compare_addr;
+    void *origin_addr;
+    size_t data_sz;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_CSWAP_CMPL_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_CSWAP_CMPL_HANDLER);
+
+    if (!MPIDI_CH4U_check_cmpl_order(rreq, MPIDI_CH4U_cswap_cmpl_handler))
+        return mpi_errno;
+
+    MPIDI_Datatype_check_size(MPIDI_CH4U_REQUEST(rreq, req->creq.datatype), 1, data_sz);
+    origin_addr = MPIDI_CH4U_REQUEST(rreq, req->creq.data);
+    compare_addr = ((char *) MPIDI_CH4U_REQUEST(rreq, req->creq.data)) + data_sz;
+
+    /* MPIDI_CS_ENTER(); */
+
+    if (MPIR_Compare_equal((void *) MPIDI_CH4U_REQUEST(rreq, req->creq.addr), compare_addr,
+                           MPIDI_CH4U_REQUEST(rreq, req->creq.datatype))) {
+        MPIR_Memcpy(compare_addr, (void *) MPIDI_CH4U_REQUEST(rreq, req->creq.addr), data_sz);
+        MPIR_Memcpy((void *) MPIDI_CH4U_REQUEST(rreq, req->creq.addr), origin_addr, data_sz);
+    }
+    else {
+        MPIR_Memcpy(compare_addr, (void *) MPIDI_CH4U_REQUEST(rreq, req->creq.addr), data_sz);
+    }
+
+    /* MPIDI_CS_EXIT(); */
+
+    mpi_errno = MPIDI_CH4U_ack_cswap(rreq);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+    MPIDI_CH4I_am_request_complete(rreq);
+    MPIDI_CH4U_progress_cmpl_list();
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_CSWAP_CMPL_HANDLER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_do_accumulate_op
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_do_accumulate_op(void *source_buf, int source_count,
+                                              MPI_Datatype source_dtp, void *target_buf,
+                                              int target_count, MPI_Datatype target_dtp,
+                                              MPI_Aint stream_offset, MPI_Op acc_op)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPI_User_function *uop = NULL;
+    MPI_Aint source_dtp_size = 0, source_dtp_extent = 0;
+    int is_empty_source = FALSE;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_DO_ACCUMULATE_OP);
+
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_DO_ACCUMULATE_OP);
+
+    /* first Judge if source buffer is empty */
+    if (acc_op == MPI_NO_OP)
+        is_empty_source = TRUE;
+
+    if (is_empty_source == FALSE) {
+        MPIR_Assert(MPIR_DATATYPE_IS_PREDEFINED(source_dtp));
+        MPID_Datatype_get_size_macro(source_dtp, source_dtp_size);
+        MPID_Datatype_get_extent_macro(source_dtp, source_dtp_extent);
+    }
+
+    if (HANDLE_GET_KIND(acc_op) == HANDLE_KIND_BUILTIN) {
+        /* get the function by indexing into the op table */
+        uop = MPIR_OP_HDL_TO_FN(acc_op);
+    }
+    else {
+        /* --BEGIN ERROR HANDLING-- */
+        mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+                                         FCNAME, __LINE__, MPI_ERR_OP,
+                                         "**opnotpredefined", "**opnotpredefined %d", acc_op);
+        return mpi_errno;
+        /* --END ERROR HANDLING-- */
+    }
+
+
+    if (is_empty_source == TRUE || MPIR_DATATYPE_IS_PREDEFINED(target_dtp)) {
+        /* directly apply op if target dtp is predefined dtp OR source buffer is empty */
+        MPI_Aint real_stream_offset;
+        void *curr_target_buf;
+
+        if (is_empty_source == FALSE) {
+            MPIR_Assert(source_dtp == target_dtp);
+            real_stream_offset = (stream_offset / source_dtp_size) * source_dtp_extent;
+            curr_target_buf = (void *) ((char *) target_buf + real_stream_offset);
+        }
+        else {
+            curr_target_buf = target_buf;
+        }
+
+        (*uop) (source_buf, curr_target_buf, &source_count, &source_dtp);
+    }
+    else {
+        /* derived datatype */
+        MPID_Segment *segp;
+        DLOOP_VECTOR *dloop_vec;
+        MPI_Aint first, last;
+        int vec_len, i, count;
+        MPI_Aint type_extent, type_size;
+        MPI_Datatype type;
+        MPIR_Datatype *dtp;
+        MPI_Aint curr_len;
+        void *curr_loc;
+        int accumulated_count;
+
+        segp = MPIDU_Segment_alloc();
+        /* --BEGIN ERROR HANDLING-- */
+        if (!segp) {
+            mpi_errno =
+                MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__,
+                                     MPI_ERR_OTHER, "**nomem", 0);
+            MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_DO_ACCUMULATE_OP);
+            return mpi_errno;
+        }
+        /* --END ERROR HANDLING-- */
+        MPIDU_Segment_init(NULL, target_count, target_dtp, segp, 0);
+        first = stream_offset;
+        last = first + source_count * source_dtp_size;
+
+        MPID_Datatype_get_ptr(target_dtp, dtp);
+        vec_len = dtp->max_contig_blocks * target_count + 1;
+        /* +1 needed because Rob says so */
+        dloop_vec = (DLOOP_VECTOR *)
+            MPL_malloc(vec_len * sizeof(DLOOP_VECTOR));
+        /* --BEGIN ERROR HANDLING-- */
+        if (!dloop_vec) {
+            mpi_errno =
+                MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__,
+                                     MPI_ERR_OTHER, "**nomem", 0);
+            MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_DO_ACCUMULATE_OP);
+            return mpi_errno;
+        }
+        /* --END ERROR HANDLING-- */
+
+        MPIDU_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len);
+
+        type = dtp->basic_type;
+        MPIR_Assert(type != MPI_DATATYPE_NULL);
+
+        MPIR_Assert(type == source_dtp);
+        type_size = source_dtp_size;
+        type_extent = source_dtp_extent;
+
+        i = 0;
+        curr_loc = dloop_vec[0].DLOOP_VECTOR_BUF;
+        curr_len = dloop_vec[0].DLOOP_VECTOR_LEN;
+        accumulated_count = 0;
+        while (i != vec_len) {
+            if (curr_len < type_size) {
+                MPIR_Assert(i != vec_len);
+                i++;
+                curr_len += dloop_vec[i].DLOOP_VECTOR_LEN;
+                continue;
+            }
+
+            MPIR_Assign_trunc(count, curr_len / type_size, int);
+
+            (*uop) ((char *) source_buf + type_extent * accumulated_count,
+                    (char *) target_buf + MPIR_Ptr_to_aint(curr_loc), &count, &type);
+
+            if (curr_len % type_size == 0) {
+                i++;
+                if (i != vec_len) {
+                    curr_loc = dloop_vec[i].DLOOP_VECTOR_BUF;
+                    curr_len = dloop_vec[i].DLOOP_VECTOR_LEN;
+                }
+            }
+            else {
+                curr_loc = (void *) ((char *) curr_loc + type_extent * count);
+                curr_len -= type_size * count;
+            }
+
+            accumulated_count += count;
+        }
+
+        MPIDU_Segment_free(segp);
+        MPL_free(dloop_vec);
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_DO_ACCUMULATE_OP);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_handle_acc_cmpl
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_handle_acc_cmpl(MPIR_Request * rreq, int do_get)
+{
+    int mpi_errno = MPI_SUCCESS, i;
+    MPI_Aint basic_sz, count, offset = 0;
+    struct iovec *iov;
+    char *src_ptr, *original = NULL;
+    size_t data_sz;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_HANDLE_ACC_CMPL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_HANDLE_ACC_CMPL);
+
+    MPID_Datatype_get_size_macro(MPIDI_CH4U_REQUEST(rreq, req->areq.target_datatype), basic_sz);
+    data_sz = MPIDI_CH4U_REQUEST(rreq, req->areq.data_sz);
+
+    /* MPIDI_CS_ENTER(); */
+
+    if (do_get) {
+        original = (char *) MPL_malloc(data_sz);
+        MPIR_Assert(original);
+    }
+
+    if (MPIDI_CH4U_REQUEST(rreq, req->areq.op) == MPI_NO_OP) {
+        MPIDI_CH4U_REQUEST(rreq, req->areq.origin_count) =
+            MPIDI_CH4U_REQUEST(rreq, req->areq.target_count);
+        MPIDI_CH4U_REQUEST(rreq, req->areq.data_sz) = data_sz;
+    }
+
+    if (MPIDI_CH4U_REQUEST(rreq, req->areq.dt_iov) == NULL) {
+
+        if (original) {
+            MPIR_Memcpy(original, MPIDI_CH4U_REQUEST(rreq, req->areq.target_addr),
+                        basic_sz * MPIDI_CH4U_REQUEST(rreq, req->areq.target_count));
+        }
+
+        mpi_errno = MPIDI_CH4U_do_accumulate_op(MPIDI_CH4U_REQUEST(rreq, req->areq.data),
+                                                MPIDI_CH4U_REQUEST(rreq, req->areq.origin_count),
+                                                MPIDI_CH4U_REQUEST(rreq, req->areq.origin_datatype),
+                                                MPIDI_CH4U_REQUEST(rreq, req->areq.target_addr),
+                                                MPIDI_CH4U_REQUEST(rreq, req->areq.target_count),
+                                                MPIDI_CH4U_REQUEST(rreq, req->areq.target_datatype),
+                                                0, MPIDI_CH4U_REQUEST(rreq, req->areq.op));
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+    else {
+        iov = (struct iovec *) MPIDI_CH4U_REQUEST(rreq, req->areq.dt_iov);
+        src_ptr = (char *) MPIDI_CH4U_REQUEST(rreq, req->areq.data);
+        for (i = 0; i < MPIDI_CH4U_REQUEST(rreq, req->areq.n_iov); i++) {
+            count = iov[i].iov_len / basic_sz;
+            MPIR_Assert(count > 0);
+
+            if (original) {
+                MPIR_Memcpy(original + offset, iov[i].iov_base, count * basic_sz);
+                offset += count * basic_sz;
+            }
+
+            mpi_errno = MPIDI_CH4U_do_accumulate_op(src_ptr, count,
+                                                    MPIDI_CH4U_REQUEST(rreq,
+                                                                       req->areq.origin_datatype),
+                                                    iov[i].iov_base, count, MPIDI_CH4U_REQUEST(rreq,
+                                                                                               req->
+                                                                                               areq.
+                                                                                               target_datatype),
+                                                    0, MPIDI_CH4U_REQUEST(rreq, req->areq.op));
+            if (mpi_errno)
+                MPIR_ERR_POP(mpi_errno);
+            src_ptr += count * basic_sz;
+        }
+        MPL_free(iov);
+    }
+
+    /* MPIDI_CS_EXIT(); */
+    if (MPIDI_CH4U_REQUEST(rreq, req->areq.data))
+        MPL_free(MPIDI_CH4U_REQUEST(rreq, req->areq.data));
+
+    if (original) {
+        MPIDI_CH4U_REQUEST(rreq, req->areq.data) = original;
+        mpi_errno = MPIDI_CH4U_ack_get_acc(rreq);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+    else {
+        MPIDI_CH4U_REQUEST(rreq, req->areq.data) = NULL;
+        mpi_errno = MPIDI_CH4U_ack_acc(rreq);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+    MPIDI_CH4I_am_request_complete(rreq);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_HANDLE_ACC_CMPL);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_acc_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_acc_cmpl_handler(MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_ACC_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_ACC_HANDLER);
+
+    if (!MPIDI_CH4U_check_cmpl_order(rreq, MPIDI_CH4U_acc_cmpl_handler))
+        return mpi_errno;
+
+    mpi_errno = MPIDI_CH4U_handle_acc_cmpl(rreq, 0);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    MPIDI_CH4U_progress_cmpl_list();
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_ACC_HANDLER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_get_acc_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_get_acc_cmpl_handler(MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_GET_ACC_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_GET_ACC_HANDLER);
+
+    if (!MPIDI_CH4U_check_cmpl_order(rreq, MPIDI_CH4U_get_acc_cmpl_handler))
+        return mpi_errno;
+
+    mpi_errno = MPIDI_CH4U_handle_acc_cmpl(rreq, 1);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    MPIDI_CH4U_progress_cmpl_list();
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_GET_ACC_HANDLER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_get_ack_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_get_ack_cmpl_handler(MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *greq;
+    MPIR_Win *win;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_GET_ACK_CMPL_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_GET_ACK_CMPL_HANDLER);
+
+    if (!MPIDI_CH4U_check_cmpl_order(rreq, MPIDI_CH4U_get_ack_cmpl_handler))
+        return mpi_errno;
+
+    greq = (MPIR_Request *) MPIDI_CH4U_REQUEST(rreq, req->greq.greq_ptr);
+    if (MPIDI_CH4U_REQUEST(greq, req->status) & MPIDI_CH4U_REQ_RCV_NON_CONTIG) {
+        MPL_free(MPIDI_CH4U_REQUEST(greq, req->iov));
+    }
+
+    win = MPIDI_CH4U_REQUEST(greq, req->greq.win_ptr);
+    /* MPIDI_CS_ENTER(); */
+    OPA_decr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+
+    MPIDI_CH4I_am_request_complete(greq);
+    MPIDI_CH4I_am_request_complete(rreq);
+    MPIDI_CH4U_progress_cmpl_list();
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_GET_ACK_CMPL_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_cswap_ack_cmpl_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_cswap_ack_cmpl_handler(MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Win *win;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_CSWAP_ACK_CMPL_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_CSWAP_ACK_CMPL_HANDLER);
+
+    if (!MPIDI_CH4U_check_cmpl_order(rreq, MPIDI_CH4U_cswap_ack_cmpl_handler))
+        return mpi_errno;
+
+    win = MPIDI_CH4U_REQUEST(rreq, req->creq.win_ptr);
+    /* MPIDI_CS_ENTER(); */
+    OPA_decr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+
+    MPL_free(MPIDI_CH4U_REQUEST(rreq, req->creq.data));
+    MPIDI_CH4I_am_request_complete(rreq);
+
+    MPIDI_CH4U_progress_cmpl_list();
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_CSWAP_ACK_CMPL_HANDLER);
+    return mpi_errno;
+}
+
+/* Main body of the send target handler, commonly used by both eager send and rendezvous send */
+static inline int MPIDI_CH4I_do_send_target_handler(void **data,
+                                                    size_t * p_data_sz,
+                                                    int *is_contig,
+                                                    MPIDI_NM_am_completion_handler_fn *
+                                                    cmpl_handler_fn, MPIR_Request * rreq)
+{
+    int dt_contig, n_iov;
+    MPI_Aint dt_true_lb, last, num_iov;
+    MPIR_Datatype *dt_ptr;
+    MPID_Segment *segment_ptr;
+    size_t data_sz;
+
+    *cmpl_handler_fn = MPIDI_CH4U_recv_cmpl_handler;
+    MPIDI_CH4U_REQUEST(rreq, req->seq_no) = OPA_fetch_and_add_int(&MPIDI_CH4_Global.nxt_seq_no, 1);
+
+    MPIDI_Datatype_get_info(MPIDI_CH4U_REQUEST(rreq, count),
+                            MPIDI_CH4U_REQUEST(rreq, datatype),
+                            dt_contig, data_sz, dt_ptr, dt_true_lb);
+    *is_contig = dt_contig;
+
+    if (dt_contig) {
+        *p_data_sz = data_sz;
+        *data = (char *) MPIDI_CH4U_REQUEST(rreq, buffer) + dt_true_lb;
+    }
+    else {
+        segment_ptr = MPIDU_Segment_alloc();
+        MPIR_Assert(segment_ptr);
+
+        MPIDU_Segment_init(MPIDI_CH4U_REQUEST(rreq, buffer),
+                           MPIDI_CH4U_REQUEST(rreq, count),
+                           MPIDI_CH4U_REQUEST(rreq, datatype), segment_ptr, 0);
+
+        if (*p_data_sz > data_sz) {
+            rreq->status.MPI_ERROR = MPI_ERR_TRUNCATE;
+            *p_data_sz = data_sz;
+        }
+        last = data_sz;
+        MPIDU_Segment_count_contig_blocks(segment_ptr, 0, &last, &num_iov);
+        n_iov = (int) num_iov;
+        MPIR_Assert(n_iov > 0);
+        MPIDI_CH4U_REQUEST(rreq, req->iov) =
+            (struct iovec *) MPL_malloc(n_iov * sizeof(struct iovec));
+        MPIR_Assert(MPIDI_CH4U_REQUEST(rreq, req->iov));
+
+        last = *p_data_sz;
+        MPIDU_Segment_pack_vector(segment_ptr, 0, &last, MPIDI_CH4U_REQUEST(rreq, req->iov),
+                                  &n_iov);
+        if (last != (MPI_Aint) * p_data_sz) {
+            rreq->status.MPI_ERROR = MPI_ERR_TYPE;
+        }
+        *data = MPIDI_CH4U_REQUEST(rreq, req->iov);
+        *p_data_sz = n_iov;
+        MPIDI_CH4U_REQUEST(rreq, req->status) |= MPIDI_CH4U_REQ_RCV_NON_CONTIG;
+        MPL_free(segment_ptr);
+    }
+
+    return MPI_SUCCESS;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_send_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_send_target_handler(void *am_hdr,
+                                                 void **data,
+                                                 size_t * p_data_sz,
+                                                 int *is_contig,
+                                                 MPIDI_NM_am_completion_handler_fn *
+                                                 cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq = NULL;
+    MPIR_Comm *root_comm;
+    MPIDI_CH4U_hdr_t *hdr = (MPIDI_CH4U_hdr_t *) am_hdr;
+    MPIR_Context_id_t context_id;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_SEND_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_SEND_HANDLER);
+    context_id = MPIDI_CH4U_get_context(hdr->msg_tag);
+    root_comm = MPIDI_CH4U_context_id_to_comm(context_id);
+    if (root_comm) {
+        /* MPIDI_CS_ENTER(); */
+        rreq = MPIDI_CH4U_dequeue_posted(hdr->msg_tag, &MPIDI_CH4U_COMM(root_comm, posted_list));
+        /* MPIDI_CS_EXIT(); */
+    }
+
+    if (rreq == NULL) {
+        rreq = MPIDI_CH4I_am_request_create(MPIR_REQUEST_KIND__RECV);
+        MPIDI_CH4U_REQUEST(rreq, buffer) = (char *) MPL_malloc(*p_data_sz);
+        MPIDI_CH4U_REQUEST(rreq, datatype) = MPI_BYTE;
+        MPIDI_CH4U_REQUEST(rreq, count) = *p_data_sz;
+        MPIDI_CH4U_REQUEST(rreq, tag) = hdr->msg_tag;
+        MPIDI_CH4U_REQUEST(rreq, src_rank) = hdr->src_rank;
+        MPIDI_CH4U_REQUEST(rreq, req->status) |= MPIDI_CH4U_REQ_BUSY;
+        MPIDI_CH4U_REQUEST(rreq, req->status) |= MPIDI_CH4U_REQ_UNEXPECTED;
+        /* MPIDI_CS_ENTER(); */
+        if (root_comm) {
+            MPIR_Comm_add_ref(root_comm);
+            MPIDI_CH4U_enqueue_unexp(rreq, &MPIDI_CH4U_COMM(root_comm, unexp_list));
+        }
+        else {
+            MPIDI_CH4U_enqueue_unexp(rreq, MPIDI_CH4U_context_id_to_uelist(context_id));
+        }
+        /* MPIDI_CS_EXIT(); */
+    }
+    else {
+        /* rreq != NULL <=> root_comm != NULL */
+        MPIR_Assert(root_comm);
+        /* Decrement the refcnt when popping a request out from posted_list */
+        MPIR_Comm_release(root_comm);
+        MPIDI_CH4U_REQUEST(rreq, src_rank) = hdr->src_rank;
+        MPIDI_CH4U_REQUEST(rreq, tag) = hdr->msg_tag;
+    }
+
+    *req = rreq;
+
+    mpi_errno = MPIDI_CH4I_do_send_target_handler(data, p_data_sz, is_contig,
+                                                  cmpl_handler_fn, rreq);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_SEND_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_send_long_req_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_send_long_req_target_handler(void *am_hdr,
+                                                          void **data,
+                                                          size_t * p_data_sz,
+                                                          int *is_contig,
+                                                          MPIDI_NM_am_completion_handler_fn *
+                                                          cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq = NULL;
+    MPIR_Comm *root_comm;
+    MPIDI_CH4U_hdr_t *hdr = (MPIDI_CH4U_hdr_t *) am_hdr;
+    MPIDI_CH4U_send_long_req_msg_t *lreq_hdr = (MPIDI_CH4U_send_long_req_msg_t *) am_hdr;
+    MPIR_Context_id_t context_id;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_SEND_LONG_REQ_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_SEND_LONG_REQ_HANDLER);
+
+    context_id = MPIDI_CH4U_get_context(hdr->msg_tag);
+    root_comm = MPIDI_CH4U_context_id_to_comm(context_id);
+    if (root_comm) {
+        /* MPIDI_CS_ENTER(); */
+        rreq = MPIDI_CH4U_dequeue_posted(hdr->msg_tag, &MPIDI_CH4U_COMM(root_comm, posted_list));
+        /* MPIDI_CS_EXIT(); */
+    }
+
+    if (rreq == NULL) {
+        rreq = MPIDI_CH4I_am_request_create(MPIR_REQUEST_KIND__RECV);
+
+        MPIDI_CH4U_REQUEST(rreq, buffer) = NULL;
+        MPIDI_CH4U_REQUEST(rreq, datatype) = MPI_BYTE;
+        MPIDI_CH4U_REQUEST(rreq, count) = lreq_hdr->data_sz;
+        MPIDI_CH4U_REQUEST(rreq, req->status) |= MPIDI_CH4U_REQ_LONG_RTS;
+        MPIDI_CH4U_REQUEST(rreq, req->rreq.peer_req_ptr) = lreq_hdr->sreq_ptr;
+        MPIDI_CH4U_REQUEST(rreq, tag) = hdr->msg_tag;
+        MPIDI_CH4U_REQUEST(rreq, src_rank) = hdr->src_rank;
+
+        /* MPIDI_CS_ENTER(); */
+        if (root_comm) {
+            MPIR_Comm_add_ref(root_comm);
+            MPIDI_CH4U_enqueue_unexp(rreq, &MPIDI_CH4U_COMM(root_comm, unexp_list));
+        }
+        else {
+            MPIDI_CH4U_enqueue_unexp(rreq, MPIDI_CH4U_context_id_to_uelist(context_id));
+        }
+        /* MPIDI_CS_EXIT(); */
+    }
+    else {
+        /* Matching receive was posted, tell the netmod */
+        MPIR_Comm_release(root_comm);   /* -1 for posted_list */
+        MPIDI_CH4U_REQUEST(rreq, req->status) |= MPIDI_CH4U_REQ_LONG_RTS;
+        MPIDI_CH4U_REQUEST(rreq, req->rreq.peer_req_ptr) = lreq_hdr->sreq_ptr;
+        MPIDI_CH4U_REQUEST(rreq, tag) = hdr->msg_tag;
+        MPIDI_CH4U_REQUEST(rreq, src_rank) = hdr->src_rank;
+        mpi_errno = MPIDI_NM_am_recv(rreq);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_SEND_LONG_REQ_HANDLER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_send_long_lmt_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_send_long_lmt_target_handler(void *am_hdr,
+                                                          void **data,
+                                                          size_t * p_data_sz,
+                                                          int *is_contig,
+                                                          MPIDI_NM_am_completion_handler_fn *
+                                                          cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno;
+    MPIR_Request *rreq;
+    MPIDI_CH4U_send_long_lmt_msg_t *lmt_hdr = (MPIDI_CH4U_send_long_lmt_msg_t *) am_hdr;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_SEND_LONG_LMT_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_SEND_LONG_LMT_HANDLER);
+
+    rreq = (MPIR_Request *) lmt_hdr->rreq_ptr;
+    MPIR_Assert(rreq);
+    mpi_errno = MPIDI_CH4I_do_send_target_handler(data, p_data_sz, is_contig,
+                                                  cmpl_handler_fn, rreq);
+    *req = rreq;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_SEND_LONG_LMT_HANDLER);
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_ssend_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_ssend_target_handler(void *am_hdr,
+                                                  void **data,
+                                                  size_t * p_data_sz,
+                                                  int *is_contig,
+                                                  MPIDI_NM_am_completion_handler_fn *
+                                                  cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIDI_CH4U_ssend_req_msg_t *msg_hdr = (MPIDI_CH4U_ssend_req_msg_t *) am_hdr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_SSEND_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_SSEND_HANDLER);
+
+    mpi_errno = MPIDI_CH4U_send_target_handler(am_hdr,
+                                               data, p_data_sz, is_contig, cmpl_handler_fn, req);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    MPIR_Assert(req);
+    MPIDI_CH4U_REQUEST(*req, req->rreq.peer_req_ptr) = msg_hdr->sreq_ptr;
+    MPIDI_CH4U_REQUEST(*req, req->status) |= MPIDI_CH4U_REQ_PEER_SSEND;
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_SSEND_HANDLER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_ssend_ack_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_ssend_ack_target_handler(void *am_hdr,
+                                                      void **data,
+                                                      size_t * p_data_sz, int *is_contig,
+                                                      MPIDI_NM_am_completion_handler_fn *
+                                                      cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *sreq;
+    MPIDI_CH4U_ssend_ack_msg_t *msg_hdr = (MPIDI_CH4U_ssend_ack_msg_t *) am_hdr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_SSEND_ACK_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_SSEND_ACK_HANDLER);
+
+    sreq = (MPIR_Request *) msg_hdr->sreq_ptr;
+    MPIDI_CH4I_am_request_complete(sreq);
+
+    if (req)
+        *req = NULL;
+    if (cmpl_handler_fn)
+        *cmpl_handler_fn = NULL;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_SSEND_ACK_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_send_long_ack_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_send_long_ack_target_handler(void *am_hdr,
+                                                          void **data,
+                                                          size_t * p_data_sz, int *is_contig,
+                                                          MPIDI_NM_am_completion_handler_fn *
+                                                          cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *sreq;
+    MPIDI_CH4U_send_long_ack_msg_t *msg_hdr = (MPIDI_CH4U_send_long_ack_msg_t *) am_hdr;
+    MPIDI_CH4U_send_long_lmt_msg_t send_hdr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_SEND_LONG_ACK_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_SEND_LONG_ACK_HANDLER);
+
+    sreq = (MPIR_Request *) msg_hdr->sreq_ptr;
+    MPIR_Assert(sreq != NULL);
+
+    /* Start the main data transfer */
+    send_hdr.rreq_ptr = msg_hdr->rreq_ptr;
+    mpi_errno =
+        MPIDI_NM_send_am_reply(MPIDI_CH4U_get_context(MPIDI_CH4U_REQUEST(sreq, req->lreq).msg_tag),
+                               MPIDI_CH4U_REQUEST(sreq, src_rank), MPIDI_CH4U_SEND_LONG_LMT,
+                               &send_hdr, sizeof(send_hdr), MPIDI_CH4U_REQUEST(sreq,
+                                                                               req->lreq).src_buf,
+                               MPIDI_CH4U_REQUEST(sreq, req->lreq).count, MPIDI_CH4U_REQUEST(sreq,
+                                                                                             req->
+                                                                                             lreq).
+                               datatype, sreq);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    if (cmpl_handler_fn)
+        *cmpl_handler_fn = MPIDI_CH4U_send_origin_cmpl_handler;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_SEND_LONG_ACK_HANDLER);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_put_ack_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_put_ack_target_handler(void *am_hdr,
+                                                    void **data,
+                                                    size_t * p_data_sz, int *is_contig,
+                                                    MPIDI_NM_am_completion_handler_fn *
+                                                    cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_CH4U_put_ack_msg_t *msg_hdr = (MPIDI_CH4U_put_ack_msg_t *) am_hdr;
+    MPIR_Win *win;
+    MPIR_Request *preq;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_PUT_ACK_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_PUT_ACK_HANDLER);
+
+    preq = (MPIR_Request *) msg_hdr->preq_ptr;
+    win = MPIDI_CH4U_REQUEST(preq, req->preq.win_ptr);
+
+    if (MPIDI_CH4U_REQUEST(preq, req->preq.dt_iov)) {
+        MPL_free(MPIDI_CH4U_REQUEST(preq, req->preq.dt_iov));
+    }
+
+    MPIDI_CH4I_am_request_complete(preq);
+
+    /* MPIDI_CS_ENTER(); */
+    OPA_decr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+
+    if (req)
+        *req = NULL;
+    if (cmpl_handler_fn)
+        *cmpl_handler_fn = NULL;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_PUT_ACK_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_acc_ack_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_acc_ack_target_handler(void *am_hdr,
+                                                    void **data,
+                                                    size_t * p_data_sz, int *is_contig,
+                                                    MPIDI_NM_am_completion_handler_fn *
+                                                    cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_CH4U_acc_ack_msg_t *msg_hdr = (MPIDI_CH4U_acc_ack_msg_t *) am_hdr;
+    MPIR_Win *win;
+    MPIR_Request *areq;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_ACC_ACK_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_ACC_ACK_HANDLER);
+
+    areq = (MPIR_Request *) msg_hdr->req_ptr;
+    win = MPIDI_CH4U_REQUEST(areq, req->areq.win_ptr);
+
+    if (MPIDI_CH4U_REQUEST(areq, req->areq.dt_iov)) {
+        MPL_free(MPIDI_CH4U_REQUEST(areq, req->areq.dt_iov));
+    }
+
+    MPIDI_CH4I_am_request_complete(areq);
+
+    /* MPIDI_CS_ENTER(); */
+    OPA_decr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+
+    if (req)
+        *req = NULL;
+    if (cmpl_handler_fn)
+        *cmpl_handler_fn = NULL;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_ACC_ACK_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_get_acc_ack_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_get_acc_ack_target_handler(void *am_hdr,
+                                                        void **data,
+                                                        size_t * p_data_sz, int *is_contig,
+                                                        MPIDI_NM_am_completion_handler_fn *
+                                                        cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_CH4U_acc_ack_msg_t *msg_hdr = (MPIDI_CH4U_acc_ack_msg_t *) am_hdr;
+    MPIR_Request *areq;
+
+    size_t data_sz;
+    int dt_contig, n_iov;
+    MPI_Aint dt_true_lb, last, num_iov;
+    MPIR_Datatype *dt_ptr;
+    MPID_Segment *segment_ptr;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_ACC_ACK_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_ACC_ACK_HANDLER);
+
+    areq = (MPIR_Request *) msg_hdr->req_ptr;
+
+    if (MPIDI_CH4U_REQUEST(areq, req->areq.dt_iov)) {
+        MPL_free(MPIDI_CH4U_REQUEST(areq, req->areq.dt_iov));
+    }
+
+    MPIDI_Datatype_get_info(MPIDI_CH4U_REQUEST(areq, req->areq.result_count),
+                            MPIDI_CH4U_REQUEST(areq, req->areq.result_datatype),
+                            dt_contig, data_sz, dt_ptr, dt_true_lb);
+    *is_contig = dt_contig;
+
+    if (dt_contig) {
+        *p_data_sz = data_sz;
+        *data = (char *) MPIDI_CH4U_REQUEST(areq, req->areq.result_addr) + dt_true_lb;
+    }
+    else {
+        segment_ptr = MPIDU_Segment_alloc();
+        MPIR_Assert(segment_ptr);
+
+        MPIDU_Segment_init(MPIDI_CH4U_REQUEST(areq, req->areq.result_addr),
+                           MPIDI_CH4U_REQUEST(areq, req->areq.result_count),
+                           MPIDI_CH4U_REQUEST(areq, req->areq.result_datatype), segment_ptr, 0);
+
+        last = data_sz;
+        MPIDU_Segment_count_contig_blocks(segment_ptr, 0, &last, &num_iov);
+        n_iov = (int) num_iov;
+        MPIR_Assert(n_iov > 0);
+        MPIDI_CH4U_REQUEST(areq, req->iov) =
+            (struct iovec *) MPL_malloc(n_iov * sizeof(struct iovec));
+        MPIR_Assert(MPIDI_CH4U_REQUEST(areq, req->iov));
+
+        last = data_sz;
+        MPIDU_Segment_pack_vector(segment_ptr, 0, &last, MPIDI_CH4U_REQUEST(areq, req->iov),
+                                  &n_iov);
+        MPIR_Assert(last == (MPI_Aint) data_sz);
+        *data = MPIDI_CH4U_REQUEST(areq, req->iov);
+        *p_data_sz = n_iov;
+        MPIDI_CH4U_REQUEST(areq, req->status) |= MPIDI_CH4U_REQ_RCV_NON_CONTIG;
+        MPL_free(segment_ptr);
+    }
+
+    *req = areq;
+    *cmpl_handler_fn = MPIDI_CH4U_get_acc_ack_cmpl_handler;
+    MPIDI_CH4U_REQUEST(areq, req->seq_no) = OPA_fetch_and_add_int(&MPIDI_CH4_Global.nxt_seq_no, 1);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_ACC_ACK_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_cswap_ack_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_cswap_ack_target_handler(void *am_hdr,
+                                                      void **data,
+                                                      size_t * p_data_sz, int *is_contig,
+                                                      MPIDI_NM_am_completion_handler_fn *
+                                                      cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_CH4U_cswap_ack_msg_t *msg_hdr = (MPIDI_CH4U_cswap_ack_msg_t *) am_hdr;
+    MPIR_Request *creq;
+    uint64_t data_sz;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_CSWAP_ACK_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_CSWAP_ACK_HANDLER);
+
+    creq = (MPIR_Request *) msg_hdr->req_ptr;
+    MPIDI_Datatype_check_size(MPIDI_CH4U_REQUEST(creq, req->creq.datatype), 1, data_sz);
+    *data = MPIDI_CH4U_REQUEST(creq, req->creq.result_addr);
+    *p_data_sz = data_sz;
+    *is_contig = 1;
+
+    *req = creq;
+    *cmpl_handler_fn = MPIDI_CH4U_cswap_ack_cmpl_handler;
+    MPIDI_CH4U_REQUEST(creq, req->seq_no) = OPA_fetch_and_add_int(&MPIDI_CH4_Global.nxt_seq_no, 1);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_CSWAP_ACK_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_win_lock_advance
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_win_lock_advance(MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    struct MPIDI_CH4U_win_sync_lock *slock = &MPIDI_CH4U_WIN(win, sync).lock;
+    struct MPIDI_CH4U_win_queue *q = &slock->local.requested;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_WIN_LOCK_ADVANCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_WIN_LOCK_ADVANCE);
+
+    if ((q->head != NULL) && ((slock->local.count == 0) ||
+                              ((slock->local.type == MPI_LOCK_SHARED) &&
+                               (q->head->type == MPI_LOCK_SHARED)))) {
+        struct MPIDI_CH4U_win_lock *lock = q->head;
+        q->head = lock->next;
+
+        if (q->head == NULL)
+            q->tail = NULL;
+
+        ++slock->local.count;
+        slock->local.type = lock->type;
+
+        MPIDI_CH4U_win_cntrl_msg_t msg;
+        msg.win_id = MPIDI_CH4U_WIN(win, win_id);
+        msg.origin_rank = win->comm_ptr->rank;
+
+        if (lock->mtype == MPIDI_CH4U_WIN_LOCK)
+            msg.type = MPIDI_CH4U_WIN_LOCK_ACK;
+        else if (lock->mtype == MPIDI_CH4U_WIN_LOCKALL)
+            msg.type = MPIDI_CH4U_WIN_LOCKALL_ACK;
+        else
+            MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**rmasync");
+
+        mpi_errno = MPIDI_NM_inject_am_hdr_reply(MPIDI_CH4U_win_to_context(win),
+                                                 lock->rank,
+                                                 MPIDI_CH4U_WIN_CTRL, &msg, sizeof(msg));
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+        MPL_free(lock);
+
+        mpi_errno = MPIDI_CH4U_win_lock_advance(win);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_WIN_LOCK_ADVANCE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_win_lock_req_proc
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline void MPIDI_CH4U_win_lock_req_proc(const MPIDI_CH4U_win_cntrl_msg_t * info,
+                                                MPIR_Win * win)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_WIN_LOCK_REQ_PROC);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_WIN_LOCK_REQ_PROC);
+
+    struct MPIDI_CH4U_win_lock *lock = (struct MPIDI_CH4U_win_lock *)
+        MPL_calloc(1, sizeof(struct MPIDI_CH4U_win_lock));
+
+    if (info->type == MPIDI_CH4U_WIN_LOCK)
+        lock->mtype = MPIDI_CH4U_WIN_LOCK;
+    else if (info->type == MPIDI_CH4U_WIN_LOCKALL)
+        lock->mtype = MPIDI_CH4U_WIN_LOCKALL;
+
+    lock->rank = info->origin_rank;
+    lock->type = info->lock_type;
+    struct MPIDI_CH4U_win_queue *q = &MPIDI_CH4U_WIN(win, sync).lock.local.requested;
+    MPIR_Assert((q->head != NULL) ^ (q->tail == NULL));
+
+    if (q->tail == NULL)
+        q->head = lock;
+    else
+        q->tail->next = lock;
+
+    q->tail = lock;
+
+    MPIDI_CH4U_win_lock_advance(win);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_WIN_LOCK_REQ_PROC);
+    return;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_win_lock_ack_proc
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline void MPIDI_CH4U_win_lock_ack_proc(const MPIDI_CH4U_win_cntrl_msg_t * info,
+                                                MPIR_Win * win)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_WIN_LOCK_ACK_PROC);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_WIN_LOCK_ACK_PROC);
+
+    if (info->type == MPIDI_CH4U_WIN_LOCK_ACK)
+        MPIDI_CH4U_WIN(win, sync).lock.remote.locked += 1;
+    else if (info->type == MPIDI_CH4U_WIN_LOCKALL_ACK)
+        MPIDI_CH4U_WIN(win, sync).lock.remote.allLocked += 1;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_WIN_LOCK_ACK_PROC);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_win_unlock_ack_proc
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline void MPIDI_CH4U_win_unlock_proc(const MPIDI_CH4U_win_cntrl_msg_t * info,
+                                              MPIR_Win * win)
+{
+
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_WIN_UNLOCK_ACK_PROC);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_WIN_UNLOCK_ACK_PROC);
+
+    --MPIDI_CH4U_WIN(win, sync).lock.local.count;
+    MPIR_Assert((int) MPIDI_CH4U_WIN(win, sync).lock.local.count >= 0);
+    MPIDI_CH4U_win_lock_advance(win);
+
+    MPIDI_CH4U_win_cntrl_msg_t msg;
+    msg.win_id = MPIDI_CH4U_WIN(win, win_id);
+    msg.origin_rank = win->comm_ptr->rank;
+    msg.type = MPIDI_CH4U_WIN_UNLOCK_ACK;
+
+    mpi_errno = MPIDI_NM_inject_am_hdr_reply(MPIDI_CH4U_win_to_context(win),
+                                             info->origin_rank,
+                                             MPIDI_CH4U_WIN_CTRL, &msg, sizeof(msg));
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_WIN_UNLOCK_ACK_PROC);
+    return;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_win_complete_proc
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline void MPIDI_CH4U_win_complete_proc(const MPIDI_CH4U_win_cntrl_msg_t * info,
+                                                MPIR_Win * win)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_WIN_COMPLETE_PROC);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_WIN_COMPLETE_PROC);
+
+    ++MPIDI_CH4U_WIN(win, sync).sc.count;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_WIN_COMPLETE_PROC);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_win_post_proc
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline void MPIDI_CH4U_win_post_proc(const MPIDI_CH4U_win_cntrl_msg_t * info, MPIR_Win * win)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_WIN_POST_PROC);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_WIN_POST_PROC);
+
+    ++MPIDI_CH4U_WIN(win, sync).pw.count;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_WIN_POST_PROC);
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_win_unlock_done_cb
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline void MPIDI_CH4U_win_unlock_done_cb(const MPIDI_CH4U_win_cntrl_msg_t * info,
+                                                 MPIR_Win * win)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_WIN_UNLOCK_DONE_CB);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_WIN_UNLOCK_DONE_CB);
+
+    if (MPIDI_CH4U_WIN(win, sync).origin_epoch_type == MPIDI_CH4U_EPOTYPE_LOCK) {
+        MPIDI_CH4U_WIN(win, sync).lock.remote.locked--;
+    }
+    else if (MPIDI_CH4U_WIN(win, sync).origin_epoch_type == MPIDI_CH4U_EPOTYPE_LOCK_ALL) {
+        MPIR_Assert((int) MPIDI_CH4U_WIN(win, sync).lock.remote.allLocked > 0);
+        MPIDI_CH4U_WIN(win, sync).lock.remote.allLocked -= 1;
+    }
+    else {
+        MPIR_Assert(0);
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_WIN_UNLOCK_DONE_CB);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_win_ctrl_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_win_ctrl_target_handler(void *am_hdr,
+                                                     void **data,
+                                                     size_t * p_data_sz, int *is_contig,
+                                                     MPIDI_NM_am_completion_handler_fn *
+                                                     cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_CH4U_win_cntrl_msg_t *msg_hdr = (MPIDI_CH4U_win_cntrl_msg_t *) am_hdr;
+    MPIR_Win *win;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_WIN_CTRL_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_WIN_CTRL_HANDLER);
+
+    MPL_HASH_FIND(dev.ch4u.hash_handle, MPIDI_CH4_Global.win_hash,
+                  &msg_hdr->win_id, sizeof(uint64_t), win);
+    /* TODO: check output win ptr */
+
+    switch (msg_hdr->type) {
+        char buff[32];
+
+    case MPIDI_CH4U_WIN_LOCK:
+    case MPIDI_CH4U_WIN_LOCKALL:
+        MPIDI_CH4U_win_lock_req_proc(msg_hdr, win);
+        break;
+
+    case MPIDI_CH4U_WIN_LOCK_ACK:
+    case MPIDI_CH4U_WIN_LOCKALL_ACK:
+        MPIDI_CH4U_win_lock_ack_proc(msg_hdr, win);
+        break;
+
+    case MPIDI_CH4U_WIN_UNLOCK:
+    case MPIDI_CH4U_WIN_UNLOCKALL:
+        MPIDI_CH4U_win_unlock_proc(msg_hdr, win);
+        break;
+
+    case MPIDI_CH4U_WIN_UNLOCK_ACK:
+    case MPIDI_CH4U_WIN_UNLOCKALL_ACK:
+        MPIDI_CH4U_win_unlock_done_cb(msg_hdr, win);
+        break;
+
+    case MPIDI_CH4U_WIN_COMPLETE:
+        MPIDI_CH4U_win_complete_proc(msg_hdr, win);
+        break;
+
+    case MPIDI_CH4U_WIN_POST:
+        MPIDI_CH4U_win_post_proc(msg_hdr, win);
+        break;
+
+    default:
+        MPL_snprintf(buff, sizeof(buff), "Invalid message type: %d\n", msg_hdr->type);
+        MPID_Abort(NULL, MPI_ERR_INTERN, 1, buff);
+    }
+
+    if (req)
+        *req = NULL;
+    if (cmpl_handler_fn)
+        *cmpl_handler_fn = NULL;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_WIN_CTRL_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_put_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_put_target_handler(void *am_hdr,
+                                                void **data,
+                                                size_t * p_data_sz,
+                                                int *is_contig,
+                                                MPIDI_NM_am_completion_handler_fn *
+                                                cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq = NULL;
+    size_t data_sz;
+    struct iovec *iov, *dt_iov;
+    uintptr_t base;             /* Base address of the window */
+    size_t offset;
+
+    int dt_contig, n_iov;
+    MPI_Aint dt_true_lb, last, num_iov;
+    MPIR_Datatype *dt_ptr;
+    MPID_Segment *segment_ptr;
+    MPIR_Win *win;
+    MPIDI_CH4U_put_msg_t *msg_hdr = (MPIDI_CH4U_put_msg_t *) am_hdr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_PUT_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_PUT_HANDLER);
+
+    rreq = MPIDI_CH4I_am_win_request_create();
+    MPIR_Assert(rreq);
+    rreq->kind = MPIR_REQUEST_KIND__RMA;
+    *req = rreq;
+
+    MPIDI_CH4U_REQUEST(*req, req->preq.preq_ptr) = msg_hdr->preq_ptr;
+    MPIDI_CH4U_REQUEST(*req, src_rank) = msg_hdr->src_rank;
+
+    MPL_HASH_FIND(dev.ch4u.hash_handle, MPIDI_CH4_Global.win_hash,
+                  &msg_hdr->win_id, sizeof(uint64_t), win);
+    MPIR_Assert(win);
+
+    base = MPIDI_CH4I_win_base_at_target(win);
+
+    /* MPIDI_CS_ENTER(); */
+    OPA_incr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+    MPIDI_CH4U_REQUEST(rreq, req->preq.win_ptr) = win;
+
+    *cmpl_handler_fn = MPIDI_CH4U_put_cmpl_handler;
+    MPIDI_CH4U_REQUEST(rreq, req->seq_no) = OPA_fetch_and_add_int(&MPIDI_CH4_Global.nxt_seq_no, 1);
+
+    offset = win->disp_unit * msg_hdr->target_disp;
+    if (msg_hdr->n_iov) {
+        int i;
+        dt_iov = (struct iovec *) MPL_malloc(sizeof(struct iovec) * msg_hdr->n_iov);
+        MPIR_Assert(dt_iov);
+
+        iov = (struct iovec *) ((char *) am_hdr + sizeof(*msg_hdr));
+        for (i = 0; i < msg_hdr->n_iov; i++)
+            iov[i].iov_base = (char *) iov[i].iov_base + base + offset;
+        MPIR_Memcpy(dt_iov, iov, sizeof(struct iovec) * msg_hdr->n_iov);
+        MPIDI_CH4U_REQUEST(rreq, req->preq.dt_iov) = dt_iov;
+        MPIDI_CH4U_REQUEST(rreq, req->preq.n_iov) = msg_hdr->n_iov;
+        *is_contig = 0;
+        *data = iov;
+        *p_data_sz = msg_hdr->n_iov;
+        goto fn_exit;
+    }
+
+    MPIDI_CH4U_REQUEST(rreq, req->preq.dt_iov) = NULL;
+    MPIDI_Datatype_get_info(msg_hdr->count, msg_hdr->datatype,
+                            dt_contig, data_sz, dt_ptr, dt_true_lb);
+    *is_contig = dt_contig;
+
+    if (dt_contig) {
+        *p_data_sz = data_sz;
+        *data = (char *) (offset + base + dt_true_lb);
+    }
+    else {
+        segment_ptr = MPIDU_Segment_alloc();
+        MPIR_Assert(segment_ptr);
+
+        MPIDU_Segment_init((void *) (offset + base), msg_hdr->count, msg_hdr->datatype,
+                           segment_ptr, 0);
+        last = data_sz;
+        MPIDU_Segment_count_contig_blocks(segment_ptr, 0, &last, &num_iov);
+        n_iov = (int) num_iov;
+        MPIR_Assert(n_iov > 0);
+        MPIDI_CH4U_REQUEST(rreq, req->iov) =
+            (struct iovec *) MPL_malloc(n_iov * sizeof(struct iovec));
+        MPIR_Assert(MPIDI_CH4U_REQUEST(rreq, req->iov));
+
+        last = data_sz;
+        MPIDU_Segment_pack_vector(segment_ptr, 0, &last, MPIDI_CH4U_REQUEST(rreq, req->iov),
+                                  &n_iov);
+        MPIR_Assert(last == (MPI_Aint) data_sz);
+        *data = MPIDI_CH4U_REQUEST(rreq, req->iov);
+        *p_data_sz = n_iov;
+        MPIDI_CH4U_REQUEST(rreq, req->status) |= MPIDI_CH4U_REQ_RCV_NON_CONTIG;
+        MPL_free(segment_ptr);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_PUT_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_put_iov_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_put_iov_target_handler(void *am_hdr,
+                                                    void **data,
+                                                    size_t * p_data_sz,
+                                                    int *is_contig,
+                                                    MPIDI_NM_am_completion_handler_fn *
+                                                    cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq = NULL;
+    struct iovec *dt_iov;
+
+    MPIR_Win *win;
+    MPIDI_CH4U_put_msg_t *msg_hdr = (MPIDI_CH4U_put_msg_t *) am_hdr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_PUT_IOV_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_PUT_IOV_HANDLER);
+
+    rreq = MPIDI_CH4I_am_win_request_create();
+    MPIR_Assert(rreq);
+    rreq->kind = MPIR_REQUEST_KIND__RMA;
+    *req = rreq;
+
+    MPIDI_CH4U_REQUEST(*req, req->preq.preq_ptr) = msg_hdr->preq_ptr;
+    MPIDI_CH4U_REQUEST(*req, src_rank) = msg_hdr->src_rank;
+
+    MPL_HASH_FIND(dev.ch4u.hash_handle, MPIDI_CH4_Global.win_hash,
+                  &msg_hdr->win_id, sizeof(uint64_t), win);
+    MPIR_Assert(win);
+
+    /* MPIDI_CS_ENTER(); */
+    OPA_incr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+    MPIDI_CH4U_REQUEST(rreq, req->preq.win_ptr) = win;
+
+    *cmpl_handler_fn = MPIDI_CH4U_put_iov_cmpl_handler;
+    MPIDI_CH4U_REQUEST(rreq, req->seq_no) = OPA_fetch_and_add_int(&MPIDI_CH4_Global.nxt_seq_no, 1);
+
+    /* Base adjustment for iov will be done after we get the entire iovs,
+     * at MPIDI_CH4U_put_data_target_handler */
+    MPIR_Assert(msg_hdr->n_iov);
+    dt_iov = (struct iovec *) MPL_malloc(sizeof(struct iovec) * msg_hdr->n_iov);
+    MPIR_Assert(dt_iov);
+
+    MPIDI_CH4U_REQUEST(rreq, req->preq.dt_iov) = dt_iov;
+    MPIDI_CH4U_REQUEST(rreq, req->preq.n_iov) = msg_hdr->n_iov;
+    *is_contig = 1;
+    *data = dt_iov;
+    *p_data_sz = msg_hdr->n_iov * sizeof(struct iovec);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_PUT_IOV_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_put_iov_ack_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_put_iov_ack_target_handler(void *am_hdr,
+                                                        void **data,
+                                                        size_t * p_data_sz,
+                                                        int *is_contig,
+                                                        MPIDI_NM_am_completion_handler_fn *
+                                                        cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq, *origin_req;
+    MPIDI_CH4U_put_iov_ack_msg_t *msg_hdr = (MPIDI_CH4U_put_iov_ack_msg_t *) am_hdr;
+    MPIDI_CH4U_put_dat_msg_t dat_msg;
+    MPIR_Win *win;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_PUT_IOV_ACK_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_PUT_IOV_ACK_HANDLER);
+
+    rreq = MPIDI_CH4I_am_win_request_create();
+    MPIR_Assert(rreq);
+    rreq->kind = MPIR_REQUEST_KIND__RMA;
+
+    origin_req = (MPIR_Request *) msg_hdr->origin_preq_ptr;
+    dat_msg.preq_ptr = msg_hdr->target_preq_ptr;
+    win = MPIDI_CH4U_REQUEST(origin_req, req->preq.win_ptr);
+    mpi_errno = MPIDI_NM_send_am_reply(MPIDI_CH4U_win_to_context(win),
+                                       MPIDI_CH4U_REQUEST(origin_req, src_rank),
+                                       MPIDI_CH4U_PUT_DAT_REQ,
+                                       &dat_msg, sizeof(dat_msg),
+                                       MPIDI_CH4U_REQUEST(origin_req, req->preq.origin_addr),
+                                       MPIDI_CH4U_REQUEST(origin_req, req->preq.origin_count),
+                                       MPIDI_CH4U_REQUEST(origin_req, req->preq.origin_datatype),
+                                       rreq);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+    dtype_release_if_not_builtin(MPIDI_CH4U_REQUEST(origin_req, req->preq.origin_datatype));
+
+    *cmpl_handler_fn = NULL;
+    *req = NULL;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_PUT_IOV_ACK_HANDLER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_acc_iov_ack_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_acc_iov_ack_target_handler(void *am_hdr,
+                                                        void **data,
+                                                        size_t * p_data_sz,
+                                                        int *is_contig,
+                                                        MPIDI_NM_am_completion_handler_fn *
+                                                        cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq, *origin_req;
+    MPIDI_CH4U_acc_iov_ack_msg_t *msg_hdr = (MPIDI_CH4U_acc_iov_ack_msg_t *) am_hdr;
+    MPIDI_CH4U_acc_dat_msg_t dat_msg;
+    MPIR_Win *win;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_ACC_IOV_ACK_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_ACC_IOV_ACK_HANDLER);
+
+    rreq = MPIDI_CH4I_am_win_request_create();
+    MPIR_Assert(rreq);
+    rreq->kind = MPIR_REQUEST_KIND__RMA;
+
+    origin_req = (MPIR_Request *) msg_hdr->origin_preq_ptr;
+    dat_msg.preq_ptr = msg_hdr->target_preq_ptr;
+    win = MPIDI_CH4U_REQUEST(origin_req, req->areq.win_ptr);
+    mpi_errno = MPIDI_NM_send_am_reply(MPIDI_CH4U_win_to_context(win),
+                                       MPIDI_CH4U_REQUEST(origin_req, src_rank),
+                                       MPIDI_CH4U_ACC_DAT_REQ,
+                                       &dat_msg, sizeof(dat_msg),
+                                       MPIDI_CH4U_REQUEST(origin_req, req->areq.origin_addr),
+                                       MPIDI_CH4U_REQUEST(origin_req, req->areq.origin_count),
+                                       MPIDI_CH4U_REQUEST(origin_req, req->areq.origin_datatype),
+                                       rreq);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+    dtype_release_if_not_builtin(MPIDI_CH4U_REQUEST(origin_req, req->areq.origin_datatype));
+
+    *cmpl_handler_fn = NULL;
+    *req = NULL;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_ACC_IOV_ACK_HANDLER);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_put_data_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_put_data_target_handler(void *am_hdr,
+                                                     void **data,
+                                                     size_t * p_data_sz,
+                                                     int *is_contig,
+                                                     MPIDI_NM_am_completion_handler_fn *
+                                                     cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq;
+    MPIDI_CH4U_put_dat_msg_t *msg_hdr = (MPIDI_CH4U_put_dat_msg_t *) am_hdr;
+    MPIR_Win *win;
+    struct iovec *iov;
+    uintptr_t base;
+    int i;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_PUT_DATA_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_PUT_DATA_HANDLER);
+
+    rreq = (MPIR_Request *) msg_hdr->preq_ptr;
+    win = MPIDI_CH4U_REQUEST(rreq, req->preq.win_ptr);
+    base = MPIDI_CH4I_win_base_at_target(win);
+
+    /* Adjust the target addresses using the window base address */
+    iov = (struct iovec *) MPIDI_CH4U_REQUEST(rreq, req->preq.dt_iov);
+    for (i = 0; i < MPIDI_CH4U_REQUEST(rreq, req->preq.n_iov); i++)
+        iov[i].iov_base = (char *) iov[i].iov_base + base;
+
+    *data = MPIDI_CH4U_REQUEST(rreq, req->preq.dt_iov);
+    *is_contig = 0;
+    *p_data_sz = MPIDI_CH4U_REQUEST(rreq, req->preq.n_iov);
+    *req = rreq;
+    *cmpl_handler_fn = MPIDI_CH4U_put_cmpl_handler;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_PUT_DATA_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_acc_data_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_acc_data_target_handler(void *am_hdr,
+                                                     void **data,
+                                                     size_t * p_data_sz,
+                                                     int *is_contig,
+                                                     MPIDI_NM_am_completion_handler_fn *
+                                                     cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq;
+    size_t data_sz;
+    void *p_data = NULL;
+    MPIDI_CH4U_acc_dat_msg_t *msg_hdr = (MPIDI_CH4U_acc_dat_msg_t *) am_hdr;
+    uintptr_t base;
+    int i;
+    struct iovec *iov;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_ACC_DATA_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_ACC_DATA_HANDLER);
+
+    rreq = (MPIR_Request *) msg_hdr->preq_ptr;
+    base = (uintptr_t) MPIDI_CH4U_REQUEST(rreq, req->areq.target_addr);
+
+    MPIDI_Datatype_check_size(MPIDI_CH4U_REQUEST(rreq, req->areq.origin_datatype),
+                              MPIDI_CH4U_REQUEST(rreq, req->areq.origin_count), data_sz);
+    if (data_sz) {
+        p_data = MPL_malloc(data_sz);
+        MPIR_Assert(p_data);
+    }
+
+    MPIDI_CH4U_REQUEST(rreq, req->areq.data) = p_data;
+
+    /* Adjust the target addresses using the window base address */
+    iov = (struct iovec *) MPIDI_CH4U_REQUEST(rreq, req->areq.dt_iov);
+    for (i = 0; i < MPIDI_CH4U_REQUEST(rreq, req->areq.n_iov); i++)
+        iov[i].iov_base = (char *) iov[i].iov_base + base;
+
+    *data = p_data;
+    *is_contig = 1;
+    *p_data_sz = data_sz;
+    *req = rreq;
+    *cmpl_handler_fn = MPIDI_CH4U_REQUEST(rreq, req->areq.do_get) ?
+        MPIDI_CH4U_get_acc_cmpl_handler : MPIDI_CH4U_acc_cmpl_handler;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_ACC_DATA_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_cswap_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_cswap_target_handler(void *am_hdr,
+                                                  void **data,
+                                                  size_t * p_data_sz,
+                                                  int *is_contig,
+                                                  MPIDI_NM_am_completion_handler_fn *
+                                                  cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq = NULL;
+    size_t data_sz;
+    MPIR_Win *win;
+    uintptr_t base;
+    size_t offset;
+
+    int dt_contig;
+    void *p_data;
+
+    MPIDI_CH4U_cswap_req_msg_t *msg_hdr = (MPIDI_CH4U_cswap_req_msg_t *) am_hdr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_CSWAP_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_CSWAP_HANDLER);
+
+    rreq = MPIDI_CH4I_am_win_request_create();
+    MPIR_Assert(rreq);
+    rreq->kind = MPIR_REQUEST_KIND__RMA;
+    *req = rreq;
+
+    *cmpl_handler_fn = MPIDI_CH4U_cswap_cmpl_handler;
+    MPIDI_CH4U_REQUEST(rreq, req->seq_no) = OPA_fetch_and_add_int(&MPIDI_CH4_Global.nxt_seq_no, 1);
+
+    MPIDI_Datatype_check_contig_size(msg_hdr->datatype, 1, dt_contig, data_sz);
+    *is_contig = dt_contig;
+
+    MPL_HASH_FIND(dev.ch4u.hash_handle, MPIDI_CH4_Global.win_hash,
+                  &msg_hdr->win_id, sizeof(uint64_t), win);
+    MPIR_Assert(win);
+
+    base = MPIDI_CH4I_win_base_at_target(win);
+
+    /* MPIDI_CS_ENTER(); */
+    OPA_incr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+    offset = win->disp_unit * msg_hdr->target_disp;
+
+    MPIDI_CH4U_REQUEST(*req, req->creq.win_ptr) = win;
+    MPIDI_CH4U_REQUEST(*req, req->creq.creq_ptr) = msg_hdr->req_ptr;
+    MPIDI_CH4U_REQUEST(*req, req->creq.datatype) = msg_hdr->datatype;
+    MPIDI_CH4U_REQUEST(*req, req->creq.addr) = offset + base;
+    MPIDI_CH4U_REQUEST(*req, src_rank) = msg_hdr->src_rank;
+
+    MPIR_Assert(dt_contig == 1);
+    p_data = MPL_malloc(data_sz * 2);
+    MPIR_Assert(p_data);
+
+    *p_data_sz = data_sz * 2;
+    *data = p_data;
+    MPIDI_CH4U_REQUEST(*req, req->creq.data) = p_data;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_CSWAP_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_handle_acc_request
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_handle_acc_request(void *am_hdr,
+                                                void **data,
+                                                size_t * p_data_sz,
+                                                int *is_contig,
+                                                MPIDI_NM_am_completion_handler_fn *
+                                                cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq = NULL;
+    size_t data_sz;
+    void *p_data = NULL;
+    struct iovec *iov, *dt_iov;
+    MPIR_Win *win;
+    uintptr_t base;
+    size_t offset;
+    int i;
+
+    MPIDI_CH4U_acc_req_msg_t *msg_hdr = (MPIDI_CH4U_acc_req_msg_t *) am_hdr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_HANDLE_ACC_REQ);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_HANDLE_ACC_REQ);
+
+    rreq = MPIDI_CH4I_am_win_request_create();
+    MPIR_Assert(rreq);
+    rreq->kind = MPIR_REQUEST_KIND__RMA;
+    *req = rreq;
+
+    MPIDI_Datatype_check_size(msg_hdr->origin_datatype, msg_hdr->origin_count, data_sz);
+    if (data_sz) {
+        p_data = MPL_malloc(data_sz);
+        MPIR_Assert(p_data);
+    }
+
+    *cmpl_handler_fn = (msg_hdr->do_get) ? MPIDI_CH4U_get_acc_cmpl_handler :
+        MPIDI_CH4U_acc_cmpl_handler;
+    MPIDI_CH4U_REQUEST(rreq, req->seq_no) = OPA_fetch_and_add_int(&MPIDI_CH4_Global.nxt_seq_no, 1);
+
+    *is_contig = 1;
+    *p_data_sz = data_sz;
+    *data = p_data;
+
+    MPL_HASH_FIND(dev.ch4u.hash_handle, MPIDI_CH4_Global.win_hash,
+                  &msg_hdr->win_id, sizeof(uint64_t), win);
+    MPIR_Assert(win);
+
+    base = MPIDI_CH4I_win_base_at_target(win);
+    offset = win->disp_unit * msg_hdr->target_disp;
+    /* MPIDI_CS_ENTER(); */
+    OPA_incr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+
+    MPIDI_CH4U_REQUEST(*req, req->areq.win_ptr) = win;
+    MPIDI_CH4U_REQUEST(*req, req->areq.req_ptr) = msg_hdr->req_ptr;
+    MPIDI_CH4U_REQUEST(*req, req->areq.origin_datatype) = msg_hdr->origin_datatype;
+    MPIDI_CH4U_REQUEST(*req, req->areq.target_datatype) = msg_hdr->target_datatype;
+    MPIDI_CH4U_REQUEST(*req, req->areq.origin_count) = msg_hdr->origin_count;
+    MPIDI_CH4U_REQUEST(*req, req->areq.target_count) = msg_hdr->target_count;
+    MPIDI_CH4U_REQUEST(*req, req->areq.target_addr) = (void *) (offset + base);
+    MPIDI_CH4U_REQUEST(*req, req->areq.op) = msg_hdr->op;
+    MPIDI_CH4U_REQUEST(*req, req->areq.data) = p_data;
+    MPIDI_CH4U_REQUEST(*req, req->areq.n_iov) = msg_hdr->n_iov;
+    MPIDI_CH4U_REQUEST(*req, req->areq.data_sz) = msg_hdr->result_data_sz;
+    MPIDI_CH4U_REQUEST(*req, src_rank) = msg_hdr->src_rank;
+
+    if (!msg_hdr->n_iov) {
+        MPIDI_CH4U_REQUEST(rreq, req->areq.dt_iov) = NULL;
+        goto fn_exit;
+    }
+
+    dt_iov = (struct iovec *) MPL_malloc(sizeof(struct iovec) * msg_hdr->n_iov);
+    MPIR_Assert(dt_iov);
+
+    iov = (struct iovec *) ((char *) msg_hdr + sizeof(*msg_hdr));
+    for (i = 0; i < msg_hdr->n_iov; i++) {
+        dt_iov[i].iov_base = (char *) iov[i].iov_base + base + offset;
+        dt_iov[i].iov_len = iov[i].iov_len;
+    }
+    MPIDI_CH4U_REQUEST(rreq, req->areq.dt_iov) = dt_iov;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_HANDLE_ACC_REQ);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_acc_iov_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_acc_iov_target_handler(void *am_hdr,
+                                                    void **data,
+                                                    size_t * p_data_sz,
+                                                    int *is_contig,
+                                                    MPIDI_NM_am_completion_handler_fn *
+                                                    cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq = NULL;
+    struct iovec *dt_iov;
+    MPIR_Win *win;
+    uintptr_t base;
+    size_t offset;
+
+    MPIDI_CH4U_acc_req_msg_t *msg_hdr = (MPIDI_CH4U_acc_req_msg_t *) am_hdr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_HANDLE_ACC_IOV_REQ);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_HANDLE_ACC_IOV_REQ);
+
+    rreq = MPIDI_CH4I_am_win_request_create();
+    MPIR_Assert(rreq);
+    rreq->kind = MPIR_REQUEST_KIND__RMA;
+    *req = rreq;
+
+    MPL_HASH_FIND(dev.ch4u.hash_handle, MPIDI_CH4_Global.win_hash,
+                  &msg_hdr->win_id, sizeof(uint64_t), win);
+    MPIR_Assert(win);
+
+    base = MPIDI_CH4I_win_base_at_target(win);
+
+    offset = win->disp_unit * msg_hdr->target_disp;
+    /* MPIDI_CS_ENTER(); */
+    OPA_incr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+
+    MPIDI_CH4U_REQUEST(*req, req->areq.win_ptr) = win;
+    MPIDI_CH4U_REQUEST(*req, req->areq.req_ptr) = msg_hdr->req_ptr;
+    MPIDI_CH4U_REQUEST(*req, req->areq.origin_datatype) = msg_hdr->origin_datatype;
+    MPIDI_CH4U_REQUEST(*req, req->areq.target_datatype) = msg_hdr->target_datatype;
+    MPIDI_CH4U_REQUEST(*req, req->areq.origin_count) = msg_hdr->origin_count;
+    MPIDI_CH4U_REQUEST(*req, req->areq.target_count) = msg_hdr->target_count;
+    MPIDI_CH4U_REQUEST(*req, req->areq.target_addr) = (void *) (offset + base);
+    MPIDI_CH4U_REQUEST(*req, req->areq.op) = msg_hdr->op;
+    MPIDI_CH4U_REQUEST(*req, req->areq.n_iov) = msg_hdr->n_iov;
+    MPIDI_CH4U_REQUEST(*req, req->areq.data_sz) = msg_hdr->result_data_sz;
+    MPIDI_CH4U_REQUEST(*req, req->areq.do_get) = msg_hdr->do_get;
+    MPIDI_CH4U_REQUEST(*req, src_rank) = msg_hdr->src_rank;
+
+    dt_iov = (struct iovec *) MPL_malloc(sizeof(struct iovec) * msg_hdr->n_iov);
+    MPIDI_CH4U_REQUEST(rreq, req->areq.dt_iov) = dt_iov;
+    MPIR_Assert(dt_iov);
+
+    /* Base adjustment for iov will be done after we get the entire iovs,
+     * at MPIDI_CH4U_acc_data_target_handler */
+    *is_contig = 1;
+    *p_data_sz = sizeof(struct iovec) * msg_hdr->n_iov;
+    *data = (void *) dt_iov;
+
+    *cmpl_handler_fn = MPIDI_CH4U_acc_iov_cmpl_handler;
+    MPIDI_CH4U_REQUEST(rreq, req->seq_no) = OPA_fetch_and_add_int(&MPIDI_CH4_Global.nxt_seq_no, 1);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_HANDLE_ACC_IOV_REQ);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_get_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_get_target_handler(void *am_hdr,
+                                                void **data,
+                                                size_t * p_data_sz,
+                                                int *is_contig,
+                                                MPIDI_NM_am_completion_handler_fn *
+                                                cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq = NULL;
+    MPIDI_CH4U_get_req_msg_t *msg_hdr = (MPIDI_CH4U_get_req_msg_t *) am_hdr;
+    struct iovec *iov;
+    MPIR_Win *win;
+    uintptr_t base;
+    size_t offset;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_GET_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_GET_HANDLER);
+
+    rreq = MPIDI_CH4I_am_win_request_create();
+    MPIR_Assert(rreq);
+    rreq->kind = MPIR_REQUEST_KIND__RMA;
+
+    *req = rreq;
+    *cmpl_handler_fn = MPIDI_CH4U_get_cmpl_handler;
+    MPIDI_CH4U_REQUEST(rreq, req->seq_no) = OPA_fetch_and_add_int(&MPIDI_CH4_Global.nxt_seq_no, 1);
+
+    MPL_HASH_FIND(dev.ch4u.hash_handle, MPIDI_CH4_Global.win_hash,
+                  &msg_hdr->win_id, sizeof(uint64_t), win);
+    MPIR_Assert(win);
+
+    base = MPIDI_CH4I_win_base_at_target(win);
+
+    /* MPIDI_CS_ENTER(); */
+    OPA_incr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+
+    offset = win->disp_unit * msg_hdr->target_disp;
+    MPIDI_CH4U_REQUEST(rreq, req->greq.win_ptr) = win;
+    MPIDI_CH4U_REQUEST(rreq, req->greq.n_iov) = msg_hdr->n_iov;
+    MPIDI_CH4U_REQUEST(rreq, req->greq.addr) = offset + base;
+    MPIDI_CH4U_REQUEST(rreq, req->greq.count) = msg_hdr->count;
+    MPIDI_CH4U_REQUEST(rreq, req->greq.datatype) = msg_hdr->datatype;
+    MPIDI_CH4U_REQUEST(rreq, req->greq.dt_iov) = NULL;
+    MPIDI_CH4U_REQUEST(rreq, req->greq.greq_ptr) = msg_hdr->greq_ptr;
+    MPIDI_CH4U_REQUEST(rreq, src_rank) = msg_hdr->src_rank;
+
+    if (msg_hdr->n_iov) {
+        iov = (struct iovec *) MPL_malloc(msg_hdr->n_iov * sizeof(*iov));
+        MPIR_Assert(iov);
+
+        *data = (void *) iov;
+        *is_contig = 1;
+        *p_data_sz = msg_hdr->n_iov * sizeof(*iov);
+        MPIDI_CH4U_REQUEST(rreq, req->greq.dt_iov) = iov;
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_GET_HANDLER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_get_ack_target_handler
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_get_ack_target_handler(void *am_hdr,
+                                                    void **data,
+                                                    size_t * p_data_sz,
+                                                    int *is_contig,
+                                                    MPIDI_NM_am_completion_handler_fn *
+                                                    cmpl_handler_fn, MPIR_Request ** req)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq = NULL, *greq;
+    size_t data_sz;
+
+    int dt_contig, n_iov;
+    MPI_Aint dt_true_lb, last, num_iov;
+    MPIR_Datatype *dt_ptr;
+    MPID_Segment *segment_ptr;
+
+    MPIDI_CH4U_get_ack_msg_t *msg_hdr = (MPIDI_CH4U_get_ack_msg_t *) am_hdr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_GET_ACK_HANDLER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_GET_ACK_HANDLER);
+
+    greq = MPIDI_CH4I_am_win_request_create();
+    MPIR_Assert(greq);
+    greq->kind = MPIR_REQUEST_KIND__RMA;
+    *req = greq;
+
+    rreq = (MPIR_Request *) msg_hdr->greq_ptr;
+    MPIR_Assert(rreq->kind == MPIR_REQUEST_KIND__RMA);
+    MPIDI_CH4U_REQUEST(greq, req->greq.greq_ptr) = (uint64_t) rreq;
+
+    if (MPIDI_CH4U_REQUEST(rreq, req->greq.dt_iov)) {
+        MPL_free(MPIDI_CH4U_REQUEST(rreq, req->greq.dt_iov));
+    }
+
+    *cmpl_handler_fn = MPIDI_CH4U_get_ack_cmpl_handler;
+    MPIDI_CH4U_REQUEST(greq, req->seq_no) = OPA_fetch_and_add_int(&MPIDI_CH4_Global.nxt_seq_no, 1);
+
+    MPIDI_Datatype_get_info(MPIDI_CH4U_REQUEST(rreq, req->greq.count),
+                            MPIDI_CH4U_REQUEST(rreq, req->greq.datatype),
+                            dt_contig, data_sz, dt_ptr, dt_true_lb);
+
+    *is_contig = dt_contig;
+
+    if (dt_contig) {
+        *p_data_sz = data_sz;
+        *data = (char *) (MPIDI_CH4U_REQUEST(rreq, req->greq.addr) + dt_true_lb);
+    }
+    else {
+        segment_ptr = MPIDU_Segment_alloc();
+        MPIR_Assert(segment_ptr);
+
+        MPIDU_Segment_init((void *) MPIDI_CH4U_REQUEST(rreq, req->greq.addr),
+                           MPIDI_CH4U_REQUEST(rreq, req->greq.count),
+                           MPIDI_CH4U_REQUEST(rreq, req->greq.datatype), segment_ptr, 0);
+        last = data_sz;
+        MPIDU_Segment_count_contig_blocks(segment_ptr, 0, &last, &num_iov);
+        n_iov = (int) num_iov;
+        MPIR_Assert(n_iov > 0);
+        MPIDI_CH4U_REQUEST(rreq, req->iov) =
+            (struct iovec *) MPL_malloc(n_iov * sizeof(struct iovec));
+        MPIR_Assert(MPIDI_CH4U_REQUEST(rreq, req->iov));
+
+        last = data_sz;
+        MPIDU_Segment_pack_vector(segment_ptr, 0, &last, MPIDI_CH4U_REQUEST(rreq, req->iov),
+                                  &n_iov);
+        MPIR_Assert(last == (MPI_Aint) data_sz);
+        *data = MPIDI_CH4U_REQUEST(rreq, req->iov);
+        *p_data_sz = n_iov;
+        MPIDI_CH4U_REQUEST(rreq, req->status) |= MPIDI_CH4U_REQ_RCV_NON_CONTIG;
+        MPL_free(segment_ptr);
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_GET_ACK_HANDLER);
+    return mpi_errno;
+}
+
+#endif /* CH4R_CALLBACKS_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4r_init.h b/src/mpid/ch4/src/ch4r_init.h
new file mode 100644
index 0000000..124ac49
--- /dev/null
+++ b/src/mpid/ch4/src/ch4r_init.h
@@ -0,0 +1,333 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4R_INIT_H_INCLUDED
+#define CH4R_INIT_H_INCLUDED
+
+#include "ch4_impl.h"
+#include "ch4i_util.h"
+#include "ch4r_buf.h"
+#include "ch4r_callbacks.h"
+#include "mpl_uthash.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_init_comm
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_init_comm(MPIR_Comm * comm)
+{
+    int mpi_errno = MPI_SUCCESS, comm_idx, subcomm_type, is_localcomm;
+    MPIDI_CH4U_rreq_t **uelist;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_INIT_COMM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_INIT_COMM);
+
+    /*
+     * Prevents double initialization of some special communicators.
+     *
+     * comm_world and comm_self may exhibit this function twice, first during MPIDI_CH4U_init
+     * and the second during MPIR_Comm_commit in MPIDI_Init.
+     * If there is an early arrival of an unexpected message before the second visit,
+     * the following code will wipe out the unexpected queue andthe message is lost forever.
+     */
+    if (unlikely(MPIDI_CH4_Global.is_ch4u_initialized &&
+                 (comm == MPIR_Process.comm_world || comm == MPIR_Process.comm_self)))
+        goto fn_exit;
+
+    comm_idx = MPIDI_CH4U_get_context_index(comm->recvcontext_id);
+    subcomm_type = MPIR_CONTEXT_READ_FIELD(SUBCOMM, comm->recvcontext_id);
+    is_localcomm = MPIR_CONTEXT_READ_FIELD(IS_LOCALCOMM, comm->recvcontext_id);
+
+    MPIR_Assert(subcomm_type <= 3);
+    MPIR_Assert(is_localcomm <= 1);
+    MPIDI_CH4_Global.comm_req_lists[comm_idx].comm[is_localcomm][subcomm_type] = comm;
+    MPIDI_CH4U_COMM(comm, posted_list) = NULL;
+    MPIDI_CH4U_COMM(comm, unexp_list) = NULL;
+
+    uelist = MPIDI_CH4U_context_id_to_uelist(comm->context_id);
+    if (*uelist) {
+        MPIDI_CH4U_rreq_t *curr, *tmp;
+        MPL_DL_FOREACH_SAFE(*uelist, curr, tmp) {
+            MPL_DL_DELETE(*uelist, curr);
+            MPIR_Comm_add_ref(comm);    /* +1 for each entry in unexp_list */
+            MPL_DL_APPEND(MPIDI_CH4U_COMM(comm, unexp_list), curr);
+        }
+        *uelist = NULL;
+    }
+
+    MPIDI_CH4U_COMM(comm, window_instance) = 0;
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_INIT_COMM);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_destroy_comm
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_destroy_comm(MPIR_Comm * comm)
+{
+    int mpi_errno = MPI_SUCCESS, comm_idx, subcomm_type, is_localcomm;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_DESTROY_COMM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_DESTROY_COMM);
+
+    comm_idx = MPIDI_CH4U_get_context_index(comm->recvcontext_id);
+    subcomm_type = MPIR_CONTEXT_READ_FIELD(SUBCOMM, comm->recvcontext_id);
+    is_localcomm = MPIR_CONTEXT_READ_FIELD(IS_LOCALCOMM, comm->recvcontext_id);
+
+    MPIR_Assert(subcomm_type <= 3);
+    MPIR_Assert(is_localcomm <= 1);
+    MPIR_Assert(MPIDI_CH4_Global.comm_req_lists[comm_idx].comm[is_localcomm][subcomm_type] != NULL);
+
+    if (MPIDI_CH4_Global.comm_req_lists[comm_idx].comm[subcomm_type]) {
+        MPIR_Assert(MPIDI_CH4_Global.comm_req_lists[comm_idx].comm[is_localcomm][subcomm_type]->dev.
+                    ch4.ch4u.posted_list == NULL);
+        MPIR_Assert(MPIDI_CH4_Global.comm_req_lists[comm_idx].comm[is_localcomm][subcomm_type]->dev.
+                    ch4.ch4u.unexp_list == NULL);
+    }
+    MPIDI_CH4_Global.comm_req_lists[comm_idx].comm[is_localcomm][subcomm_type] = NULL;
+
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_DESTROY_COMM);
+    return mpi_errno;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_init(MPIR_Comm * comm_world, MPIR_Comm * comm_self,
+                                   int num_contexts, void **netmod_contexts)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_INIT);
+
+    MPIDI_CH4_Global.is_ch4u_initialized = 0;
+
+    MPIDI_CH4_Global.comm_req_lists = (MPIDI_CH4U_comm_req_list_t *)
+        MPL_calloc(MPIR_MAX_CONTEXT_MASK * MPIR_CONTEXT_INT_BITS,
+                   sizeof(MPIDI_CH4U_comm_req_list_t));
+#ifndef MPIDI_CH4U_USE_PER_COMM_QUEUE
+    MPIDI_CH4_Global.posted_list = NULL;
+    MPIDI_CH4_Global.unexp_list = NULL;
+#endif
+
+    MPIDI_CH4_Global.cmpl_list = NULL;
+    OPA_store_int(&MPIDI_CH4_Global.exp_seq_no, 0);
+    OPA_store_int(&MPIDI_CH4_Global.nxt_seq_no, 0);
+
+    MPIDI_CH4_Global.buf_pool = MPIDI_CH4U_create_buf_pool(MPIDI_CH4I_BUF_POOL_NUM,
+                                                           MPIDI_CH4I_BUF_POOL_SZ);
+    MPIR_Assert(MPIDI_CH4_Global.buf_pool);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_SEND,
+                                         &MPIDI_CH4U_send_origin_cmpl_handler,
+                                         &MPIDI_CH4U_send_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_SEND_LONG_REQ, NULL /* Injection only */ ,
+                                         &MPIDI_CH4U_send_long_req_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_SEND_LONG_ACK, NULL /* Injection only */ ,
+                                         &MPIDI_CH4U_send_long_ack_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_SEND_LONG_LMT,
+                                         &MPIDI_CH4U_send_long_lmt_origin_cmpl_handler,
+                                         &MPIDI_CH4U_send_long_lmt_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_SSEND_REQ,
+                                         &MPIDI_CH4U_send_origin_cmpl_handler,
+                                         &MPIDI_CH4U_ssend_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_SSEND_ACK,
+                                         &MPIDI_CH4U_ssend_ack_origin_cmpl_handler,
+                                         &MPIDI_CH4U_ssend_ack_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_PUT_REQ,
+                                         &MPIDI_CH4U_put_origin_cmpl_handler,
+                                         &MPIDI_CH4U_put_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_PUT_ACK,
+                                         NULL, &MPIDI_CH4U_put_ack_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_GET_REQ,
+                                         &MPIDI_CH4U_get_origin_cmpl_handler,
+                                         &MPIDI_CH4U_get_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_GET_ACK,
+                                         &MPIDI_CH4U_get_ack_origin_cmpl_handler,
+                                         &MPIDI_CH4U_get_ack_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_CSWAP_REQ,
+                                         &MPIDI_CH4U_cswap_origin_cmpl_handler,
+                                         &MPIDI_CH4U_cswap_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_CSWAP_ACK,
+                                         &MPIDI_CH4U_cswap_ack_origin_cmpl_handler,
+                                         &MPIDI_CH4U_cswap_ack_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_ACC_REQ,
+                                         &MPIDI_CH4U_acc_origin_cmpl_handler,
+                                         &MPIDI_CH4U_handle_acc_request);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_ACC_ACK,
+                                         NULL, &MPIDI_CH4U_acc_ack_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_GET_ACC_ACK,
+                                         &MPIDI_CH4U_get_acc_ack_origin_cmpl_handler,
+                                         &MPIDI_CH4U_get_acc_ack_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_WIN_CTRL,
+                                         NULL, &MPIDI_CH4U_win_ctrl_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_PUT_IOV_REQ,
+                                         &MPIDI_CH4U_put_iov_origin_cmpl_handler,
+                                         &MPIDI_CH4U_put_iov_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_PUT_IOV_ACK,
+                                         NULL, &MPIDI_CH4U_put_iov_ack_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_PUT_DAT_REQ,
+                                         &MPIDI_CH4U_put_data_origin_cmpl_handler,
+                                         &MPIDI_CH4U_put_data_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_ACC_IOV_REQ,
+                                         &MPIDI_CH4U_acc_iov_origin_cmpl_handler,
+                                         &MPIDI_CH4U_acc_iov_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_ACC_IOV_ACK,
+                                         NULL, &MPIDI_CH4U_acc_iov_ack_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_NM_reg_hdr_handler(MPIDI_CH4U_ACC_DAT_REQ,
+                                         &MPIDI_CH4U_acc_data_origin_cmpl_handler,
+                                         &MPIDI_CH4U_acc_data_target_handler);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_CH4U_init_comm(comm_world);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIDI_CH4U_init_comm(comm_self);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    MPIDI_CH4_Global.win_hash = NULL;
+
+    MPIDI_CH4_Global.is_ch4u_initialized = 1;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_INIT);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ void MPIDI_CH4U_finalize()
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_FINALIZE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_FINALIZE);
+    MPIDI_CH4_Global.is_ch4u_initialized = 0;
+    MPL_HASH_CLEAR(dev.ch4u.hash_handle, MPIDI_CH4_Global.win_hash);
+    MPIDI_CH4R_destroy_buf_pool(MPIDI_CH4_Global.buf_pool);
+    MPL_free(MPIDI_CH4_Global.comm_req_lists);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_FINALIZE);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_alloc_mem
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ void *MPIDI_CH4U_alloc_mem(size_t size, MPIR_Info * info_ptr)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_ALLOC_MEM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_ALLOC_MEM);
+    void *p;
+    p = MPL_malloc(size);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_ALLOC_MEM);
+    return p;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_free_mem
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_free_mem(void *ptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_FREE_MEM);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_FREE_MEM);
+    MPL_free(ptr);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_FREE_MEM);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDIU_update_node_map
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDIU_update_node_map(int avtid, int size, MPID_Node_id_t node_map[])
+{
+    int i;
+    for (i = 0; i < size; i++) {
+        MPIDI_CH4_Global.node_map[avtid][i] = node_map[i];
+    }
+    return MPI_SUCCESS;
+}
+
+#endif /* CH4R_INIT_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4r_probe.h b/src/mpid/ch4/src/ch4r_probe.h
new file mode 100644
index 0000000..20753db
--- /dev/null
+++ b/src/mpid/ch4/src/ch4r_probe.h
@@ -0,0 +1,176 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4R_PROBE_H_INCLUDED
+#define CH4R_PROBE_H_INCLUDED
+
+#include "ch4_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_iprobe
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_iprobe(int source,
+                                     int tag,
+                                     MPIR_Comm * comm,
+                                     int context_offset, int *flag, MPI_Status * status)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Comm *root_comm;
+    MPIR_Request *unexp_req;
+    uint64_t match_bits, mask_bits;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_IPROBE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_IPROBE);
+
+    if (unlikely(source == MPI_PROC_NULL)) {
+        MPIR_Status_set_procnull(status);
+        *flag = true;
+        goto fn_exit;
+    }
+
+    root_comm = MPIDI_CH4U_context_id_to_comm(comm->context_id);
+    match_bits = MPIDI_CH4U_init_recvtag(&mask_bits, root_comm->recvcontext_id +
+                                         context_offset, source, tag);
+
+    /* MPIDI_CS_ENTER(); */
+    unexp_req = MPIDI_CH4U_find_unexp(match_bits, mask_bits,
+                                      &MPIDI_CH4U_COMM(root_comm, unexp_list));
+
+    if (unexp_req) {
+        *flag = 1;
+        unexp_req->status.MPI_ERROR = MPI_SUCCESS;
+        unexp_req->status.MPI_SOURCE = MPIDI_CH4U_REQUEST(unexp_req, src_rank);
+        unexp_req->status.MPI_TAG = MPIDI_CH4U_get_tag(MPIDI_CH4U_REQUEST(unexp_req, tag));
+        MPIR_STATUS_SET_COUNT(unexp_req->status, MPIDI_CH4U_REQUEST(unexp_req, count));
+
+        status->MPI_TAG = unexp_req->status.MPI_TAG;
+        status->MPI_SOURCE = unexp_req->status.MPI_SOURCE;
+        MPIR_STATUS_SET_COUNT(*status, MPIDI_CH4U_REQUEST(unexp_req, count));
+    }
+    else {
+        *flag = 0;
+        MPIDI_Progress_test();
+    }
+    /* MPIDI_CS_EXIT(); */
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_IPROBE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Probe
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_probe(int source,
+                                    int tag,
+                                    MPIR_Comm * comm, int context_offset, MPI_Status * status)
+{
+    int mpi_errno, flag = 0;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_PROBE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_PROBE);
+
+    while (!flag) {
+        mpi_errno = MPIDI_CH4U_iprobe(source, tag, comm, context_offset, &flag, status);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_PROBE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_improbe
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_improbe(int source,
+                                      int tag,
+                                      MPIR_Comm * comm,
+                                      int context_offset,
+                                      int *flag, MPIR_Request ** message, MPI_Status * status)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Comm *root_comm;
+    MPIR_Request *unexp_req;
+    uint64_t match_bits, mask_bits;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_IMPROBE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_IMPROBE);
+
+    if (unlikely(source == MPI_PROC_NULL)) {
+        MPIR_Status_set_procnull(status);
+        *flag = true;
+        goto fn_exit;
+    }
+
+    root_comm = MPIDI_CH4U_context_id_to_comm(comm->context_id);
+    match_bits = MPIDI_CH4U_init_recvtag(&mask_bits, root_comm->recvcontext_id +
+                                         context_offset, source, tag);
+
+    /* MPIDI_CS_ENTER(); */
+    unexp_req = MPIDI_CH4U_dequeue_unexp(match_bits, mask_bits,
+                                         &MPIDI_CH4U_COMM(root_comm, unexp_list));
+
+    if (unexp_req) {
+        *flag = 1;
+        *message = unexp_req;
+
+        (*message)->kind = MPIR_REQUEST_KIND__MPROBE;
+        (*message)->comm = comm;
+        /* Notes on refcounting comm:
+         * We intentionally do nothing here because what we are supposed to do here
+         * is -1 for dequeue(unexp_list) and +1 for (*message)->comm */
+
+        unexp_req->status.MPI_ERROR = MPI_SUCCESS;
+        unexp_req->status.MPI_SOURCE = MPIDI_CH4U_REQUEST(unexp_req, src_rank);
+        unexp_req->status.MPI_TAG = MPIDI_CH4U_get_tag(MPIDI_CH4U_REQUEST(unexp_req, tag));
+        MPIR_STATUS_SET_COUNT(unexp_req->status, MPIDI_CH4U_REQUEST(unexp_req, count));
+        MPIDI_CH4U_REQUEST(unexp_req, req->status) |= MPIDI_CH4U_REQ_UNEXP_DQUED;
+
+        status->MPI_TAG = unexp_req->status.MPI_TAG;
+        status->MPI_SOURCE = unexp_req->status.MPI_SOURCE;
+        MPIR_STATUS_SET_COUNT(*status, MPIDI_CH4U_REQUEST(unexp_req, count));
+    }
+    else {
+        *flag = 0;
+        MPIDI_Progress_test();
+    }
+    /* MPIDI_CS_EXIT(); */
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_IMPROBE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_mprobe
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_mprobe(int source,
+                                     int tag,
+                                     MPIR_Comm * comm,
+                                     int context_offset,
+                                     MPIR_Request ** message, MPI_Status * status)
+{
+    int mpi_errno, flag = 0;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_MPROBE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_MPROBE);
+    while (!flag) {
+        mpi_errno = MPIDI_CH4U_improbe(source, tag, comm, context_offset, &flag, message, status);
+    }
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_MPROBE);
+    return mpi_errno;
+}
+
+#endif /* CH4R_WIN_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4r_proc.h b/src/mpid/ch4/src/ch4r_proc.h
new file mode 100644
index 0000000..65c75bf
--- /dev/null
+++ b/src/mpid/ch4/src/ch4r_proc.h
@@ -0,0 +1,411 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4R_PROC_H_INCLUDED
+#define CH4R_PROC_H_INCLUDED
+
+#include "ch4_types.h"
+#include "build_nodemap.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDIU_comm_rank_to_pid
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDIU_comm_rank_to_pid(MPIR_Comm * comm, int rank, int *index, int *avtid)
+{
+    switch (MPIDII_COMM(comm, map).mode) {
+    case MPIDII_RANK_MAP_DIRECT:
+        *avtid = MPIDII_COMM(comm, map).avtid;
+        *index = rank;
+        break;
+    case MPIDII_RANK_MAP_DIRECT_INTRA:
+        *index = rank;
+        break;
+    case MPIDII_RANK_MAP_OFFSET:
+        *avtid = MPIDII_COMM(comm, map).avtid;
+        *index = rank + MPIDII_COMM(comm, map).reg.offset;
+        break;
+    case MPIDII_RANK_MAP_OFFSET_INTRA:
+        *index = rank + MPIDII_COMM(comm, map).reg.offset;
+        break;
+    case MPIDII_RANK_MAP_STRIDE:
+        *avtid = MPIDII_COMM(comm, map).avtid;
+        *index = MPIDII_CALC_STRIDE_SIMPLE(rank, MPIDII_COMM(comm, map).reg.stride.stride,
+                                           MPIDII_COMM(comm, map).reg.stride.offset);
+        break;
+    case MPIDII_RANK_MAP_STRIDE_INTRA:
+        *index = MPIDII_CALC_STRIDE_SIMPLE(rank, MPIDII_COMM(comm, map).reg.stride.stride,
+                                           MPIDII_COMM(comm, map).reg.stride.offset);
+        break;
+    case MPIDII_RANK_MAP_STRIDE_BLOCK:
+        *avtid = MPIDII_COMM(comm, map).avtid;
+        *index = MPIDII_CALC_STRIDE(rank, MPIDII_COMM(comm, map).reg.stride.stride,
+                                    MPIDII_COMM(comm, map).reg.stride.blocksize,
+                                    MPIDII_COMM(comm, map).reg.stride.offset);
+        break;
+    case MPIDII_RANK_MAP_STRIDE_BLOCK_INTRA:
+        *index = MPIDII_CALC_STRIDE(rank, MPIDII_COMM(comm, map).reg.stride.stride,
+                                    MPIDII_COMM(comm, map).reg.stride.blocksize,
+                                    MPIDII_COMM(comm, map).reg.stride.offset);
+        break;
+    case MPIDII_RANK_MAP_LUT:
+        *avtid = MPIDII_COMM(comm, map).avtid;
+        *index = MPIDII_COMM(comm, map).irreg.lut.lpid[rank];
+        break;
+    case MPIDII_RANK_MAP_LUT_INTRA:
+        *index = MPIDII_COMM(comm, map).irreg.lut.lpid[rank];
+        break;
+    case MPIDII_RANK_MAP_MLUT:
+        *index = MPIDII_COMM(comm, map).irreg.mlut.gpid[rank].lpid;
+        *avtid = MPIDII_COMM(comm, map).irreg.mlut.gpid[rank].avtid;
+        break;
+    case MPIDII_RANK_MAP_NONE:
+        MPIR_Assert(0);
+        break;
+    }
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                    (MPL_DBG_FDEST, " rank=%d, index=%d", rank, *index));
+    return *index;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDIU_comm_rank_to_av
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline MPIDII_av_entry_t *MPIDIU_comm_rank_to_av(MPIR_Comm * comm, int rank)
+{
+    switch (MPIDII_COMM(comm, map).mode) {
+    case MPIDII_RANK_MAP_DIRECT:
+        return &MPIDII_av_table[MPIDII_COMM(comm, map).avtid]->table[rank];
+    case MPIDII_RANK_MAP_DIRECT_INTRA:
+        return &MPIDII_av_table0->table[rank];
+    case MPIDII_RANK_MAP_OFFSET:
+        return &MPIDII_av_table[MPIDII_COMM(comm, map).avtid]
+            ->table[rank + MPIDII_COMM(comm, map).reg.offset];
+    case MPIDII_RANK_MAP_OFFSET_INTRA:
+        return &MPIDII_av_table0->table[rank + MPIDII_COMM(comm, map).reg.offset];
+    case MPIDII_RANK_MAP_STRIDE:
+        return &MPIDII_av_table[MPIDII_COMM(comm, map).avtid]
+            ->table[MPIDII_CALC_STRIDE_SIMPLE(rank,
+                                              MPIDII_COMM(comm, map).reg.stride.stride,
+                                              MPIDII_COMM(comm, map).reg.stride.offset)];
+    case MPIDII_RANK_MAP_STRIDE_INTRA:
+        return &MPIDII_av_table0->table[MPIDII_CALC_STRIDE_SIMPLE(rank,
+                                                                  MPIDII_COMM(comm,
+                                                                              map).reg.stride.
+                                                                  stride, MPIDII_COMM(comm,
+                                                                                      map).reg.
+                                                                  stride.offset)];
+    case MPIDII_RANK_MAP_STRIDE_BLOCK:
+        return &MPIDII_av_table[MPIDII_COMM(comm, map).avtid]
+            ->table[MPIDII_CALC_STRIDE(rank,
+                                       MPIDII_COMM(comm, map).reg.stride.stride,
+                                       MPIDII_COMM(comm, map).reg.stride.blocksize,
+                                       MPIDII_COMM(comm, map).reg.stride.offset)];
+    case MPIDII_RANK_MAP_STRIDE_BLOCK_INTRA:
+        return &MPIDII_av_table0->table[MPIDII_CALC_STRIDE(rank,
+                                                           MPIDII_COMM(comm, map).reg.stride.stride,
+                                                           MPIDII_COMM(comm,
+                                                                       map).reg.stride.blocksize,
+                                                           MPIDII_COMM(comm,
+                                                                       map).reg.stride.offset)];
+    case MPIDII_RANK_MAP_LUT:
+        return &MPIDII_av_table[MPIDII_COMM(comm, map).avtid]
+            ->table[MPIDII_COMM(comm, map).irreg.lut.lpid[rank]];
+    case MPIDII_RANK_MAP_LUT_INTRA:
+        return &MPIDII_av_table0->table[MPIDII_COMM(comm, map).irreg.lut.lpid[rank]];
+    case MPIDII_RANK_MAP_MLUT:
+        return &MPIDII_av_table[MPIDII_COMM(comm, map).irreg.mlut.gpid[rank].avtid]
+            ->table[MPIDII_COMM(comm, map).irreg.mlut.gpid[rank].lpid];
+    case MPIDII_RANK_MAP_NONE:
+        MPIR_Assert(0);
+        return NULL;
+    }
+    return NULL;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDIU_comm_rank_to_pid_local
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDIU_comm_rank_to_pid_local(MPIR_Comm * comm, int rank, int *index, int *avtid)
+{
+    *avtid = MPIDII_COMM(comm, local_map).avtid;
+    switch (MPIDII_COMM(comm, local_map).mode) {
+    case MPIDII_RANK_MAP_DIRECT:
+    case MPIDII_RANK_MAP_DIRECT_INTRA:
+        *index = rank;
+        break;
+    case MPIDII_RANK_MAP_OFFSET:
+    case MPIDII_RANK_MAP_OFFSET_INTRA:
+        *index = rank + MPIDII_COMM(comm, local_map).reg.offset;
+        break;
+    case MPIDII_RANK_MAP_STRIDE:
+    case MPIDII_RANK_MAP_STRIDE_INTRA:
+        *index = MPIDII_CALC_STRIDE_SIMPLE(rank, MPIDII_COMM(comm, map).reg.stride.stride,
+                                           MPIDII_COMM(comm, map).reg.stride.offset);
+        break;
+    case MPIDII_RANK_MAP_STRIDE_BLOCK:
+    case MPIDII_RANK_MAP_STRIDE_BLOCK_INTRA:
+        *index = MPIDII_CALC_STRIDE(rank, MPIDII_COMM(comm, local_map).reg.stride.stride,
+                                    MPIDII_COMM(comm, local_map).reg.stride.blocksize,
+                                    MPIDII_COMM(comm, local_map).reg.stride.offset);
+        break;
+    case MPIDII_RANK_MAP_LUT:
+    case MPIDII_RANK_MAP_LUT_INTRA:
+        *index = MPIDII_COMM(comm, local_map).irreg.lut.lpid[rank];
+        break;
+    case MPIDII_RANK_MAP_MLUT:
+        *index = MPIDII_COMM(comm, local_map).irreg.mlut.gpid[rank].lpid;
+        *avtid = MPIDII_COMM(comm, local_map).irreg.mlut.gpid[rank].avtid;
+        break;
+    case MPIDII_RANK_MAP_NONE:
+        MPIR_Assert(0);
+        break;
+    }
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                    (MPL_DBG_FDEST, " rank: rank=%d, index=%d", rank, *index));
+    return *index;
+}
+
+static inline int MPIDI_CH4U_rank_is_local(int rank, MPIR_Comm * comm)
+{
+    int ret;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDIU_RANK_IS_LOCAL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDIU_RANK_IS_LOCAL);
+
+#ifdef MPIDI_BUILD_CH4_LOCALITY_INFO
+    if (comm->comm_kind == MPIR_COMM_KIND__INTERCOMM) {
+        ret = 0;
+        goto fn_exit;
+    }
+    ret = MPIDIU_comm_rank_to_av(comm, rank)->is_local;
+    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE,
+                    (MPL_DBG_FDEST, " is_local=%d, rank=%d", ret, rank));
+#endif
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDIU_RANK_IS_LOCAL);
+    return ret;
+}
+
+
+static inline int MPIDI_CH4U_rank_to_lpid(int rank, MPIR_Comm * comm)
+{
+    int ret;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDIU_RANK_TO_LPID);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDIU_RANK_TO_LPID);
+
+    int avtid = 0, lpid = 0;
+    MPIDIU_comm_rank_to_pid(comm, rank, &lpid, &avtid);
+    if (avtid == 0) {
+        ret = lpid;
+    }
+    else {
+        ret = -1;
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDIU_RANK_TO_LPID);
+    return ret;
+}
+
+static inline int MPIDI_CH4U_get_node_id(MPIR_Comm * comm, int rank, MPID_Node_id_t * id_p)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int avtid = 0, lpid = 0;
+    MPIDIU_comm_rank_to_pid(comm, rank, &lpid, &avtid);
+    *id_p = MPIDI_CH4_Global.node_map[avtid][lpid];
+
+    return mpi_errno;
+}
+
+static inline int MPIDI_CH4U_get_max_node_id(MPIR_Comm * comm, MPID_Node_id_t * max_id_p)
+{
+    int mpi_errno = MPI_SUCCESS;
+    *max_id_p = MPIDI_CH4_Global.max_node_id;
+
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_build_nodemap
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4U_build_nodemap(int myrank,
+                                           MPIR_Comm * comm,
+                                           int sz,
+                                           MPID_Node_id_t * out_nodemap, MPID_Node_id_t * sz_out)
+{
+    return MPIR_NODEMAP_build_nodemap(sz, myrank, out_nodemap, sz_out);
+}
+
+static inline int MPIDIU_get_n_avts()
+{
+    return MPIDI_CH4_Global.avt_mgr.n_avts;
+}
+
+static inline int MPIDIU_get_max_n_avts()
+{
+    return MPIDI_CH4_Global.avt_mgr.max_n_avts;
+}
+
+static inline int MPIDIU_get_avt_size(int avtid)
+{
+    return MPIDII_av_table[avtid]->size;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDIU_alloc_globals_for_avtid
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDIU_alloc_globals_for_avtid(int avtid)
+{
+    int max_n_avts;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATER_ALLOC_GLOBALS_FOR_AVTID);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATER_ALLOC_GLOBALS_FOR_AVTID);
+    max_n_avts = MPIDIU_get_max_n_avts();
+    if (max_n_avts > MPIDI_CH4_Global.allocated_max_n_avts) {
+        MPIDI_CH4_Global.node_map = (MPID_Node_id_t **) MPL_realloc(MPIDI_CH4_Global.node_map,
+                                                                    max_n_avts *
+                                                                    sizeof(MPID_Node_id_t *));
+    }
+
+    MPIDI_CH4_Global.node_map[avtid] =
+        (MPID_Node_id_t *) MPL_malloc(MPIDII_av_table[avtid]->size * sizeof(MPID_Node_id_t));
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATER_ALLOC_GLOBALS_FOR_AVTID);
+    return MPI_SUCCESS;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDIU_free_globals_for_avtid
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDIU_free_globals_for_avtid(int avtid)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATER_FREE_GLOBALS_FOR_AVTID);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATER_FREE_GLOBALS_FOR_AVTID);
+    MPL_free(MPIDI_CH4_Global.node_map[avtid]);
+    MPIDI_CH4_Global.node_map[avtid] = NULL;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATER_FREE_GLOBALS_FOR_AVTID);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDIU_get_next_avtid(int *avtid)
+{
+    if (MPIDI_CH4_Global.avt_mgr.next_avtid == -1) {    /* out of free avtids */
+        int old_max, new_max, i;
+        old_max = MPIDI_CH4_Global.avt_mgr.max_n_avts;
+        new_max = old_max + 1;
+        MPIDI_CH4_Global.avt_mgr.free_avtid =
+            (int *) MPL_realloc(MPIDI_CH4_Global.avt_mgr.free_avtid, new_max * sizeof(int));
+        for (i = old_max; i < new_max - 1; i++) {
+            MPIDI_CH4_Global.avt_mgr.free_avtid[i] = i + 1;
+        }
+        MPIDI_CH4_Global.avt_mgr.free_avtid[new_max - 1] = -1;
+        MPIDI_CH4_Global.avt_mgr.max_n_avts = new_max;
+        MPIDI_CH4_Global.avt_mgr.next_avtid = old_max;
+    }
+
+    *avtid = MPIDI_CH4_Global.avt_mgr.next_avtid;
+    MPIDI_CH4_Global.avt_mgr.next_avtid = MPIDI_CH4_Global.avt_mgr.free_avtid[*avtid];
+    MPIDI_CH4_Global.avt_mgr.free_avtid[*avtid] = -1;
+    MPIDI_CH4_Global.avt_mgr.n_avts++;
+    MPIR_Assert(MPIDI_CH4_Global.avt_mgr.n_avts <= MPIDI_CH4_Global.avt_mgr.max_n_avts);
+    return *avtid;
+}
+
+static inline int MPIDIU_free_avtid(int avtid)
+{
+    MPIR_Assert(MPIDI_CH4_Global.avt_mgr.n_avts > 0);
+    MPIDI_CH4_Global.avt_mgr.free_avtid[avtid] = MPIDI_CH4_Global.avt_mgr.next_avtid;
+    MPIDI_CH4_Global.avt_mgr.next_avtid = avtid;
+    MPIDI_CH4_Global.avt_mgr.n_avts--;
+    return 0;
+}
+
+static inline int MPIDIU_new_avt(int size, int *avtid)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int max_n_avts;
+    MPIDII_av_table_t *new_av_table;
+    MPIDIU_get_next_avtid(avtid);
+
+    new_av_table = (MPIDII_av_table_t *) MPL_malloc(size * sizeof(MPIDII_av_entry_t)
+                                                    + sizeof(MPIDII_av_table_t));
+    max_n_avts = MPIDIU_get_max_n_avts();
+    MPIDII_av_table = (MPIDII_av_table_t **) MPL_realloc(MPIDII_av_table,
+                                                         max_n_avts * sizeof(MPIDII_av_table_t *));
+    new_av_table->size = size;
+    MPIDII_av_table[*avtid] = new_av_table;
+
+    MPIR_Object_set_ref(MPIDII_av_table[*avtid], 0);
+
+    MPIDIU_alloc_globals_for_avtid(*avtid);
+    return mpi_errno;
+}
+
+static inline int MPIDIU_free_avt(int avtid)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDIU_free_globals_for_avtid(avtid);
+    MPL_free(MPIDII_av_table[avtid]);
+    MPIDII_av_table[avtid] = NULL;
+    MPIDIU_free_avtid(avtid);
+    return mpi_errno;
+}
+
+static inline int MPIDIU_avt_add_ref(int avtid)
+{
+    MPIR_Object_add_ref(MPIDII_av_table[avtid]);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDIU_avt_release_ref(int avtid)
+{
+    int count;
+    MPIR_Object_release_ref(MPIDIU_get_av_table(avtid), &count);
+    if (count == 0) {
+        MPIDIU_free_avt(avtid);
+        MPIDIU_free_globals_for_avtid(avtid);
+    }
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDIU_avt_init()
+{
+    int i;
+    MPIDI_CH4_Global.avt_mgr.max_n_avts = 1;
+    MPIDI_CH4_Global.avt_mgr.next_avtid = 0;
+    MPIDI_CH4_Global.avt_mgr.n_avts = 0;
+    MPIDI_CH4_Global.avt_mgr.free_avtid =
+        (int *) MPL_malloc(MPIDI_CH4_Global.avt_mgr.max_n_avts * sizeof(int));
+
+    for (i = 0; i < MPIDI_CH4_Global.avt_mgr.max_n_avts - 1; i++) {
+        MPIDI_CH4_Global.avt_mgr.free_avtid[i] = i + 1;
+    }
+    MPIDI_CH4_Global.avt_mgr.free_avtid[MPIDI_CH4_Global.avt_mgr.max_n_avts - 1] = -1;
+
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDIU_avt_destroy()
+{
+    MPL_free(MPIDI_CH4_Global.avt_mgr.free_avtid);
+    return MPI_SUCCESS;
+}
+
+static inline int MPIDIU_build_nodemap_avtid(int myrank, MPIR_Comm * comm, int sz, int avtid)
+{
+    return MPIDI_CH4U_build_nodemap(myrank, comm, sz,
+                                    MPIDI_CH4_Global.node_map[avtid],
+                                    &MPIDI_CH4_Global.max_node_id);
+}
+
+#endif /* CH4R_PROC_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4r_recv.h b/src/mpid/ch4/src/ch4r_recv.h
new file mode 100644
index 0000000..c77040c
--- /dev/null
+++ b/src/mpid/ch4/src/ch4r_recv.h
@@ -0,0 +1,429 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4R_RECV_H_INCLUDED
+#define CH4R_RECV_H_INCLUDED
+
+#include "ch4_impl.h"
+#include "ch4r_proc.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4I_prepare_recv_req
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4I_prepare_recv_req(void *buf, int count, MPI_Datatype datatype,
+                                              MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_PREPARE_RECV_BUFFER);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_PREPARE_RECV_BUFFER);
+
+    MPIDI_CH4U_REQUEST(rreq, datatype) = datatype;
+    MPIDI_CH4U_REQUEST(rreq, buffer) = (char *) buf;
+    MPIDI_CH4U_REQUEST(rreq, count) = count;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_PREPARE_RECV_BUFFER);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4I_handle_unexpected
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4I_handle_unexpected(void *buf,
+                                               int count,
+                                               MPI_Datatype datatype,
+                                               MPIR_Comm * comm,
+                                               int context_offset, MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int dt_contig;
+    MPI_Aint dt_true_lb, last;
+    MPIR_Datatype *dt_ptr;
+    size_t in_data_sz, dt_sz, nbytes;
+    MPID_Segment *segment_ptr;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_HANDLE_UNEXPECTED);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_HANDLE_UNEXPECTED);
+
+    in_data_sz = MPIDI_CH4U_REQUEST(rreq, count);
+    MPID_Datatype_get_size_macro(datatype, dt_sz);
+
+    if (in_data_sz > dt_sz * count) {
+        rreq->status.MPI_ERROR = MPI_ERR_TRUNCATE;
+        nbytes = dt_sz * count;
+    }
+    else {
+        rreq->status.MPI_ERROR = MPI_SUCCESS;
+        nbytes = in_data_sz;
+        count = dt_sz ? nbytes / dt_sz : 0;
+    }
+    MPIR_STATUS_SET_COUNT(rreq->status, nbytes);
+    MPIDI_CH4U_REQUEST(rreq, datatype) = datatype;
+    MPIDI_CH4U_REQUEST(rreq, count) = nbytes;
+
+    MPIDI_Datatype_get_info(count, datatype, dt_contig, dt_sz, dt_ptr, dt_true_lb);
+
+    if (!dt_contig) {
+        segment_ptr = MPID_Segment_alloc();
+        MPIR_ERR_CHKANDJUMP1(segment_ptr == NULL, mpi_errno,
+                             MPI_ERR_OTHER, "**nomem", "**nomem %s", "Recv MPID_Segment_alloc");
+        MPID_Segment_init(buf, count, datatype, segment_ptr, 0);
+
+        last = nbytes;
+        MPID_Segment_unpack(segment_ptr, 0, &last, MPIDI_CH4U_REQUEST(rreq, buffer));
+        MPID_Segment_free(segment_ptr);
+        if (last != (MPI_Aint) (nbytes)) {
+            mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+                                             __FUNCTION__, __LINE__,
+                                             MPI_ERR_TYPE, "**dtypemismatch", 0);
+            rreq->status.MPI_ERROR = mpi_errno;
+        }
+    }
+    else {
+        MPIR_Memcpy((char *) buf + dt_true_lb, MPIDI_CH4U_REQUEST(rreq, buffer), nbytes);
+    }
+
+    MPIDI_CH4U_REQUEST(rreq, req->status) &= ~MPIDI_CH4U_REQ_UNEXPECTED;
+    MPL_free(MPIDI_CH4U_REQUEST(rreq, buffer));
+
+    rreq->status.MPI_SOURCE = MPIDI_CH4U_REQUEST(rreq, src_rank);
+    rreq->status.MPI_TAG = MPIDI_CH4U_get_tag(MPIDI_CH4U_REQUEST(rreq, tag));
+
+    if (MPIDI_CH4U_REQUEST(rreq, req->status) & MPIDI_CH4U_REQ_PEER_SSEND) {
+        mpi_errno = MPIDI_CH4U_reply_ssend(rreq);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+    MPIDI_CH4I_am_request_complete(rreq);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_HANDLE_UNEXPECTED);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4I_do_irecv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4I_do_irecv(void *buf,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int rank,
+                                      int tag,
+                                      MPIR_Comm * comm,
+                                      int context_offset,
+                                      MPIR_Request ** request, int alloc_req, uint64_t flags)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq = NULL, *unexp_req = NULL;
+    uint64_t match_bits, mask_bits;
+    MPIR_Context_id_t context_id = comm->recvcontext_id + context_offset;
+    MPIR_Comm *root_comm;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_DO_IRECV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_DO_IRECV);
+
+    match_bits = MPIDI_CH4U_init_recvtag(&mask_bits, context_id, rank, tag);
+    root_comm = MPIDI_CH4U_context_id_to_comm(comm->recvcontext_id);
+    unexp_req = MPIDI_CH4U_dequeue_unexp(match_bits, mask_bits,
+                                         &MPIDI_CH4U_COMM(root_comm, unexp_list));
+
+    if (unexp_req) {
+        MPIR_Comm_release(root_comm);   /* -1 for removing from unexp_list */
+        if (MPIDI_CH4U_REQUEST(unexp_req, req->status) & MPIDI_CH4U_REQ_BUSY) {
+            MPIDI_CH4U_REQUEST(unexp_req, req->status) |= MPIDI_CH4U_REQ_MATCHED;
+        }
+        else if (MPIDI_CH4U_REQUEST(unexp_req, req->status) & MPIDI_CH4U_REQ_LONG_RTS) {
+            /* Matching receive is now posted, tell the netmod */
+            dtype_add_ref_if_not_builtin(datatype);
+            MPIDI_CH4U_REQUEST(unexp_req, datatype) = datatype;
+            MPIDI_CH4U_REQUEST(unexp_req, buffer) = (char *) buf;
+            MPIDI_CH4U_REQUEST(unexp_req, count) = count;
+            *request = unexp_req;
+            mpi_errno = MPIDI_NM_am_recv(unexp_req);
+            if (mpi_errno)
+                MPIR_ERR_POP(mpi_errno);
+            goto fn_exit;
+        }
+        else {
+            *request = unexp_req;
+            mpi_errno = MPIDI_CH4I_handle_unexpected(buf, count, datatype,
+                                                     root_comm, context_id, unexp_req);
+            if (mpi_errno)
+                MPIR_ERR_POP(mpi_errno);
+            goto fn_exit;
+        }
+    }
+
+    if (alloc_req) {
+        rreq = MPIDI_CH4I_am_request_create(MPIR_REQUEST_KIND__RECV);
+    }
+    else {
+        rreq = *request;
+        MPIR_Assert(0);
+    }
+
+    *request = rreq;
+    if (unlikely(rank == MPI_PROC_NULL)) {
+        rreq->kind = MPIR_REQUEST_KIND__RECV;
+        rreq->status.MPI_ERROR = MPI_SUCCESS;
+        rreq->status.MPI_SOURCE = rank;
+        rreq->status.MPI_TAG = tag;
+        MPIDI_CH4I_am_request_complete(rreq);
+        goto fn_exit;
+    }
+
+    dtype_add_ref_if_not_builtin(datatype);
+    MPIDI_CH4U_REQUEST(rreq, tag) = match_bits;
+    MPIDI_CH4U_REQUEST(rreq, req->rreq.ignore) = mask_bits;
+    MPIDI_CH4U_REQUEST(rreq, datatype) = datatype;
+
+    mpi_errno = MPIDI_CH4I_prepare_recv_req(buf, count, datatype, rreq);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+
+    if (!unexp_req) {
+        /* MPIDI_CS_ENTER(); */
+        /* Increment refcnt for comm before posting rreq to posted_list,
+         * to make sure comm is alive while holding an entry in the posted_list */
+        MPIR_Comm_add_ref(root_comm);
+        MPIDI_CH4U_enqueue_posted(rreq, &MPIDI_CH4U_COMM(root_comm, posted_list));
+        /* MPIDI_CS_EXIT(); */
+    }
+    else {
+        MPIDI_CH4U_REQUEST(unexp_req, req->rreq.match_req) = (uint64_t) rreq;
+    }
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_DO_IRECV);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_recv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_recv(void *buf,
+                                   int count,
+                                   MPI_Datatype datatype,
+                                   int rank,
+                                   int tag,
+                                   MPIR_Comm * comm,
+                                   int context_offset, MPI_Status * status, MPIR_Request ** request)
+{
+    int mpi_errno;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_RECV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_RECV);
+
+    mpi_errno = MPIDI_CH4I_do_irecv(buf, count, datatype, rank, tag,
+                                    comm, context_offset, request, 1, 0ULL);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_RECV);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_recv_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_recv_init(void *buf,
+                                        int count,
+                                        MPI_Datatype datatype,
+                                        int rank,
+                                        int tag,
+                                        MPIR_Comm * comm,
+                                        int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_RECV_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_RECV_INIT);
+
+    rreq = MPIDI_CH4I_am_request_create(MPIR_REQUEST_KIND__PREQUEST_RECV);
+
+    *request = rreq;
+    rreq->comm = comm;
+    MPIR_Comm_add_ref(comm);
+
+    MPIDI_CH4U_REQUEST(rreq, buffer) = (void *) buf;
+    MPIDI_CH4U_REQUEST(rreq, count) = count;
+    MPIDI_CH4U_REQUEST(rreq, datatype) = datatype;
+    MPIDI_CH4U_REQUEST(rreq, tag) =
+        MPIDI_CH4U_init_send_tag(comm->context_id + context_offset, rank, tag);
+    rreq->u.persist.real_request = NULL;
+    MPIDI_CH4I_am_request_complete(rreq);
+    MPIDI_CH4U_REQUEST(rreq, p_type) = MPIDI_PTYPE_RECV;
+    dtype_add_ref_if_not_builtin(datatype);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_RECV_INIT);
+    return mpi_errno;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_imrecv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_imrecv(void *buf,
+                                     int count,
+                                     MPI_Datatype datatype,
+                                     MPIR_Request * message, MPIR_Request ** rreqp)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_IMRECV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_IMRECV);
+
+    if (message == NULL) {
+        MPIDI_Request_create_null_rreq(rreq, mpi_errno, fn_fail);
+        *rreqp = rreq;
+        goto fn_exit;
+    }
+
+    MPIR_Assert(message->kind == MPIR_REQUEST_KIND__MPROBE);
+    MPIDI_CH4U_REQUEST(message, req->rreq.mrcv_buffer) = buf;
+    MPIDI_CH4U_REQUEST(message, req->rreq.mrcv_count) = count;
+    MPIDI_CH4U_REQUEST(message, req->rreq.mrcv_datatype) = datatype;
+    *rreqp = message;
+
+    /* MPIDI_CS_ENTER(); */
+    if (MPIDI_CH4U_REQUEST(message, req->status) & MPIDI_CH4U_REQ_BUSY) {
+        MPIDI_CH4U_REQUEST(message, req->status) |= MPIDI_CH4U_REQ_UNEXP_CLAIMED;
+    }
+    else if (MPIDI_CH4U_REQUEST(message, req->status) & MPIDI_CH4U_REQ_LONG_RTS) {
+        /* Matching receive is now posted, tell the netmod */
+        message->kind = MPIR_REQUEST_KIND__RECV;
+        dtype_add_ref_if_not_builtin(datatype);
+        MPIDI_CH4U_REQUEST(message, datatype) = datatype;
+        MPIDI_CH4U_REQUEST(message, buffer) = (char *) buf;
+        MPIDI_CH4U_REQUEST(message, count) = count;
+        mpi_errno = MPIDI_NM_am_recv(message);
+    }
+    else {
+        mpi_errno = MPIDI_CH4U_unexp_mrecv_cmpl_handler(message);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+    /* MPIDI_CS_EXIT(); */
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_IMRECV);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_mrecv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_mrecv(void *buf,
+                                    int count,
+                                    MPI_Datatype datatype,
+                                    MPIR_Request * message, MPI_Status * status)
+{
+    int mpi_errno = MPI_SUCCESS, active_flag;
+    MPI_Request req_handle;
+    MPIR_Request *rreq = NULL;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_MRECV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_MRECV);
+
+    mpi_errno = MPIDI_Imrecv(buf, count, datatype, message, &rreq);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    while (!MPIR_Request_is_complete(rreq)) {
+        MPIDI_Progress_test();
+    }
+
+    MPIR_Request_extract_status(rreq, status);
+
+    mpi_errno = MPIR_Request_complete(&req_handle, rreq, status, &active_flag);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_MRECV);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_irecv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_irecv(void *buf,
+                                    int count,
+                                    MPI_Datatype datatype,
+                                    int rank,
+                                    int tag,
+                                    MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_IRECV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_IRECV);
+
+    mpi_errno = MPIDI_CH4I_do_irecv(buf, count, datatype, rank, tag,
+                                    comm, context_offset, request, 1, 0ULL);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_IRECV);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_cancel_recv
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_cancel_recv(MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS, found;
+    MPIR_Comm *root_comm;
+    uint64_t msg_tag;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_CANCEL_RECV);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_CANCEL_RECV);
+
+    msg_tag = MPIDI_CH4U_REQUEST(rreq, tag);
+    root_comm = MPIDI_CH4U_context_id_to_comm(MPIDI_CH4U_get_context(msg_tag));
+
+    /* MPIDI_CS_ENTER(); */
+    found =
+        MPIDI_CH4U_delete_posted(&rreq->dev.ch4.ch4u.req->rreq,
+                                 &MPIDI_CH4U_COMM(root_comm, posted_list));
+    /* MPIDI_CS_EXIT(); */
+
+    if (found) {
+        MPIR_STATUS_SET_CANCEL_BIT(rreq->status, TRUE);
+        MPIR_STATUS_SET_COUNT(rreq->status, 0);
+        MPIR_Comm_release(root_comm);   /* -1 for posted_list */
+        MPIDI_CH4I_am_request_complete(rreq);
+    }
+    else {
+        MPIR_STATUS_SET_CANCEL_BIT(rreq->status, FALSE);
+    }
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_CANCEL_RECV);
+    return mpi_errno;
+}
+
+#endif /* CH4R_RECV_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4r_recvq.h b/src/mpid/ch4/src/ch4r_recvq.h
new file mode 100644
index 0000000..ff98e36
--- /dev/null
+++ b/src/mpid/ch4/src/ch4r_recvq.h
@@ -0,0 +1,336 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4R_RECVQ_H_INCLUDED
+#define CH4R_RECVQ_H_INCLUDED
+
+#include <mpidimpl.h>
+#include "mpl_utlist.h"
+#include "ch4_impl.h"
+
+#ifdef MPIDI_CH4U_USE_PER_COMM_QUEUE
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_enqueue_posted
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ void MPIDI_CH4U_enqueue_posted(MPIR_Request * req, MPIDI_CH4U_rreq_t ** list)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_ENQUEUE_POSTED);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_ENQUEUE_POSTED);
+    MPIDI_CH4U_REQUEST(req, req->rreq.request) = (uint64_t) req;
+    MPL_DL_APPEND(*list, &req->dev.ch4.ch4u.req->rreq);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_ENQUEUE_POSTED);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_enqueue_unexp
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ void MPIDI_CH4U_enqueue_unexp(MPIR_Request * req, MPIDI_CH4U_rreq_t ** list)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_ENQUEUE_UNEXP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_ENQUEUE_UNEXP);
+    MPIDI_CH4U_REQUEST(req, req->rreq.request) = (uint64_t) req;
+    MPL_DL_APPEND(*list, &req->dev.ch4.ch4u.req->rreq);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_ENQUEUE_UNEXP);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_delete_unexp
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ void MPIDI_CH4U_delete_unexp(MPIR_Request * req, MPIDI_CH4U_rreq_t ** list)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_DELETE_UNEXP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_DELETE_UNEXP);
+    MPL_DL_DELETE(*list, &req->dev.ch4.ch4u.req->rreq);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_DELETE_UNEXP);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_dequeue_unexp_strict
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ MPIR_Request *MPIDI_CH4U_dequeue_unexp_strict(uint64_t tag, uint64_t ignore,
+                                                             MPIDI_CH4U_rreq_t ** list)
+{
+    MPIDI_CH4U_rreq_t *curr, *tmp;
+    MPIR_Request *req = NULL;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_DEQUEUE_UNEXP_STRICT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_DEQUEUE_UNEXP_STRICT);
+
+    MPL_DL_FOREACH_SAFE(*list, curr, tmp) {
+        req = (MPIR_Request *) curr->request;
+        if (!(MPIDI_CH4U_REQUEST(req, req->status) & MPIDI_CH4U_REQ_BUSY) &&
+            ((tag & ~ignore) == (MPIDI_CH4U_REQUEST(req, tag) & ~ignore))) {
+            MPL_DL_DELETE(*list, curr);
+            break;
+        }
+        req = NULL;
+    }
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_DEQUEUE_UNEXP_STRICT);
+    return req;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_dequeue_unexp
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ MPIR_Request *MPIDI_CH4U_dequeue_unexp(uint64_t tag, uint64_t ignore,
+                                                      MPIDI_CH4U_rreq_t ** list)
+{
+    MPIDI_CH4U_rreq_t *curr, *tmp;
+    MPIR_Request *req = NULL;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_DEQUEUE_UNEXP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_DEQUEUE_UNEXP);
+
+    MPL_DL_FOREACH_SAFE(*list, curr, tmp) {
+        req = (MPIR_Request *) curr->request;
+        if ((tag & ~ignore) == (MPIDI_CH4U_REQUEST(req, tag) & ~ignore)) {
+            MPL_DL_DELETE(*list, curr);
+            break;
+        }
+        req = NULL;
+    }
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_DEQUEUE_UNEXP);
+    return req;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_find_unexp
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ MPIR_Request *MPIDI_CH4U_find_unexp(uint64_t tag, uint64_t ignore,
+                                                   MPIDI_CH4U_rreq_t ** list)
+{
+    MPIDI_CH4U_rreq_t *curr, *tmp;
+    MPIR_Request *req = NULL;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_FIND_UNEXP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_FIND_UNEXP);
+
+    MPL_DL_FOREACH_SAFE(*list, curr, tmp) {
+        req = (MPIR_Request *) curr->request;
+        if ((tag & ~ignore) == (MPIDI_CH4U_REQUEST(req, tag) & ~ignore)) {
+            break;
+        }
+        req = NULL;
+    }
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_FIND_UNEXP);
+    return req;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_dequeue_posted
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ MPIR_Request *MPIDI_CH4U_dequeue_posted(uint64_t tag, MPIDI_CH4U_rreq_t ** list)
+{
+    MPIR_Request *req = NULL;
+    MPIDI_CH4U_rreq_t *curr, *tmp;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_DEQUEUE_POSTED);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_DEQUEUE_POSTED);
+
+    MPL_DL_FOREACH_SAFE(*list, curr, tmp) {
+        req = (MPIR_Request *) curr->request;
+        if ((tag & ~(MPIDI_CH4U_REQUEST(req, req->rreq.ignore))) ==
+            (MPIDI_CH4U_REQUEST(req, tag) & ~(MPIDI_CH4U_REQUEST(req, req->rreq.ignore)))) {
+            MPL_DL_DELETE(*list, curr);
+            break;
+        }
+        req = NULL;
+    }
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_DEQUEUE_POSTED);
+    return req;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_delete_posted
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_delete_posted(MPIDI_CH4U_rreq_t * req, MPIDI_CH4U_rreq_t ** list)
+{
+    int found = 0;
+    MPIDI_CH4U_rreq_t *curr, *tmp;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_DELETE_POSTED);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_DELETE_POSTED);
+    MPL_DL_FOREACH_SAFE(*list, curr, tmp) {
+        if (curr == req) {
+            MPL_DL_DELETE(*list, curr);
+            found = 1;
+            break;
+        }
+    }
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_DELETE_POSTED);
+    return found;
+}
+
+#else /* #ifdef MPIDI_CH4U_USE_PER_COMM_QUEUE */
+
+/* Use global queue */
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_enqueue_posted
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ void MPIDI_CH4U_enqueue_posted(MPIR_Request * req, MPIDI_CH4U_rreq_t ** list)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_ENQUEUE_POSTED);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_ENQUEUE_POSTED);
+    MPIDI_CH4U_REQUEST(req, req->rreq.request) = (uint64_t) req;
+    MPL_DL_APPEND(MPIDI_CH4_Global.posted_list, &req->dev.ch4.ch4u.req->rreq);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_ENQUEUE_POSTED);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_enqueue_unexp
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ void MPIDI_CH4U_enqueue_unexp(MPIR_Request * req, MPIDI_CH4U_rreq_t ** list)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_ENQUEUE_UNEXP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_ENQUEUE_UNEXP);
+    MPIDI_CH4U_REQUEST(req, req->rreq.request) = (uint64_t) req;
+    MPL_DL_APPEND(MPIDI_CH4_Global.unexp_list, &req->dev.ch4.ch4u.req->rreq);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_ENQUEUE_UNEXP);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_delete_unexp
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ void MPIDI_CH4U_delete_unexp(MPIR_Request * req, MPIDI_CH4U_rreq_t ** list)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_DELETE_UNEXP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_DELETE_UNEXP);
+    MPL_DL_DELETE(MPIDI_CH4_Global.unexp_list, &req->dev.ch4.ch4u.req->rreq);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_DELETE_UNEXP);
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_dequeue_unexp_strict
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ MPIR_Request *MPIDI_CH4U_dequeue_unexp_strict(uint64_t tag, uint64_t ignore,
+                                                             MPIDI_CH4U_rreq_t ** list)
+{
+    MPIDI_CH4U_rreq_t *curr, *tmp;
+    MPIR_Request *req = NULL;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_DEQUEUE_UNEXP_STRICT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_DEQUEUE_UNEXP_STRICT);
+
+    MPL_DL_FOREACH_SAFE(MPIDI_CH4_Global.unexp_list, curr, tmp) {
+        req = (MPIR_Request *) curr->request;
+        if (!(MPIDI_CH4U_REQUEST(req, req->status) & MPIDI_CH4U_REQ_BUSY) &&
+            ((tag & ~ignore) == (MPIDI_CH4U_REQUEST(req, tag) & ~ignore))) {
+            MPL_DL_DELETE(MPIDI_CH4_Global.unexp_list, curr);
+            break;
+        }
+        req = NULL;
+    }
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_DEQUEUE_UNEXP_STRICT);
+    return req;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_dequeue_unexp
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ MPIR_Request *MPIDI_CH4U_dequeue_unexp(uint64_t tag, uint64_t ignore,
+                                                      MPIDI_CH4U_rreq_t ** list)
+{
+    MPIDI_CH4U_rreq_t *curr, *tmp;
+    MPIR_Request *req = NULL;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_DEQUEUE_UNEXP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_DEQUEUE_UNEXP);
+
+    MPL_DL_FOREACH_SAFE(MPIDI_CH4_Global.unexp_list, curr, tmp) {
+        req = (MPIR_Request *) curr->request;
+        if ((tag & ~ignore) == (MPIDI_CH4U_REQUEST(req, tag) & ~ignore)) {
+            MPL_DL_DELETE(MPIDI_CH4_Global.unexp_list, curr);
+            break;
+        }
+        req = NULL;
+    }
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_DEQUEUE_UNEXP);
+    return req;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_find_unexp
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ MPIR_Request *MPIDI_CH4U_find_unexp(uint64_t tag, uint64_t ignore,
+                                                   MPIDI_CH4U_rreq_t ** list)
+{
+    MPIDI_CH4U_rreq_t *curr, *tmp;
+    MPIR_Request *req = NULL;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_FIND_UNEXP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_FIND_UNEXP);
+
+    MPL_DL_FOREACH_SAFE(MPIDI_CH4_Global.unexp_list, curr, tmp) {
+        req = (MPIR_Request *) curr->request;
+        if ((tag & ~ignore) == (MPIDI_CH4U_REQUEST(req, tag) & ~ignore)) {
+            break;
+        }
+        req = NULL;
+    }
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_FIND_UNEXP);
+    return req;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_dequeue_posted
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ MPIR_Request *MPIDI_CH4U_dequeue_posted(uint64_t tag, MPIDI_CH4U_rreq_t ** list)
+{
+    MPIR_Request *req = NULL;
+    MPIDI_CH4U_rreq_t *curr, *tmp;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_DEQUEUE_POSTED);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_DEQUEUE_POSTED);
+
+    MPL_DL_FOREACH_SAFE(MPIDI_CH4_Global.posted_list, curr, tmp) {
+        req = (MPIR_Request *) curr->request;
+        if ((tag & ~MPIDI_CH4U_REQUEST(req, req->rreq.ignore)) ==
+            (MPIDI_CH4U_REQUEST(req, tag) & ~MPIDI_CH4U_REQUEST(req, req->rreq.ignore))) {
+            MPL_DL_DELETE(MPIDI_CH4_Global.posted_list, curr);
+            break;
+        }
+        req = NULL;
+    }
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_DEQUEUE_POSTED);
+    return req;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_delete_posted
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_delete_posted(MPIDI_CH4U_rreq_t * req, MPIDI_CH4U_rreq_t ** list)
+{
+    int found = 0;
+    MPIDI_CH4U_rreq_t *curr, *tmp;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_DELETE_POSTED);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_DELETE_POSTED);
+    MPL_DL_FOREACH_SAFE(MPIDI_CH4_Global.posted_list, curr, tmp) {
+        if (curr == req) {
+            MPL_DL_DELETE(MPIDI_CH4_Global.posted_list, curr);
+            found = 1;
+            break;
+        }
+    }
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_DELETE_POSTED);
+    return found;
+}
+
+#endif /* MPIDI_CH4U_USE_PER_COMM_QUEUE */
+
+#endif /* CH4R_RECVQ_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4r_request.h b/src/mpid/ch4/src/ch4r_request.h
new file mode 100644
index 0000000..68399c5
--- /dev/null
+++ b/src/mpid/ch4/src/ch4r_request.h
@@ -0,0 +1,126 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4R_REQUEST_H_INCLUDED
+#define CH4R_REQUEST_H_INCLUDED
+
+#include "ch4_types.h"
+#include "ch4r_buf.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4I_am_request_create
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline MPIR_Request *MPIDI_CH4I_am_request_create(MPIR_Request_kind_t kind)
+{
+    MPIR_Request *req;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_REQUEST_CREATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_REQUEST_CREATE);
+
+    req = MPIR_Request_create(kind);
+    MPIDI_NM_am_request_init(req);
+    MPIR_Request_add_ref(req);
+
+    CH4_COMPILE_TIME_ASSERT(sizeof(MPIDI_CH4U_req_ext_t) <= MPIDI_CH4I_BUF_POOL_SZ);
+    MPIDI_CH4U_REQUEST(req, req) =
+        (MPIDI_CH4U_req_ext_t *) MPIDI_CH4R_get_buf(MPIDI_CH4_Global.buf_pool);
+    MPIR_Assert(MPIDI_CH4U_REQUEST(req, req));
+    MPIDI_CH4U_REQUEST(req, req->status) = 0;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_REQUEST_CREATE);
+
+    return req;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4I_am_win_request_create
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline MPIR_Request *MPIDI_CH4I_am_win_request_create()
+{
+    MPIR_Request *req;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_REQUEST_CREATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_REQUEST_CREATE);
+
+    req = MPIR_Request_create(MPIR_REQUEST_KIND__UNDEFINED);
+    MPIDI_NM_am_request_init(req);
+
+    CH4_COMPILE_TIME_ASSERT(sizeof(MPIDI_CH4U_req_ext_t) <= MPIDI_CH4I_BUF_POOL_SZ);
+    MPIDI_CH4U_REQUEST(req, req) =
+        (MPIDI_CH4U_req_ext_t *) MPIDI_CH4R_get_buf(MPIDI_CH4_Global.buf_pool);
+    MPIR_Assert(MPIDI_CH4U_REQUEST(req, req));
+    MPIDI_CH4U_REQUEST(req, req->status) = 0;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_REQUEST_CREATE);
+    return req;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4I_am_request_complete
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ void MPIDI_CH4I_am_request_complete(MPIR_Request * req)
+{
+    int incomplete;
+    MPIR_cc_decr(req->cc_ptr, &incomplete);
+    if (!incomplete) {
+        if (MPIDI_CH4U_REQUEST(req, req) && MPIR_cc_is_complete(&req->cc)) {
+            MPIDI_CH4R_release_buf(MPIDI_CH4U_REQUEST(req, req));
+            MPIDI_CH4U_REQUEST(req, req) = NULL;
+        }
+        MPIDI_NM_am_request_finalize(req);
+        MPIDI_CH4U_request_release(req);
+    }
+}
+
+/* This function should be called any time an anysource request is matched so
+ * the upper layer will have a chance to arbitrate who wins the race between
+ * the netmod and the shmod. This will cancel the request of the other side and
+ * take care of copying any relevant data. */
+static inline int MPIDI_CH4R_anysource_matched(MPIR_Request * rreq, int caller,
+                                               int *continue_matching)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPIDI_CH4R_ANYSOURCE_MATCHED);
+    MPIR_FUNC_VERBOSE_ENTER(MPIDI_CH4R_ANYSOURCE_MATCHED);
+
+    MPIR_Assert(MPIDI_CH4R_NETMOD == caller || MPIDI_CH4R_SHM == caller);
+
+    if (MPIDI_CH4R_NETMOD == caller) {
+#ifdef MPIDI_BUILD_CH4_SHM
+        mpi_errno = MPIDI_SHM_cancel_recv(rreq);
+
+        /* If the netmod is cancelling the request, then shared memory will
+         * just copy the status from the shared memory side because the netmod
+         * will always win the race condition here. */
+        if (MPIR_STATUS_GET_CANCEL_BIT(rreq->status)) {
+            /* If the request is cancelled, copy the status object from the
+             * partner request */
+            rreq->status = MPIDI_CH4I_REQUEST_ANYSOURCE_PARTNER(rreq)->status;
+        }
+#endif
+        *continue_matching = 0;
+    }
+    else if (MPIDI_CH4R_SHM == caller) {
+        mpi_errno = MPIDI_NM_cancel_recv(rreq);
+
+        /* If the netmod has already matched this request, shared memory will
+         * lose and should stop matching this request */
+        *continue_matching = !MPIR_STATUS_GET_CANCEL_BIT(rreq->status);
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPIDI_CH4R_ANYSOURCE_MATCHED);
+    return mpi_errno;
+}
+
+#endif /* CH4R_REQUEST_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4r_rma.h b/src/mpid/ch4/src/ch4r_rma.h
new file mode 100644
index 0000000..b0411db
--- /dev/null
+++ b/src/mpid/ch4/src/ch4r_rma.h
@@ -0,0 +1,761 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4R_RMA_H_INCLUDED
+#define CH4R_RMA_H_INCLUDED
+
+#include "ch4_impl.h"
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4I_do_put
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4I_do_put(const void *origin_addr,
+                                    int origin_count,
+                                    MPI_Datatype origin_datatype,
+                                    int target_rank,
+                                    MPI_Aint target_disp,
+                                    int target_count,
+                                    MPI_Datatype target_datatype,
+                                    MPIR_Win * win, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS, n_iov, c;
+    MPIR_Request *sreq = NULL;
+    MPIDI_CH4U_put_msg_t am_hdr;
+    uint64_t offset;
+    size_t data_sz;
+    MPI_Aint last, num_iov;
+    MPID_Segment *segment_ptr;
+    struct iovec *dt_iov, am_iov[2];
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_DO_PUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_DO_PUT);
+
+    MPIDI_CH4U_EPOCH_CHECK_SYNC(win, mpi_errno, goto fn_fail);
+
+    sreq = MPIDI_CH4I_am_win_request_create();
+    MPIR_Assert(sreq);
+    sreq->kind = MPIR_REQUEST_KIND__RMA;
+    if (request) {
+        *request = sreq;
+        MPIDI_Request_add_ref(sreq);
+    }
+
+    MPIDI_CH4U_REQUEST(sreq, req->preq.win_ptr) = win;
+    MPIDI_Datatype_check_size(origin_datatype, origin_count, data_sz);
+    if (data_sz == 0 || target_rank == MPI_PROC_NULL) {
+        MPIDI_CH4I_am_request_complete(sreq);
+        goto fn_exit;
+    }
+
+    if (target_rank == win->comm_ptr->rank) {
+        offset = win->disp_unit * target_disp;
+        MPIDI_CH4I_am_request_complete(sreq);
+        return MPIR_Localcopy(origin_addr,
+                              origin_count,
+                              origin_datatype,
+                              (char *) win->base + offset, target_count, target_datatype);
+    }
+
+    MPIDI_CH4U_EPOCH_START_CHECK(win, mpi_errno, goto fn_fail);
+    MPIR_cc_incr(sreq->cc_ptr, &c);
+    am_hdr.src_rank = win->comm_ptr->rank;
+    am_hdr.target_disp = target_disp;
+    am_hdr.count = target_count;
+    am_hdr.datatype = target_datatype;
+    am_hdr.preq_ptr = (uint64_t) sreq;
+    am_hdr.win_id = MPIDI_CH4U_WIN(win, win_id);
+
+    /* MPIDI_CS_ENTER(); */
+    OPA_incr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+
+    if (HANDLE_GET_KIND(target_datatype) == HANDLE_KIND_BUILTIN) {
+        am_hdr.n_iov = 0;
+        MPIDI_CH4U_REQUEST(sreq, req->preq.dt_iov) = NULL;
+
+        mpi_errno = MPIDI_NM_send_am(target_rank, win->comm_ptr, MPIDI_CH4U_PUT_REQ,
+                                     &am_hdr, sizeof(am_hdr), origin_addr,
+                                     origin_count, origin_datatype, sreq, NULL);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+        goto fn_exit;
+    }
+
+    segment_ptr = MPIDU_Segment_alloc();
+    MPIR_Assert(segment_ptr);
+
+    MPIDU_Segment_init(NULL, target_count, target_datatype, segment_ptr, 0);
+    last = data_sz;
+    MPIDU_Segment_count_contig_blocks(segment_ptr, 0, &last, &num_iov);
+    n_iov = (int) num_iov;
+    MPIR_Assert(n_iov > 0);
+    am_hdr.n_iov = n_iov;
+    dt_iov = (struct iovec *) MPL_malloc(n_iov * sizeof(struct iovec));
+    MPIR_Assert(dt_iov);
+
+    last = data_sz;
+    MPIDU_Segment_pack_vector(segment_ptr, 0, &last, dt_iov, &n_iov);
+    MPIR_Assert(last == (MPI_Aint) data_sz);
+    MPL_free(segment_ptr);
+
+    am_iov[0].iov_base = &am_hdr;
+    am_iov[0].iov_len = sizeof(am_hdr);
+    am_iov[1].iov_base = dt_iov;
+    am_iov[1].iov_len = sizeof(struct iovec) * am_hdr.n_iov;
+
+    MPIDI_CH4U_REQUEST(sreq, req->preq.dt_iov) = dt_iov;
+
+    if ((am_iov[0].iov_len + am_iov[1].iov_len) <= MPIDI_NM_am_hdr_max_sz()) {
+        mpi_errno = MPIDI_NM_send_amv(target_rank, win->comm_ptr, MPIDI_CH4U_PUT_REQ,
+                                      &am_iov[0], 2, origin_addr, origin_count, origin_datatype,
+                                      sreq, NULL);
+    }
+    else {
+        MPIDI_CH4U_REQUEST(sreq, req->preq.origin_addr) = (void *) origin_addr;
+        MPIDI_CH4U_REQUEST(sreq, req->preq.origin_count) = origin_count;
+        MPIDI_CH4U_REQUEST(sreq, req->preq.origin_datatype) = origin_datatype;
+        MPIDI_CH4U_REQUEST(sreq, src_rank) = target_rank;
+        dtype_add_ref_if_not_builtin(origin_datatype);
+
+        mpi_errno = MPIDI_NM_send_am(target_rank, win->comm_ptr, MPIDI_CH4U_PUT_IOV_REQ,
+                                     &am_hdr, sizeof(am_hdr), am_iov[1].iov_base,
+                                     am_iov[1].iov_len, MPI_BYTE, sreq, NULL);
+    }
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_DO_PUT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4I_do_get
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4I_do_get(void *origin_addr,
+                                    int origin_count,
+                                    MPI_Datatype origin_datatype,
+                                    int target_rank,
+                                    MPI_Aint target_disp,
+                                    int target_count,
+                                    MPI_Datatype target_datatype,
+                                    MPIR_Win * win, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS, n_iov, c;
+    size_t offset;
+    MPIR_Request *sreq = NULL;
+    MPIDI_CH4U_get_req_msg_t am_hdr;
+    size_t data_sz;
+    MPI_Aint last, num_iov;
+    MPID_Segment *segment_ptr;
+    struct iovec *dt_iov;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_MPIDI_CH4I_DO_GET);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_MPIDI_CH4I_DO_GET);
+
+    MPIDI_CH4U_EPOCH_CHECK_SYNC(win, mpi_errno, goto fn_fail);
+
+    sreq = MPIDI_CH4I_am_win_request_create();
+    MPIR_Assert(sreq);
+    sreq->kind = MPIR_REQUEST_KIND__RMA;
+    if (request) {
+        *request = sreq;
+        MPIDI_Request_add_ref(sreq);
+    }
+
+    MPIDI_Datatype_check_size(origin_datatype, origin_count, data_sz);
+    if (data_sz == 0 || target_rank == MPI_PROC_NULL) {
+        MPIDI_CH4I_am_request_complete(sreq);
+        goto fn_exit;
+    }
+
+    MPIDI_CH4U_REQUEST(sreq, req->greq.win_ptr) = win;
+    MPIDI_CH4U_REQUEST(sreq, req->greq.addr) = (uint64_t) ((char *) origin_addr);
+    MPIDI_CH4U_REQUEST(sreq, req->greq.count) = origin_count;
+    MPIDI_CH4U_REQUEST(sreq, req->greq.datatype) = origin_datatype;
+
+    if (target_rank == win->comm_ptr->rank) {
+        MPIDI_CH4I_am_request_complete(sreq);
+        offset = win->disp_unit * target_disp;
+        return MPIR_Localcopy((char *) win->base + offset,
+                              target_count,
+                              target_datatype, origin_addr, origin_count, origin_datatype);
+    }
+
+    MPIDI_CH4U_EPOCH_START_CHECK(win, mpi_errno, goto fn_fail);
+    MPIR_cc_incr(sreq->cc_ptr, &c);
+    am_hdr.target_disp = target_disp;
+    am_hdr.count = target_count;
+    am_hdr.datatype = target_datatype;
+    am_hdr.greq_ptr = (uint64_t) sreq;
+    am_hdr.win_id = MPIDI_CH4U_WIN(win, win_id);
+    am_hdr.src_rank = win->comm_ptr->rank;
+
+    /* MPIDI_CS_ENTER(); */
+    OPA_incr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+
+    if (HANDLE_GET_KIND(target_datatype) == HANDLE_KIND_BUILTIN) {
+        am_hdr.n_iov = 0;
+        MPIDI_CH4U_REQUEST(sreq, req->greq.dt_iov) = NULL;
+
+        mpi_errno = MPIDI_NM_send_am_hdr(target_rank, win->comm_ptr,
+                                         MPIDI_CH4U_GET_REQ, &am_hdr, sizeof(am_hdr), sreq, NULL);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+        goto fn_exit;
+    }
+
+    segment_ptr = MPIDU_Segment_alloc();
+    MPIR_Assert(segment_ptr);
+
+    MPIDU_Segment_init(NULL, target_count, target_datatype, segment_ptr, 0);
+    last = data_sz;
+    MPIDU_Segment_count_contig_blocks(segment_ptr, 0, &last, &num_iov);
+    n_iov = (int) num_iov;
+    MPIR_Assert(n_iov > 0);
+    am_hdr.n_iov = n_iov;
+    dt_iov = (struct iovec *) MPL_malloc(n_iov * sizeof(struct iovec));
+    MPIR_Assert(dt_iov);
+
+    last = data_sz;
+    MPIDU_Segment_pack_vector(segment_ptr, 0, &last, dt_iov, &n_iov);
+    MPIR_Assert(last == (MPI_Aint) data_sz);
+    MPL_free(segment_ptr);
+
+    MPIDI_CH4U_REQUEST(sreq, req->greq.dt_iov) = dt_iov;
+    mpi_errno = MPIDI_NM_send_am(target_rank, win->comm_ptr, MPIDI_CH4U_GET_REQ,
+                                 &am_hdr, sizeof(am_hdr), dt_iov,
+                                 sizeof(struct iovec) * am_hdr.n_iov, MPI_BYTE, sreq, NULL);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_MPIDI_CH4I_DO_GET);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_put
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_put(const void *origin_addr,
+                                  int origin_count,
+                                  MPI_Datatype origin_datatype,
+                                  int target_rank,
+                                  MPI_Aint target_disp,
+                                  int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_PUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_PUT);
+
+    mpi_errno = MPIDI_CH4I_do_put(origin_addr, origin_count, origin_datatype,
+                                  target_rank, target_disp, target_count,
+                                  target_datatype, win, NULL);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_PUT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_rput
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_rput(const void *origin_addr,
+                                   int origin_count,
+                                   MPI_Datatype origin_datatype,
+                                   int target_rank,
+                                   MPI_Aint target_disp,
+                                   int target_count,
+                                   MPI_Datatype target_datatype,
+                                   MPIR_Win * win, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_RPUT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_RPUT);
+
+    mpi_errno = MPIDI_CH4I_do_put(origin_addr, origin_count, origin_datatype,
+                                  target_rank, target_disp, target_count,
+                                  target_datatype, win, request);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_RPUT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_get
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_get(void *origin_addr,
+                                  int origin_count,
+                                  MPI_Datatype origin_datatype,
+                                  int target_rank,
+                                  MPI_Aint target_disp,
+                                  int target_count, MPI_Datatype target_datatype, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_GET);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_GET);
+
+    mpi_errno = MPIDI_CH4I_do_get(origin_addr, origin_count, origin_datatype,
+                                  target_rank, target_disp, target_count,
+                                  target_datatype, win, NULL);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_GET);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_rget
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_rget(void *origin_addr,
+                                   int origin_count,
+                                   MPI_Datatype origin_datatype,
+                                   int target_rank,
+                                   MPI_Aint target_disp,
+                                   int target_count,
+                                   MPI_Datatype target_datatype,
+                                   MPIR_Win * win, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_RGET);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_RGET);
+
+    mpi_errno = MPIDI_CH4I_do_get(origin_addr, origin_count, origin_datatype,
+                                  target_rank, target_disp, target_count,
+                                  target_datatype, win, request);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_RGET);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4I_do_accumulate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4I_do_accumulate(const void *origin_addr,
+                                            int origin_count,
+                                            MPI_Datatype origin_datatype,
+                                            int target_rank,
+                                            MPI_Aint target_disp,
+                                            int target_count,
+                                            MPI_Datatype target_datatype,
+                                            MPI_Op op, MPIR_Win * win,
+                                            int do_get, MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS, c, n_iov;
+    size_t basic_type_size;
+    MPIDI_CH4U_acc_req_msg_t am_hdr;
+    uint64_t data_sz, result_data_sz, target_data_sz;
+    MPI_Aint last, num_iov;
+    MPID_Segment *segment_ptr;
+    struct iovec *dt_iov, am_iov[2];
+    MPIR_Datatype *dt_ptr;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_DO_ACCUMULATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_DO_ACCUMULATE);
+
+    MPIDI_CH4U_EPOCH_CHECK_SYNC(win, mpi_errno, goto fn_fail);
+
+    MPIDI_Datatype_get_size_dt_ptr(origin_count, origin_datatype, data_sz, dt_ptr);
+    MPIDI_Datatype_check_size(target_datatype, target_count, target_data_sz);
+
+    if ((data_sz == 0 && do_get == 0) ||
+        target_rank == MPI_PROC_NULL || target_count == 0 || target_data_sz == 0 ||
+        (do_get == 1 && origin_count == 0 &&
+         MPIDI_CH4U_REQUEST(sreq, req->areq.result_count) == 0)) {
+        if (do_get)
+            dtype_release_if_not_builtin(MPIDI_CH4U_REQUEST(sreq, req->areq.result_datatype));
+        MPIDI_CH4I_am_request_complete(sreq);
+        goto fn_exit;
+    }
+
+    MPIDI_CH4U_REQUEST(sreq, req->areq.win_ptr) = win;
+    MPIDI_CH4U_EPOCH_START_CHECK(win, mpi_errno, goto fn_fail);
+    MPIR_cc_incr(sreq->cc_ptr, &c);
+
+    am_hdr.req_ptr = (uint64_t) sreq;
+    am_hdr.origin_count = origin_count;
+    am_hdr.do_get = do_get;
+
+    if (HANDLE_GET_KIND(origin_datatype) == HANDLE_KIND_BUILTIN) {
+        am_hdr.origin_datatype = origin_datatype;
+    }
+    else {
+        am_hdr.origin_datatype = (dt_ptr) ? dt_ptr->basic_type : MPI_DATATYPE_NULL;
+        MPID_Datatype_get_size_macro(am_hdr.origin_datatype, basic_type_size);
+        am_hdr.origin_count = (basic_type_size > 0) ? data_sz / basic_type_size : 0;
+    }
+
+    am_hdr.target_count = target_count;
+    am_hdr.target_datatype = target_datatype;
+    am_hdr.target_disp = target_disp;
+    am_hdr.op = op;
+    am_hdr.win_id = MPIDI_CH4U_WIN(win, win_id);
+    am_hdr.src_rank = win->comm_ptr->rank;
+
+    if (do_get) {
+        MPIDI_Datatype_check_size(MPIDI_CH4U_REQUEST(sreq, req->areq.result_datatype),
+                                  MPIDI_CH4U_REQUEST(sreq, req->areq.result_count), result_data_sz);
+        am_hdr.result_data_sz = result_data_sz;
+    }
+
+    /* MPIDI_CS_ENTER(); */
+    OPA_incr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+
+    MPIDI_CH4U_REQUEST(sreq, req->areq.data_sz) = data_sz;
+    if (HANDLE_GET_KIND(target_datatype) == HANDLE_KIND_BUILTIN) {
+        am_hdr.n_iov = 0;
+        MPIDI_CH4U_REQUEST(sreq, req->areq.dt_iov) = NULL;
+
+        mpi_errno = MPIDI_NM_send_am(target_rank, win->comm_ptr, MPIDI_CH4U_ACC_REQ,
+                                     &am_hdr, sizeof(am_hdr), origin_addr,
+                                     (op == MPI_NO_OP) ? 0 : origin_count,
+                                     origin_datatype, sreq, NULL);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+        goto fn_exit;
+    }
+
+    MPIDI_Datatype_get_size_dt_ptr(target_count, target_datatype, data_sz, dt_ptr);
+    am_hdr.target_datatype = dt_ptr->basic_type;
+    am_hdr.target_count = dt_ptr->n_builtin_elements;
+
+    segment_ptr = MPIDU_Segment_alloc();
+    MPIR_Assert(segment_ptr);
+
+
+    MPIDU_Segment_init(NULL, target_count, target_datatype, segment_ptr, 0);
+    last = data_sz;
+    MPIDU_Segment_count_contig_blocks(segment_ptr, 0, &last, &num_iov);
+    n_iov = (int) num_iov;
+    MPIR_Assert(n_iov > 0);
+    am_hdr.n_iov = n_iov;
+    dt_iov = (struct iovec *) MPL_malloc(n_iov * sizeof(struct iovec));
+    MPIR_Assert(dt_iov);
+
+    last = data_sz;
+    MPIDU_Segment_pack_vector(segment_ptr, 0, &last, dt_iov, &n_iov);
+    MPIR_Assert(last == (MPI_Aint) data_sz);
+    MPL_free(segment_ptr);
+
+    am_iov[0].iov_base = &am_hdr;
+    am_iov[0].iov_len = sizeof(am_hdr);
+    am_iov[1].iov_base = dt_iov;
+    am_iov[1].iov_len = sizeof(struct iovec) * am_hdr.n_iov;
+    MPIDI_CH4U_REQUEST(sreq, req->areq.dt_iov) = dt_iov;
+
+    if ((am_iov[0].iov_len + am_iov[1].iov_len) <= MPIDI_NM_am_hdr_max_sz()) {
+        mpi_errno = MPIDI_NM_send_amv(target_rank, win->comm_ptr, MPIDI_CH4U_ACC_REQ,
+                                      &am_iov[0], 2, origin_addr,
+                                      (op == MPI_NO_OP) ? 0 : origin_count,
+                                      origin_datatype, sreq, NULL);
+    }
+    else {
+        MPIDI_CH4U_REQUEST(sreq, req->areq.origin_addr) = (void *) origin_addr;
+        MPIDI_CH4U_REQUEST(sreq, req->areq.origin_count) = origin_count;
+        MPIDI_CH4U_REQUEST(sreq, req->areq.origin_datatype) = origin_datatype;
+        MPIDI_CH4U_REQUEST(sreq, src_rank) = target_rank;
+        dtype_add_ref_if_not_builtin(origin_datatype);
+
+        mpi_errno = MPIDI_NM_send_am(target_rank, win->comm_ptr, MPIDI_CH4U_ACC_IOV_REQ,
+                                     &am_hdr, sizeof(am_hdr), am_iov[1].iov_base,
+                                     am_iov[1].iov_len, MPI_BYTE, sreq, NULL);
+    }
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_DO_ACCUMULATE);
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_raccumulate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_raccumulate(const void *origin_addr,
+                                          int origin_count,
+                                          MPI_Datatype origin_datatype,
+                                          int target_rank,
+                                          MPI_Aint target_disp,
+                                          int target_count,
+                                          MPI_Datatype target_datatype,
+                                          MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *sreq;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_RACCUMULATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_RACCUMULATE);
+
+    sreq = MPIDI_CH4I_am_win_request_create();
+    MPIR_Assert(sreq);
+    sreq->kind = MPIR_REQUEST_KIND__RMA;
+    if (request) {
+        *request = sreq;
+        MPIDI_Request_add_ref(sreq);
+    }
+
+    mpi_errno = MPIDI_CH4I_do_accumulate(origin_addr,
+                                         origin_count,
+                                         origin_datatype,
+                                         target_rank,
+                                         target_disp,
+                                         target_count, target_datatype, op, win, 0, sreq);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_RACCUMULATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_accumulate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_accumulate(const void *origin_addr,
+                                         int origin_count,
+                                         MPI_Datatype origin_datatype,
+                                         int target_rank,
+                                         MPI_Aint target_disp,
+                                         int target_count,
+                                         MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_ACCUMULATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_ACCUMULATE);
+
+    mpi_errno = MPIDI_CH4U_raccumulate(origin_addr,
+                                       origin_count,
+                                       origin_datatype,
+                                       target_rank,
+                                       target_disp, target_count, target_datatype, op, win, NULL);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_ACCUMULATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_rget_accumulate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_rget_accumulate(const void *origin_addr,
+                                              int origin_count,
+                                              MPI_Datatype origin_datatype,
+                                              void *result_addr,
+                                              int result_count,
+                                              MPI_Datatype result_datatype,
+                                              int target_rank,
+                                              MPI_Aint target_disp,
+                                              int target_count,
+                                              MPI_Datatype target_datatype,
+                                              MPI_Op op, MPIR_Win * win, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *sreq;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_RGET_ACCUMULATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_RGET_ACCUMULATE);
+
+    sreq = MPIDI_CH4I_am_win_request_create();
+    MPIR_Assert(sreq);
+    sreq->kind = MPIR_REQUEST_KIND__RMA;
+
+    MPIDI_CH4U_REQUEST(sreq, req->areq.result_addr) = result_addr;
+    MPIDI_CH4U_REQUEST(sreq, req->areq.result_count) = result_count;
+    MPIDI_CH4U_REQUEST(sreq, req->areq.result_datatype) = result_datatype;
+    dtype_add_ref_if_not_builtin(result_datatype);
+
+    if (request) {
+        *request = sreq;
+        MPIDI_Request_add_ref(sreq);
+    }
+
+    mpi_errno = MPIDI_CH4I_do_accumulate(origin_addr,
+                                         origin_count,
+                                         origin_datatype,
+                                         target_rank,
+                                         target_disp,
+                                         target_count, target_datatype, op, win, 1, sreq);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_RGET_ACCUMULATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_get_accumulate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_get_accumulate(const void *origin_addr,
+                                             int origin_count,
+                                             MPI_Datatype origin_datatype,
+                                             void *result_addr,
+                                             int result_count,
+                                             MPI_Datatype result_datatype,
+                                             int target_rank,
+                                             MPI_Aint target_disp,
+                                             int target_count,
+                                             MPI_Datatype target_datatype,
+                                             MPI_Op op, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_GET_ACCUMULATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_GET_ACCUMULATE);
+
+    mpi_errno = MPIDI_CH4U_rget_accumulate(origin_addr,
+                                           origin_count,
+                                           origin_datatype,
+                                           result_addr,
+                                           result_count,
+                                           result_datatype,
+                                           target_rank,
+                                           target_disp,
+                                           target_count, target_datatype, op, win, NULL);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_GET_ACCUMULATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_compare_and_swap
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_compare_and_swap(const void *origin_addr,
+                                               const void *compare_addr,
+                                               void *result_addr,
+                                               MPI_Datatype datatype,
+                                               int target_rank,
+                                               MPI_Aint target_disp, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS, c;
+    MPIR_Request *sreq = NULL;
+    MPIDI_CH4U_cswap_req_msg_t am_hdr;
+    size_t data_sz;
+    void *p_data;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_COMPARE_AND_SWAP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_COMPARE_AND_SWAP);
+
+    MPIDI_CH4U_EPOCH_CHECK_SYNC(win, mpi_errno, goto fn_fail);
+
+    sreq = MPIDI_CH4I_am_win_request_create();
+    MPIR_Assert(sreq);
+    sreq->kind = MPIR_REQUEST_KIND__RMA;
+
+    MPIDI_Datatype_check_size(datatype, 1, data_sz);
+    if (data_sz == 0 || target_rank == MPI_PROC_NULL) {
+        MPIDI_CH4I_am_request_complete(sreq);
+        goto fn_exit;
+    }
+
+    p_data = MPL_malloc(data_sz * 2);
+    MPIR_Assert(p_data);
+    MPIR_Memcpy(p_data, (char *) origin_addr, data_sz);
+    MPIR_Memcpy((char *) p_data + data_sz, (char *) compare_addr, data_sz);
+
+    MPIDI_CH4U_REQUEST(sreq, req->creq.win_ptr) = win;
+    MPIDI_CH4U_REQUEST(sreq, req->creq.addr) = (uint64_t) ((char *) result_addr);
+    MPIDI_CH4U_REQUEST(sreq, req->creq.datatype) = datatype;
+    MPIDI_CH4U_REQUEST(sreq, req->creq.result_addr) = result_addr;
+    MPIDI_CH4U_REQUEST(sreq, req->creq.data) = p_data;
+
+    MPIDI_CH4U_EPOCH_START_CHECK(win, mpi_errno, goto fn_fail);
+    MPIR_cc_incr(sreq->cc_ptr, &c);
+
+    am_hdr.target_disp = target_disp;
+    am_hdr.datatype = datatype;
+    am_hdr.req_ptr = (uint64_t) sreq;
+    am_hdr.win_id = MPIDI_CH4U_WIN(win, win_id);
+    am_hdr.src_rank = win->comm_ptr->rank;
+
+    /* MPIDI_CS_ENTER(); */
+    OPA_incr_int(&MPIDI_CH4U_WIN(win, outstanding_ops));
+    /* MPIDI_CS_EXIT(); */
+
+    mpi_errno = MPIDI_NM_send_am(target_rank, win->comm_ptr, MPIDI_CH4U_CSWAP_REQ,
+                                 &am_hdr, sizeof(am_hdr), (char *) p_data, 2, datatype, sreq, NULL);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_COMPARE_AND_SWAP);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_fetch_and_op
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_fetch_and_op(const void *origin_addr,
+                                           void *result_addr,
+                                           MPI_Datatype datatype,
+                                           int target_rank,
+                                           MPI_Aint target_disp, MPI_Op op, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_FETCH_AND_OP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_FETCH_AND_OP);
+
+    mpi_errno = MPIDI_CH4U_get_accumulate(origin_addr, 1, datatype,
+                                          result_addr, 1, datatype,
+                                          target_rank, target_disp, 1, datatype, op, win);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_FETCH_AND_OP);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#endif /* CH4R_RMA_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4r_send.h b/src/mpid/ch4/src/ch4r_send.h
new file mode 100644
index 0000000..84a67e3
--- /dev/null
+++ b/src/mpid/ch4/src/ch4r_send.h
@@ -0,0 +1,468 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4R_SEND_H_INCLUDED
+#define CH4R_SEND_H_INCLUDED
+
+#include "ch4_impl.h"
+
+#include <../mpi/pt2pt/bsendutil.h>
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4I_do_send
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4I_do_send(const void *buf,
+                                     int count,
+                                     MPI_Datatype datatype,
+                                     int rank,
+                                     int tag,
+                                     MPIR_Comm * comm,
+                                     int context_offset, MPIR_Request ** request, int type)
+{
+    int mpi_errno = MPI_SUCCESS, c;
+    MPIR_Request *sreq = NULL;
+    uint64_t match_bits;
+    MPIDI_CH4U_hdr_t am_hdr;
+    MPIDI_CH4U_ssend_req_msg_t ssend_req;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_DO_SEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_DO_SEND);
+
+    sreq = MPIDI_CH4I_am_request_create(MPIR_REQUEST_KIND__SEND);
+    MPIR_Assert(sreq);
+
+    *request = sreq;
+    match_bits = MPIDI_CH4U_init_send_tag(comm->context_id + context_offset, comm->rank, tag);
+
+    am_hdr.msg_tag = match_bits;
+    am_hdr.src_rank = comm->rank;
+    if (type == MPIDI_CH4U_SSEND_REQ) {
+        ssend_req.hdr = am_hdr;
+        ssend_req.sreq_ptr = (uint64_t) sreq;
+        MPIR_cc_incr(sreq->cc_ptr, &c);
+
+        mpi_errno = MPIDI_NM_send_am(rank, comm, MPIDI_CH4U_SSEND_REQ,
+                                     &ssend_req, sizeof(ssend_req),
+                                     buf, count, datatype, sreq, NULL);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+    else {
+        mpi_errno = MPIDI_NM_send_am(rank, comm, MPIDI_CH4U_SEND,
+                                     &am_hdr, sizeof(am_hdr), buf, count, datatype, sreq, NULL);
+        if (mpi_errno)
+            MPIR_ERR_POP(mpi_errno);
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_DO_SEND);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4I_send
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4I_send(const void *buf, int count, MPI_Datatype datatype,
+                                  int rank, int tag, MPIR_Comm * comm, int context_offset,
+                                  MPIR_Request ** request, int noreq, int type)
+{
+    int mpi_errno;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_NM_SEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_NM_SEND);
+
+    if (unlikely(rank == MPI_PROC_NULL)) {
+        mpi_errno = MPI_SUCCESS;
+        if (!noreq) {
+            *request = MPIDI_CH4I_am_request_create(MPIR_REQUEST_KIND__SEND);
+            MPIDI_Request_complete((*request));
+        }
+        goto fn_exit;
+    }
+
+    mpi_errno =
+        MPIDI_CH4I_do_send(buf, count, datatype, rank, tag, comm, context_offset, request, type);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_NM_SEND);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4I_psend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4I_psend(const void *buf,
+                                   int count,
+                                   MPI_Datatype datatype,
+                                   int rank,
+                                   int tag,
+                                   MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    MPIR_Request *sreq;
+    uint64_t match_bits;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_NM_PSEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_NM_PSEND);
+
+    sreq = MPIDI_CH4I_am_request_create(MPIR_REQUEST_KIND__PREQUEST_SEND);
+    *request = sreq;
+
+    MPIR_Comm_add_ref(comm);
+    sreq->comm = comm;
+    match_bits = MPIDI_CH4U_init_send_tag(comm->context_id + context_offset, rank, tag);
+
+    MPIDI_CH4U_REQUEST(sreq, buffer) = (void *) buf;
+    MPIDI_CH4U_REQUEST(sreq, count) = count;
+    MPIDI_CH4U_REQUEST(sreq, datatype) = datatype;
+    MPIDI_CH4U_REQUEST(sreq, tag) = match_bits;
+    MPIDI_CH4U_REQUEST(sreq, src_rank) = rank;
+
+    sreq->u.persist.real_request = NULL;
+    MPIDI_Request_complete(sreq);
+
+    dtype_add_ref_if_not_builtin(datatype);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_NM_PSEND);
+    return MPI_SUCCESS;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_send
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_send(const void *buf,
+                                   int count,
+                                   MPI_Datatype datatype,
+                                   int rank,
+                                   int tag,
+                                   MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_SEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_SEND);
+    mpi_errno = MPIDI_CH4I_send(buf, count, datatype, rank, tag, comm,
+                                context_offset, request, 1, 0ULL);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_SEND);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_isend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_isend(const void *buf,
+                                    int count,
+                                    MPI_Datatype datatype,
+                                    int rank,
+                                    int tag,
+                                    MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_ISEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_ISEND);
+    mpi_errno = MPIDI_CH4I_send(buf, count, datatype, rank, tag, comm,
+                                context_offset, request, 0, 0ULL);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_ISEND);
+    return mpi_errno;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_rsend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_rsend(const void *buf,
+                                    int count,
+                                    MPI_Datatype datatype,
+                                    int rank,
+                                    int tag,
+                                    MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_RSEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_RSEND);
+    mpi_errno = MPIDI_CH4I_send(buf, count, datatype, rank, tag, comm,
+                                context_offset, request, 1, 0ULL);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_RSEND);
+    return mpi_errno;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_irsend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_irsend(const void *buf,
+                                     int count,
+                                     MPI_Datatype datatype,
+                                     int rank,
+                                     int tag,
+                                     MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_IRSEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_IRSEND);
+    mpi_errno = MPIDI_CH4I_send(buf, count, datatype, rank, tag, comm,
+                                context_offset, request, 0, 0ULL);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_SEND);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_ssend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_ssend(const void *buf,
+                                    int count,
+                                    MPI_Datatype datatype,
+                                    int rank,
+                                    int tag,
+                                    MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_SSEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_SSEND);
+    mpi_errno = MPIDI_CH4I_send(buf, count, datatype, rank, tag, comm,
+                                context_offset, request, 1, MPIDI_CH4U_SSEND_REQ);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_SSEND);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_issend
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_issend(const void *buf,
+                                     int count,
+                                     MPI_Datatype datatype,
+                                     int rank,
+                                     int tag,
+                                     MPIR_Comm * comm, int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_ISSEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_ISSEND);
+    mpi_errno = MPIDI_CH4I_send(buf, count, datatype, rank, tag, comm,
+                                context_offset, request, 0, MPIDI_CH4U_SSEND_REQ);
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_ISSEND);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_startall
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_startall(int count, MPIR_Request * requests[])
+{
+    int mpi_errno = MPI_SUCCESS, i;
+    int rank, tag, context_offset;
+    MPI_Datatype datatype;
+    uint64_t msg_tag;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_STARTALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_STARTALL);
+
+    for (i = 0; i < count; i++) {
+        MPIR_Request *const preq = requests[i];
+        MPI_Request sreq_handle;
+
+        msg_tag = MPIDI_CH4U_REQUEST(preq, tag);
+        datatype = MPIDI_CH4U_REQUEST(preq, datatype);
+
+        tag = MPIDI_CH4U_get_tag(msg_tag);
+        rank = MPIDI_CH4U_REQUEST(preq, src_rank);
+        context_offset = MPIDI_CH4U_get_context(msg_tag) - preq->comm->context_id;
+
+        switch (MPIDI_CH4U_REQUEST(preq, p_type)) {
+
+        case MPIDI_PTYPE_RECV:
+#ifdef MPIDI_BUILD_CH4_SHM
+            mpi_errno = MPIDI_NM_irecv(MPIDI_CH4U_REQUEST(preq, buffer),
+                                       MPIDI_CH4U_REQUEST(preq, count),
+                                       datatype, rank, tag,
+                                       preq->comm, context_offset, &preq->u.persist.real_request);
+#else
+            mpi_errno = MPIDI_Irecv(MPIDI_CH4U_REQUEST(preq, buffer),
+                                    MPIDI_CH4U_REQUEST(preq, count),
+                                    datatype, rank, tag,
+                                    preq->comm, context_offset, &preq->u.persist.real_request);
+#endif
+            break;
+
+        case MPIDI_PTYPE_SEND:
+#ifdef MPIDI_BUILD_CH4_SHM
+            mpi_errno = MPIDI_NM_isend(MPIDI_CH4U_REQUEST(preq, buffer),
+                                       MPIDI_CH4U_REQUEST(preq, count),
+                                       datatype, rank, tag,
+                                       preq->comm, context_offset, &preq->u.persist.real_request);
+#else
+            mpi_errno = MPIDI_Isend(MPIDI_CH4U_REQUEST(preq, buffer),
+                                    MPIDI_CH4U_REQUEST(preq, count),
+                                    datatype, rank, tag,
+                                    preq->comm, context_offset, &preq->u.persist.real_request);
+#endif
+            break;
+
+        case MPIDI_PTYPE_SSEND:
+#ifdef MPIDI_BUILD_CH4_SHM
+            mpi_errno = MPIDI_NM_issend(MPIDI_CH4U_REQUEST(preq, buffer),
+                                        MPIDI_CH4U_REQUEST(preq, count),
+                                        datatype, rank, tag,
+                                        preq->comm, context_offset, &preq->u.persist.real_request);
+#else
+            mpi_errno = MPIDI_Issend(MPIDI_CH4U_REQUEST(preq, buffer),
+                                     MPIDI_CH4U_REQUEST(preq, count),
+                                     datatype, rank, tag,
+                                     preq->comm, context_offset, &preq->u.persist.real_request);
+#endif
+            break;
+
+        case MPIDI_PTYPE_BSEND:
+            mpi_errno = MPIR_Ibsend_impl(MPIDI_CH4U_REQUEST(preq, buffer),
+                                         MPIDI_CH4U_REQUEST(preq, count),
+                                         datatype, rank, tag, preq->comm, &sreq_handle);
+            if (mpi_errno == MPI_SUCCESS)
+                MPIR_Request_get_ptr(sreq_handle, preq->u.persist.real_request);
+
+            break;
+
+        default:
+            mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, __FUNCTION__,
+                                             __LINE__, MPI_ERR_INTERN, "**ch3|badreqtype",
+                                             "**ch3|badreqtype %d", MPIDI_CH4U_REQUEST(preq,
+                                                                                       p_type));
+        }
+
+        if (mpi_errno == MPI_SUCCESS) {
+            preq->status.MPI_ERROR = MPI_SUCCESS;
+
+            if (MPIDI_CH4U_REQUEST(preq, p_type) == MPIDI_PTYPE_BSEND) {
+                preq->cc_ptr = &preq->cc;
+                MPIDI_Request_set_completed(preq);
+            }
+            else
+                preq->cc_ptr = &preq->u.persist.real_request->cc;
+        }
+        else {
+            preq->u.persist.real_request = NULL;
+            preq->status.MPI_ERROR = mpi_errno;
+            preq->cc_ptr = &preq->cc;
+            MPIDI_Request_set_completed(preq);
+        }
+        dtype_release_if_not_builtin(datatype);
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_STARTALL);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_send_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_send_init(const void *buf,
+                                        int count,
+                                        MPI_Datatype datatype,
+                                        int rank,
+                                        int tag,
+                                        MPIR_Comm * comm,
+                                        int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_SEND_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_SEND_INIT);
+    mpi_errno = MPIDI_CH4I_psend(buf, count, datatype, rank, tag, comm, context_offset, request);
+    MPIDI_CH4U_REQUEST((*request), p_type) = MPIDI_PTYPE_SEND;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_SEND_INIT);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_ssend_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_ssend_init(const void *buf,
+                                         int count,
+                                         MPI_Datatype datatype,
+                                         int rank,
+                                         int tag,
+                                         MPIR_Comm * comm,
+                                         int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_SSEND_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_SSEND_INIT);
+    mpi_errno = MPIDI_CH4I_psend(buf, count, datatype, rank, tag, comm, context_offset, request);
+    MPIDI_CH4U_REQUEST((*request), p_type) = MPIDI_PTYPE_SSEND;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_SSEND_INIT);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_bsend_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_bsend_init(const void *buf,
+                                         int count,
+                                         MPI_Datatype datatype,
+                                         int rank,
+                                         int tag,
+                                         MPIR_Comm * comm,
+                                         int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_BSEND_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_BSEND_INIT);
+    mpi_errno = MPIDI_CH4I_psend(buf, count, datatype, rank, tag, comm, context_offset, request);
+    MPIDI_CH4U_REQUEST((*request), p_type) = MPIDI_PTYPE_BSEND;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_BSEND_INIT);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_rsend_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_rsend_init(const void *buf,
+                                         int count,
+                                         MPI_Datatype datatype,
+                                         int rank,
+                                         int tag,
+                                         MPIR_Comm * comm,
+                                         int context_offset, MPIR_Request ** request)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_RSEND_INIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_RSEND_INIT);
+    mpi_errno = MPIDI_CH4I_psend(buf, count, datatype, rank, tag, comm, context_offset, request);
+    MPIDI_CH4U_REQUEST((*request), p_type) = MPIDI_PTYPE_SEND;
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_RSEND_INIT);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4U_cancel_send
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+__CH4_INLINE__ int MPIDI_CH4U_cancel_send(MPIR_Request * sreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_CANCEL_SEND);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_CANCEL_SEND);
+    /* cannot cancel send */
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_CANCEL_SEND);
+    return mpi_errno;
+}
+
+#endif /* CH4R_SEND_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4r_symheap.h b/src/mpid/ch4/src/ch4r_symheap.h
new file mode 100644
index 0000000..d64db40
--- /dev/null
+++ b/src/mpid/ch4/src/ch4r_symheap.h
@@ -0,0 +1,258 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4R_SYMHEAP_H_INCLUDED
+#define CH4R_SYMHEAP_H_INCLUDED
+
+#include <mpichconf.h>
+
+#include <opa_primitives.h>
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif /* HAVE_SYS_MMAN_H */
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif /* HAVE_SYS_TIME_H */
+#ifdef HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif /* HAVE_SYS_STAT_H */
+#ifdef HAVE_FCNTL_H
+#include <fcntl.h>
+#endif /* HAVE_FCNTL_H */
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif /* HAVE_STDINT_H */
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_get_mapsize
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline size_t MPIDI_CH4R_get_mapsize(size_t size, size_t * psz)
+{
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4R_GET_MAPSIZE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4R_GET_MAPSIZE);
+
+    long page_sz = sysconf(_SC_PAGESIZE);
+    size_t mapsize = (size + (page_sz - 1)) & (~(page_sz - 1));
+    *psz = page_sz;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4R_GET_MAPSIZE);
+    return mapsize;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_check_maprange_ok
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_check_maprange_ok(void *start, size_t size)
+{
+    int rc = 0;
+    int ret = 0;
+    size_t page_sz;
+    size_t mapsize = MPIDI_CH4R_get_mapsize(size, &page_sz);
+    size_t i, num_pages = mapsize / page_sz;
+    char *ptr = (char *) start;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4R_CHECK_MAPRANGE_OK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4R_CHECK_MAPRANGE_OK);
+
+    for (i = 0; i < num_pages; i++) {
+        rc = msync(ptr, page_sz, 0);
+
+        if (rc == -1) {
+            MPIR_Assert(errno == ENOMEM);
+            ptr += page_sz;
+        }
+        else
+            goto fn_exit;
+    }
+
+    ret = 1;
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4R_CHECK_MAPRANGE_OK);
+    return ret;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_generate_random_addr
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline void *MPIDI_CH4R_generate_random_addr(size_t size)
+{
+    /* starting position for pointer to map
+     * This is not generic, probably only works properly on Linux
+     * but it's not fatal since we bail after a fixed number of iterations
+     */
+#define MPIDI_CH4I_MAP_POINTER ((random_unsigned&((0x00006FFFFFFFFFFF&(~(page_sz-1)))|0x0000600000000000)))
+    uintptr_t map_pointer;
+#ifdef USE_SYM_HEAP
+    char random_state[256];
+    size_t page_sz;
+    uint64_t random_unsigned;
+    size_t mapsize = MPIDI_CH4R_get_mapsize(size, &page_sz);
+    struct timeval ts;
+    int iter = 100;
+    int32_t rh, rl;
+    struct random_data rbuf;
+#endif
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4R_GENERATE_RANDOM_ADDR);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4R_GENERATE_RANDOM_ADDR);
+
+#ifndef USE_SYM_HEAP
+    map_pointer = -1ULL;
+    goto fn_exit;
+#else
+
+    /* rbuf must be zero-cleared otherwise it results in SIGSEGV in glibc
+     * (http://stackoverflow.com/questions/4167034/c-initstate-r-crashing) */
+    memset(&rbuf, 0, sizeof(rbuf));
+
+    gettimeofday(&ts, NULL);
+
+    initstate_r(ts.tv_usec, random_state, sizeof(random_state), &rbuf);
+    random_r(&rbuf, &rh);
+    random_r(&rbuf, &rl);
+    random_unsigned = ((uint64_t) rh) << 32 | (uint64_t) rl;
+    map_pointer = MPIDI_CH4I_MAP_POINTER;
+
+    while (MPIDI_CH4R_check_maprange_ok((void *) map_pointer, mapsize) == 0) {
+        random_r(&rbuf, &rh);
+        random_r(&rbuf, &rl);
+        random_unsigned = ((uint64_t) rh) << 32 | (uint64_t) rl;
+        map_pointer = MPIDI_CH4I_MAP_POINTER;
+        iter--;
+
+        if (iter == 0) {
+            map_pointer = -1ULL;
+            goto fn_exit;
+        }
+    }
+
+#endif
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4R_GENERATE_RANDOM_ADDR);
+    return (void *) map_pointer;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_get_symmetric_heap
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_get_symmetric_heap(MPI_Aint size,
+                                                MPIR_Comm * comm, void **base, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int iter = 100;
+    void *baseP;
+    size_t mapsize;
+#ifdef USE_SYM_HEAP
+    unsigned test, result;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+    size_t page_sz;
+#endif
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4R_GET_SYMMETRIC_HEAP);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4R_GET_SYMMETRIC_HEAP);
+
+#ifndef USE_SYM_HEAP
+    iter = 0;
+#else
+
+    mapsize = MPIDI_CH4R_get_mapsize(size, &page_sz);
+
+    struct {
+        uint64_t sz;
+        int loc;
+    } maxloc, maxloc_result;
+
+    maxloc.sz = size;
+    maxloc.loc = comm->rank;
+    mpi_errno = MPIR_Allreduce_impl(&maxloc,
+                                    &maxloc_result, 1, MPI_LONG_INT, MPI_MAXLOC, comm, &errflag);
+
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+    if (maxloc_result.sz > 0) {
+        result = 0;
+
+        while (!result && --iter != 0) {
+            uintptr_t map_pointer = 0ULL;
+
+            baseP = (void *) -1ULL;
+
+            if (comm->rank == maxloc_result.loc) {
+                map_pointer = (uintptr_t) MPIDI_CH4R_generate_random_addr(mapsize);
+                baseP = mmap((void *) map_pointer,
+                             mapsize,
+                             PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0);
+            }
+
+            mpi_errno = MPIR_Bcast_impl(&map_pointer,
+                                        1, MPI_UNSIGNED_LONG, maxloc_result.loc, comm, &errflag);
+
+            if (mpi_errno != MPI_SUCCESS)
+                goto fn_fail;
+
+            if (comm->rank != maxloc_result.loc) {
+                int rc = MPIDI_CH4R_check_maprange_ok((void *) map_pointer, mapsize);
+
+                if (rc) {
+                    baseP = mmap((void *) map_pointer,
+                                 mapsize,
+                                 PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0);
+                }
+                else
+                    baseP = (void *) -1ULL;
+            }
+
+            if (mapsize == 0)
+                baseP = (void *) map_pointer;
+
+            test = ((uintptr_t) baseP != -1ULL) ? 1 : 0;
+            mpi_errno = MPIR_Allreduce_impl(&test,
+                                            &result, 1, MPI_UNSIGNED, MPI_BAND, comm, &errflag);
+
+            if (mpi_errno != MPI_SUCCESS)
+                goto fn_fail;
+
+            if (result == 0 && baseP != (void *) -1ULL)
+                munmap(baseP, mapsize);
+        }
+    }
+    else
+        baseP = NULL;
+#endif
+
+    if (iter == 0) {
+        MPL_DBG_MSG(MPIDI_CH4_DBG_GENERAL, VERBOSE,
+                    "WARNING: Win_allocate:  Unable to allocate symmetric heap\n");
+        baseP = MPL_malloc(size);
+        MPIR_ERR_CHKANDJUMP((baseP == NULL), mpi_errno, MPI_ERR_BUFFER, "**bufnull");
+        MPIDI_CH4U_WIN(win, mmap_sz) = -1ULL;
+        MPIDI_CH4U_WIN(win, mmap_addr) = NULL;
+    }
+    else {
+        MPIDI_CH4U_WIN(win, mmap_sz) = mapsize;
+        MPIDI_CH4U_WIN(win, mmap_addr) = baseP;
+    }
+
+    *base = baseP;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4R_GET_SYMMETRIC_HEAP);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#endif /* CH4R_SYMHEAP_H_INCLUDED */
diff --git a/src/mpid/ch4/src/ch4r_win.h b/src/mpid/ch4/src/ch4r_win.h
new file mode 100644
index 0000000..840f890
--- /dev/null
+++ b/src/mpid/ch4/src/ch4r_win.h
@@ -0,0 +1,1250 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+#ifndef CH4R_WIN_H_INCLUDED
+#define CH4R_WIN_H_INCLUDED
+
+#include "ch4_impl.h"
+#include "ch4i_util.h"
+#include <opa_primitives.h>
+#include "mpir_info.h"
+#include "mpl_uthash.h"
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif /* HAVE_SYS_MMAN_H */
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_set_info
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_set_info(MPIR_Win * win, MPIR_Info * info)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NETMOD_CH4I_WIN_SET_INFO);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NETMOD_CH4I_WIN_SET_INFO);
+
+    MPIR_Info *curr_ptr;
+    char *value, *token, *savePtr;
+    uint save_ordering;
+
+    curr_ptr = info->next;
+
+    while (curr_ptr) {
+        if (!strcmp(curr_ptr->key, "no_locks")) {
+            if (!strcmp(curr_ptr->value, "true"))
+                MPIDI_CH4U_WIN(win, info_args).no_locks = 1;
+            else
+                MPIDI_CH4U_WIN(win, info_args).no_locks = 0;
+        }
+        else if (!strcmp(curr_ptr->key, "accumulate_ordering")) {
+            save_ordering = (uint) MPIDI_CH4U_WIN(win, info_args).accumulate_ordering;
+            MPIDI_CH4U_WIN(win, info_args).accumulate_ordering = 0;
+            value = curr_ptr->value;
+            token = (char *) strtok_r(value, ",", &savePtr);
+
+            while (token) {
+                if (!memcmp(token, "rar", 3))
+                    MPIDI_CH4U_WIN(win, info_args).accumulate_ordering =
+                        (MPIDI_CH4U_WIN(win, info_args).
+                         accumulate_ordering | MPIDI_CH4I_ACCU_ORDER_RAR);
+                else if (!memcmp(token, "raw", 3))
+                    MPIDI_CH4U_WIN(win, info_args).accumulate_ordering =
+                        (MPIDI_CH4U_WIN(win, info_args).
+                         accumulate_ordering | MPIDI_CH4I_ACCU_ORDER_RAW);
+                else if (!memcmp(token, "war", 3))
+                    MPIDI_CH4U_WIN(win, info_args).accumulate_ordering =
+                        (MPIDI_CH4U_WIN(win, info_args).
+                         accumulate_ordering | MPIDI_CH4I_ACCU_ORDER_WAR);
+                else if (!memcmp(token, "waw", 3))
+                    MPIDI_CH4U_WIN(win, info_args).accumulate_ordering =
+                        (MPIDI_CH4U_WIN(win, info_args).
+                         accumulate_ordering | MPIDI_CH4I_ACCU_ORDER_WAW);
+                else
+                    MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_ARG, goto fn_fail, "**info");
+
+                token = (char *) strtok_r(NULL, ",", &savePtr);
+            }
+
+            if (MPIDI_CH4U_WIN(win, info_args).accumulate_ordering == 0)
+                MPIDI_CH4U_WIN(win, info_args).accumulate_ordering = save_ordering;
+        }
+        else if (!strcmp(curr_ptr->key, "accumulate_ops")) {
+            /* the default setting is MPIDI_ACCU_SAME_OP_NO_OP */
+            if (!strcmp(curr_ptr->value, "same_op"))
+                MPIDI_CH4U_WIN(win, info_args).accumulate_ops = MPIDI_CH4I_ACCU_SAME_OP;
+        }
+
+        curr_ptr = curr_ptr->next;
+    }
+
+    mpi_errno = MPIR_Barrier_impl(win->comm_ptr, &errflag);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_SET_INFO);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_init
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_init(MPI_Aint length,
+                                      int disp_unit,
+                                      MPIR_Win ** win_ptr,
+                                      MPIR_Info * info,
+                                      MPIR_Comm * comm_ptr, int create_flavor, int model)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_Win *win = (MPIR_Win *) MPIR_Handle_obj_alloc(&MPIR_Win_mem);
+    MPIR_ERR_CHKANDSTMT(win == NULL, mpi_errno, MPI_ERR_NO_MEM, goto fn_fail, "**nomem");
+    *win_ptr = win;
+
+    memset(&win->dev.ch4u, 0, sizeof(MPIDI_CH4U_win_t));
+    win->comm_ptr = comm_ptr;
+    MPIR_Comm_add_ref(comm_ptr);
+
+    win->errhandler = NULL;
+    win->base = NULL;
+    win->size = length;
+    win->disp_unit = disp_unit;
+    win->create_flavor = (MPIR_Win_flavor_t) create_flavor;
+    win->model = (MPIR_Win_model_t) model;
+    win->copyCreateFlavor = (MPIR_Win_flavor_t) 0;
+    win->copyModel = (MPIR_Win_model_t) 0;
+    win->attributes = NULL;
+    win->comm_ptr = comm_ptr;
+    win->copyDispUnit = 0;
+    win->copySize = 0;
+    MPIDI_CH4U_WIN(win, shared_table) = NULL;
+    if ((info != NULL) && ((int *) info != (int *) MPI_INFO_NULL)) {
+        mpi_errno = MPIDI_CH4R_win_set_info(win, info);
+        MPIR_Assert(mpi_errno == 0);
+    }
+
+    /* Initialize the info (hint) flags per window */
+    MPIDI_CH4U_WIN(win, info_args).no_locks = 0;
+    MPIDI_CH4U_WIN(win, info_args).accumulate_ordering = (MPIDI_CH4I_ACCU_ORDER_RAR |
+                                                          MPIDI_CH4I_ACCU_ORDER_RAW |
+                                                          MPIDI_CH4I_ACCU_ORDER_WAR |
+                                                          MPIDI_CH4I_ACCU_ORDER_WAW);
+    MPIDI_CH4U_WIN(win, info_args).accumulate_ops = MPIDI_CH4I_ACCU_SAME_OP_NO_OP;
+    MPIDI_CH4U_WIN(win, info_args).same_size = 0;
+    MPIDI_CH4U_WIN(win, info_args).alloc_shared_noncontig = 0;
+    MPIDI_CH4U_WIN(win, mmap_sz) = 0;
+    MPIDI_CH4U_WIN(win, mmap_addr) = NULL;
+
+    MPIDI_CH4U_WIN(win, win_id) = MPIDI_CH4U_generate_win_id(comm_ptr);
+    MPL_HASH_ADD(dev.ch4u.hash_handle, MPIDI_CH4_Global.win_hash,
+                 dev.ch4u.win_id, sizeof(uint64_t), win);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4I_fill_ranks_in_win_grp
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4I_fill_ranks_in_win_grp(MPIR_Win * win_ptr, MPIR_Group * group_ptr,
+                                                   int *ranks_in_win_grp)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int i, *ranks_in_grp;
+    MPIR_Group *win_grp_ptr;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_FILL_RANKS_IN_WIN_GRP);
+    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_CH4I_FILL_RANKS_IN_WIN_GRP);
+
+    ranks_in_grp = (int *) MPL_malloc(group_ptr->size * sizeof(int));
+    MPIR_Assert(ranks_in_grp);
+    for (i = 0; i < group_ptr->size; i++)
+        ranks_in_grp[i] = i;
+
+    mpi_errno = MPIR_Comm_group_impl(win_ptr->comm_ptr, &win_grp_ptr);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIR_Group_translate_ranks_impl(group_ptr, group_ptr->size,
+                                                ranks_in_grp, win_grp_ptr, ranks_in_win_grp);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIR_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIR_Group_free_impl(win_grp_ptr);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIR_ERR_POP(mpi_errno);
+
+    MPL_free(ranks_in_grp);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_CH4I_FILL_RANKS_IN_WIN_GRP);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4I_progress_win_fence
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4I_progress_win_fence(MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4_PROGRESS_WIN_FENCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4_PROGRESS_WIN_FENCE);
+
+    do {
+        MPIDI_CH4R_PROGRESS();
+    } while (OPA_load_int(&MPIDI_CH4U_WIN(win, outstanding_ops)) != 0);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4_PROGRESS_WIN_FENCE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_start
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_start(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_WIN_START);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_WIN_START);
+
+    MPIDI_CH4U_EPOCH_CHECK_TYPE(win, mpi_errno, goto fn_fail);
+
+    MPIR_Group_add_ref(group);
+
+    MPIDI_CH4R_PROGRESS_WHILE(group->size != (int) MPIDI_CH4U_WIN(win, sync).pw.count);
+    MPIDI_CH4U_WIN(win, sync).pw.count = 0;
+
+    MPIR_ERR_CHKANDJUMP((MPIDI_CH4U_WIN(win, sync).sc.group != NULL),
+                        mpi_errno, MPI_ERR_GROUP, "**group");
+    MPIDI_CH4U_WIN(win, sync).sc.group = group;
+    MPIDI_CH4U_WIN(win, sync).origin_epoch_type = MPIDI_CH4U_EPOTYPE_START;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_WIN_START);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_complete
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_complete(MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_CH4U_win_cntrl_msg_t msg;
+    int index, peer;
+    MPIR_Group *group;
+    int *ranks_in_win_grp;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_WIN_COMPLETE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_WIN_COMPLETE);
+
+    MPIDI_CH4U_EPOCH_START_CHECK2(win, mpi_errno, goto fn_fail);
+
+    mpi_errno = MPIDI_CH4I_progress_win_fence(win);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    group = MPIDI_CH4U_WIN(win, sync).sc.group;
+    MPIR_Assert(group != NULL);
+
+    msg.win_id = MPIDI_CH4U_WIN(win, win_id);
+    msg.origin_rank = win->comm_ptr->rank;
+    msg.type = MPIDI_CH4U_WIN_COMPLETE;
+
+    ranks_in_win_grp = (int *) MPL_malloc(sizeof(int) * group->size);
+    MPIR_Assert(ranks_in_win_grp);
+
+    mpi_errno = MPIDI_CH4I_fill_ranks_in_win_grp(win, group, ranks_in_win_grp);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIR_ERR_POP(mpi_errno);
+
+    for (index = 0; index < group->size; ++index) {
+        peer = ranks_in_win_grp[index];
+        mpi_errno = MPIDI_NM_inject_am_hdr(peer, win->comm_ptr,
+                                           MPIDI_CH4U_WIN_CTRL, &msg, sizeof(msg), NULL);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
+    }
+
+    MPL_free(ranks_in_win_grp);
+    MPIDI_CH4U_EPOCH_TARGET_EVENT(win);
+    MPIR_Group_release(MPIDI_CH4U_WIN(win, sync).sc.group);
+    MPIDI_CH4U_WIN(win, sync).sc.group = NULL;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_WIN_COMPLETE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_post
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_post(MPIR_Group * group, int assert, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_CH4U_win_cntrl_msg_t msg;
+    int index, peer;
+    int *ranks_in_win_grp;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4U_WIN_POST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4U_WIN_POST);
+
+    MPIDI_CH4U_EPOCH_POST_CHECK(win, mpi_errno, goto fn_fail);
+
+    MPIR_Group_add_ref(group);
+    MPIR_ERR_CHKANDJUMP((MPIDI_CH4U_WIN(win, sync).pw.group != NULL),
+                        mpi_errno, MPI_ERR_GROUP, "**group");
+
+    MPIDI_CH4U_WIN(win, sync).pw.group = group;
+    MPIR_Assert(group != NULL);
+
+    msg.win_id = MPIDI_CH4U_WIN(win, win_id);
+    msg.origin_rank = win->comm_ptr->rank;
+    msg.type = MPIDI_CH4U_WIN_POST;
+
+    ranks_in_win_grp = (int *) MPL_malloc(sizeof(int) * group->size);
+    MPIR_Assert(ranks_in_win_grp);
+
+    mpi_errno = MPIDI_CH4I_fill_ranks_in_win_grp(win, group, ranks_in_win_grp);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIR_ERR_POP(mpi_errno);
+
+    for (index = 0; index < group->size; ++index) {
+        peer = ranks_in_win_grp[index];
+        mpi_errno = MPIDI_NM_inject_am_hdr(peer, win->comm_ptr,
+                                           MPIDI_CH4U_WIN_CTRL, &msg, sizeof(msg), NULL);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
+    }
+
+    MPL_free(ranks_in_win_grp);
+    MPIDI_CH4U_WIN(win, sync).target_epoch_type = MPIDI_CH4U_EPOTYPE_POST;
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4U_WIN_POST);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_wait
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_wait(MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Group *group;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_WAIT);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_WAIT);
+
+    MPIDI_CH4U_EPOCH_TARGET_CHECK(win, MPIDI_CH4U_EPOTYPE_POST, mpi_errno, goto fn_fail);
+    group = MPIDI_CH4U_WIN(win, sync).pw.group;
+    MPIDI_CH4R_PROGRESS_WHILE(group->size != (int) MPIDI_CH4U_WIN(win, sync).sc.count);
+
+    MPIDI_CH4U_WIN(win, sync).sc.count = 0;
+    MPIDI_CH4U_WIN(win, sync).pw.group = NULL;
+    MPIR_Group_release(group);
+    MPIDI_CH4U_EPOCH_ORIGIN_EVENT(win);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_WAIT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_test
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_test(MPIR_Win * win, int *flag)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_TEST);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_TEST);
+
+    MPIDI_CH4U_EPOCH_TARGET_CHECK(win, MPIDI_CH4U_EPOTYPE_POST, mpi_errno, goto fn_fail);
+
+    MPIR_Group *group;
+    group = MPIDI_CH4U_WIN(win, sync).pw.group;
+
+    if (group->size == (int) MPIDI_CH4U_WIN(win, sync).sc.count) {
+        MPIDI_CH4U_WIN(win, sync).sc.count = 0;
+        MPIDI_CH4U_WIN(win, sync).pw.group = NULL;
+        *flag = 1;
+        MPIR_Group_release(group);
+        MPIDI_CH4U_EPOCH_ORIGIN_EVENT(win);
+    }
+    else {
+        MPIDI_CH4R_PROGRESS();
+        *flag = 0;
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_TEST);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_lock
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_lock(int lock_type, int rank, int assert, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    unsigned locked;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_LOCK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_LOCK);
+
+    MPIDI_CH4U_win_sync_lock *slock = &MPIDI_CH4U_WIN(win, sync).lock;
+    if (rank == MPI_PROC_NULL)
+        goto fn_exit0;
+
+    MPIDI_CH4U_EPOCH_CHECK_TYPE(win, mpi_errno, goto fn_fail);
+
+    MPIDI_CH4U_win_cntrl_msg_t msg;
+    msg.win_id = MPIDI_CH4U_WIN(win, win_id);
+    msg.origin_rank = win->comm_ptr->rank;
+    msg.type = MPIDI_CH4U_WIN_LOCK;
+    msg.lock_type = lock_type;
+
+    locked = slock->remote.locked + 1;
+    mpi_errno = MPIDI_NM_inject_am_hdr(rank, win->comm_ptr,
+                                       MPIDI_CH4U_WIN_CTRL, &msg, sizeof(msg), NULL);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
+    MPIDI_CH4R_PROGRESS_WHILE(slock->remote.locked != locked);
+
+  fn_exit0:
+    MPIDI_CH4U_WIN(win, sync).origin_epoch_type = MPIDI_CH4U_EPOTYPE_LOCK;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_LOCK);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_unlock
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_unlock(int rank, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    unsigned unlocked;
+    MPIDI_CH4U_win_cntrl_msg_t msg;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_UNLOCK);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_UNLOCK);
+    if (rank == MPI_PROC_NULL)
+        goto fn_exit0;
+
+    MPIDI_CH4U_EPOCH_ORIGIN_CHECK(win, MPIDI_CH4U_EPOTYPE_LOCK, mpi_errno, return mpi_errno);
+
+    mpi_errno = MPIDI_CH4I_progress_win_fence(win);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    msg.win_id = MPIDI_CH4U_WIN(win, win_id);
+    msg.origin_rank = win->comm_ptr->rank;
+    msg.type = MPIDI_CH4U_WIN_UNLOCK;
+    unlocked = MPIDI_CH4U_WIN(win, sync).lock.remote.locked - 1;
+
+    mpi_errno = MPIDI_NM_inject_am_hdr(rank, win->comm_ptr,
+                                       MPIDI_CH4U_WIN_CTRL, &msg, sizeof(msg), NULL);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
+
+    MPIDI_CH4R_PROGRESS_WHILE(MPIDI_CH4U_WIN(win, sync).lock.remote.locked != unlocked);
+  fn_exit0:
+
+    if (!MPIDI_CH4U_WIN(win, sync).lock.remote.locked) {
+        MPIDI_CH4U_WIN(win, sync).origin_epoch_type = MPIDI_CH4U_EPOTYPE_NONE;
+        MPIDI_CH4U_WIN(win, sync).target_epoch_type = MPIDI_CH4U_EPOTYPE_NONE;
+    }
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_UNLOCK);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_get_info
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_get_info(MPIR_Win * win, MPIR_Info ** info_p_p)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_GET_INFO);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_GET_INFO);
+
+    mpi_errno = MPIR_Info_alloc(info_p_p);
+    MPIR_Assert(mpi_errno == MPI_SUCCESS);
+
+    if (MPIDI_CH4U_WIN(win, info_args).no_locks)
+        mpi_errno = MPIR_Info_set_impl(*info_p_p, "no_locks", "true");
+    else
+        mpi_errno = MPIR_Info_set_impl(*info_p_p, "no_locks", "false");
+
+    MPIR_Assert(mpi_errno == MPI_SUCCESS);
+
+    {
+#define BUFSIZE 32
+        char buf[BUFSIZE];
+        int c = 0;
+
+        if (MPIDI_CH4U_WIN(win, info_args).accumulate_ordering & MPIDI_CH4I_ACCU_ORDER_RAR)
+            c += snprintf(buf + c, BUFSIZE - c, "%srar", (c > 0) ? "," : "");
+
+        if (MPIDI_CH4U_WIN(win, info_args).accumulate_ordering & MPIDI_CH4I_ACCU_ORDER_RAW)
+            c += snprintf(buf + c, BUFSIZE - c, "%sraw", (c > 0) ? "," : "");
+
+        if (MPIDI_CH4U_WIN(win, info_args).accumulate_ordering & MPIDI_CH4I_ACCU_ORDER_WAR)
+            c += snprintf(buf + c, BUFSIZE - c, "%swar", (c > 0) ? "," : "");
+
+        if (MPIDI_CH4U_WIN(win, info_args).accumulate_ordering & MPIDI_CH4I_ACCU_ORDER_WAW)
+            c += snprintf(buf + c, BUFSIZE - c, "%swaw", (c > 0) ? "," : "");
+
+        if (c == 0) {
+            memcpy(&buf[0], "not set   ", 10);
+        }
+
+        MPIR_Info_set_impl(*info_p_p, "accumulate_ordering", buf);
+        MPIR_Assert(mpi_errno == MPI_SUCCESS);
+#undef BUFSIZE
+    }
+
+    if (MPIDI_CH4U_WIN(win, info_args).accumulate_ops == MPIDI_CH4I_ACCU_SAME_OP)
+        mpi_errno = MPIR_Info_set_impl(*info_p_p, "accumulate_ops", "same_op");
+    else
+        mpi_errno = MPIR_Info_set_impl(*info_p_p, "accumulate_ops", "same_op_no_op");
+
+    MPIR_Assert(mpi_errno == MPI_SUCCESS);
+
+    if (win->create_flavor == MPI_WIN_FLAVOR_SHARED) {
+        if (MPIDI_CH4U_WIN(win, info_args).alloc_shared_noncontig)
+            mpi_errno = MPIR_Info_set_impl(*info_p_p, "alloc_shared_noncontig", "true");
+        else
+            mpi_errno = MPIR_Info_set_impl(*info_p_p, "alloc_shared_noncontig", "false");
+
+        MPIR_Assert(mpi_errno == MPI_SUCCESS);
+    }
+    else if (win->create_flavor == MPI_WIN_FLAVOR_ALLOCATE) {
+        if (MPIDI_CH4U_WIN(win, info_args).same_size)
+            mpi_errno = MPIR_Info_set_impl(*info_p_p, "same_size", "true");
+        else
+            mpi_errno = MPIR_Info_set_impl(*info_p_p, "same_size", "false");
+
+        MPIR_Assert(mpi_errno == MPI_SUCCESS);
+    }
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_GET_INFO);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_finalize
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_finalize(MPIR_Win ** win_ptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Win *win = *win_ptr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_FINALIZE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_FINALIZE);
+
+    if (win->create_flavor == MPI_WIN_FLAVOR_ALLOCATE && win->base) {
+        if (MPIDI_CH4U_WIN(win, mmap_sz) > 0)
+            munmap(MPIDI_CH4U_WIN(win, mmap_addr), MPIDI_CH4U_WIN(win, mmap_sz));
+        else if (MPIDI_CH4U_WIN(win, mmap_sz) == -1)
+            MPL_free(win->base);
+    }
+
+    if (win->create_flavor == MPI_WIN_FLAVOR_SHARED) {
+        if (MPIDI_CH4U_WIN(win, mmap_addr))
+            munmap(MPIDI_CH4U_WIN(win, mmap_addr), MPIDI_CH4U_WIN(win, mmap_sz));
+        MPL_free(MPIDI_CH4U_WIN(win, sizes));
+    }
+
+    if (MPIDI_CH4U_WIN(win, lockQ)) {
+        MPL_free(MPIDI_CH4U_WIN(win, lockQ));
+        MPIDI_CH4U_WIN(win, lockQ) = NULL;
+    }
+
+    MPL_HASH_DELETE(dev.ch4u.hash_handle, MPIDI_CH4_Global.win_hash, win);
+
+    if (win->create_flavor == MPI_WIN_FLAVOR_SHARED) {
+        MPL_free(MPIDI_CH4U_WIN(win, shared_table));
+    }
+
+    MPIR_Comm_release(win->comm_ptr);
+    MPIR_Handle_obj_free(&MPIR_Win_mem, win);
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_FINALIZE);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_free
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_free(MPIR_Win ** win_ptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+    MPIR_Win *win = *win_ptr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_FREE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_FREE);
+
+    MPIDI_CH4U_EPOCH_FREE_CHECK(win, mpi_errno, goto fn_fail);
+    mpi_errno = MPIR_Barrier_impl(win->comm_ptr, &errflag);
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+    MPIDI_CH4R_win_finalize(win_ptr);
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_FREE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_fence
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_fence(int massert, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_FENCE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_FENCE);
+
+    MPIDI_CH4U_EPOCH_FENCE_CHECK(win, mpi_errno, goto fn_fail);
+    mpi_errno = MPIDI_CH4I_progress_win_fence(win);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+    MPIDI_CH4U_EPOCH_FENCE_EVENT(win, massert);
+
+    /*
+     * We always make a barrier even if MPI_MODE_NOPRECEDE is specified.
+     * This is necessary because we no longer defer executions of RMA ops
+     * until synchronization calls as CH3 did. Otherwise, the code like
+     * this won't work correctly (cf. f77/rma/wingetf)
+     *
+     * Rank 0                          Rank 1
+     * ----                            ----
+     * Store to local mem in window
+     * MPI_Win_fence(MODE_NOPRECEDE)   MPI_Win_fence(MODE_NOPRECEDE)
+     * MPI_Get(from rank 1)
+     */
+    mpi_errno = MPIR_Barrier_impl(win->comm_ptr, &errflag);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_FENCE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_create
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_create(void *base,
+                                        MPI_Aint length,
+                                        int disp_unit,
+                                        MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+    MPIR_Win *win;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_CREATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_CREATE);
+
+    mpi_errno = MPIDI_CH4R_win_init(length,
+                                    disp_unit,
+                                    win_ptr,
+                                    info, comm_ptr, MPI_WIN_FLAVOR_CREATE, MPI_WIN_UNIFIED);
+
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+    win = *win_ptr;
+    win->base = base;
+
+    mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag);
+
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_CREATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_attach
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_attach(MPIR_Win * win, void *base, MPI_Aint size)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_ATTACH);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_ATTACH);
+
+    MPIR_ERR_CHKANDSTMT((win->create_flavor != MPI_WIN_FLAVOR_DYNAMIC), mpi_errno,
+                        MPI_ERR_RMA_FLAVOR, goto fn_fail, "**rmaflavor");
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_ATTACH);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_allocate_shared
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_allocate_shared(MPI_Aint size,
+                                                 int disp_unit,
+                                                 MPIR_Info * info_ptr,
+                                                 MPIR_Comm * comm_ptr,
+                                                 void **base_ptr, MPIR_Win ** win_ptr)
+{
+    int i = 0, fd = -1, rc, first = 0, mpi_errno = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+    void *baseP = NULL;
+    MPIR_Win *win = NULL;
+    ssize_t total_size = 0LL;
+    MPI_Aint size_out = 0;
+    MPIDI_CH4U_win_shared_info_t *shared_table = NULL;
+    char shm_key[64];
+    void *map_ptr;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_ALLOCATE_SHARED);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_ALLOCATE_SHARED);
+
+    mpi_errno = MPIDI_CH4R_win_init(size, disp_unit, win_ptr, info_ptr, comm_ptr,
+                                    MPI_WIN_FLAVOR_SHARED, MPI_WIN_UNIFIED);
+
+    win = *win_ptr;
+    MPIDI_CH4U_WIN(win, shared_table) =
+        (MPIDI_CH4U_win_shared_info_t *) MPL_malloc(sizeof(MPIDI_CH4U_win_shared_info_t) *
+                                                    comm_ptr->local_size);
+    shared_table = MPIDI_CH4U_WIN(win, shared_table);
+    shared_table[comm_ptr->rank].size = size;
+    shared_table[comm_ptr->rank].disp_unit = disp_unit;
+
+    mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE,
+                                    0,
+                                    MPI_DATATYPE_NULL,
+                                    shared_table,
+                                    sizeof(MPIDI_CH4U_win_shared_info_t),
+                                    MPI_BYTE, comm_ptr, &errflag);
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+    /* No allreduce here because this is a shared memory domain
+     * and should be a relatively small number of processes
+     * and a non performance sensitive API.
+     */
+    for (i = 0; i < comm_ptr->local_size; i++)
+        total_size += shared_table[i].size;
+
+    if (total_size == 0)
+        goto fn_zero;
+
+    sprintf(shm_key, "/mpi-%X-%" PRIx64, MPIDI_CH4_Global.jobid, MPIDI_CH4U_WIN(win, win_id));
+
+    rc = shm_open(shm_key, O_CREAT | O_EXCL | O_RDWR, 0600);
+    first = (rc != -1);
+
+    if (!first) {
+        rc = shm_open(shm_key, O_RDWR, 0);
+
+        if (rc == -1) {
+            shm_unlink(shm_key);
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_NO_MEM, goto fn_fail, "**nomem");
+        }
+    }
+
+    /* Make the addresses symmetric by using MAP_FIXED */
+    size_t page_sz, mapsize;
+
+    mapsize = MPIDI_CH4R_get_mapsize(total_size, &page_sz);
+    fd = rc;
+    rc = ftruncate(fd, mapsize);
+
+    if (rc == -1) {
+        close(fd);
+
+        if (first)
+            shm_unlink(shm_key);
+
+        MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_NO_MEM, goto fn_fail, "**nomem");
+    }
+
+    if (comm_ptr->rank == 0) {
+        map_ptr = MPIDI_CH4R_generate_random_addr(mapsize);
+        map_ptr = mmap(map_ptr, mapsize, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0);
+
+        if (map_ptr == NULL || map_ptr == MAP_FAILED) {
+            close(fd);
+
+            if (first)
+                shm_unlink(shm_key);
+
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_NO_MEM, goto fn_fail, "**nomem");
+        }
+
+        mpi_errno = MPIR_Bcast_impl(&map_ptr, 1, MPI_UNSIGNED_LONG, 0, comm_ptr, &errflag);
+
+        if (mpi_errno != MPI_SUCCESS)
+            goto fn_fail;
+
+        MPIDI_CH4U_WIN(win, mmap_addr) = map_ptr;
+        MPIDI_CH4U_WIN(win, mmap_sz) = mapsize;
+    }
+    else {
+        mpi_errno = MPIR_Bcast_impl(&map_ptr, 1, MPI_UNSIGNED_LONG, 0, comm_ptr, &errflag);
+
+        if (mpi_errno != MPI_SUCCESS)
+            goto fn_fail;
+
+        rc = MPIDI_CH4R_check_maprange_ok(map_ptr, mapsize);
+        /* If we hit this assert, we need to iterate
+         * trying more addresses
+         */
+        MPIR_Assert(rc == 1);
+        map_ptr = mmap(map_ptr, mapsize, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0);
+        MPIDI_CH4U_WIN(win, mmap_addr) = map_ptr;
+        MPIDI_CH4U_WIN(win, mmap_sz) = mapsize;
+
+        if (map_ptr == NULL || map_ptr == MAP_FAILED) {
+            close(fd);
+
+            if (first)
+                shm_unlink(shm_key);
+
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_NO_MEM, goto fn_fail, "**nomem");
+        }
+    }
+
+    /* Scan for my offset into the buffer             */
+    /* Could use exscan if this is expensive at scale */
+    for (i = 0; i < comm_ptr->rank; i++)
+        size_out += shared_table[i].size;
+
+  fn_zero:
+
+    baseP = (size == 0) ? NULL : (void *) ((char *) map_ptr + size_out);
+    win->base = baseP;
+    win->size = size;
+
+    *(void **) base_ptr = (void *) win->base;
+    mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag);
+
+    if (fd >= 0)
+        close(fd);
+
+    if (first)
+        shm_unlink(shm_key);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_ALLOCATE_SHARED);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_detach
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_detach(MPIR_Win * win, const void *base)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_DETACH);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_DETACH);
+    MPIR_ERR_CHKANDSTMT((win->create_flavor != MPI_WIN_FLAVOR_DYNAMIC), mpi_errno,
+                        MPI_ERR_RMA_FLAVOR, goto fn_fail, "**rmaflavor");
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_DETACH);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_shared_query
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_shared_query(MPIR_Win * win,
+                                              int rank,
+                                              MPI_Aint * size, int *disp_unit, void *baseptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    uintptr_t base = (uintptr_t) MPIDI_CH4U_WIN(win, mmap_addr);
+    int offset = rank, i;
+    MPIDI_CH4U_win_shared_info_t *shared_table = MPIDI_CH4U_WIN(win, shared_table);
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_SHARED_QUERY);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_SHARED_QUERY);
+
+    if (rank < 0)
+        offset = 0;
+    *size = shared_table[offset].size;
+    *disp_unit = shared_table[offset].disp_unit;
+    if (*size > 0) {
+        for (i = 0; i < offset; i++)
+            base += shared_table[i].size;
+        *(void **) baseptr = (void *) base;
+    }
+    else
+        *(void **) baseptr = NULL;
+
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_SHARED_QUERY);
+    return mpi_errno;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_allocate
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_allocate(MPI_Aint size,
+                                          int disp_unit,
+                                          MPIR_Info * info,
+                                          MPIR_Comm * comm, void *baseptr, MPIR_Win ** win_ptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+    void *baseP;
+    MPIR_Win *win;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_ALLOCATE);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_ALLOCATE);
+
+    mpi_errno = MPIDI_CH4R_win_init(size, disp_unit, win_ptr, info, comm,
+                                    MPI_WIN_FLAVOR_ALLOCATE, MPI_WIN_UNIFIED);
+
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+    mpi_errno = MPIDI_CH4R_get_symmetric_heap(size, comm, &baseP, *win_ptr);
+
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+    win = *win_ptr;
+    win->base = baseP;
+
+    *(void **) baseptr = (void *) win->base;
+    mpi_errno = MPIR_Barrier_impl(comm, &errflag);
+
+    if (mpi_errno != MPI_SUCCESS)
+        goto fn_fail;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_ALLOCATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_flush
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_flush(int rank, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_FLUSH);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_FLUSH);
+
+    MPIDI_CH4U_EPOCH_LOCK_CHECK(win, mpi_errno, goto fn_fail);
+    mpi_errno = MPIDI_CH4I_progress_win_fence(win);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_FLUSH);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_flush_local_all
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_flush_local_all(MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_FLUSH_LOCAL_ALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_FLUSH_LOCAL_ALL);
+
+    MPIDI_CH4U_EPOCH_LOCK_CHECK(win, mpi_errno, goto fn_fail);
+
+    mpi_errno = MPIDI_CH4I_progress_win_fence(win);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_FLUSH_LOCAL_ALL);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_unlock_all
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_unlock_all(MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_UNLOCK_ALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_UNLOCK_ALL);
+    int i;
+    MPIDI_CH4U_win_lock_info *lockQ;
+
+    MPIDI_CH4U_EPOCH_ORIGIN_CHECK(win, MPIDI_CH4U_EPOTYPE_LOCK_ALL, mpi_errno, goto fn_exit);
+
+    mpi_errno = MPIDI_CH4I_progress_win_fence(win);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+    MPIR_Assert(MPIDI_CH4U_WIN(win, lockQ) != NULL);
+    lockQ = (MPIDI_CH4U_win_lock_info *) MPIDI_CH4U_WIN(win, lockQ);
+
+    for (i = 0; i < win->comm_ptr->local_size; i++) {
+
+        MPIDI_CH4U_win_cntrl_msg_t msg;
+        msg.win_id = MPIDI_CH4U_WIN(win, win_id);
+        msg.origin_rank = win->comm_ptr->rank;
+        msg.type = MPIDI_CH4U_WIN_UNLOCKALL;
+
+        lockQ[i].done = 0;
+        lockQ[i].peer = i;
+        lockQ[i].win = win;
+
+        mpi_errno = MPIDI_NM_inject_am_hdr(i, win->comm_ptr,
+                                           MPIDI_CH4U_WIN_CTRL, &msg, sizeof(msg), NULL);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
+
+        if (MPIDI_CH4U_WIN(win, sync).lock.remote.allLocked == 1)
+            lockQ[i].done = 1;
+    }
+
+    MPIDI_CH4R_PROGRESS_WHILE(MPIDI_CH4U_WIN(win, sync).lock.remote.allLocked);
+
+    MPIDI_CH4U_WIN(win, sync).origin_epoch_type = MPIDI_CH4U_EPOTYPE_NONE;
+    MPIDI_CH4U_WIN(win, sync).target_epoch_type = MPIDI_CH4U_EPOTYPE_NONE;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_UNLOCK_ALL);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_create_dynamic
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_create_dynamic(MPIR_Info * info,
+                                                MPIR_Comm * comm, MPIR_Win ** win_ptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int rc = MPI_SUCCESS;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_CREATE_DYNAMIC);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_CREATE_DYNAMIC);
+
+    MPIR_Win *win;
+
+    rc = MPIDI_CH4R_win_init(0, 1, win_ptr, info, comm, MPI_WIN_FLAVOR_DYNAMIC, MPI_WIN_UNIFIED);
+
+    if (rc != MPI_SUCCESS)
+        goto fn_fail;
+
+    win = *win_ptr;
+    win->base = MPI_BOTTOM;
+
+
+    if (rc != MPI_SUCCESS)
+        goto fn_fail;
+
+    mpi_errno = MPIR_Barrier_impl(comm, &errflag);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_CREATE_DYNAMIC);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_flush_local
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_flush_local(int rank, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_FLUSH_LOCAL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_FLUSH_LOCAL);
+
+    MPIDI_CH4U_EPOCH_LOCK_CHECK(win, mpi_errno, goto fn_fail);
+
+    mpi_errno = MPIDI_CH4I_progress_win_fence(win);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_FLUSH_LOCAL);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_sync
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_sync(MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_SYNC);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_SYNC);
+
+    MPIDI_CH4U_EPOCH_LOCK_CHECK(win, mpi_errno, goto fn_fail);
+    OPA_read_write_barrier();
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_SYNC);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_flush_all
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_flush_all(MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_FLUSH_ALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_FLUSH_ALL);
+
+    MPIDI_CH4U_EPOCH_LOCK_CHECK(win, mpi_errno, goto fn_fail);
+
+    mpi_errno = MPIDI_CH4I_progress_win_fence(win);
+    if (mpi_errno)
+        MPIR_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_FLUSH_ALL);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH4R_win_lock_all
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static inline int MPIDI_CH4R_win_lock_all(int assert, MPIR_Win * win)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CH4I_WIN_LOCK_ALL);
+    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CH4I_WIN_LOCK_ALL);
+
+    MPIDI_CH4U_EPOCH_CHECK_TYPE(win, mpi_errno, goto fn_fail);
+
+    int size;
+    size = win->comm_ptr->local_size;
+
+    if (!MPIDI_CH4U_WIN(win, lockQ)) {
+        MPIDI_CH4U_WIN(win, lockQ) =
+            (MPIDI_CH4U_win_lock_info *) MPL_calloc(size, sizeof(MPIDI_CH4U_win_lock_info));
+        MPIR_Assert(MPIDI_CH4U_WIN(win, lockQ) != NULL);
+    }
+
+    MPIDI_CH4U_win_lock_info *lockQ;
+    lockQ = (MPIDI_CH4U_win_lock_info *) MPIDI_CH4U_WIN(win, lockQ);
+    int i;
+
+    for (i = 0; i < size; i++) {
+        MPIDI_CH4U_win_cntrl_msg_t msg;
+        msg.win_id = MPIDI_CH4U_WIN(win, win_id);
+        msg.origin_rank = win->comm_ptr->rank;
+        msg.type = MPIDI_CH4U_WIN_LOCKALL;
+        msg.lock_type = MPI_LOCK_SHARED;
+
+        lockQ[i].done = 0;
+        lockQ[i].peer = i;
+        lockQ[i].win = win;
+        lockQ[i].lock_type = MPI_LOCK_SHARED;
+
+        mpi_errno = MPIDI_NM_inject_am_hdr(i, win->comm_ptr,
+                                           MPIDI_CH4U_WIN_CTRL, &msg, sizeof(msg), NULL);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
+
+        if (MPIDI_CH4U_WIN(win, sync).lock.remote.allLocked == 1)
+            lockQ[i].done = 1;
+    }
+
+    MPIDI_CH4R_PROGRESS_WHILE(size != (int) MPIDI_CH4U_WIN(win, sync).lock.remote.allLocked);
+    MPIDI_CH4U_WIN(win, sync).origin_epoch_type = MPIDI_CH4U_EPOTYPE_LOCK_ALL;
+
+  fn_exit:
+    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CH4I_WIN_LOCK_ALL);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#endif /* CH4R_WIN_H_INCLUDED */
diff --git a/src/mpid/ch4/src/mpid_ch4_net_array.c.in b/src/mpid/ch4/src/mpid_ch4_net_array.c.in
new file mode 100644
index 0000000..8765f39
--- /dev/null
+++ b/src/mpid/ch4/src/mpid_ch4_net_array.c.in
@@ -0,0 +1,28 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#include <mpidimpl.h>
+
+/* forward declaration of funcs structs defined in network modules */
+extern MPIDI_NM_funcs_t @ch4_nets_func_decl@;
+extern MPIDI_NM_native_funcs_t @ch4_nets_native_func_decl@;
+
+#ifndef NETMOD_DIRECT
+MPIDI_NM_funcs_t *MPIDI_NM_funcs[@ch4_nets_array_sz@] = { @ch4_nets_func_array@ };
+MPIDI_NM_native_funcs_t *MPIDI_NM_native_funcs[@ch4_nets_array_sz@] =
+    { @ch4_nets_native_func_array@ };
+#else
+MPIDI_NM_funcs_t *MPIDI_NM_funcs[@ch4_nets_array_sz@] = { 0 };
+MPIDI_NM_native_funcs_t *MPIDI_NM_native_funcs[@ch4_nets_array_sz@] = { 0 };
+#endif
+int MPIDI_num_netmods = @ch4_nets_array_sz@;
+char MPIDI_NM_strings[@ch4_nets_array_sz@][MPIDI_MAX_NETMOD_STRING_LEN] =
+    { @ch4_nets_strings@ };
diff --git a/src/mpid/ch4/src/mpid_ch4_shm_array.c.in b/src/mpid/ch4/src/mpid_ch4_shm_array.c.in
new file mode 100644
index 0000000..6bce704
--- /dev/null
+++ b/src/mpid/ch4/src/mpid_ch4_shm_array.c.in
@@ -0,0 +1,32 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Portions of this code were written by Intel Corporation.
+ *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
+ *  to Argonne National Laboratory subject to Software Grant and Corporate
+ *  Contributor License Agreement dated February 8, 2012.
+ */
+
+#include <mpidimpl.h>
+
+#ifdef MPIDI_BUILD_CH4_SHM
+
+/* forward declaration of funcs structs defined in network modules */
+extern MPIDI_SHM_funcs_t @ch4_shm_func_decl@;
+extern MPIDI_SHM_native_funcs_t @ch4_shm_native_func_decl@;
+
+#ifndef SHM_DIRECT
+MPIDI_SHM_funcs_t *MPIDI_SHM_funcs[@ch4_shm_array_sz@] = { @ch4_shm_func_array@ };
+MPIDI_SHM_native_funcs_t *MPIDI_SHM_native_funcs[@ch4_shm_array_sz@] =
+    { @ch4_shm_native_func_array@ };
+#else
+MPIDI_SHM_funcs_t *MPIDI_SHM_funcs[@ch4_shm_array_sz@] = { 0 };
+MPIDI_SHM_native_funcs_t *MPIDI_SHM_native_funcs[@ch4_shm_array_sz@] = { 0 };
+#endif
+int MPIDI_num_shms = @ch4_shm_array_sz@;
+char MPIDI_SHM_strings[@ch4_shm_array_sz@][MPIDI_MAX_SHM_STRING_LEN] =
+    { @ch4_shm_strings@ };
+
+#endif
diff --git a/src/mpid/ch4/subconfigure.m4 b/src/mpid/ch4/subconfigure.m4
new file mode 100644
index 0000000..f4ca5c5
--- /dev/null
+++ b/src/mpid/ch4/subconfigure.m4
@@ -0,0 +1,391 @@
+[#] start of __file__
+dnl MPICH_SUBCFG_BEFORE=src/mpid/common/sched
+dnl MPICH_SUBCFG_BEFORE=src/mpid/common/datatype
+dnl MPICH_SUBCFG_BEFORE=src/mpid/common/thread
+
+dnl _PREREQ handles the former role of mpichprereq, setup_device, etc
+[#] expansion is: PAC_SUBCFG_PREREQ_[]PAC_SUBCFG_AUTO_SUFFIX
+AC_DEFUN([PAC_SUBCFG_PREREQ_]PAC_SUBCFG_AUTO_SUFFIX,[
+AM_CONDITIONAL([BUILD_CH4],[test "$device_name" = "ch4"])
+
+# the CH4 device depends on the common NBC scheduler code
+build_mpid_common_sched=yes
+build_mpid_common_datatype=yes
+build_mpid_common_thread=yes
+
+MPID_MAX_THREAD_LEVEL=MPI_THREAD_MULTIPLE
+MPID_MAX_PROCESSOR_NAME=128
+MPID_MAX_ERROR_STRING=512
+
+AM_COND_IF([BUILD_CH4],[
+AC_MSG_NOTICE([RUNNING PREREQ FOR CH4 DEVICE])
+
+# $device_args - contains the netmods
+if test -z "${device_args}" ; then
+    ch4_netmods="ofi"
+else
+    ch4_netmods=`echo ${device_args} | sed -e 's/,/ /g'`
+fi
+export ch4_netmods
+
+#
+# reset DEVICE so that it (a) always includes the channel name, and (b) does not include channel options
+#
+DEVICE="${device_name}:${ch4_netmods}"
+
+ch4_nets_func_decl=""
+ch4_nets_native_func_decl=""
+ch4_nets_func_array=""
+ch4_nets_native_func_array=""
+ch4_nets_strings=""
+net_index=0
+for net in $ch4_netmods ; do
+    if test ! -d $srcdir/src/mpid/ch4/netmod/${net} ; then
+        AC_MSG_ERROR([Network module ${net} is unknown "$srcdir/src/mpid/ch4/netmod/${net}"])
+    fi
+    net_macro=`echo $net | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'`
+    net_macro="MPIDI_CH4_${net_macro}"
+
+    if test -z "$ch4_nets_array" ; then
+        ch4_nets_array="$net_macro"
+    else
+        ch4_nets_array="$ch4_nets_array, $net_macro"
+    fi
+
+    if test -z "$ch4_nets_func_decl" ; then
+        ch4_nets_func_decl="MPIDI_NM_${net}_funcs"
+    else
+        ch4_nets_func_decl="${ch4_nets_func_decl}, MPIDI_NM_${net}_funcs"
+    fi
+
+    if test -z "$ch4_nets_native_func_decl" ; then
+        ch4_nets_native_func_decl="MPIDI_NM_native_${net}_funcs"
+    else
+        ch4_nets_native_func_decl="${ch4_nets_native_func_decl}, MPIDI_NM_native_${net}_funcs"
+    fi
+
+    if test -z "$ch4_nets_func_array" ; then
+        ch4_nets_func_array="&MPIDI_NM_${net}_funcs"
+    else
+        ch4_nets_func_array="${ch4_nets_func_array}, &MPIDI_NM_${net}_funcs"
+    fi
+
+    if test -z "$ch4_nets_native_func_array" ; then
+        ch4_nets_native_func_array="&MPIDI_NM_native_${net}_funcs"
+    else
+        ch4_nets_native_func_array="${ch4_nets_native_func_array}, &MPIDI_NM_native_${net}_funcs"
+    fi
+
+    if test -z "$ch4_nets_strings" ; then
+        ch4_nets_strings="\"$net\""
+    else
+        ch4_nets_strings="$ch4_nets_strings, \"$net\""
+    fi
+
+    if test -z "$ch4_netmod_pre_include" ; then
+        ch4_netmod_pre_include="#include \"../netmod/${net}/${net}_pre.h\""
+    else
+        ch4_netmod_pre_include="${ch4_netmod_pre_include}
+#include \"../netmod/${net}/${net}_pre.h\""
+    fi
+
+    net_upper=`echo ${net} | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'`
+    if test -z "$ch4_netmod_amrequest_decl" ; then
+        ch4_netmod_amrequest_decl="MPIDI_${net_upper}_am_request_t ${net};"
+    else
+        ch4_netmod_amrequest_decl="${ch4_netmod_amrequest_decl} \\
+MPIDI_${net_upper}_am_request_t ${net};"
+    fi
+
+    if test -z "$ch4_netmod_request_decl" ; then
+        ch4_netmod_request_decl="MPIDI_${net_upper}_request_t ${net};"
+    else
+        ch4_netmod_request_decl="${ch4_netmod_request_decl} \\
+MPIDI_${net_upper}_request_t ${net};"
+    fi
+
+    if test -z "$ch4_netmod_comm_decl" ; then
+        ch4_netmod_comm_decl="MPIDI_${net_upper}_comm_t ${net};"
+    else
+        ch4_netmod_comm_decl="${ch4_netmod_comm_decl} \\
+MPIDI_${net_upper}_comm_t ${net};"
+    fi
+    if test -z "$ch4_netmod_dt_decl" ; then
+        ch4_netmod_dt_decl="MPIDI_${net_upper}_dt_t ${net};"
+    else
+        ch4_netmod_dt_decl="${ch4_netmod_dt_decl} \\
+MPIDI_${net_upper}_dt_t ${net};"
+    fi
+    if test -z "$ch4_netmod_op_decl" ; then
+        ch4_netmod_op_decl="MPIDI_${net_upper}_op_t ${net};"
+    else
+        ch4_netmod_op_decl="${ch4_netmod_op_decl} \\
+MPIDI_${net_upper}_op_t ${net};"
+    fi
+
+    if test -z "$ch4_netmod_win_decl" ; then
+        ch4_netmod_win_decl="MPIDI_${net_upper}_win_t ${net};"
+    else
+        ch4_netmod_win_decl="${ch4_netmod_win_decl} \\
+MPIDI_${net_upper}_win_t ${net};"
+    fi
+    if test -z "$ch4_netmod_gpid_decl" ; then
+        ch4_netmod_gpid_decl="MPIDI_${net_upper}_gpid_t ${net};"
+    else
+        ch4_netmod_gpid_decl="${ch4_netmod_gpid_decl} \\
+MPIDI_${net_upper}_gpid_t ${net};"
+    fi
+    if test -z "$ch4_netmod_addr_decl" ; then
+        ch4_netmod_addr_decl="MPIDI_${net_upper}_addr_t ${net};"
+    else
+        ch4_netmod_addr_decl="${ch4_netmod_addr_decl} \\
+MPIDI_${net_upper}_addr_t ${net};"
+    fi
+
+
+
+
+net_index=`expr $net_index + 1`
+done
+ch4_nets_array_sz=$net_index
+
+AC_SUBST(device_name)
+AC_SUBST(ch4_netmods)
+AC_SUBST(ch4_nets_array)
+AC_SUBST(ch4_nets_array_sz)
+AC_SUBST(ch4_nets_func_decl)
+AC_SUBST(ch4_nets_native_func_decl)
+AC_SUBST(ch4_nets_func_array)
+AC_SUBST(ch4_nets_native_func_array)
+AC_SUBST(ch4_nets_strings)
+AC_SUBST(ch4_netmod_pre_include)
+AC_SUBST(ch4_netmod_amrequest_decl)
+AC_SUBST(ch4_netmod_request_decl)
+AC_SUBST(ch4_netmod_comm_decl)
+AC_SUBST(ch4_netmod_dt_decl)
+AC_SUBST(ch4_netmod_win_decl)
+AC_SUBST(ch4_netmod_gpid_decl)
+AC_SUBST(ch4_netmod_addr_decl)
+AC_SUBST(ch4_netmod_op_decl)
+AM_SUBST_NOTMAKE(ch4_netmod_pre_include)
+AM_SUBST_NOTMAKE(ch4_netmod_amrequest_decl)
+AM_SUBST_NOTMAKE(ch4_netmod_request_decl)
+AM_SUBST_NOTMAKE(ch4_netmod_comm_decl)
+AM_SUBST_NOTMAKE(ch4_netmod_dt_decl)
+AM_SUBST_NOTMAKE(ch4_netmod_win_decl)
+AM_SUBST_NOTMAKE(ch4_netmod_gpid_decl)
+AM_SUBST_NOTMAKE(ch4_netmod_addr_decl)
+AM_SUBST_NOTMAKE(ch4_netmod_op_decl)
+
+AC_ARG_ENABLE(ch4-netmod-direct,
+    [--enable-ch4-netmod-direct
+       Enables inlined netmod build when a single netmod is used
+       level:
+         yes       - Enabled (default)
+         no        - Disabled (may improve build times and code size)
+    ],,enable_ch4_netmod_direct=yes)
+
+
+if test "$ch4_nets_array_sz" = "1" && test "$enable_ch4_netmod_direct" = "yes" ;  then
+   PAC_APPEND_FLAG([-DNETMOD_DIRECT=__netmod_direct_${ch4_netmods}__], [CPPFLAGS])
+fi
+
+
+AC_ARG_ENABLE(ch4-shm,
+    [--enable-ch4-shm=level:module
+       Control whether CH4 shared memory is built and/or used.
+       level:
+         no        - Do not build or use CH4 shared memory.
+         yes       - Build CH4 shared memory, but do not use it by default (Your chosen netmod must provide it).
+         exclusive - Build and exclusively use CH4 shared memory. (Default)
+       module-list(optional).  comma separated list of shared memory modules:
+         posix     - POSIX shared memory implementation
+    ],,enable_ch4_shm=exclusive:posix)
+
+AC_ARG_ENABLE(ch4-shm-direct,
+    [--enable-ch4-shm-direct
+       Enables inlined shared memory build when a single shared memory module is used
+       level:
+         yes       - Enabled (default)
+         no        - Disabled (may improve build times and code size)
+    ],,enable_ch4_shm_direct=yes)
+
+ch4_shm_level=`echo $enable_ch4_shm | sed -e 's/:.*$//'`
+changequote(<<,>>)
+ch4_shm=`echo $enable_ch4_shm | sed -e 's/^[^:]*//' -e 's/^://'`
+changequote([,])
+
+if test "$ch4_shm_level" != "no" ; then
+    AC_DEFINE([MPIDI_BUILD_CH4_SHM], [1],
+        [Define if CH4 will build the default shared memory implementation as opposed to only using a netmod implementation])
+fi
+
+if test "$ch4_shm_level" = "exclusive" ; then
+    # This variable is set only when the user wants CH4 to handle all shared memory operations
+    AC_DEFINE(MPIDI_CH4_EXCLUSIVE_SHM, 1, [Define if CH4 will be providing the exclusive implementation of shared memory])
+
+    # This variable can be set either when the user asks for CH4 exclusive shared memory
+    # or when the netmod doesn't want to implement its own locality information
+    AC_DEFINE(MPIDI_BUILD_CH4_LOCALITY_INFO, 1, [CH4 should build locality info])
+fi
+
+# $ch4_shm - contains the shm mods
+if test -z "${ch4_shm}" ; then
+   if test "$ch4_shm_level" != "no" ; then
+      ch4_shm="posix"
+   fi
+else
+   ch4_shm=`echo ${ch4_shm} | sed -e 's/,/ /g'`
+fi
+export ch4_shm
+
+ch4_shm_func_decl=""
+ch4_shm_native_func_decl=""
+ch4_shm_func_array=""
+ch4_shm_native_func_array=""
+ch4_shm_strings=""
+shm_index=0
+for shm in $ch4_shm ; do
+    if test ! -d $srcdir/src/mpid/ch4/shm/${shm} ; then
+        AC_MSG_ERROR([Shared memory module ${shm} is unknown "$srcdir/src/mpid/ch4/shm/${shm}"])
+    fi
+    shm_macro=`echo $shm | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'`
+    shm_macro="MPIDI_SHM_${shm_macro}"
+
+    if test -z "$ch4_shm_array" ; then
+        ch4_shm_array="$shm_macro"
+    else
+        ch4_shm_array="$ch4_shm_array, $shm_macro"
+    fi
+
+    if test -z "$ch4_shm_func_decl" ; then
+        ch4_shm_func_decl="MPIDI_SHM_${shm}_funcs"
+    else
+        ch4_shm_func_decl="${ch4_shm_func_decl}, MPIDI_SHM_${shm}_funcs"
+    fi
+
+    if test -z "$ch4_shm_native_func_decl" ; then
+        ch4_shm_native_func_decl="MPIDI_SHM_native_${shm}_funcs"
+    else
+        ch4_shm_native_func_decl="${ch4_shm_native_func_decl}, MPIDI_SHM_native_${shm}_funcs"
+    fi
+
+    if test -z "$ch4_shm_func_array" ; then
+        ch4_shm_func_array="&MPIDI_SHM_${shm}_funcs"
+    else
+        ch4_shm_func_array="${ch4_shm_func_array}, &MPIDI_SHM_${shm}_funcs"
+    fi
+
+    if test -z "$ch4_shm_native_func_array" ; then
+        ch4_shm_native_func_array="&MPIDI_SHM_native_${shm}_funcs"
+    else
+        ch4_shm_native_func_array="${ch4_shm_native_func_array}, &MPIDI_SHM_native_${shm}_funcs"
+    fi
+
+    if test -z "$ch4_shm_strings" ; then
+        ch4_shm_strings="\"$shm\""
+    else
+        ch4_shm_strings="$ch4_shm_strings, \"$shm\""
+    fi
+
+    if test -z "$ch4_shm_pre_include" ; then
+        ch4_shm_pre_include="#include \"../shm/${shm}/${shm}_pre.h\""
+    else
+        ch4_shm_pre_include="${ch4_shm_pre_include}
+#include \"../shm/${shm}/${shm}_pre.h\""
+    fi
+
+    shm_upper=`echo ${shm} | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'`
+    if test -z "$ch4_shm_request_decl" ; then
+        ch4_shm_request_decl="MPIDI_${shm_upper}_request_t ${shm};"
+    else
+        ch4_shm_request_decl="${ch4_shm_request_decl} \\
+MPIDI_${shm_upper}_request_t ${shm};"
+    fi
+
+    if test -z "$ch4_shm_comm_decl" ; then
+        ch4_shm_comm_decl="MPIDI_${shm_upper}_comm_t ${shm};"
+    else
+        ch4_shm_comm_decl="${ch4_shm_comm_decl} \\
+MPIDI_${shm_upper}_comm_t ${shm};"
+    fi
+
+
+    shm_index=`expr $shm_index + 1`
+done
+ch4_shm_array_sz=$shm_index
+
+AC_SUBST(ch4_shm)
+AC_SUBST(ch4_shm_array)
+AC_SUBST(ch4_shm_array_sz)
+AC_SUBST(ch4_shm_func_decl)
+AC_SUBST(ch4_shm_native_func_decl)
+AC_SUBST(ch4_shm_func_array)
+AC_SUBST(ch4_shm_native_func_array)
+AC_SUBST(ch4_shm_strings)
+AC_SUBST(ch4_shm_pre_include)
+AC_SUBST(ch4_shm_request_decl)
+AC_SUBST(ch4_shm_comm_decl)
+AM_SUBST_NOTMAKE(ch4_shm_pre_include)
+AM_SUBST_NOTMAKE(ch4_shm_request_decl)
+AM_SUBST_NOTMAKE(ch4_shm_comm_decl)
+
+if test "$ch4_shm_array_sz" = "1"  && test "$enable_ch4_shm_direct" = "yes" ;  then
+   PAC_APPEND_FLAG([-DSHM_DIRECT=__shm_direct_${ch4_shm}__], [CPPFLAGS])
+fi
+
+])dnl end AM_COND_IF(BUILD_CH4,...)
+])dnl end PREREQ
+
+AC_DEFUN([PAC_SUBCFG_BODY_]PAC_SUBCFG_AUTO_SUFFIX,[
+AM_COND_IF([BUILD_CH4],[
+AC_MSG_NOTICE([RUNNING CONFIGURE FOR CH4 DEVICE])
+
+AC_ARG_WITH(ch4-rank-bits, [--with-ch4-rank-bits=16/32     Number of bits allocated to the rank field (16 or 32)],
+			   [ rankbits=$withval ],
+			   [ rankbits=32 ])
+if test "$rankbits" != "16" -a "$rankbits" != "32" ; then
+   AC_MSG_ERROR(Only 16 or 32-bit ranks are supported)
+fi
+AC_DEFINE_UNQUOTED(CH4_RANK_BITS,$rankbits,[Define the number of CH4_RANK_BITS])
+
+AC_ARG_ENABLE(ch4r-per-comm-msg-queue,
+    [--enable-ch4r-per-comm-msg-queue=option
+       Enable use of per-communicator message queues for posted recvs/unexpected messages
+         yes       - Use per-communicator message queue. (Default)
+         no        - Use global queue for posted recvs/unexpected messages.
+    ],,enable_ch4r_per_comm_msg_queue=yes)
+
+if test "$enable_ch4r_per_comm_msg_queue" = "yes" ; then
+    AC_DEFINE([MPIDI_CH4U_USE_PER_COMM_QUEUE], [1],
+        [Define if CH4U will use per-communicator message queues])
+fi
+
+PAC_ARG_SHARED_MEMORY
+
+AC_CONFIG_FILES([
+src/mpid/ch4/src/mpid_ch4_net_array.c
+src/mpid/ch4/src/mpid_ch4_shm_array.c
+src/mpid/ch4/include/netmodpre.h
+src/mpid/ch4/include/shmpre.h
+])
+PAC_ARG_SHARED_MEMORY
+])dnl end AM_COND_IF(BUILD_CH4,...)
+
+AM_CONDITIONAL([BUILD_CH4_SHM],[test "$ch4_shm_level" = "yes" -o "$ch4_shm_level" = "exclusive"])
+
+AC_CHECK_HEADERS(sys/mman.h sys/stat.h fcntl.h)
+
+found_rand_funcs=no
+AC_CHECK_FUNCS(random_r initstate_r, found_rand_funcs=yes)
+if test "$found_rand_funcs" = yes ; then
+    AC_DEFINE(USE_SYM_HEAP,1,[Define if we can use a symmetric heap])
+    AC_MSG_NOTICE([Using a symmetric heap])
+else
+    AC_MSG_NOTICE([Using a non-symmetric heap])
+fi
+
+
+])dnl end _BODY
+
+[#] end of __file__

-----------------------------------------------------------------------

Summary of changes:
 .gitignore                                   |    3 +
 src/mpid/Makefile.mk                         |    1 +
 src/mpid/ch4/.gitignore                      |    3 +
 src/mpid/ch4/Makefile.mk                     |   20 +
 src/mpid/ch4/cross/gcc-linux-x86-8           |   13 +
 src/mpid/ch4/cross/icc-linux-x86-8           |   13 +
 src/mpid/ch4/errnames.txt                    |   47 +
 src/mpid/ch4/include/Makefile.mk             |   23 +
 src/mpid/ch4/include/mpid_sched.h            |   30 +
 src/mpid/ch4/include/mpid_thread.h           |   68 +
 src/mpid/ch4/include/mpid_ticketlock.h       |  159 ++
 src/mpid/ch4/include/mpidch4.h               |  435 ++++
 src/mpid/ch4/include/mpidch4r.h              |   26 +
 src/mpid/ch4/include/mpidimpl.h              |   30 +
 src/mpid/ch4/include/mpidpost.h              |   30 +
 src/mpid/ch4/include/mpidpre.h               |  465 +++++
 src/mpid/ch4/include/netmodpre.h.in          |   26 +
 src/mpid/ch4/include/shmpre.h.in             |   19 +
 src/mpid/ch4/netmod/Makefile.mk              |   22 +
 src/mpid/ch4/netmod/include/netmod.h         | 1133 ++++++++++
 src/mpid/ch4/netmod/include/netmod_impl.h    | 1083 ++++++++++
 src/mpid/ch4/netmod/ofi/Makefile.mk          |   23 +
 src/mpid/ch4/netmod/ofi/catalog.c            |   61 +
 src/mpid/ch4/netmod/ofi/errnames.txt         |   97 +
 src/mpid/ch4/netmod/ofi/fi_list.h            |  196 ++
 src/mpid/ch4/netmod/ofi/func_table.c         |  157 ++
 src/mpid/ch4/netmod/ofi/globals.c            |   13 +
 src/mpid/ch4/netmod/ofi/netmod_direct.h      |   41 +
 src/mpid/ch4/netmod/ofi/ofi_am.h             |  383 ++++
 src/mpid/ch4/netmod/ofi/ofi_am_events.h      |  414 ++++
 src/mpid/ch4/netmod/ofi/ofi_am_impl.h        |  565 +++++
 src/mpid/ch4/netmod/ofi/ofi_am_probe.h       |   39 +
 src/mpid/ch4/netmod/ofi/ofi_am_recv.h        |   61 +
 src/mpid/ch4/netmod/ofi/ofi_am_rma.h         |  148 ++
 src/mpid/ch4/netmod/ofi/ofi_am_send.h        |  128 ++
 src/mpid/ch4/netmod/ofi/ofi_am_spawn.h       |   50 +
 src/mpid/ch4/netmod/ofi/ofi_am_win.h         |  160 ++
 src/mpid/ch4/netmod/ofi/ofi_coll.h           |  869 ++++++++
 src/mpid/ch4/netmod/ofi/ofi_comm.h           |   65 +
 src/mpid/ch4/netmod/ofi/ofi_control.h        |   72 +
 src/mpid/ch4/netmod/ofi/ofi_datatype.h       |   31 +
 src/mpid/ch4/netmod/ofi/ofi_events.h         |  804 ++++++++
 src/mpid/ch4/netmod/ofi/ofi_impl.h           |  484 +++++
 src/mpid/ch4/netmod/ofi/ofi_init.h           |  896 ++++++++
 src/mpid/ch4/netmod/ofi/ofi_iovec_util.h     |  391 ++++
 src/mpid/ch4/netmod/ofi/ofi_op.h             |   27 +
 src/mpid/ch4/netmod/ofi/ofi_pre.h            |  190 ++
 src/mpid/ch4/netmod/ofi/ofi_probe.h          |  168 ++
 src/mpid/ch4/netmod/ofi/ofi_proc.h           |   29 +
 src/mpid/ch4/netmod/ofi/ofi_progress.h       |   63 +
 src/mpid/ch4/netmod/ofi/ofi_recv.h           |  280 +++
 src/mpid/ch4/netmod/ofi/ofi_rma.h            | 1298 ++++++++++++
 src/mpid/ch4/netmod/ofi/ofi_send.h           |  566 +++++
 src/mpid/ch4/netmod/ofi/ofi_spawn.h          |  561 +++++
 src/mpid/ch4/netmod/ofi/ofi_types.h          |  545 +++++
 src/mpid/ch4/netmod/ofi/ofi_unimpl.h         |   19 +
 src/mpid/ch4/netmod/ofi/ofi_win.h            | 1254 ++++++++++++
 src/mpid/ch4/netmod/ofi/subconfigure.m4      |  153 ++
 src/mpid/ch4/netmod/ofi/util.c               |  822 ++++++++
 src/mpid/ch4/netmod/portals4/Makefile.mk     |    7 +
 src/mpid/ch4/netmod/portals4/errnames.txt    |   10 +
 src/mpid/ch4/netmod/portals4/func_table.c    |  157 ++
 src/mpid/ch4/netmod/portals4/globals.c       |   14 +
 src/mpid/ch4/netmod/portals4/netmod_direct.h |   30 +
 src/mpid/ch4/netmod/portals4/portals4_pre.h  |   51 +
 src/mpid/ch4/netmod/portals4/ptl_am.h        |  370 ++++
 src/mpid/ch4/netmod/portals4/ptl_coll.h      |  871 ++++++++
 src/mpid/ch4/netmod/portals4/ptl_comm.h      |   39 +
 src/mpid/ch4/netmod/portals4/ptl_datatype.h  |   28 +
 src/mpid/ch4/netmod/portals4/ptl_impl.h      |   58 +
 src/mpid/ch4/netmod/portals4/ptl_init.h      |  271 +++
 src/mpid/ch4/netmod/portals4/ptl_op.h        |   24 +
 src/mpid/ch4/netmod/portals4/ptl_probe.h     |   39 +
 src/mpid/ch4/netmod/portals4/ptl_proc.h      |   29 +
 src/mpid/ch4/netmod/portals4/ptl_progress.h  |  185 ++
 src/mpid/ch4/netmod/portals4/ptl_recv.h      |   63 +
 src/mpid/ch4/netmod/portals4/ptl_request.h   |   32 +
 src/mpid/ch4/netmod/portals4/ptl_rma.h       |  148 ++
 src/mpid/ch4/netmod/portals4/ptl_send.h      |  128 ++
 src/mpid/ch4/netmod/portals4/ptl_spawn.h     |   50 +
 src/mpid/ch4/netmod/portals4/ptl_types.h     |   76 +
 src/mpid/ch4/netmod/portals4/ptl_unimpl.h    |   19 +
 src/mpid/ch4/netmod/portals4/ptl_win.h       |  160 ++
 src/mpid/ch4/netmod/portals4/subconfigure.m4 |   33 +
 src/mpid/ch4/netmod/stubnm/Makefile.mk       |    6 +
 src/mpid/ch4/netmod/stubnm/globals.c         |  157 ++
 src/mpid/ch4/netmod/stubnm/netmod_direct.h   |   29 +
 src/mpid/ch4/netmod/stubnm/stubnm_am.h       |  138 ++
 src/mpid/ch4/netmod/stubnm/stubnm_coll.h     |  871 ++++++++
 src/mpid/ch4/netmod/stubnm/stubnm_comm.h     |   39 +
 src/mpid/ch4/netmod/stubnm/stubnm_datatype.h |   35 +
 src/mpid/ch4/netmod/stubnm/stubnm_impl.h     |   17 +
 src/mpid/ch4/netmod/stubnm/stubnm_init.h     |   95 +
 src/mpid/ch4/netmod/stubnm/stubnm_op.h       |   29 +
 src/mpid/ch4/netmod/stubnm/stubnm_pre.h      |   47 +
 src/mpid/ch4/netmod/stubnm/stubnm_probe.h    |   39 +
 src/mpid/ch4/netmod/stubnm/stubnm_proc.h     |   28 +
 src/mpid/ch4/netmod/stubnm/stubnm_progress.h |   76 +
 src/mpid/ch4/netmod/stubnm/stubnm_recv.h     |   67 +
 src/mpid/ch4/netmod/stubnm/stubnm_request.h  |   26 +
 src/mpid/ch4/netmod/stubnm/stubnm_rma.h      |  148 ++
 src/mpid/ch4/netmod/stubnm/stubnm_send.h     |  128 ++
 src/mpid/ch4/netmod/stubnm/stubnm_spawn.h    |   50 +
 src/mpid/ch4/netmod/stubnm/stubnm_unimpl.h   |   19 +
 src/mpid/ch4/netmod/stubnm/stubnm_win.h      |  160 ++
 src/mpid/ch4/netmod/stubnm/subconfigure.m4   |   21 +
 src/mpid/ch4/netmod/ucx/Makefile.mk          |   15 +
 src/mpid/ch4/netmod/ucx/errnames.txt         |   16 +
 src/mpid/ch4/netmod/ucx/func_table.c         |  155 ++
 src/mpid/ch4/netmod/ucx/globals.c            |   13 +
 src/mpid/ch4/netmod/ucx/netmod_direct.h      |   34 +
 src/mpid/ch4/netmod/ucx/subconfigure.m4      |   61 +
 src/mpid/ch4/netmod/ucx/ucx_am.h             |  615 ++++++
 src/mpid/ch4/netmod/ucx/ucx_am_recv.h        |   60 +
 src/mpid/ch4/netmod/ucx/ucx_am_rma.h         |  149 ++
 src/mpid/ch4/netmod/ucx/ucx_am_send.h        |  128 ++
 src/mpid/ch4/netmod/ucx/ucx_am_win.h         |  160 ++
 src/mpid/ch4/netmod/ucx/ucx_coll.h           |  867 ++++++++
 src/mpid/ch4/netmod/ucx/ucx_comm.h           |   48 +
 src/mpid/ch4/netmod/ucx/ucx_datatype.h       |  150 ++
 src/mpid/ch4/netmod/ucx/ucx_impl.h           |  151 ++
 src/mpid/ch4/netmod/ucx/ucx_init.h           |  330 +++
 src/mpid/ch4/netmod/ucx/ucx_op.h             |   24 +
 src/mpid/ch4/netmod/ucx/ucx_pre.h            |   75 +
 src/mpid/ch4/netmod/ucx/ucx_probe.h          |   99 +
 src/mpid/ch4/netmod/ucx/ucx_proc.h           |   26 +
 src/mpid/ch4/netmod/ucx/ucx_progress.h       |  147 ++
 src/mpid/ch4/netmod/ucx/ucx_recv.h           |  244 +++
 src/mpid/ch4/netmod/ucx/ucx_request.h        |  116 ++
 src/mpid/ch4/netmod/ucx/ucx_rma.h            |  299 +++
 src/mpid/ch4/netmod/ucx/ucx_send.h           |  488 +++++
 src/mpid/ch4/netmod/ucx/ucx_spawn.h          |  104 +
 src/mpid/ch4/netmod/ucx/ucx_types.h          |   66 +
 src/mpid/ch4/netmod/ucx/ucx_win.h            |  503 +++++
 src/mpid/ch4/shm/Makefile.mk                 |   24 +
 src/mpid/ch4/shm/include/shm.h               | 1192 +++++++++++
 src/mpid/ch4/shm/include/shm_impl.h          | 1109 ++++++++++
 src/mpid/ch4/shm/posix/Makefile.mk           |   38 +
 src/mpid/ch4/shm/posix/barrier.c             |   42 +
 src/mpid/ch4/shm/posix/func_table.c          |  152 ++
 src/mpid/ch4/shm/posix/globals.c             |   22 +
 src/mpid/ch4/shm/posix/posix_am.h            |  173 ++
 src/mpid/ch4/shm/posix/posix_coll.h          |  876 ++++++++
 src/mpid/ch4/shm/posix/posix_comm.h          |   45 +
 src/mpid/ch4/shm/posix/posix_datatypes.h     |  185 ++
 src/mpid/ch4/shm/posix/posix_defs.h          |  115 ++
 src/mpid/ch4/shm/posix/posix_impl.h          |  174 ++
 src/mpid/ch4/shm/posix/posix_init.h          |  324 +++
 src/mpid/ch4/shm/posix/posix_pre.h           |   41 +
 src/mpid/ch4/shm/posix/posix_probe.h         |  146 ++
 src/mpid/ch4/shm/posix/posix_progress.h      |  454 ++++
 src/mpid/ch4/shm/posix/posix_queue.h         |  343 ++++
 src/mpid/ch4/shm/posix/posix_recv.h          |  355 ++++
 src/mpid/ch4/shm/posix/posix_request.h       |   26 +
 src/mpid/ch4/shm/posix/posix_rma.h           |  143 ++
 src/mpid/ch4/shm/posix/posix_send.h          |  462 +++++
 src/mpid/ch4/shm/posix/posix_spawn.h         |   50 +
 src/mpid/ch4/shm/posix/posix_unimpl.h        |   19 +
 src/mpid/ch4/shm/posix/posix_win.h           |  185 ++
 src/mpid/ch4/shm/posix/shm_direct.h          |   27 +
 src/mpid/ch4/shm/posix/subconfigure.m4       |   53 +
 src/mpid/ch4/shm/stubshm/Makefile.mk         |    7 +
 src/mpid/ch4/shm/stubshm/func_table.c        |  152 ++
 src/mpid/ch4/shm/stubshm/globals.c           |   13 +
 src/mpid/ch4/shm/stubshm/shm_direct.h        |   27 +
 src/mpid/ch4/shm/stubshm/stubshm_am.h        |  173 ++
 src/mpid/ch4/shm/stubshm/stubshm_coll.h      |  695 +++++++
 src/mpid/ch4/shm/stubshm/stubshm_comm.h      |   29 +
 src/mpid/ch4/shm/stubshm/stubshm_impl.h      |   14 +
 src/mpid/ch4/shm/stubshm/stubshm_init.h      |   85 +
 src/mpid/ch4/shm/stubshm/stubshm_pre.h       |   27 +
 src/mpid/ch4/shm/stubshm/stubshm_probe.h     |   36 +
 src/mpid/ch4/shm/stubshm/stubshm_proc.h      |   21 +
 src/mpid/ch4/shm/stubshm/stubshm_progress.h  |   88 +
 src/mpid/ch4/shm/stubshm/stubshm_recv.h      |   70 +
 src/mpid/ch4/shm/stubshm/stubshm_request.h   |   26 +
 src/mpid/ch4/shm/stubshm/stubshm_rma.h       |  143 ++
 src/mpid/ch4/shm/stubshm/stubshm_send.h      |  140 ++
 src/mpid/ch4/shm/stubshm/stubshm_spawn.h     |   50 +
 src/mpid/ch4/shm/stubshm/stubshm_unimpl.h    |   19 +
 src/mpid/ch4/shm/stubshm/stubshm_win.h       |  185 ++
 src/mpid/ch4/shm/stubshm/subconfigure.m4     |   19 +
 src/mpid/ch4/src/Makefile.mk                 |   44 +
 src/mpid/ch4/src/ch4_coll.h                  |  378 ++++
 src/mpid/ch4/src/ch4_comm.h                  |  249 +++
 src/mpid/ch4/src/ch4_globals.c               |   62 +
 src/mpid/ch4/src/ch4_impl.h                  |  553 +++++
 src/mpid/ch4/src/ch4_init.h                  |  751 +++++++
 src/mpid/ch4/src/ch4_probe.h                 |  220 ++
 src/mpid/ch4/src/ch4_proc.h                  |   37 +
 src/mpid/ch4/src/ch4_progress.h              |  185 ++
 src/mpid/ch4/src/ch4_recv.h                  |  411 ++++
 src/mpid/ch4/src/ch4_request.h               |   91 +
 src/mpid/ch4/src/ch4_rma.h                   |  297 +++
 src/mpid/ch4/src/ch4_send.h                  |  515 +++++
 src/mpid/ch4/src/ch4_spawn.h                 |  319 +++
 src/mpid/ch4/src/ch4_types.h                 |  285 +++
 src/mpid/ch4/src/ch4_win.h                   |  530 +++++
 src/mpid/ch4/src/ch4i_comm.h                 | 1161 +++++++++++
 src/mpid/ch4/src/ch4i_util.h                 |   20 +
 src/mpid/ch4/src/ch4r_buf.h                  |  168 ++
 src/mpid/ch4/src/ch4r_callbacks.h            | 2836 ++++++++++++++++++++++++++
 src/mpid/ch4/src/ch4r_init.h                 |  333 +++
 src/mpid/ch4/src/ch4r_probe.h                |  176 ++
 src/mpid/ch4/src/ch4r_proc.h                 |  411 ++++
 src/mpid/ch4/src/ch4r_recv.h                 |  429 ++++
 src/mpid/ch4/src/ch4r_recvq.h                |  336 +++
 src/mpid/ch4/src/ch4r_request.h              |  126 ++
 src/mpid/ch4/src/ch4r_rma.h                  |  761 +++++++
 src/mpid/ch4/src/ch4r_send.h                 |  468 +++++
 src/mpid/ch4/src/ch4r_symheap.h              |  258 +++
 src/mpid/ch4/src/ch4r_win.h                  | 1250 ++++++++++++
 src/mpid/ch4/src/mpid_ch4_net_array.c.in     |   28 +
 src/mpid/ch4/src/mpid_ch4_shm_array.c.in     |   32 +
 src/mpid/ch4/subconfigure.m4                 |  391 ++++
 215 files changed, 48937 insertions(+), 0 deletions(-)
 create mode 100644 src/mpid/ch4/.gitignore
 create mode 100644 src/mpid/ch4/Makefile.mk
 create mode 100644 src/mpid/ch4/cross/gcc-linux-x86-8
 create mode 100644 src/mpid/ch4/cross/icc-linux-x86-8
 create mode 100644 src/mpid/ch4/errnames.txt
 create mode 100644 src/mpid/ch4/include/Makefile.mk
 create mode 100644 src/mpid/ch4/include/mpid_sched.h
 create mode 100644 src/mpid/ch4/include/mpid_thread.h
 create mode 100644 src/mpid/ch4/include/mpid_ticketlock.h
 create mode 100644 src/mpid/ch4/include/mpidch4.h
 create mode 100644 src/mpid/ch4/include/mpidch4r.h
 create mode 100644 src/mpid/ch4/include/mpidimpl.h
 create mode 100644 src/mpid/ch4/include/mpidpost.h
 create mode 100644 src/mpid/ch4/include/mpidpre.h
 create mode 100644 src/mpid/ch4/include/netmodpre.h.in
 create mode 100644 src/mpid/ch4/include/shmpre.h.in
 create mode 100644 src/mpid/ch4/netmod/Makefile.mk
 create mode 100644 src/mpid/ch4/netmod/include/netmod.h
 create mode 100644 src/mpid/ch4/netmod/include/netmod_impl.h
 create mode 100644 src/mpid/ch4/netmod/ofi/Makefile.mk
 create mode 100644 src/mpid/ch4/netmod/ofi/catalog.c
 create mode 100644 src/mpid/ch4/netmod/ofi/errnames.txt
 create mode 100644 src/mpid/ch4/netmod/ofi/fi_list.h
 create mode 100644 src/mpid/ch4/netmod/ofi/func_table.c
 create mode 100644 src/mpid/ch4/netmod/ofi/globals.c
 create mode 100644 src/mpid/ch4/netmod/ofi/netmod_direct.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_am.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_am_events.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_am_impl.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_am_probe.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_am_recv.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_am_rma.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_am_send.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_am_spawn.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_am_win.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_coll.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_comm.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_control.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_datatype.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_events.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_impl.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_init.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_iovec_util.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_op.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_pre.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_probe.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_proc.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_progress.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_recv.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_rma.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_send.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_spawn.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_types.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_unimpl.h
 create mode 100644 src/mpid/ch4/netmod/ofi/ofi_win.h
 create mode 100644 src/mpid/ch4/netmod/ofi/subconfigure.m4
 create mode 100644 src/mpid/ch4/netmod/ofi/util.c
 create mode 100644 src/mpid/ch4/netmod/portals4/Makefile.mk
 create mode 100644 src/mpid/ch4/netmod/portals4/errnames.txt
 create mode 100644 src/mpid/ch4/netmod/portals4/func_table.c
 create mode 100644 src/mpid/ch4/netmod/portals4/globals.c
 create mode 100644 src/mpid/ch4/netmod/portals4/netmod_direct.h
 create mode 100644 src/mpid/ch4/netmod/portals4/portals4_pre.h
 create mode 100644 src/mpid/ch4/netmod/portals4/ptl_am.h
 create mode 100644 src/mpid/ch4/netmod/portals4/ptl_coll.h
 create mode 100644 src/mpid/ch4/netmod/portals4/ptl_comm.h
 create mode 100644 src/mpid/ch4/netmod/portals4/ptl_datatype.h
 create mode 100644 src/mpid/ch4/netmod/portals4/ptl_impl.h
 create mode 100644 src/mpid/ch4/netmod/portals4/ptl_init.h
 create mode 100644 src/mpid/ch4/netmod/portals4/ptl_op.h
 create mode 100644 src/mpid/ch4/netmod/portals4/ptl_probe.h
 create mode 100644 src/mpid/ch4/netmod/portals4/ptl_proc.h
 create mode 100644 src/mpid/ch4/netmod/portals4/ptl_progress.h
 create mode 100644 src/mpid/ch4/netmod/portals4/ptl_recv.h
 create mode 100644 src/mpid/ch4/netmod/portals4/ptl_request.h
 create mode 100644 src/mpid/ch4/netmod/portals4/ptl_rma.h
 create mode 100644 src/mpid/ch4/netmod/portals4/ptl_send.h
 create mode 100644 src/mpid/ch4/netmod/portals4/ptl_spawn.h
 create mode 100644 src/mpid/ch4/netmod/portals4/ptl_types.h
 create mode 100644 src/mpid/ch4/netmod/portals4/ptl_unimpl.h
 create mode 100644 src/mpid/ch4/netmod/portals4/ptl_win.h
 create mode 100644 src/mpid/ch4/netmod/portals4/subconfigure.m4
 create mode 100644 src/mpid/ch4/netmod/stubnm/Makefile.mk
 create mode 100644 src/mpid/ch4/netmod/stubnm/globals.c
 create mode 100644 src/mpid/ch4/netmod/stubnm/netmod_direct.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/stubnm_am.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/stubnm_coll.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/stubnm_comm.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/stubnm_datatype.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/stubnm_impl.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/stubnm_init.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/stubnm_op.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/stubnm_pre.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/stubnm_probe.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/stubnm_proc.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/stubnm_progress.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/stubnm_recv.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/stubnm_request.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/stubnm_rma.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/stubnm_send.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/stubnm_spawn.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/stubnm_unimpl.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/stubnm_win.h
 create mode 100644 src/mpid/ch4/netmod/stubnm/subconfigure.m4
 create mode 100644 src/mpid/ch4/netmod/ucx/Makefile.mk
 create mode 100644 src/mpid/ch4/netmod/ucx/errnames.txt
 create mode 100644 src/mpid/ch4/netmod/ucx/func_table.c
 create mode 100644 src/mpid/ch4/netmod/ucx/globals.c
 create mode 100644 src/mpid/ch4/netmod/ucx/netmod_direct.h
 create mode 100644 src/mpid/ch4/netmod/ucx/subconfigure.m4
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_am.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_am_recv.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_am_rma.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_am_send.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_am_win.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_coll.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_comm.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_datatype.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_impl.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_init.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_op.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_pre.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_probe.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_proc.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_progress.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_recv.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_request.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_rma.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_send.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_spawn.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_types.h
 create mode 100644 src/mpid/ch4/netmod/ucx/ucx_win.h
 create mode 100644 src/mpid/ch4/shm/Makefile.mk
 create mode 100644 src/mpid/ch4/shm/include/shm.h
 create mode 100644 src/mpid/ch4/shm/include/shm_impl.h
 create mode 100644 src/mpid/ch4/shm/posix/Makefile.mk
 create mode 100644 src/mpid/ch4/shm/posix/barrier.c
 create mode 100644 src/mpid/ch4/shm/posix/func_table.c
 create mode 100644 src/mpid/ch4/shm/posix/globals.c
 create mode 100644 src/mpid/ch4/shm/posix/posix_am.h
 create mode 100644 src/mpid/ch4/shm/posix/posix_coll.h
 create mode 100644 src/mpid/ch4/shm/posix/posix_comm.h
 create mode 100644 src/mpid/ch4/shm/posix/posix_datatypes.h
 create mode 100644 src/mpid/ch4/shm/posix/posix_defs.h
 create mode 100644 src/mpid/ch4/shm/posix/posix_impl.h
 create mode 100644 src/mpid/ch4/shm/posix/posix_init.h
 create mode 100644 src/mpid/ch4/shm/posix/posix_pre.h
 create mode 100644 src/mpid/ch4/shm/posix/posix_probe.h
 create mode 100644 src/mpid/ch4/shm/posix/posix_progress.h
 create mode 100644 src/mpid/ch4/shm/posix/posix_queue.h
 create mode 100644 src/mpid/ch4/shm/posix/posix_recv.h
 create mode 100644 src/mpid/ch4/shm/posix/posix_request.h
 create mode 100644 src/mpid/ch4/shm/posix/posix_rma.h
 create mode 100644 src/mpid/ch4/shm/posix/posix_send.h
 create mode 100644 src/mpid/ch4/shm/posix/posix_spawn.h
 create mode 100644 src/mpid/ch4/shm/posix/posix_unimpl.h
 create mode 100644 src/mpid/ch4/shm/posix/posix_win.h
 create mode 100644 src/mpid/ch4/shm/posix/shm_direct.h
 create mode 100644 src/mpid/ch4/shm/posix/subconfigure.m4
 create mode 100644 src/mpid/ch4/shm/stubshm/Makefile.mk
 create mode 100644 src/mpid/ch4/shm/stubshm/func_table.c
 create mode 100644 src/mpid/ch4/shm/stubshm/globals.c
 create mode 100644 src/mpid/ch4/shm/stubshm/shm_direct.h
 create mode 100644 src/mpid/ch4/shm/stubshm/stubshm_am.h
 create mode 100644 src/mpid/ch4/shm/stubshm/stubshm_coll.h
 create mode 100644 src/mpid/ch4/shm/stubshm/stubshm_comm.h
 create mode 100644 src/mpid/ch4/shm/stubshm/stubshm_impl.h
 create mode 100644 src/mpid/ch4/shm/stubshm/stubshm_init.h
 create mode 100644 src/mpid/ch4/shm/stubshm/stubshm_pre.h
 create mode 100644 src/mpid/ch4/shm/stubshm/stubshm_probe.h
 create mode 100644 src/mpid/ch4/shm/stubshm/stubshm_proc.h
 create mode 100644 src/mpid/ch4/shm/stubshm/stubshm_progress.h
 create mode 100644 src/mpid/ch4/shm/stubshm/stubshm_recv.h
 create mode 100644 src/mpid/ch4/shm/stubshm/stubshm_request.h
 create mode 100644 src/mpid/ch4/shm/stubshm/stubshm_rma.h
 create mode 100644 src/mpid/ch4/shm/stubshm/stubshm_send.h
 create mode 100644 src/mpid/ch4/shm/stubshm/stubshm_spawn.h
 create mode 100644 src/mpid/ch4/shm/stubshm/stubshm_unimpl.h
 create mode 100644 src/mpid/ch4/shm/stubshm/stubshm_win.h
 create mode 100644 src/mpid/ch4/shm/stubshm/subconfigure.m4
 create mode 100644 src/mpid/ch4/src/Makefile.mk
 create mode 100644 src/mpid/ch4/src/ch4_coll.h
 create mode 100644 src/mpid/ch4/src/ch4_comm.h
 create mode 100644 src/mpid/ch4/src/ch4_globals.c
 create mode 100644 src/mpid/ch4/src/ch4_impl.h
 create mode 100644 src/mpid/ch4/src/ch4_init.h
 create mode 100644 src/mpid/ch4/src/ch4_probe.h
 create mode 100644 src/mpid/ch4/src/ch4_proc.h
 create mode 100644 src/mpid/ch4/src/ch4_progress.h
 create mode 100644 src/mpid/ch4/src/ch4_recv.h
 create mode 100644 src/mpid/ch4/src/ch4_request.h
 create mode 100644 src/mpid/ch4/src/ch4_rma.h
 create mode 100644 src/mpid/ch4/src/ch4_send.h
 create mode 100644 src/mpid/ch4/src/ch4_spawn.h
 create mode 100644 src/mpid/ch4/src/ch4_types.h
 create mode 100644 src/mpid/ch4/src/ch4_win.h
 create mode 100644 src/mpid/ch4/src/ch4i_comm.h
 create mode 100644 src/mpid/ch4/src/ch4i_util.h
 create mode 100644 src/mpid/ch4/src/ch4r_buf.h
 create mode 100644 src/mpid/ch4/src/ch4r_callbacks.h
 create mode 100644 src/mpid/ch4/src/ch4r_init.h
 create mode 100644 src/mpid/ch4/src/ch4r_probe.h
 create mode 100644 src/mpid/ch4/src/ch4r_proc.h
 create mode 100644 src/mpid/ch4/src/ch4r_recv.h
 create mode 100644 src/mpid/ch4/src/ch4r_recvq.h
 create mode 100644 src/mpid/ch4/src/ch4r_request.h
 create mode 100644 src/mpid/ch4/src/ch4r_rma.h
 create mode 100644 src/mpid/ch4/src/ch4r_send.h
 create mode 100644 src/mpid/ch4/src/ch4r_symheap.h
 create mode 100644 src/mpid/ch4/src/ch4r_win.h
 create mode 100644 src/mpid/ch4/src/mpid_ch4_net_array.c.in
 create mode 100644 src/mpid/ch4/src/mpid_ch4_shm_array.c.in
 create mode 100644 src/mpid/ch4/subconfigure.m4


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list