[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.1.2-32-ged41403

Service Account noreply at mpich.org
Thu Jul 31 08:37:49 CDT 2014


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  ed4140325102d2654e902ba70a08eb8ceb8daf16 (commit)
       via  19c00389e4915c68fef68d3e61b50bbded1ad46b (commit)
       via  76e70960dfa6962fcf04bdf187647061c4e499ee (commit)
       via  51f6709f5422b015910b26bebe651ed835a77f9b (commit)
       via  56a0b44574d7e609bed94047ecfb435b0a9e55b3 (commit)
       via  a499ad05e4208b4a76843b79eb726f80862ec97c (commit)
       via  fded59aee40a99eda9e5a96955751b5149c0d1bc (commit)
       via  3690597fbbf8cf8289530eaa5483967eebce2d31 (commit)
       via  586e7122784d8f45ea2d6c5fa7df2de2f53dfe2a (commit)
       via  df39ada6f58aecea6eb36c21521e1ab557e365a4 (commit)
       via  2f25f42745d009e86a0eb0c4fe21f04d76d7121a (commit)
       via  d5c2a5dab82547764974b27bdb695901f59801f7 (commit)
       via  5bfff7d3bccfff88d89e4e56565184a8ebf077b0 (commit)
      from  522c26881144def06dec5b52bde6f583073df0a9 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/ed4140325102d2654e902ba70a08eb8ceb8daf16

commit ed4140325102d2654e902ba70a08eb8ceb8daf16
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date:   Wed Jul 16 15:35:36 2014 +0900

    Fix build warnings in netmod-IB
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_finalize.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_finalize.c
index 3dd2881..c34a392 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_finalize.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_finalize.c
@@ -24,8 +24,10 @@
 int MPID_nem_ib_finalize(void)
 {
     int mpi_errno = MPI_SUCCESS;
+#if 0
     int ibcom_errno;
     int i;
+#endif
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_FINALIZE);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_FINALIZE);
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
index 99ba26b..5f4ec31 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
@@ -13,8 +13,8 @@
  *	  be deallocated. Look at all functions.
  */
 #include "ib_ibcom.h"
-#include <sys/ipc.h>
-#include <sys/shm.h>
+//#include <sys/ipc.h>
+//#include <sys/shm.h>
 #include <sys/types.h>
 #include <assert.h>
 #include <linux/mman.h> /* make it define MAP_ANONYMOUS */
@@ -30,7 +30,6 @@
 #define dprintf(...)
 #endif
 
-static int sendwr_id = 10;
 static MPID_nem_ib_com_t contab[MPID_NEM_IB_COM_SIZE];
 static int ib_initialized = 0;
 static int maxcon;
@@ -506,8 +505,6 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
     MPID_nem_ib_com_t *conp;
     struct ibv_qp_init_attr qp_init_attr;
     struct ibv_sge *sge;
-    struct ibv_send_wr *sr;
-    struct ibv_recv_wr *rr, *bad_wr;
     int mr_flags;
     int i;
 
@@ -1201,7 +1198,9 @@ int MPID_nem_ib_com_alloc(int condesc, int sz)
 {
     MPID_nem_ib_com_t *conp;
     int ibcom_errno = 0;
+#ifdef MPID_NEM_IB_DEBUG_IBCOM
     int mr_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE;
+#endif
 
     MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
 
@@ -2559,7 +2558,6 @@ int MPID_nem_ib_com_connect_ringbuf(int condesc,
 {
     int ibcom_errno = 0;
     MPID_nem_ib_com_t *conp;
-    int i;
 
     MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
 
@@ -2589,7 +2587,7 @@ int MPID_nem_ib_com_connect_ringbuf(int condesc,
         conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.rkey = rkey;
     }
     dprintf
-        ("connect_ringbuf,ringbuf_type=%d,rkey=%08x,start=%p,nslot=%d,sseq_num=%d,lsr_seq_num_tail=%d,remote_vc=%lx,alloc_new_mr=%d\n",
+        ("connect_ringbuf,ringbuf_type=%d,rkey=%08x,start=%p,nslot=%d,sseq_num=%d,lsr_seq_num_tail=%d,remote_vc=%p,alloc_new_mr=%d\n",
          conp->local_ringbuf_type, conp->local_ringbuf_rkey, conp->local_ringbuf_start,
          conp->local_ringbuf_nslot, conp->sseq_num, conp->lsr_seq_num_tail, conp->remote_vc,
          alloc_new_mr);
@@ -2818,6 +2816,7 @@ int MPID_nem_ib_com_obtain_pointer(int condesc, MPID_nem_ib_com_t ** MPID_nem_ib
     goto fn_exit;
 }
 
+#if 0
 static void MPID_nem_ib_comShow(int condesc)
 {
     MPID_nem_ib_com_t *conp;
@@ -2838,24 +2837,25 @@ static void MPID_nem_ib_comShow(int condesc)
     }
     fprintf(stdout, "\n");
 }
+#endif
 
-static char *strerror_tbl[] = {
+static const char *strerror_tbl[] = {
     [0] = "zero",
     [1] = "one",
     [2] = "two",
     [3] = "three",
 };
 
-char *MPID_nem_ib_com_strerror(int errno)
+char *MPID_nem_ib_com_strerror(int err)
 {
     char *r;
-    if (-errno > 3) {
+    if (-err > 3) {
         r = MPIU_Malloc(256);
-        sprintf(r, "%d", -errno);
+        sprintf(r, "%d", -err);
         goto fn_exit;
     }
     else {
-        r = strerror_tbl[-errno];
+        r = (char *)strerror_tbl[-err];
     }
   fn_exit:
     return r;
@@ -2891,7 +2891,6 @@ int MPID_nem_ib_com_reg_mr(void *addr, long len, struct ibv_mr **mr,
 
 int MPID_nem_ib_com_dereg_mr(struct ibv_mr *mr)
 {
-    int i;
     int ib_errno;
     int ibcom_errno = 0;
 
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
index 75e3757..180a1fc 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
@@ -559,7 +559,7 @@ extern int MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get(int condesc, int *n
 extern int MPID_nem_ib_com_rdmabuf_occupancy_notify_rstate_get(int condesc, int **rstate);
 extern int MPID_nem_ib_com_rdmabuf_occupancy_notify_lstate_get(int condesc, int **lstate);
 
-extern char *MPID_nem_ib_com_strerror(int errno);
+extern char *MPID_nem_ib_com_strerror(int err);
 
 extern int MPID_nem_ib_com_mem_rdmawr_from(int condesc, void **out);
 //extern int MPID_nem_ib_com_mem_rdmawr_to(int condesc, int seq_num, void **out);
@@ -675,7 +675,7 @@ static inline void *MPID_nem_ib_rdmawr_from_alloc(uint32_t _sz)
         return p;
     }
     else {
-        char *q, r;
+        char *q;
         if (MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz]) {
             q = MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz];
             MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz] =
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
index 37f6bdc..1ab2502 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
@@ -472,8 +472,8 @@ static inline int MPID_nem_ib_cbf_hash1(uint64_t addr)
     return
         (((addr >> (MPID_nem_ib_cbf_lognslot * 0)) & (MPID_nem_ib_cbf_nslot - 1)) ^
          ((addr >> (MPID_nem_ib_cbf_lognslot * 3)) & (MPID_nem_ib_cbf_nslot - 1)) ^
-         ((addr >> (MPID_nem_ib_cbf_lognslot * 6)) & (MPID_nem_ib_cbf_nslot - 1))
-         + 1) & (MPID_nem_ib_cbf_nslot - 1);
+         (((addr >> (MPID_nem_ib_cbf_lognslot * 6)) & (MPID_nem_ib_cbf_nslot - 1))
+         + 1)) & (MPID_nem_ib_cbf_nslot - 1);
 }
 
 static inline int MPID_nem_ib_cbf_hash2(uint64_t addr)
@@ -482,8 +482,8 @@ static inline int MPID_nem_ib_cbf_hash2(uint64_t addr)
     return
         (((addr >> (MPID_nem_ib_cbf_lognslot * 1)) & (MPID_nem_ib_cbf_nslot - 1)) ^
          ((addr >> (MPID_nem_ib_cbf_lognslot * 4)) & (MPID_nem_ib_cbf_nslot - 1)) ^
-         ((addr >> (MPID_nem_ib_cbf_lognslot * 7)) & (MPID_nem_ib_cbf_nslot - 1))
-         + 1) & (MPID_nem_ib_cbf_nslot - 1);
+         (((addr >> (MPID_nem_ib_cbf_lognslot * 7)) & (MPID_nem_ib_cbf_nslot - 1))
+         + 1)) & (MPID_nem_ib_cbf_nslot - 1);
 }
 
 static inline int MPID_nem_ib_cbf_hash3(uint64_t addr)
@@ -492,8 +492,8 @@ static inline int MPID_nem_ib_cbf_hash3(uint64_t addr)
     return
         (((addr >> (MPID_nem_ib_cbf_lognslot * 2)) & (MPID_nem_ib_cbf_nslot - 1)) ^
          ((addr >> (MPID_nem_ib_cbf_lognslot * 5)) & (MPID_nem_ib_cbf_nslot - 1)) ^
-         ((addr >> (MPID_nem_ib_cbf_lognslot * 8)) & (MPID_nem_ib_cbf_nslot - 1))
-         + 2) & (MPID_nem_ib_cbf_nslot - 1);
+         (((addr >> (MPID_nem_ib_cbf_lognslot * 8)) & (MPID_nem_ib_cbf_nslot - 1))
+         + 2)) & (MPID_nem_ib_cbf_nslot - 1);
 
 }
 
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
index 3a3d21a..35b2ab2 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
@@ -103,8 +103,6 @@ uint8_t MPID_nem_ib_lmt_tail_addr_cbf[MPID_nem_ib_cbf_nslot * MPID_nem_ib_cbf_bi
                                       8] = { 0 };
 static uint32_t MPID_nem_ib_rand_next = 1;
 MPID_nem_ib_vc_area *MPID_nem_ib_debug_current_vc_ib;
-static int listen_fd;
-static int listen_port;
 uint64_t MPID_nem_ib_ringbuf_acquired[(MPID_NEM_IB_NRINGBUF + 63) / 64];
 uint64_t MPID_nem_ib_ringbuf_allocated[(MPID_NEM_IB_NRINGBUF + 63) / 64];
 MPID_nem_ib_ringbuf_t *MPID_nem_ib_ringbuf;
@@ -201,7 +199,9 @@ static int MPID_nem_ib_kvs_get_binary(int from, const char *postfix, char *buf,
     goto fn_exit;
 }
 
+#ifndef MPID_NEM_IB_ONDEMAND
 static int MPID_nem_ib_announce_network_addr(int my_rank, char **bc_val_p, int *val_max_sz_p);
+#endif
 
 #undef FUNCNAME
 #define FUNCNAME MPID_nem_ib_init
@@ -211,7 +211,6 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
 {
     int mpi_errno = MPI_SUCCESS;
     int ibcom_errno = 0, pmi_errno;
-    int ret;
     int i, j, k;
     int ib_port = 1;
 
@@ -626,7 +625,6 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
 int MPID_nem_ib_get_business_card(int my_rank, char **bc_val_p, int *val_max_sz_p)
 {
     int mpi_errno = MPI_SUCCESS;
-    int str_errno = MPIU_STR_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_GET_BUSINESS_CARD);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_GET_BUSINESS_CARD);
     dprintf("MPID_nem_ib_get_business_card,enter\n");
@@ -634,6 +632,7 @@ int MPID_nem_ib_get_business_card(int my_rank, char **bc_val_p, int *val_max_sz_
     return mpi_errno;
 }
 
+#ifndef MPID_NEM_IB_ONDEMAND
 #undef FUNCNAME
 #define FUNCNAME MPID_nem_ib_announce_network_addr
 #undef FCNAME
@@ -641,7 +640,6 @@ int MPID_nem_ib_get_business_card(int my_rank, char **bc_val_p, int *val_max_sz_
 static int MPID_nem_ib_announce_network_addr(int my_rank, char **bc_val_p, int *val_max_sz_p)
 {
     int mpi_errno = MPI_SUCCESS;
-    int str_errno = MPIU_STR_SUCCESS;
     int ibcom_errno;
     int i, j, nranks;
 
@@ -772,6 +770,7 @@ static int MPID_nem_ib_announce_network_addr(int my_rank, char **bc_val_p, int *
     MPIU_CHKLMEM_FREEALL();
     goto fn_exit;
 }
+#endif
 
 #undef FUNCNAME
 #define FUNCNAME MPID_nem_ib_connect_to_root
@@ -830,18 +829,6 @@ int MPID_nem_ib_vc_init(MPIDI_VC_t * vc)
     int mpi_errno = MPI_SUCCESS;
 
     MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
-    int ibcom_errno;
-    size_t s;
-    MPID_nem_ib_conn_t *sc;
-    off_t offset;
-
-    int remote_qpnum;
-    uint16_t remote_lid;
-    union ibv_gid remote_gid;
-    void *remote_rmem;
-    int remote_rkey;
-
-    char key_str[256], remote_rank_str[256];
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_VC_INIT);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_VC_INIT);
@@ -1006,8 +993,6 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
     dprintf("ib_vc_terminate,pg_rank=%d\n", vc->pg_rank);
     int mpi_errno = MPI_SUCCESS;
     int ibcom_errno;
-    int req_errno = MPI_SUCCESS;
-    int i;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_VC_TERMINATE);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_VC_TERMINATE);
 
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
index cd2980c..ed40e64 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
@@ -30,8 +30,9 @@ int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_p
     MPIDI_msg_sz_t data_sz;
     MPID_Datatype *dt_ptr;
     MPI_Aint dt_true_lb;
-    MPIDI_CH3I_VC *vc_ch = VC_CH(vc);
+#if 0
     MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
+#endif
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT);
@@ -89,7 +90,7 @@ int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_p
 #endif
     /* put sz, see MPID_nem_lmt_RndvSend (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c) */
     /* TODO remove sz field
-     * /* pkt_RTS_handler (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c)
+     *   pkt_RTS_handler (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c)
      * rreq->ch.lmt_data_sz = rts_pkt->data_sz; */
     //s_cookie_buf->sz = (uint32_t)((MPID_nem_pkt_lmt_rts_t*)rts_pkt)->data_sz;
 
@@ -208,12 +209,10 @@ int MPID_nem_ib_lmt_start_recv_core(struct MPID_Request *req, void *raddr, uint3
 int MPID_nem_ib_lmt_start_recv(struct MPIDI_VC *vc, struct MPID_Request *req, MPID_IOV s_cookie)
 {
     int mpi_errno = MPI_SUCCESS;
-    int ibcom_errno;
     int dt_contig;
     MPIDI_msg_sz_t data_sz;
     MPID_Datatype *dt_ptr;
     MPI_Aint dt_true_lb;
-    MPIDI_CH3I_VC *vc_ch = VC_CH(vc);
     MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_START_RECV);
@@ -384,7 +383,9 @@ int MPID_nem_ib_lmt_switch_send(struct MPIDI_VC *vc, struct MPID_Request *req)
     REQ_FIELD(req, lmt_sender_tail) = *tailp;
     dprintf("lmt_switch_send,tail on sender=%02x,tail onreceiver=%02x,req=%p\n", *tailp,
             r_cookie_buf->tail, req);
+#ifdef MPID_NEM_IB_DEBUG_LMT
     uint8_t *tail_wordp = (uint8_t *) ((uint8_t *) write_from_buf + data_sz - sizeof(uint32_t) * 2);
+#endif
     dprintf("lmt_switch_send,tail on sender=%d\n", *tail_wordp);
     fflush(stdout);
 #endif
@@ -426,7 +427,6 @@ int MPID_nem_ib_lmt_handle_cookie(struct MPIDI_VC *vc, struct MPID_Request *req,
 int MPID_nem_ib_lmt_done_send(struct MPIDI_VC *vc, struct MPID_Request *req)
 {
     int mpi_errno = MPI_SUCCESS;
-    MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_DONE_SEND);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_DONE_SEND);
 
@@ -476,7 +476,6 @@ int MPID_nem_ib_lmt_done_send(struct MPIDI_VC *vc, struct MPID_Request *req)
 int MPID_nem_ib_lmt_done_recv(struct MPIDI_VC *vc, struct MPID_Request *rreq)
 {
     int mpi_errno = MPI_SUCCESS;
-    MPIDI_CH3I_VC *vc_ch = VC_CH(vc);
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_DONE_RECV);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_DONE_RECV);
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_malloc.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_malloc.c
index ebbfd26..5cf81a4 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_malloc.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_malloc.c
@@ -22,6 +22,10 @@
 #endif
 
 static void _local_malloc_initialize_hook(void);
+void *malloc(size_t size);
+void free(void *addr);
+void *realloc(void *addr, size_t size);
+void *calloc(size_t nmemb, size_t size);
 
 void (*__malloc_initialize_hook) (void) = _local_malloc_initialize_hook;
 
@@ -320,10 +324,10 @@ void *malloc(size_t size)
                 }
 
                 /* use head elem */
-                struct free_list *info = (struct free_list *) (arena_flist[pow].next);
-                ptr = (char *) info + CHUNK;
+                struct free_list *head = (struct free_list *) (arena_flist[pow].next);
+                ptr = (char *) head + CHUNK;
                 dprintf("malloc(%lu) [2^%d] ==> USE pool %p\n", size, pow, ptr);
-                list_del(info);
+                list_del(head);
             }
             else {
                 __init_pool_header(info, pow, alloc_sz);
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
index d087a5a..4195152 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
@@ -33,14 +33,16 @@ static int entered_drain_scq = 0;
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_poll_eager"); \
          } \
 }
-//   int n;                                         \
-//   for(n = 0; n < MPID_nem_ib_npollingset; n++) {  \
-//       MPIDI_VC_t *vc_n = MPID_nem_ib_pollingset[n];  \
-//       /*MPID_nem_ib_debug_current_vc_ib = vc_ib;*/   \
-//       MPID_nem_ib_send_progress(vc_n);               \
-//   }                                                  \
+#if 0
+   int n;                                         \
+   for(n = 0; n < MPID_nem_ib_npollingset; n++) {  \
+       MPIDI_VC_t *vc_n = MPID_nem_ib_pollingset[n];  \
+       /*MPID_nem_ib_debug_current_vc_ib = vc_ib;*/   \
+       MPID_nem_ib_send_progress(vc_n);               \
+   }                                                  \
 
 #endif
+#endif
 #if 1
 #define MPID_NEM_IB_CHECK_AND_SEND_PROGRESS \
     if (!MPID_nem_ib_sendq_empty(vc_ib->sendq) && MPID_nem_ib_sendq_ready_to_send_head(vc_ib)) { \
@@ -112,7 +114,6 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
         dprintf("drain_scq,req=%p,req->ref_count=%d,cc_ptr=%d\n", req, req->ref_count,
                 *req->cc_ptr);
         if (req->ref_count <= 0) {
-            MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
             printf("%d\n", *(int *) 0);
         }
 
@@ -768,9 +769,9 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t * ringbuf)
     int ibcom_errno;
     struct MPIDI_VC *vc;
     MPID_nem_ib_vc_area *vc_ib;
-    int result;
-    struct ibv_wc cqe[MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN];
-    uint64_t tscs, tsce;
+    //int result;
+    //struct ibv_wc cqe[MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN];
+    //uint64_t tscs, tsce;
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_POLL_EAGER);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_POLL_EAGER);
@@ -814,7 +815,7 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t * ringbuf)
         (MPID_nem_ib_netmod_trailer_t *) ((uint8_t *) buf + off_pow2_aligned);
     dprintf("poll,off_pow2_aligned=%d,netmod_trailer=%p,sz=%d\n", off_pow2_aligned, netmod_trailer,
             MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf));
-    int k = 0;
+    //int k = 0;
     //tsce = MPID_nem_ib_rdtsc(); printf("9,%ld\n", tsce - tscs); // 55 for 512-byte
     //tscs = MPID_nem_ib_rdtsc();
     //#define MPID_NEM_IB_TLBPREF_POLL 20
@@ -913,12 +914,13 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t * ringbuf)
     /* responder releases resource and then embed largest sequence number into MPI message bound to initiator */
 #if 1
     if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
-        (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
+        (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf)) {
         dprintf
             ("handle_pkt,after,%d<-%d,id=%d,pkt->type=%d,eagershort=%d,close=%d,rts=%d,piggy-backed-eagersend=%d\n",
              MPID_nem_ib_myrank, vc->pg_rank, *remote_poll, pkt->type,
              MPIDI_CH3_PKT_EAGERSHORT_SEND, MPIDI_CH3_PKT_CLOSE, MPIDI_NEM_PKT_LMT_RTS,
              MPIDI_NEM_IB_PKT_EAGER_SEND);
+    }
 
     int notify_rate;
     if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
@@ -952,14 +954,13 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t * ringbuf)
          * because rreq->dev.tmpbuf is set to zero in ch3_eager.c
          */
         if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
-            (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
+            (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf)) {
             dprintf("poll_eager,released,type=%d,MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM=%d\n", pkt->type,
                     MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM);
-        if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
-            (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
             MPID_nem_ib_recv_buf_released(vc,
                                           (void *) ((uint8_t *) buf +
                                                     sz_pkt + sizeof(MPIDI_CH3_Pkt_t)));
+        }
     }
     else {
         if (MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) == sz_pkt + sizeof(MPIDI_CH3_Pkt_t)) {
@@ -977,16 +978,15 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t * ringbuf)
 #endif
 
     if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
-        (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
+        (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf)) {
         dprintf("ib_poll,hdr_ringbuf_type=%d\n", MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf));
 
-    if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
-        (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
         if (MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf) & MPID_NEM_IB_RINGBUF_RELINDEX) {
             vc_ib->ibcom->lsr_seq_num_tail = MPID_NEM_IB_NETMOD_HDR_RELINDEX_GET(buf);
             dprintf("ib_poll,local_tail is updated to %d\n",
                     MPID_NEM_IB_NETMOD_HDR_RELINDEX_GET(buf));
         }
+    }
 
     /* Clear flag */
     if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
@@ -1108,7 +1108,6 @@ int MPID_nem_ib_poll(int in_blocking_poll)
                 (uint8_t *) ((uint8_t *) write_to_buf /*+ REQ_FIELD(rreq, lmt_dt_true_lb) */  +
                              rreq->ch.lmt_data_sz - sizeof(uint8_t));
 
-            uint8_t lmt_tail = REQ_FIELD(rreq, lmt_tail);
             if (*tailmagic != REQ_FIELD(rreq, lmt_tail)) {
                 goto next;
             }
@@ -1158,7 +1157,9 @@ int MPID_nem_ib_poll(int in_blocking_poll)
             }
 
             /* send done to sender. vc is stashed in MPID_nem_ib_lmt_start_recv (in ib_lmt.c) */
+#ifdef MPID_NEM_IB_DEBUG_POLL
             MPID_nem_ib_vc_area *vc_ib = VC_IB(rreq->ch.vc);
+#endif
             dprintf("ib_poll,GET,lmt_send_GET_DONE,rsr_seq_num_tail=%d\n",
                     vc_ib->ibcom->rsr_seq_num_tail);
             MPID_nem_ib_lmt_send_GET_DONE(rreq->ch.vc, rreq);
@@ -1638,13 +1639,7 @@ int MPID_nem_ib_PktHandler_EagerSend(MPIDI_VC_t * vc,
     MPID_nem_ib_pkt_prefix_t *netmod_pkt = (MPID_nem_ib_pkt_prefix_t *) pkt;
     MPIDI_CH3_Pkt_eager_send_t *ch3_pkt =
         (MPIDI_CH3_Pkt_eager_send_t *) ((uint8_t *) pkt + sizeof(MPID_nem_ib_pkt_prefix_t));
-    MPID_Request *rreq;
-    int found;
-    int complete;
-    char *data_buf;
-    MPIDI_msg_sz_t data_len;
     int mpi_errno = MPI_SUCCESS;
-    int ibcom_errno;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_EAGERSEND);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_EAGERSEND);
     dprintf("ib_pkthandler_eagersend,tag=%d\n", ch3_pkt->match.parts.tag);
@@ -1785,10 +1780,8 @@ int MPID_nem_ib_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                                MPIDI_msg_sz_t * buflen /* out */ ,
                                MPID_Request ** rreqp /* out */)
 {
-    MPID_nem_ib_pkt_prefix_t *netmod_pkt = (MPID_nem_ib_pkt_prefix_t *) pkt;
     MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
     int mpi_errno = MPI_SUCCESS;
-    int ibcom_errno;
     MPID_Request *req = NULL;
     MPIDI_CH3_Pkt_put_t *put_pkt =
         (MPIDI_CH3_Pkt_put_t *) ((uint8_t *) pkt + sizeof(MPIDI_CH3_Pkt_t));
@@ -1904,10 +1897,8 @@ int MPID_nem_ib_PktHandler_Accumulate(MPIDI_VC_t * vc,
                                       MPIDI_CH3_Pkt_t * pkt, MPIDI_msg_sz_t * buflen /* out */ ,
                                       MPID_Request ** rreqp /* out */)
 {
-    MPID_nem_ib_pkt_prefix_t *netmod_pkt = (MPID_nem_ib_pkt_prefix_t *) pkt;
     MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
     int mpi_errno = MPI_SUCCESS;
-    int ibcom_errno;
     MPID_Request *req = NULL;
     MPIDI_CH3_Pkt_accum_t *accum_pkt =
         (MPIDI_CH3_Pkt_accum_t *) ((uint8_t *) pkt + sizeof(MPIDI_CH3_Pkt_t));
@@ -1917,9 +1908,6 @@ int MPID_nem_ib_PktHandler_Accumulate(MPIDI_VC_t * vc,
 
     /* ref. MPIDI_CH3_PktHandler_Accumulate */
     MPI_Aint true_lb, true_extent, extent;
-    int complete = 0;
-    char *data_buf = NULL;
-    MPIDI_msg_sz_t data_len;
     MPI_Aint type_size;
     MPID_Win *win_ptr;
 
@@ -2044,13 +2032,7 @@ int MPID_nem_ib_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     MPID_nem_ib_pkt_prefix_t *netmod_pkt = (MPID_nem_ib_pkt_prefix_t *) pkt;
     MPIDI_CH3_Pkt_get_t *ch3_pkt =
         (MPIDI_CH3_Pkt_get_t *) ((uint8_t *) pkt + sizeof(MPID_nem_ib_pkt_prefix_t));
-    MPID_Request *rreq;
-    int found;
-    int complete;
-    char *data_buf;
-    MPIDI_msg_sz_t data_len;
     int mpi_errno = MPI_SUCCESS;
-    int ibcom_errno;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_GET);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_GET);
     /* Update occupation status of local SR (send request) queue */
@@ -2103,10 +2085,8 @@ int MPID_nem_ib_PktHandler_GetResp(MPIDI_VC_t * vc,
                                    MPIDI_CH3_Pkt_t * pkt, MPIDI_msg_sz_t * buflen /* out */ ,
                                    MPID_Request ** rreqp /* out */)
 {
-    MPID_nem_ib_pkt_prefix_t *netmod_pkt = (MPID_nem_ib_pkt_prefix_t *) pkt;
     MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
     int mpi_errno = MPI_SUCCESS;
-    int ibcom_errno;
     MPID_Request *req = NULL;
     MPIDI_CH3_Pkt_get_resp_t *get_resp_pkt =
         (MPIDI_CH3_Pkt_get_resp_t *) ((uint8_t *) pkt + sizeof(MPIDI_CH3_Pkt_t));
@@ -2178,7 +2158,6 @@ int MPID_nem_ib_pkt_GET_DONE_handler(MPIDI_VC_t * vc,
                                      MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
 {
     int mpi_errno = MPI_SUCCESS;
-    int ibcom_errno;
     MPID_nem_ib_pkt_lmt_get_done_t *const done_pkt = (MPID_nem_ib_pkt_lmt_get_done_t *) pkt;
     MPID_Request *req;
     MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
@@ -2254,9 +2233,7 @@ int MPID_nem_ib_PktHandler_req_seq_num(MPIDI_VC_t * vc,
                                        MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
 {
     int mpi_errno = MPI_SUCCESS;
-    int ibcom_errno;
     MPID_nem_ib_pkt_req_seq_num_t *const req_pkt = (MPID_nem_ib_pkt_req_seq_num_t *) pkt;
-    MPID_Request *req;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_REQ_SEQ_NUM);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_REQ_SEQ_NUM);
     /* mark as all of the message is read */
@@ -2294,9 +2271,7 @@ int MPID_nem_ib_PktHandler_reply_seq_num(MPIDI_VC_t * vc,
                                          MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
 {
     int mpi_errno = MPI_SUCCESS;
-    int ibcom_errno;
     MPID_nem_ib_pkt_reply_seq_num_t *const reply_pkt = (MPID_nem_ib_pkt_reply_seq_num_t *) pkt;
-    MPID_Request *req;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_REPLY_SEQ_NUM);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_REPLY_SEQ_NUM);
     /* mark as all of the message is consumed */
@@ -2337,7 +2312,6 @@ int MPID_nem_ib_PktHandler_change_rdmabuf_occupancy_notify_state
     int ibcom_errno;
     MPID_nem_ib_pkt_change_rdmabuf_occupancy_notify_state_t *const reply_pkt =
         (MPID_nem_ib_pkt_change_rdmabuf_occupancy_notify_state_t *) pkt;
-    MPID_Request *req;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_CHANGE_RDMABUF_OCCUPANCY_NOTIFY_STATE);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_CHANGE_RDMABUF_OCCUPANCY_NOTIFY_STATE);
     /* mark as all of the message is read */
@@ -2373,7 +2347,6 @@ int MPID_nem_ib_pkt_rma_lmt_getdone(MPIDI_VC_t * vc,
                                     MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
 {
     int mpi_errno = MPI_SUCCESS;
-    int ibcom_errno;
     MPID_nem_ib_pkt_lmt_get_done_t *const done_pkt = (MPID_nem_ib_pkt_lmt_get_done_t *) pkt;
     MPID_Request *req;
     int req_type;
@@ -2885,7 +2858,6 @@ int MPID_nem_ib_cm_poll_syn()
     int mpi_errno = MPI_SUCCESS;
     int ibcom_errno;
     int ib_port = 1;
-    int i;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_POLL_SYN);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_POLL_SYN);
     /* scratch pad is freed after receiving CLOSE */
@@ -2986,7 +2958,7 @@ int MPID_nem_ib_cm_poll_syn()
                 if (is_synack) {
                     MPID_NEM_IB_CM_COMPOSE_SYNACK(cmd, req, syn->initiator_req);
                     dprintf
-                        ("cm_poll_syn,composing synack,responder_req=%p,cmd->rmem=%lx,rkey=%08x,ringbuf_nslot=%d,remote_vc=%lx\n",
+                        ("cm_poll_syn,composing synack,responder_req=%p,cmd->rmem=%p,rkey=%08x,ringbuf_nslot=%d,remote_vc=%p\n",
                          cmd->responder_req, cmd->rmem, cmd->rkey, cmd->ringbuf_nslot,
                          cmd->remote_vc);
                     cmd->initiator_ringbuf_index = req->initiator_ringbuf_index =
@@ -3197,7 +3169,7 @@ int MPID_nem_ib_cm_poll()
                         icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
                     MPID_NEM_IB_CM_COMPOSE_ACK1(cmd, req, synack->responder_req);
                     dprintf
-                        ("cm_poll,composing ack1,cmd->responder_req=%p,cmd->rmem=%lx,rkey=%08x,ringbuf_nslot=%d,remote_vc=%lx\n",
+                        ("cm_poll,composing ack1,cmd->responder_req=%p,cmd->rmem=%p,rkey=%08x,ringbuf_nslot=%d,remote_vc=%p\n",
                          cmd->responder_req, cmd->rmem, cmd->rkey, cmd->ringbuf_nslot,
                          cmd->remote_vc);
                     MPID_nem_ib_cm_cmd_shadow_t *shadow = (MPID_nem_ib_cm_cmd_shadow_t *)
@@ -3237,6 +3209,7 @@ int MPID_nem_ib_cm_poll()
                 MPID_nem_ib_cm_req_t *req = (MPID_nem_ib_cm_req_t *) synack->initiator_req;
                 dprintf
                     ("cm_poll,synack detected!,responder_req=%p,responder_rank=%d,ringbuf_index=%d,tx=%d\n",
+                     synack->responder_req, req->responder_rank, synack->initiator_ringbuf_index,
                      req->ibcom->outstanding_connection_tx);
                 /* These mean the end of CM-op, so decrement here. */
                 req->ibcom->outstanding_connection_tx -= 1;
@@ -3439,7 +3412,6 @@ int MPID_nem_ib_cm_poll()
 int MPID_nem_ib_ringbuf_alloc(MPIDI_VC_t * vc)
 {
     int mpi_errno = MPI_SUCCESS;
-    int ibcom_errno;
     int i;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_ALLOC);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_ALLOC);
@@ -3517,7 +3489,6 @@ int MPID_nem_ib_ringbuf_alloc(MPIDI_VC_t * vc)
 int MPID_nem_ib_ringbuf_free(MPIDI_VC_t * vc)
 {
     int mpi_errno = MPI_SUCCESS;
-    int ibcom_errno;
     int i;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_FREE);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_FREE);
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
index 7eeabee..bbabe11 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
@@ -34,8 +34,10 @@ static int ref_count;
 typedef struct {
     char *next;
 } free_list_t;
+#if 0
 static char *free_list_front[MPID_NEM_IB_NIALLOCID] = { 0 };
 static char *arena_flist[MPID_NEM_IB_NIALLOCID] = { 0 };
+#endif
 
 #define MPID_NEM_IB_SZARENA 4096
 #define MPID_NEM_IB_CLUSTER_SIZE (MPID_NEM_IB_SZARENA/sz)
@@ -157,7 +159,7 @@ static inline void __lru_queue_display()
              p != (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) &MPID_nem_ib_com_reg_mr_cache[i];
              p = (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) p->lru_next) {
             if (p && p->addr) {
-                dprintf("-------- p=%p,addr=%p,len=%d,refc=%d,lru_next=%p\n", p, p->addr, p->len,
+                dprintf("-------- p=%p,addr=%p,len=%ld,refc=%d,lru_next=%p\n", p, p->addr, p->len,
                         p->refc, p->lru_next);
             }
             else {
@@ -201,7 +203,7 @@ struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
 #endif
     key = MPID_nem_ib_com_hash_func(addr);
 
-    dprintf("[MrCache] addr=%p, len=%d\n", addr, len);
+    dprintf("[MrCache] addr=%p, len=%ld\n", addr, len);
     dprintf("[MrCache] aligned addr=%p, len=%ld\n", addr_aligned, len_aligned);
 
     //__lru_queue_display();
@@ -342,6 +344,7 @@ struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
 #endif
 }
 
+#if 0
 static void MPID_nem_ib_com_reg_mr_dereg(struct ibv_mr *mr)
 {
 
@@ -354,6 +357,7 @@ static void MPID_nem_ib_com_reg_mr_dereg(struct ibv_mr *mr)
     //dprintf("MPID_nem_ib_com_reg_mr_dereg,entry=%p,mr=%p,addr=%p,refc=%d,offset=%lx\n", e, mr, e->mr->addr,
     //e->refc, offset);
 }
+#endif
 
 int MPID_nem_ib_com_register_cache_init()
 {
@@ -406,7 +410,7 @@ int MPID_nem_ib_com_register_cache_release()
              MPID_nem_ib_com_reg_mr_cache[i].lru_next;
              p !=
              (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) &MPID_nem_ib_com_reg_mr_cache[i];) {
-            if (p && p->addr > 0) {
+            if (p && p->addr) {
                 ib_errno = MPID_nem_ib_com_dereg_mr(p->mr);
                 MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, printf("MPID_nem_ib_com_dereg_mr"));
                 struct MPID_nem_ib_com_reg_mr_cache_entry_t *p_old = p;
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
index 50b21eb..66ef497 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
@@ -171,7 +171,9 @@ static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, vo
 
     /* send RDMA-write-to buffer occupancy information */
     /* embed SR occupancy information and remember the last one sent */
+#if 0
     MPIDI_CH3_Pkt_t *ch3_hdr = (MPIDI_CH3_Pkt_t *) hdr;
+#endif
     if (MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent)
         > notify_rate) {
 #if 0   /* debug, disabling piggy-back */
@@ -295,7 +297,9 @@ static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, vo
         dprintf("isendcontig_core,MPIDI_CH3_PKT_ACCUMULATE,ref_count=%d\n", sreq->ref_count);
     }
 
+#ifdef MPID_NEM_IB_DEBUG_SEND
     int msg_type = MPIDI_Request_get_msg_type(sreq);
+#endif
 
     dprintf
         ("isendcontig_core,sreq=%p,prefix=%p,sz_prefix=%d,hdr=%p,sz_hdr=%ld,data=%p,sz_data=%d,remote_ringbuf->type=%d\n",
@@ -441,7 +445,9 @@ int MPID_nem_ib_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
                             MPIDI_msg_sz_t hdr_sz, void *data, MPIDI_msg_sz_t data_sz)
 {
     int mpi_errno = MPI_SUCCESS;
+#if 0
     int ibcom_errno;
+#endif
     MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_ISENDCONTIG);
@@ -675,9 +681,11 @@ int MPID_nem_ib_iStartContigMsg(MPIDI_VC_t * vc, void *hdr, MPIDI_msg_sz_t hdr_s
 {
     MPID_Request *sreq = NULL;
     int mpi_errno = MPI_SUCCESS;
+#if 0
     int ibcom_errno;
     MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
     int sseq_num;
+#endif
     //uint64_t tscs, tsce;
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_ISTARTCONTIGMSG);
@@ -935,8 +943,10 @@ int MPID_nem_ib_SendNoncontig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
                               MPIDI_msg_sz_t hdr_sz)
 {
     int mpi_errno = MPI_SUCCESS;
+#if 0
     int ibcom_errno;
     MPIDI_msg_sz_t last;
+#endif
     MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_SENDNONCONTIG);
@@ -1047,12 +1057,11 @@ int MPID_nem_ib_SendNoncontig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
 int MPID_nem_ib_send_progress(MPIDI_VC_t * vc)
 {
     int mpi_errno = MPI_SUCCESS;
+#if 0
     int ibcom_errno;
+#endif
     MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
-    MPID_IOV *iov;
-    int n_iov;
     MPID_Request *sreq, *prev_sreq;
-    int again = 0;
     int req_type, msg_type;
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_SEND_PROGRESS);
@@ -1208,8 +1217,10 @@ int MPID_nem_ib_send_progress(MPIDI_VC_t * vc)
                         /* send current rsr_seq_num_tail because message from target to initiator
                          * might have happened while being queued */
                     case MPIDI_NEM_PKT_LMT_RTS:{
+#if 0
                             MPID_nem_ib_lmt_cookie_t *s_cookie_buf =
                                 (MPID_nem_ib_lmt_cookie_t *) sreq->dev.iov[1].MPID_IOV_BUF;
+#endif
                             dprintf("send_progress,MPIDI_NEM_PKT_LMT_RTS,rsr_seq_num_tail=%d\n",
                                     vc_ib->ibcom->rsr_seq_num_tail);
 #if 0   /* moving to packet header */
@@ -1223,8 +1234,10 @@ int MPID_nem_ib_send_progress(MPIDI_VC_t * vc)
                         }
 
                     case MPIDI_NEM_PKT_LMT_CTS:{
+#if 0
                             MPID_nem_ib_lmt_cookie_t *s_cookie_buf =
                                 (MPID_nem_ib_lmt_cookie_t *) sreq->dev.iov[1].MPID_IOV_BUF;
+#endif
                             dprintf("send_progress,MPIDI_NEM_PKT_LMT_CTS,rsr_seq_num_tail=%d\n",
                                     vc_ib->ibcom->rsr_seq_num_tail);
 #if 0   /* moving to packet header */
@@ -1335,7 +1348,7 @@ int MPID_nem_ib_send_progress(MPIDI_VC_t * vc)
             }
 
             /* save sreq->dev.next (and sreq) because decrementing reference-counter might free sreq */
-            MPID_Request *tmp_sreq = sreq;
+            //MPID_Request *tmp_sreq = sreq;
             sreq = MPID_nem_ib_sendq_next(sreq);
             goto next_unlinked;
             //next:
@@ -1366,7 +1379,6 @@ int MPID_nem_ib_send_progress(MPIDI_VC_t * vc)
 int MPID_nem_ib_cm_progress()
 {
     int mpi_errno = MPI_SUCCESS;
-    int ibcom_errno;
     MPID_nem_ib_cm_req_t *sreq, *prev_sreq;
     MPID_nem_ib_cm_cmd_shadow_t *shadow;
     int is_established = 0;
@@ -1619,7 +1631,6 @@ int MPID_nem_ib_cm_cas_core(int rank, MPID_nem_ib_cm_cmd_shadow_t * shadow)
 {
     int mpi_errno = MPI_SUCCESS;
     int ibcom_errno;
-    int val;
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_CAS_CORE);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_CAS_CORE);
@@ -1655,7 +1666,6 @@ int MPID_nem_ib_cm_cas(MPIDI_VC_t * vc, uint32_t ask_on_connect)
 {
     int mpi_errno = MPI_SUCCESS;
     int ibcom_errno;
-    int val;
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_CAS);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_CAS);
@@ -1718,8 +1728,6 @@ int MPID_nem_ib_cm_cmd_core(int rank, MPID_nem_ib_cm_cmd_shadow_t * shadow, void
 {
     int mpi_errno = MPI_SUCCESS;
     int ibcom_errno;
-    int val;
-    MPID_nem_ib_cm_cmd_t cmd;
     int ib_port = 1;
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_CMD_CORE);
@@ -1905,7 +1913,6 @@ int MPID_nem_ib_ringbuf_ask_fetch(MPIDI_VC_t * vc)
 {
     int mpi_errno = MPI_SUCCESS;
     int ibcom_errno;
-    int val;
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_FETCH);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_FETCH);
@@ -1971,7 +1978,6 @@ int MPID_nem_ib_ringbuf_ask_cas_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_sh
 {
     int mpi_errno = MPI_SUCCESS;
     int ibcom_errno;
-    int val;
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_CAS_CORE);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_CAS_CORE);
@@ -2010,8 +2016,6 @@ int MPID_nem_ib_ringbuf_ask_cas_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_sh
 int MPID_nem_ib_ringbuf_ask_cas(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_req_t * req)
 {
     int mpi_errno = MPI_SUCCESS;
-    int ibcom_errno;
-    int val;
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_CAS);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_CAS);
@@ -2081,7 +2085,6 @@ int MPID_nem_ib_ringbuf_ask_cas(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_req_t * req
 int MPID_nem_ib_ringbuf_progress()
 {
     int mpi_errno = MPI_SUCCESS;
-    int ibcom_errno;
     MPID_nem_ib_ringbuf_req_t *sreq, *prev_sreq;
     MPID_nem_ib_ringbuf_cmd_shadow_t *shadow;
 
@@ -2164,7 +2167,7 @@ int MPID_nem_ib_ringbuf_progress()
             }
 
             /* save sreq->dev.next (and sreq) because decrementing reference-counter might free sreq */
-            MPID_nem_ib_ringbuf_req_t *tmp_sreq = sreq;
+            //MPID_nem_ib_ringbuf_req_t *tmp_sreq = sreq;
             sreq = MPID_nem_ib_ringbuf_sendq_next(sreq);
 
             goto next_unlinked;

http://git.mpich.org/mpich.git/commitdiff/19c00389e4915c68fef68d3e61b50bbded1ad46b

commit 19c00389e4915c68fef68d3e61b50bbded1ad46b
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date:   Wed Jul 2 15:56:04 2014 +0900

    Fix netmod-IB to pass RMA-test
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt b/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
index 9154e1e..82f58f8 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
@@ -35,6 +35,7 @@
 **MPID_nem_ib_com_wr_scratch_pad:MPID_nem_ib_com_wr_scratch_pad failed
 **MPID_nem_ib_drain_scq:MPID_nem_ib_drain_scq failed
 **MPID_nem_ib_drain_scq_scratch_pad:MPID_nem_ib_drain_scq_scratch_pad failed
+**MPID_nem_ib_handle_pkt_bh:MPID_nem_ib_handle_pkt_bh failed
 **MPID_nem_ib_kvs_put_binary:MPID_nem_ib_kvs_put_binary failed
 **MPID_nem_ib_lmt_done_recv:MPID_nem_ib_lmt_done_recv failed
 **MPID_nem_ib_lmt_done_send:MPID_nem_ib_lmt_done_send failed
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
index 3be6a44..37f6bdc 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
@@ -553,6 +553,8 @@ int MPID_nem_ib_drain_scq_scratch_pad(void);
 int MPID_nem_ib_poll(int in_blocking_poll);
 int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t * ringbuf);
 int MPID_nem_ib_ring_alloc(MPIDI_VC_t * vc);
+int MPID_nem_ib_handle_pkt_bh(MPIDI_VC_t * vc, MPID_Request * req, char *buf,
+                              MPIDI_msg_sz_t buflen);
 
 int MPID_nem_ib_cm_drain_scq(void);
 int MPID_nem_ib_cm_drain_rcq(void);
@@ -689,6 +691,14 @@ typedef struct {
     uint8_t tail;               /* last word of payload */
 } MPID_nem_ib_lmt_cookie_t;
 
+typedef struct {
+    void *addr;
+    uint32_t rkey;
+    uint8_t tail;               /* last word of payload */
+    int len;
+    MPI_Request sender_req_id;  /* request id of sender side */
+} MPID_nem_ib_rma_lmt_cookie_t;
+
 typedef enum MPID_nem_ib_pkt_subtype {
     MPIDI_NEM_IB_PKT_EAGER_SEND,
 #if 0                           /* modification of mpid_nem_lmt.c is required */
@@ -702,6 +712,7 @@ typedef enum MPID_nem_ib_pkt_subtype {
     MPIDI_NEM_IB_PKT_REQ_SEQ_NUM,
     MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM,
     MPIDI_NEM_IB_PKT_CHG_RDMABUF_OCC_NOTIFY_STATE,
+    MPIDI_NEM_IB_PKT_RMA_LMT_GET_DONE,
     MPIDI_NEM_IB_PKT_NUM_PKT_HANDLERS
 } MPID_nem_ib_pkt_subtype_t;
 
@@ -778,6 +789,9 @@ int MPID_nem_ib_PktHandler_change_rdmabuf_occupancy_notify_state(MPIDI_VC_t * vc
                                                                  MPIDI_CH3_Pkt_t * pkt,
                                                                  MPIDI_msg_sz_t * buflen,
                                                                  MPID_Request ** rreqp);
+int MPID_nem_ib_pkt_rma_lmt_getdone(MPIDI_VC_t * vc,
+                                    MPIDI_CH3_Pkt_t * pkt,
+                                    MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp);
 
 /* MPID_nem_ib_PktHandler_lmt_done is a wrapper of pkt_DONE_handler and calls it */
 /* pkt_DONE_handler (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c) is not exported */
@@ -885,6 +899,30 @@ int pkt_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt, MPIDI_msg_sz_t * bu
         }                                                                                                       \
     } while (0)
 
+#define MPID_nem_ib_lmt_send_PKT_LMT_DONE(vc, rreq) do {                                                                   \
+        MPID_PKT_DECL_CAST(_upkt, MPID_nem_ib_pkt_lmt_get_done_t, _done_pkt);                                          \
+        MPID_Request *_done_req;                                                                                \
+                                                                                                                \
+        MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"sending rndv DONE packet"); \
+        MPIDI_Pkt_init(_done_pkt, MPIDI_NEM_PKT_NETMOD); \
+        _done_pkt->subtype = MPIDI_NEM_IB_PKT_RMA_LMT_GET_DONE;\
+        _done_pkt->req_id = (rreq)->ch.lmt_req_id; \
+            /* embed SR occupancy information */ \
+        _done_pkt->seq_num_tail = VC_FIELD(vc, ibcom->rsr_seq_num_tail); \
+ \
+            /* remember the last one sent */ \
+        VC_FIELD(vc, ibcom->rsr_seq_num_tail_last_sent) = VC_FIELD(vc, ibcom->rsr_seq_num_tail); \
+                                                                                                                \
+        mpi_errno = MPIDI_CH3_iStartMsg((vc), _done_pkt, sizeof(*_done_pkt), &_done_req);                       \
+        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_lmt_send_GET_DONE");                                  \
+        if (_done_req != NULL)                                                                                  \
+        {                                                                                                       \
+            MPIU_ERR_CHKANDJUMP(_done_req->status.MPI_ERROR, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_lmt_send_GET_DONE");            \
+            MPID_Request_release(_done_req);                                                                    \
+            dprintf("send_get_done,release,req=%p\n", _done_req);       \
+        }                                                                                                       \
+    } while (0)
+
 /* Allocator for packing buffer for non-contiguous data
    - Allocate performs dequeue
      - Slow to "malloc" (two load and one store instructions)
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
index 1d9fa9d..3a3d21a 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
@@ -960,6 +960,7 @@ int MPID_nem_ib_vc_init(MPIDI_VC_t * vc)
     MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM] = MPID_nem_ib_PktHandler_reply_seq_num;
     MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_CHG_RDMABUF_OCC_NOTIFY_STATE] =
         MPID_nem_ib_PktHandler_change_rdmabuf_occupancy_notify_state;
+    MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_RMA_LMT_GET_DONE] = MPID_nem_ib_pkt_rma_lmt_getdone;
 
     /* register CH3 send/recv functions */
     vc_ch->iStartContigMsg = MPID_nem_ib_iStartContigMsg;
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
index 4d931e6..d087a5a 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
@@ -6,6 +6,7 @@
  */
 
 #include "ib_impl.h"
+#include "mpidrma.h"
 
 //#define MPID_NEM_IB_DEBUG_POLL
 #ifdef dprintf  /* avoid redefinition with src/mpid/ch3/include/mpidimpl.h */
@@ -168,6 +169,16 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
                 }
             }
 
+            /* As for request by PKT_PUT, both req->type and req->comm are not set.
+             * If receiver's data type is derived-type, req->dev.datatype_ptr is set.
+             */
+            if ((*req->cc_ptr == 1) && (req_type == 0) && !req->comm) {
+                if (req->dev.datatype_ptr && (req->dev.segment_size > 0) &&
+                    REQ_FIELD(req, lmt_pack_buf)) {
+                    MPIU_Free(REQ_FIELD(req, lmt_pack_buf));
+                }
+            }
+
             /* decrement the number of entries in IB command queue */
             vc_ib->ibcom->ncom -= 1;
             MPID_nem_ib_ncqe -= 1;
@@ -260,6 +271,12 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
             MPID_nem_ib_ncqe -= 1;
             MPID_nem_ib_rdmawr_from_free(REQ_FIELD(req, buf_from), REQ_FIELD(req, buf_from_sz));
 
+            /* this request may be from Noncontig */
+            if ((*req->cc_ptr == 1) && req->dev.datatype_ptr && (req->dev.segment_size > 0) &&
+                REQ_FIELD(req, lmt_pack_buf)) {
+                MPIU_Free(REQ_FIELD(req, lmt_pack_buf));
+            }
+
             dprintf("drain_scq,GET_RESP,ncqe=%d\n", MPID_nem_ib_ncqe);
             MPIU_Assert(req->ref_count == 1 || req->ref_count == 2);
 
@@ -415,6 +432,183 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
             //MPID_NEM_IB_SEND_PROGRESS_POLLINGSET
             //}
         }
+        else if (req_type == 13 && cqe[i].opcode == IBV_WC_RDMA_READ) {
+            MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
+
+            /* end of packet */
+            if (req_wrap->mf == 0) {
+                MPIDI_msg_sz_t data_len = req->ch.lmt_data_sz;
+                MPI_Aint type_size;
+
+                MPID_Datatype_get_size_macro(req->dev.datatype, type_size);
+                req->dev.recv_data_sz = type_size * req->dev.user_count;
+
+                int complete = 0;
+                mpi_errno =
+                    MPIDI_CH3U_Receive_data_found(req, REQ_FIELD(req, lmt_pack_buf), &data_len,
+                                                  &complete);
+
+                /* Data receive must be completed */
+                MPIU_Assert(complete == TRUE);
+
+                MPIU_Free(REQ_FIELD(req, lmt_pack_buf));
+
+                MPID_nem_ib_lmt_send_PKT_LMT_DONE(req->ch.vc, req);
+                MPIDI_CH3U_Request_complete(req);
+            }
+
+            /* decrement the number of entries in IB command queue */
+            vc_ib->ibcom->ncom -= 1;
+            MPID_nem_ib_ncqe -= 1;
+
+            MPIU_Free(req_wrap);
+        }
+        else if (req_type == MPIDI_REQUEST_TYPE_PUT_RESP && cqe[i].opcode == IBV_WC_RDMA_READ) {
+            MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
+
+            /* end of packet */
+            if (req_wrap->mf == 0) {
+                MPIDI_msg_sz_t data_len = req->ch.lmt_data_sz;
+                int complete = 0;
+                mpi_errno =
+                    MPIDI_CH3U_Receive_data_found(req, REQ_FIELD(req, lmt_pack_buf), &data_len,
+                                                  &complete);
+
+                /* Data receive must be completed */
+                MPIU_Assert(complete == TRUE);
+
+                MPIU_Free(REQ_FIELD(req, lmt_pack_buf));
+
+                complete = 0;
+                mpi_errno = MPIDI_CH3_ReqHandler_PutAccumRespComplete(req->ch.vc, req, &complete);      // call MPIDI_CH3U_Request_complete()
+                if (mpi_errno)
+                    MPIU_ERR_POP(mpi_errno);
+                MPIU_Assert(complete == TRUE);
+
+                MPID_nem_ib_lmt_send_PKT_LMT_DONE(req->ch.vc, req);
+                MPIDI_CH3U_Request_complete(req);
+            }
+
+            /* decrement the number of entries in IB command queue */
+            vc_ib->ibcom->ncom -= 1;
+            MPID_nem_ib_ncqe -= 1;
+
+            MPIU_Free(req_wrap);
+        }
+        else if (req_type == MPIDI_REQUEST_TYPE_PUT_RESP_DERIVED_DT &&
+                 cqe[i].opcode == IBV_WC_RDMA_READ) {
+            MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
+            /* end of packet */
+            if (req_wrap->mf == 0) {
+                MPIDI_msg_sz_t buflen = req->ch.lmt_data_sz;
+                char *buf = (char *) REQ_FIELD(req, lmt_pack_buf);
+                int complete = 0;
+                int dataloop_size = *(int *) req->dev.dtype_info;       /* copy from temp store area */
+
+                /* copy all of dtype_info and dataloop */
+                MPIU_Memcpy(req->dev.dtype_info, buf, sizeof(MPIDI_RMA_dtype_info));
+                MPIU_Memcpy(req->dev.dataloop, buf + sizeof(MPIDI_RMA_dtype_info), dataloop_size);
+
+
+                /* All dtype data has been received, call req handler */
+                mpi_errno =
+                    MPIDI_CH3_ReqHandler_PutRespDerivedDTComplete(req->ch.vc, req, &complete);
+                MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
+                                     "**ch3|postrecv %s", "MPIDI_CH3_PKT_PUT");
+                /* return 'complete == FALSE' */
+
+                buflen -= (sizeof(MPIDI_RMA_dtype_info) + dataloop_size);
+                buf += (sizeof(MPIDI_RMA_dtype_info) + dataloop_size);
+
+                mpi_errno = MPID_nem_ib_handle_pkt_bh(req->ch.vc, req, buf, buflen);
+                MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+                                    "**MPID_nem_ib_handle_pkt_bh");
+
+                MPID_nem_ib_lmt_send_PKT_LMT_DONE(req->ch.vc, req);
+
+                MPIU_Free(REQ_FIELD(req, lmt_pack_buf));
+                MPIDI_CH3U_Request_complete(req);
+            }
+
+            /* decrement the number of entries in IB command queue */
+            vc_ib->ibcom->ncom -= 1;
+            MPID_nem_ib_ncqe -= 1;
+
+            MPIU_Free(req_wrap);
+        }
+        else if (req_type == MPIDI_REQUEST_TYPE_ACCUM_RESP && cqe[i].opcode == IBV_WC_RDMA_READ) {
+            MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
+
+            /* end of packet */
+            if (req_wrap->mf == 0) {
+                MPIDI_msg_sz_t data_len = req->ch.lmt_data_sz;
+                int complete = 0;
+                mpi_errno =
+                    MPIDI_CH3U_Receive_data_found(req, REQ_FIELD(req, lmt_pack_buf), &data_len,
+                                                  &complete);
+
+                /* Data receive must be completed */
+                MPIU_Assert(complete == TRUE);
+
+                MPIU_Free(REQ_FIELD(req, lmt_pack_buf));
+
+                complete = 0;
+                mpi_errno = MPIDI_CH3_ReqHandler_PutAccumRespComplete(req->ch.vc, req, &complete);      // call MPIDI_CH3U_Request_complete()
+                if (mpi_errno)
+                    MPIU_ERR_POP(mpi_errno);
+                MPIU_Assert(complete == TRUE);
+
+                MPID_nem_ib_lmt_send_PKT_LMT_DONE(req->ch.vc, req);
+                MPIDI_CH3U_Request_complete(req);
+            }
+
+            /* decrement the number of entries in IB command queue */
+            vc_ib->ibcom->ncom -= 1;
+            MPID_nem_ib_ncqe -= 1;
+
+            MPIU_Free(req_wrap);
+        }
+        else if (req_type == MPIDI_REQUEST_TYPE_ACCUM_RESP_DERIVED_DT &&
+                 cqe[i].opcode == IBV_WC_RDMA_READ) {
+            MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
+            /* end of packet */
+            if (req_wrap->mf == 0) {
+                MPIDI_msg_sz_t buflen = req->ch.lmt_data_sz;
+                char *buf = (char *) REQ_FIELD(req, lmt_pack_buf);
+                int complete = 0;
+                int dataloop_size = *(int *) req->dev.dtype_info;       /* copy from temp store area */
+
+                /* copy all of dtype_info and dataloop */
+                MPIU_Memcpy(req->dev.dtype_info, buf, sizeof(MPIDI_RMA_dtype_info));
+                MPIU_Memcpy(req->dev.dataloop, buf + sizeof(MPIDI_RMA_dtype_info), dataloop_size);
+
+
+                /* All dtype data has been received, call req handler */
+                mpi_errno =
+                    MPIDI_CH3_ReqHandler_AccumRespDerivedDTComplete(req->ch.vc, req, &complete);
+                MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
+                                     "**ch3|postrecv %s", "MPIDI_CH3_ACCUMULATE");
+                /* return 'complete == FALSE' */
+
+                buflen -= (sizeof(MPIDI_RMA_dtype_info) + dataloop_size);
+                buf += (sizeof(MPIDI_RMA_dtype_info) + dataloop_size);
+
+                mpi_errno = MPID_nem_ib_handle_pkt_bh(req->ch.vc, req, buf, buflen);
+                MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+                                    "**MPID_nem_ib_handle_pkt_bh");
+
+                MPID_nem_ib_lmt_send_PKT_LMT_DONE(req->ch.vc, req);
+
+                MPIU_Free(REQ_FIELD(req, lmt_pack_buf));
+                MPIDI_CH3U_Request_complete(req);
+            }
+
+            /* decrement the number of entries in IB command queue */
+            vc_ib->ibcom->ncom -= 1;
+            MPID_nem_ib_ncqe -= 1;
+
+            MPIU_Free(req_wrap);
+        }
         else {
             printf("drain_scq,unknown kind=%d,req_type=%d,msg_type=%d\n", kind, req_type, msg_type);
             assert(0);
@@ -437,6 +631,71 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
     goto fn_exit;
 }
 
+/* bottom part of MPID_nem_handle_pkt() */
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_handle_pkt_bh
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_ib_handle_pkt_bh(MPIDI_VC_t * vc, MPID_Request * req, char *buf, MPIDI_msg_sz_t buflen)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int complete = 0;
+
+    while (buflen && !complete) {
+        MPID_IOV *iov;
+        int n_iov;
+        iov = &req->dev.iov[req->dev.iov_offset];
+        n_iov = req->dev.iov_count;
+
+        while (n_iov && buflen >= iov->MPID_IOV_LEN) {
+            size_t iov_len = iov->MPID_IOV_LEN;
+            MPIU_Memcpy(iov->MPID_IOV_BUF, buf, iov_len);
+
+            buflen -= iov_len;
+            buf += iov_len;
+            --n_iov;
+            ++iov;
+        }
+
+        if (n_iov) {
+            if (buflen > 0) {
+                MPIU_Memcpy(iov->MPID_IOV_BUF, buf, buflen);
+                iov->MPID_IOV_BUF = (void *) ((char *) iov->MPID_IOV_BUF + buflen);
+                iov->MPID_IOV_LEN -= buflen;
+                buflen = 0;
+            }
+
+            req->dev.iov_offset = iov - req->dev.iov;
+            req->dev.iov_count = n_iov;
+        }
+        else {
+            int (*reqFn) (MPIDI_VC_t *, MPID_Request *, int *);
+
+            reqFn = req->dev.OnDataAvail;
+            if (!reqFn) {
+                MPIDI_CH3U_Request_complete(req);
+                complete = TRUE;
+            }
+            else {
+                mpi_errno = reqFn(vc, req, &complete);
+                if (mpi_errno)
+                    MPIU_ERR_POP(mpi_errno);
+            }
+
+            if (!complete) {
+                req->dev.iov_offset = 0;
+                MPIU_Assert(req->dev.iov_count > 0 &&
+                            req->dev.iov[req->dev.iov_offset].MPID_IOV_LEN > 0);
+            }
+        }
+    }
+  fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_DRAIN_SCQ);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
 #undef FUNCNAME
 #define FUNCNAME MPID_nem_ib_drain_scq_scratch_pad
 #undef FCNAME
@@ -1527,50 +1786,106 @@ int MPID_nem_ib_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                                MPID_Request ** rreqp /* out */)
 {
     MPID_nem_ib_pkt_prefix_t *netmod_pkt = (MPID_nem_ib_pkt_prefix_t *) pkt;
-    MPIDI_CH3_Pkt_put_t *ch3_pkt =
-        (MPIDI_CH3_Pkt_put_t *) ((uint8_t *) pkt + sizeof(MPID_nem_ib_pkt_prefix_t));
-    MPID_Request *rreq;
-    int found;
-    int complete;
-    char *data_buf;
-    MPIDI_msg_sz_t data_len;
+    MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
     int mpi_errno = MPI_SUCCESS;
     int ibcom_errno;
-    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_PUT);
-    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_PUT);
-    /* Update occupation status of local SR (send request) queue */
-    MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
-    dprintf
-        ("MPID_nem_ib_Pkthandler_Put,lsr_seq_num_tail=%d,put_pkt->seq_num_tail=%d\n",
-         vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
-    vc_ib->ibcom->lsr_seq_num_tail = netmod_pkt->seq_num_tail;
-    dprintf("MPID_nem_ib_Pkthandler_Put,lsr_seq_num_tail updated to %d\n",
-            vc_ib->ibcom->lsr_seq_num_tail);
-#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
-    /* change remote notification policy of RDMA-write-to buf */
-    dprintf("pkthandler,put,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
-    MPID_nem_ib_change_rdmabuf_occupancy_notify_policy_lw(vc_ib, &vc_ib->ibcom->lsr_seq_num_tail);
-    dprintf("pkthandler,put,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
-#endif
-    dprintf("pkthandler,put,sendq_empty=%d,ncom=%d,rdmabuf_occ=%d\n",
-            MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom,
-            MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
-    /* try to send from sendq because at least one RDMA-write-to buffer has been released */
-    dprintf("pkthandler,put,send_progress\n");
-    fflush(stdout);
-    MPID_NEM_IB_CHECK_AND_SEND_PROGRESS;
-    /* fall back to the original handler */
-    /* we don't need to worry about the difference caused by embedding seq_num
-     * because size of MPI-header of MPIDI_CH3_PKT_PUT equals to sizeof(MPIDI_CH3_Pkt_t)
-     * see MPID_nem_ib_iSendContig
-     */
-    MPIDI_msg_sz_t ch3_buflen = *buflen - sizeof(MPID_nem_ib_pkt_prefix_t);
-    mpi_errno = MPIDI_CH3_PktHandler_Put(vc, (MPIDI_CH3_Pkt_t *) ch3_pkt, &ch3_buflen, rreqp);
-    *buflen = ch3_buflen + sizeof(MPID_nem_ib_pkt_prefix_t);
-    if (mpi_errno) {
-        MPIU_ERR_POP(mpi_errno);
+    MPID_Request *req = NULL;
+    MPIDI_CH3_Pkt_put_t *put_pkt =
+        (MPIDI_CH3_Pkt_put_t *) ((uint8_t *) pkt + sizeof(MPIDI_CH3_Pkt_t));
+    MPID_nem_ib_rma_lmt_cookie_t *s_cookie_buf =
+        (MPID_nem_ib_rma_lmt_cookie_t *) ((uint8_t *) pkt + sizeof(MPIDI_CH3_Pkt_t) +
+                                          sizeof(MPIDI_CH3_Pkt_t));
+
+    /* ref. MPIDI_CH3_PktHandler_Put (= pktArray[MPIDI_CH3_PKT_PUT]) */
+    MPI_Aint type_size;
+
+    MPID_Win *win_ptr;
+
+    MPIU_Assert(put_pkt->target_win_handle != MPI_WIN_NULL);
+    MPID_Win_get_ptr(put_pkt->target_win_handle, win_ptr);
+    mpi_errno = MPIDI_CH3_Start_rma_op_target(win_ptr, put_pkt->flags);
+
+    req = MPID_Request_create();
+    MPIU_Object_set_ref(req, 1);        /* decrement only in drain_scq ? */
+    int incomplete;
+    MPIDI_CH3U_Request_increment_cc(req, &incomplete);  // decrement in drain_scq
+
+    req->dev.user_buf = put_pkt->addr;
+    req->dev.user_count = put_pkt->count;
+    req->dev.target_win_handle = put_pkt->target_win_handle;
+    req->dev.source_win_handle = put_pkt->source_win_handle;
+    req->dev.flags = put_pkt->flags;
+
+    if (MPIR_DATATYPE_IS_PREDEFINED(put_pkt->datatype)) {
+        MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_PUT_RESP);
+        req->dev.datatype = put_pkt->datatype;
+
+        MPID_Datatype_get_size_macro(put_pkt->datatype, type_size);
+        req->dev.recv_data_sz = type_size * put_pkt->count;
+    }
+    else {
+        /* derived datatype */
+        MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_PUT_RESP_DERIVED_DT);
+        req->dev.datatype = MPI_DATATYPE_NULL;
+
+        req->dev.dtype_info = (MPIDI_RMA_dtype_info *) MPIU_Malloc(sizeof(MPIDI_RMA_dtype_info));
+        req->dev.dataloop = MPIU_Malloc(put_pkt->dataloop_size);
+
+        /* We have to store the value of 'put_pkt->dataloop_size' which we use in drain_scq.
+         * Temporarily, put it in req->dev.dtype_info.
+         */
+        *(int *) req->dev.dtype_info = put_pkt->dataloop_size;
+    }
+
+    /* ref. pkt_RTS_handler (= pktArray[MPIDI_NEM_PKT_LMT_RTS]) */
+
+    void *write_to_buf;
+
+    req->ch.lmt_data_sz = s_cookie_buf->len;
+    req->ch.lmt_req_id = s_cookie_buf->sender_req_id;
+
+    REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc((size_t) req->ch.lmt_data_sz);
+    write_to_buf = REQ_FIELD(req, lmt_pack_buf);
+
+    /* stash vc for ib_poll */
+    req->ch.vc = vc;
+
+    REQ_FIELD(req, lmt_tail) = s_cookie_buf->tail;
+
+    /* try to issue RDMA-read command */
+    int slack = 1;              /* slack for control packet bringing sequence number */
+    if (MPID_nem_ib_sendq_empty(vc_ib->sendq) &&
+        vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
+        MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack) {
+        mpi_errno = MPID_nem_ib_lmt_start_recv_core(req, s_cookie_buf->addr, s_cookie_buf->rkey, write_to_buf); /* fast path not storing raddr and rkey */
+        if (mpi_errno) {
+            MPIU_ERR_POP(mpi_errno);
+        }
+    }
+    else {
+        /* enqueue command into send_queue */
+        dprintf("lmt_start_recv, enqueuing,sendq_empty=%d,ncom=%d,ncqe=%d\n",
+                MPID_nem_ib_sendq_empty(vc_ib->sendq),
+                vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
+                MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
+
+        /* make raddr, (sz is in rreq->ch.lmt_data_sz), rkey, (user_buf is in req->dev.user_buf) survive enqueue, free cookie, dequeue */
+        REQ_FIELD(req, lmt_raddr) = s_cookie_buf->addr;
+        REQ_FIELD(req, lmt_rkey) = s_cookie_buf->rkey;
+        REQ_FIELD(req, lmt_write_to_buf) = write_to_buf;
+
+        /* set for send_progress */
+        MPIDI_Request_set_msg_type(req, MPIDI_REQUEST_RNDV_MSG);
+        req->kind = MPID_REQUEST_RECV;
+
+        MPID_nem_ib_sendq_enqueue(&vc_ib->sendq, req);
     }
 
+    /* prefix + header + data */
+    *buflen =
+        sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPID_nem_ib_rma_lmt_cookie_t);
+    *rreqp = NULL;
+
   fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_PKTHANDLER_PUT);
     return mpi_errno;
@@ -1590,51 +1905,124 @@ int MPID_nem_ib_PktHandler_Accumulate(MPIDI_VC_t * vc,
                                       MPID_Request ** rreqp /* out */)
 {
     MPID_nem_ib_pkt_prefix_t *netmod_pkt = (MPID_nem_ib_pkt_prefix_t *) pkt;
-    MPIDI_CH3_Pkt_accum_t *ch3_pkt =
-        (MPIDI_CH3_Pkt_accum_t *) ((uint8_t *) pkt + sizeof(MPID_nem_ib_pkt_prefix_t));
-    MPID_Request *rreq;
-    int found;
-    int complete;
-    char *data_buf;
-    MPIDI_msg_sz_t data_len;
+    MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
     int mpi_errno = MPI_SUCCESS;
     int ibcom_errno;
-    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_ACCUMULATE);
-    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_ACCUMULATE);
-    /* Update occupation status of local SR (send request) queue */
-    MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
-    dprintf
-        ("MPID_nem_ib_Pkthandler_Accumulate,lsr_seq_num_tail=%d,accum_pkt->seq_num_tail=%d\n",
-         vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
-    vc_ib->ibcom->lsr_seq_num_tail = netmod_pkt->seq_num_tail;
-    dprintf
-        ("MPID_nem_ib_Pkthandler_Accumulate,lsr_seq_num_tail updated to %d\n",
-         vc_ib->ibcom->lsr_seq_num_tail);
-#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
-    /* change remote notification policy of RDMA-write-to buf */
-    dprintf("pkthandler,put,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
-    MPID_nem_ib_change_rdmabuf_occupancy_notify_policy_lw(vc_ib, &vc_ib->ibcom->lsr_seq_num_tail);
-    dprintf("pkthandler,put,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
-#endif
-    dprintf("pkthandler,put,sendq_empty=%d,ncom=%d,rdmabuf_occ=%d\n",
-            MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom,
-            MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
-    /* try to send from sendq because at least one RDMA-write-to buffer has been released */
-    dprintf("pkthandler,put,send_progress\n");
-    fflush(stdout);
-    MPID_NEM_IB_CHECK_AND_SEND_PROGRESS
-        /* fall back to the original handler */
-        /* we don't need to worry about the difference caused by embedding seq_num
-         * because size of MPI-header of MPIDI_CH3_PKT_PUT equals to sizeof(MPIDI_CH3_Pkt_t)
-         * see MPID_nem_ib_iSendContig
+    MPID_Request *req = NULL;
+    MPIDI_CH3_Pkt_accum_t *accum_pkt =
+        (MPIDI_CH3_Pkt_accum_t *) ((uint8_t *) pkt + sizeof(MPIDI_CH3_Pkt_t));
+    MPID_nem_ib_rma_lmt_cookie_t *s_cookie_buf =
+        (MPID_nem_ib_rma_lmt_cookie_t *) ((uint8_t *) pkt + sizeof(MPIDI_CH3_Pkt_t) +
+                                          sizeof(MPIDI_CH3_Pkt_t));
+
+    /* ref. MPIDI_CH3_PktHandler_Accumulate */
+    MPI_Aint true_lb, true_extent, extent;
+    int complete = 0;
+    char *data_buf = NULL;
+    MPIDI_msg_sz_t data_len;
+    MPI_Aint type_size;
+    MPID_Win *win_ptr;
+
+    MPIU_Assert(accum_pkt->target_win_handle != MPI_WIN_NULL);
+    MPID_Win_get_ptr(accum_pkt->target_win_handle, win_ptr);
+    mpi_errno = MPIDI_CH3_Start_rma_op_target(win_ptr, accum_pkt->flags);
+
+    req = MPID_Request_create();
+    MPIU_Object_set_ref(req, 1);
+
+    int incomplete;
+    MPIDI_CH3U_Request_increment_cc(req, &incomplete);  // decrement in drain_scq
+
+    req->dev.user_count = accum_pkt->count;
+    req->dev.op = accum_pkt->op;
+    req->dev.real_user_buf = accum_pkt->addr;
+    req->dev.target_win_handle = accum_pkt->target_win_handle;
+    req->dev.source_win_handle = accum_pkt->source_win_handle;
+    req->dev.flags = accum_pkt->flags;
+
+    if (accum_pkt->type == MPIDI_CH3_PKT_GET_ACCUM) {
+        req->dev.resp_request_handle = accum_pkt->request_handle;
+    }
+    else {
+        req->dev.resp_request_handle = MPI_REQUEST_NULL;
+    }
+
+    if (MPIR_DATATYPE_IS_PREDEFINED(accum_pkt->datatype)) {
+        MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_ACCUM_RESP);
+        req->dev.datatype = accum_pkt->datatype;
+
+        MPIR_Type_get_true_extent_impl(accum_pkt->datatype, &true_lb, &true_extent);
+        MPID_Datatype_get_extent_macro(accum_pkt->datatype, extent);
+
+        /* Predefined types should always have zero lb */
+        MPIU_Assert(true_lb == 0);
+
+        req->dev.user_buf = MPIU_Malloc(accum_pkt->count * (MPIR_MAX(extent, true_extent)));
+
+        MPID_Datatype_get_size_macro(accum_pkt->datatype, type_size);
+        req->dev.recv_data_sz = type_size * accum_pkt->count;
+    }
+    else {
+        MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_ACCUM_RESP_DERIVED_DT);
+        req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_AccumRespDerivedDTComplete;
+        req->dev.datatype = MPI_DATATYPE_NULL;
+
+        req->dev.dtype_info = (MPIDI_RMA_dtype_info *) MPIU_Malloc(sizeof(MPIDI_RMA_dtype_info));
+        req->dev.dataloop = MPIU_Malloc(accum_pkt->dataloop_size);
+
+        /* We have to store the value of 'put_pkt->dataloop_size' which we use in drain_scq.
+         * Temporarily, put it in req->dev.dtype_info.
          */
-        MPIDI_msg_sz_t ch3_buflen = *buflen - sizeof(MPID_nem_ib_pkt_prefix_t);
-    mpi_errno =
-        MPIDI_CH3_PktHandler_Accumulate(vc, (MPIDI_CH3_Pkt_t *) ch3_pkt, &ch3_buflen, rreqp);
-    *buflen = ch3_buflen + sizeof(MPID_nem_ib_pkt_prefix_t);
-    if (mpi_errno) {
-        MPIU_ERR_POP(mpi_errno);
+        *(int *) req->dev.dtype_info = accum_pkt->dataloop_size;
+    }
+
+    /* ref. pkt_RTS_handler (= pktArray[MPIDI_NEM_PKT_LMT_RTS]) */
+    void *write_to_buf;
+
+    req->ch.lmt_data_sz = s_cookie_buf->len;
+    req->ch.lmt_req_id = s_cookie_buf->sender_req_id;
+
+    REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc((size_t) req->ch.lmt_data_sz);
+    write_to_buf = REQ_FIELD(req, lmt_pack_buf);
+
+    /* stash vc for ib_poll */
+    req->ch.vc = vc;
+
+    REQ_FIELD(req, lmt_tail) = s_cookie_buf->tail;
+
+    /* try to issue RDMA-read command */
+    int slack = 1;              /* slack for control packet bringing sequence number */
+    if (MPID_nem_ib_sendq_empty(vc_ib->sendq) &&
+        vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
+        MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack) {
+        mpi_errno = MPID_nem_ib_lmt_start_recv_core(req, s_cookie_buf->addr, s_cookie_buf->rkey, write_to_buf); /* fast path not storing raddr and rkey */
+        if (mpi_errno) {
+            MPIU_ERR_POP(mpi_errno);
+        }
     }
+    else {
+        /* enqueue command into send_queue */
+        dprintf("lmt_start_recv, enqueuing,sendq_empty=%d,ncom=%d,ncqe=%d\n",
+                MPID_nem_ib_sendq_empty(vc_ib->sendq),
+                vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
+                MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
+
+        /* make raddr, (sz is in rreq->ch.lmt_data_sz), rkey, (user_buf is in req->dev.user_buf) survive enqueue, free cookie, dequeue */
+        REQ_FIELD(req, lmt_raddr) = s_cookie_buf->addr;
+        REQ_FIELD(req, lmt_rkey) = s_cookie_buf->rkey;
+        REQ_FIELD(req, lmt_write_to_buf) = write_to_buf;
+
+        /* set for send_progress */
+        MPIDI_Request_set_msg_type(req, MPIDI_REQUEST_RNDV_MSG);
+        req->kind = MPID_REQUEST_RECV;
+
+        MPID_nem_ib_sendq_enqueue(&vc_ib->sendq, req);
+    }
+
+    /* prefix + header + data */
+    *buflen =
+        sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPID_nem_ib_rma_lmt_cookie_t);
+    *rreqp = NULL;
 
   fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_PKTHANDLER_ACCUMULATE);
@@ -1716,50 +2104,62 @@ int MPID_nem_ib_PktHandler_GetResp(MPIDI_VC_t * vc,
                                    MPID_Request ** rreqp /* out */)
 {
     MPID_nem_ib_pkt_prefix_t *netmod_pkt = (MPID_nem_ib_pkt_prefix_t *) pkt;
-    MPIDI_CH3_Pkt_get_t *ch3_pkt =
-        (MPIDI_CH3_Pkt_get_t *) ((uint8_t *) pkt + sizeof(MPID_nem_ib_pkt_prefix_t));
-    MPID_Request *rreq;
-    int found;
-    int complete;
-    char *data_buf;
-    MPIDI_msg_sz_t data_len;
+    MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
     int mpi_errno = MPI_SUCCESS;
     int ibcom_errno;
-    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_GETRESP);
-    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_GETRESP);
-    /* Update occupation status of local SR (send request) queue */
-    MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
-    dprintf
-        ("MPID_nem_ib_Pkthandler_GetResp,lsr_seq_num_tail=%d,get_pkt->seq_num_tail=%d\n",
-         vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
-    vc_ib->ibcom->lsr_seq_num_tail = netmod_pkt->seq_num_tail;
-    dprintf
-        ("MPID_nem_ib_Pkthandler_GetResp,lsr_seq_num_tail updated to %d\n",
-         vc_ib->ibcom->lsr_seq_num_tail);
-#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
-    /* change remote notification policy of RDMA-write-to buf */
-    dprintf("pkthandler,put,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
-    MPID_nem_ib_change_rdmabuf_occupancy_notify_policy_lw(vc_ib, &vc_ib->ibcom->lsr_seq_num_tail);
-    dprintf("pkthandler,put,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
-#endif
-    dprintf("pkthandler,put,sendq_empty=%d,ncom=%d,rdmabuf_occ=%d\n",
-            MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom,
-            MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
-    /* try to send from sendq because at least one RDMA-write-to buffer has been released */
-    dprintf("pkthandler,get,send_progress\n");
-    fflush(stdout);
-    MPID_NEM_IB_SEND_PROGRESS_POLLINGSET
-        /* fall back to the original handler */
-        /* we don't need to worry about the difference caused by embedding seq_num
-         * because size of MPI-header of MPIDI_CH3_PKT_PUT equals to sizeof(MPIDI_CH3_Pkt_t)
-         * see MPID_nem_ib_iSendContig
-         */
-        MPIDI_msg_sz_t ch3_buflen = *buflen - sizeof(MPID_nem_ib_pkt_prefix_t);
-    mpi_errno = MPIDI_CH3_PktHandler_GetResp(vc, (MPIDI_CH3_Pkt_t *) ch3_pkt, &ch3_buflen, rreqp);
-    *buflen = ch3_buflen + sizeof(MPID_nem_ib_pkt_prefix_t);
-    if (mpi_errno) {
-        MPIU_ERR_POP(mpi_errno);
+    MPID_Request *req = NULL;
+    MPIDI_CH3_Pkt_get_resp_t *get_resp_pkt =
+        (MPIDI_CH3_Pkt_get_resp_t *) ((uint8_t *) pkt + sizeof(MPIDI_CH3_Pkt_t));
+    MPID_nem_ib_rma_lmt_cookie_t *s_cookie_buf =
+        (MPID_nem_ib_rma_lmt_cookie_t *) ((uint8_t *) pkt + sizeof(MPIDI_CH3_Pkt_t) +
+                                          sizeof(MPIDI_CH3_Pkt_t));
+    MPID_Request_get_ptr(get_resp_pkt->request_handle, req);
+
+    void *write_to_buf;
+
+    req->ch.lmt_data_sz = s_cookie_buf->len;
+    req->ch.lmt_req_id = s_cookie_buf->sender_req_id;
+
+    REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc((size_t) req->ch.lmt_data_sz);
+    write_to_buf = REQ_FIELD(req, lmt_pack_buf);
+
+    /* This is magic number to pick up request in drain_scq */
+    MPIDI_Request_set_type(req, 13);    // currently Request-type is defined from 1 to 12.
+
+    /* stash vc for ib_poll */
+    req->ch.vc = vc;
+
+    REQ_FIELD(req, lmt_tail) = s_cookie_buf->tail;
+
+    /* try to issue RDMA-read command */
+    int slack = 1;              /* slack for control packet bringing sequence number */
+    if (MPID_nem_ib_sendq_empty(vc_ib->sendq) &&
+        vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
+        MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack) {
+        mpi_errno = MPID_nem_ib_lmt_start_recv_core(req, s_cookie_buf->addr, s_cookie_buf->rkey, write_to_buf); /* fast path not storing raddr and rkey */
+        if (mpi_errno) {
+            MPIU_ERR_POP(mpi_errno);
+        }
     }
+    else {
+        /* enqueue command into send_queue */
+        dprintf("lmt_start_recv, enqueuing,sendq_empty=%d,ncom=%d,ncqe=%d\n",
+                MPID_nem_ib_sendq_empty(vc_ib->sendq),
+                vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
+                MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
+
+        /* make raddr, (sz is in rreq->ch.lmt_data_sz), rkey, (user_buf is in req->dev.user_buf) survive enqueue, free cookie, dequeue */
+        REQ_FIELD(req, lmt_raddr) = s_cookie_buf->addr;
+        REQ_FIELD(req, lmt_rkey) = s_cookie_buf->rkey;
+        REQ_FIELD(req, lmt_write_to_buf) = write_to_buf;
+
+        MPID_nem_ib_sendq_enqueue(&vc_ib->sendq, req);
+    }
+
+    /* prefix + header + data */
+    *buflen =
+        sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPID_nem_ib_rma_lmt_cookie_t);
+    *rreqp = NULL;
 
   fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_PKTHANDLER_GETRESP);
@@ -1964,6 +2364,48 @@ int MPID_nem_ib_PktHandler_change_rdmabuf_occupancy_notify_state
     goto fn_exit;
 }
 
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_pkt_rma_lmt_getdone
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_ib_pkt_rma_lmt_getdone(MPIDI_VC_t * vc,
+                                    MPIDI_CH3_Pkt_t * pkt,
+                                    MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int ibcom_errno;
+    MPID_nem_ib_pkt_lmt_get_done_t *const done_pkt = (MPID_nem_ib_pkt_lmt_get_done_t *) pkt;
+    MPID_Request *req;
+    int req_type;
+
+    *buflen = sizeof(MPIDI_CH3_Pkt_t);
+    MPID_Request_get_ptr(done_pkt->req_id, req);
+
+    MPIU_THREAD_CS_ENTER(LMT,);
+
+    req_type = MPIDI_Request_get_type(req);
+    /* free memory area for cookie */
+    if (!req->ch.s_cookie) {
+        dprintf("lmt_done_send,enter,req->ch.s_cookie is zero");
+    }
+    MPIU_Free(req->ch.s_cookie);
+
+    if ((req_type == 0 && !req->comm) || (req_type == MPIDI_REQUEST_TYPE_GET_RESP)) {
+        if ((*req->cc_ptr == 1) && req->dev.datatype_ptr && (req->dev.segment_size > 0) &&
+            REQ_FIELD(req, lmt_pack_buf)) {
+            MPIU_Free(REQ_FIELD(req, lmt_pack_buf));
+        }
+    }
+    MPIDI_CH3U_Request_complete(req);
+
+    *rreqp = NULL;
+  fn_exit:
+    MPIU_THREAD_CS_EXIT(LMT,);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
 #ifdef MPID_NEM_IB_ONDEMAND
 #undef FUNCNAME
 #define FUNCNAME MPID_nem_ib_cm_drain_scq
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
index 28d9608..50b21eb 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
@@ -154,6 +154,8 @@ static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, vo
     MPID_nem_ib_pkt_prefix_t pkt_netmod;
     void *prefix;
     int sz_prefix;
+    void *s_data;
+    int s_data_sz;
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_ISENDCONTIG_CORE);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_ISENDCONTIG_CORE);
@@ -215,6 +217,58 @@ static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, vo
         sz_prefix = 0;
     }
 
+    s_data = data;
+    s_data_sz = data_sz;
+
+    if (hdr &&
+          ((((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_PUT)
+            || (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_GET_RESP)
+            || (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_ACCUMULATE))) {
+        /* If request length is too long, create LMT packet */
+        if (MPID_NEM_IB_NETMOD_HDR_SIZEOF(vc_ib->ibcom->local_ringbuf_type)
+               + sizeof(MPIDI_CH3_Pkt_t) + data_sz
+                 > MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_netmod_trailer_t)) {
+            pkt_netmod.type = MPIDI_NEM_PKT_NETMOD;
+
+            if (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_PUT)
+                pkt_netmod.subtype = MPIDI_NEM_IB_PKT_PUT;
+            else if (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_GET_RESP)
+                pkt_netmod.subtype = MPIDI_NEM_IB_PKT_GET_RESP;
+            else if (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_ACCUMULATE)
+                pkt_netmod.subtype = MPIDI_NEM_IB_PKT_ACCUMULATE;
+
+            void *write_from_buf = data;
+
+            MPID_nem_ib_rma_lmt_cookie_t *s_cookie_buf = (MPID_nem_ib_rma_lmt_cookie_t *) MPIU_Malloc(sizeof(MPID_nem_ib_rma_lmt_cookie_t));
+
+            sreq->ch.s_cookie = s_cookie_buf;
+
+            s_cookie_buf->tail = *((uint8_t *) ((uint8_t *) write_from_buf + data_sz - sizeof(uint8_t)));
+            /* put IB rkey */
+            struct ibv_mr *mr =
+                MPID_nem_ib_com_reg_mr_fetch(write_from_buf, data_sz, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
+            MPIU_ERR_CHKANDJUMP(!mr, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
+#ifdef HAVE_LIBDCFA
+            s_cookie_buf->addr = (void *) mr->host_addr;
+#else
+            s_cookie_buf->addr = write_from_buf;
+#endif
+            s_cookie_buf->rkey = mr->rkey;
+            s_cookie_buf->len = data_sz;
+            s_cookie_buf->sender_req_id = sreq->handle;
+
+	    /* set for ib_com_isend */
+	    prefix = (void *)&pkt_netmod;
+	    sz_prefix = sizeof(MPIDI_CH3_Pkt_t);
+	    s_data = (void *)s_cookie_buf;
+	    s_data_sz = sizeof(MPID_nem_ib_rma_lmt_cookie_t);
+
+	    /* Release Request, when sender receives DONE packet. */
+            int incomplete;
+            MPIDI_CH3U_Request_increment_cc(sreq, &incomplete); // decrement in drain_scq and pkt_rma_lmt_getdone
+        }
+    }
+
     /* packet handlers including MPIDI_CH3_PktHandler_EagerSend and MPID_nem_handle_pkt assume this */
     hdr_sz = sizeof(MPIDI_CH3_Pkt_t);
 
@@ -259,7 +313,7 @@ static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, vo
                               (uint64_t) sreq,
                               prefix, sz_prefix,
                               hdr, hdr_sz,
-                              data, (int) data_sz,
+                              s_data, (int) s_data_sz,
                               &copied,
                               vc_ib->ibcom->local_ringbuf_type, vc_ib->ibcom->remote_ringbuf->type,
                               &REQ_FIELD(sreq, buf_from), &REQ_FIELD(sreq, buf_from_sz));
@@ -689,6 +743,15 @@ static int MPID_nem_ib_SendNoncontig_core(MPIDI_VC_t * vc, MPID_Request * sreq,
     MPIDI_msg_sz_t last;
     MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
 
+    void *prefix;
+    int prefix_sz;
+    void *data;
+    int data_sz;
+    MPID_nem_ib_pkt_prefix_t pkt_netmod;
+
+    prefix = NULL;
+    prefix_sz = 0;
+
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_SENDNONCONTIG_CORE);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_SENDNONCONTIG_CORE);
 
@@ -703,6 +766,58 @@ static int MPID_nem_ib_SendNoncontig_core(MPIDI_VC_t * vc, MPID_Request * sreq,
         MPIU_Assert(last == sreq->dev.segment_size);
     }
 
+    data = (void *)REQ_FIELD(sreq, lmt_pack_buf);
+    data_sz = last;
+
+    if (hdr &&
+          ((((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_PUT)
+            || (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_GET_RESP)
+            || (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_ACCUMULATE))) {
+	/* If request length is too long, create LMT packet */
+	if ( MPID_NEM_IB_NETMOD_HDR_SIZEOF(vc_ib->ibcom->local_ringbuf_type)
+               + sizeof(MPIDI_CH3_Pkt_t) + sreq->dev.segment_size
+                 > MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_netmod_trailer_t)) {
+            pkt_netmod.type = MPIDI_NEM_PKT_NETMOD;
+
+            if (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_PUT)
+                pkt_netmod.subtype = MPIDI_NEM_IB_PKT_PUT;
+            else if (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_GET_RESP)
+                pkt_netmod.subtype = MPIDI_NEM_IB_PKT_GET_RESP;
+            else if (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_ACCUMULATE)
+                pkt_netmod.subtype = MPIDI_NEM_IB_PKT_ACCUMULATE;
+
+            void *write_from_buf = REQ_FIELD(sreq, lmt_pack_buf);
+
+            MPID_nem_ib_rma_lmt_cookie_t *s_cookie_buf = (MPID_nem_ib_rma_lmt_cookie_t *) MPIU_Malloc(sizeof(MPID_nem_ib_rma_lmt_cookie_t));
+
+            sreq->ch.s_cookie = s_cookie_buf;
+
+            s_cookie_buf->tail = *((uint8_t *) ((uint8_t *) write_from_buf + last - sizeof(uint8_t)));
+            /* put IB rkey */
+            struct ibv_mr *mr =
+                MPID_nem_ib_com_reg_mr_fetch(write_from_buf, last, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
+            MPIU_ERR_CHKANDJUMP(!mr, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
+#ifdef HAVE_LIBDCFA
+            s_cookie_buf->addr = (void *) mr->host_addr;
+#else
+            s_cookie_buf->addr = write_from_buf;
+#endif
+            s_cookie_buf->rkey = mr->rkey;
+            s_cookie_buf->len = last;
+            s_cookie_buf->sender_req_id = sreq->handle;
+
+	    /* set for ib_com_isend */
+	    prefix = (void *)&pkt_netmod;
+	    prefix_sz = sizeof(MPIDI_CH3_Pkt_t);
+	    data = (void *)s_cookie_buf;
+	    data_sz = sizeof(MPID_nem_ib_rma_lmt_cookie_t);
+
+	    /* Release Request, when sender receives DONE packet. */
+            int incomplete;
+            MPIDI_CH3U_Request_increment_cc(sreq, &incomplete); // decrement in drain_scq and pkt_rma_lmt_getdone
+        }
+    }
+
     /* packet handlers assume this */
     hdr_sz = sizeof(MPIDI_CH3_Pkt_t);
 
@@ -724,9 +839,9 @@ static int MPID_nem_ib_SendNoncontig_core(MPIDI_VC_t * vc, MPID_Request * sreq,
     ibcom_errno =
         MPID_nem_ib_com_isend(vc_ib->sc->fd,
                               (uint64_t) sreq,
-                              NULL, 0,
+                              prefix, prefix_sz,
                               hdr, hdr_sz,
-                              (void *) REQ_FIELD(sreq, lmt_pack_buf), (int) last,
+                              data, data_sz,
                               &copied,
                               vc_ib->ibcom->local_ringbuf_type, vc_ib->ibcom->remote_ringbuf->type,
                               &REQ_FIELD(sreq, buf_from), &REQ_FIELD(sreq, buf_from_sz));

http://git.mpich.org/mpich.git/commitdiff/76e70960dfa6962fcf04bdf187647061c4e499ee

commit 76e70960dfa6962fcf04bdf187647061c4e499ee
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date:   Mon Jun 23 10:56:33 2014 +0900

    Fix the managment of memory area for small size
    
    In stead of reusing a memory pool when all elements of pool are freed,
    reuse a element when it is freed.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_malloc.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_malloc.c
index 9458cb2..ebbfd26 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_malloc.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_malloc.c
@@ -48,6 +48,8 @@ struct free_list {
     struct free_list *prev;
 };
 
+#define CHUNK (sizeof(struct free_list))
+
 static inline void list_init(struct free_list *head)
 {
     head->next = head;
@@ -192,7 +194,7 @@ static void __init_pool_header(struct pool_info *info, int i, int size)
 
 static void _local_malloc_initialize_hook(void)
 {
-    int i;
+    int i, j;
     char *aligned;
     size_t size;
     int count;
@@ -229,13 +231,24 @@ static void _local_malloc_initialize_hook(void)
 
         info = (struct pool_info *) aligned;
 
-        if (i <= MMAPED_OFFSET_POW)
+        if (i <= MMAPED_OFFSET_POW) {
             __init_pool_header_with_hole(info, i, size);
-        else
-            __init_pool_header(info, i, size);
 
-        /* add list tail */
-        list_add_tail(&(info->list), &arena_flist[i]);
+            int elem = (DEFAULT_POOL_SIZE - (info->hole_num * info->size)) / (CHUNK + info->size);
+            struct free_list *block_head = (struct free_list *) info->next_pos;
+            for (j = 0; j < elem; j++) {
+                if (((size_t) ((char *) block_head + CHUNK) & ((size_t) PAGE_SIZE - 1)) !=
+                    MMAPED_OFFSET) {
+                    list_add_tail(block_head, &arena_flist[i]);
+                }
+                block_head = (struct free_list *) ((char *) block_head + CHUNK + info->size);
+            }
+        }
+        else {
+            __init_pool_header(info, i, size);
+            /* add list tail */
+            list_add_tail(&(info->list), &arena_flist[i]);
+        }
 
         aligned += size;
     }
@@ -245,6 +258,7 @@ static void _local_malloc_initialize_hook(void)
 
 void *malloc(size_t size)
 {
+    int i;
     int pow;
     char *ptr = NULL;
 
@@ -291,21 +305,39 @@ void *malloc(size_t size)
 
             info = (struct pool_info *) tmp;
 
-            if (pow <= MMAPED_OFFSET_POW)
+            if (pow <= MMAPED_OFFSET_POW) {
                 __init_pool_header_with_hole(info, pow, alloc_sz);
-            else
-                __init_pool_header(info, pow, alloc_sz);
 
-            list_add_tail(&(info->list), &arena_flist[pow]);
+                int elem =
+                    (DEFAULT_POOL_SIZE - (info->hole_num * info->size)) / (CHUNK + info->size);
+                struct free_list *block_head = (struct free_list *) info->next_pos;
+                for (i = 0; i < elem; i++) {
+                    if (((size_t) ((char *) block_head + CHUNK) & ((size_t) PAGE_SIZE - 1)) !=
+                        MMAPED_OFFSET) {
+                        list_add_tail(block_head, &arena_flist[pow]);
+                    }
+                    block_head = (struct free_list *) ((char *) block_head + CHUNK + info->size);
+                }
 
-            ptr = info->next_pos;
-            info->next_pos += info->size;
+                /* use head elem */
+                struct free_list *info = (struct free_list *) (arena_flist[pow].next);
+                ptr = (char *) info + CHUNK;
+                dprintf("malloc(%lu) [2^%d] ==> USE pool %p\n", size, pow, ptr);
+                list_del(info);
+            }
+            else {
+                __init_pool_header(info, pow, alloc_sz);
+                list_add_tail(&(info->list), &arena_flist[pow]);
 
-            if (pow <= MMAPED_OFFSET_POW)
-                info->count++;
+                ptr = info->next_pos;
+                info->next_pos += info->size;
 
-            dprintf("malloc(%lu) [2^%d] ==> CREATE pool %p   use = %lu\n", size, pow, ptr,
-                    NUM_USED(info->next_pos, POOL_ALIGN_SIZE, info->size));
+                if (pow <= MMAPED_OFFSET_POW)
+                    info->count++;
+
+                dprintf("malloc(%lu) [2^%d] ==> CREATE pool %p   use = %lu\n", size, pow, ptr,
+                        NUM_USED(info->next_pos, POOL_ALIGN_SIZE, info->size));
+            }
         }
     }
     else {
@@ -319,7 +351,7 @@ void *malloc(size_t size)
 
             dprintf("malloc(%lu) [2^%d] ==> USE mmaped %p\n", size, pow, ptr);
         }
-        else {
+        else if (pow > MMAPED_OFFSET_POW) {
             struct pool_info *info = (struct pool_info *) (arena_flist[pow].next);
 
             ptr = info->next_pos;
@@ -332,15 +364,12 @@ void *malloc(size_t size)
             if (((size_t) info->next_pos & ~(POOL_ALIGN_SIZE - 1)) == (size_t) info->next_pos) {
                 list_del(&(info->list));
             }
-            else if (info->pow <= MMAPED_OFFSET_POW) {
-                info->count++;
-
-                if (info->count == info->num_per_page) {
-                    info->next_pos += (info->size * info->hole_num);
-                    info->count = info->hole_num;
-                    info->free_num += info->hole_num;
-                }
-            }
+        }
+        else {
+            char *info = (char *) (arena_flist[pow].next);
+            ptr = (char *) info + CHUNK;
+            dprintf("malloc(%lu) [2^%d] ==> USE pool %p\n", size, pow, ptr);
+            list_del((struct free_list *) info);
         }
     }
 
@@ -364,28 +393,28 @@ static inline void free_core(void *addr)
         struct pool_info *info =
             (struct pool_info *) ((size_t) addr & ~((size_t) POOL_ALIGN_SIZE - 1));
 
-        dprintf("free(%p) --> free POOL [2^%d] %lu / %u / %u (use / free / max)\n",
-                addr, info->pow,
-                NUM_USED(info->next_pos, POOL_ALIGN_SIZE, info->size),
-                info->free_num + 1, info->num);
-
-        info->free_num++;
-        if (info->free_num == info->num) {
-            /* intialize for reuse */
-            if (info->pow <= MMAPED_OFFSET_POW) {
-                info->count = info->hole_num;
-                info->free_num = info->hole_num;
-                info->next_pos = (char *) info + (info->size * info->hole_num);
-            }
-            else {
+        if (info->pow <= MMAPED_OFFSET_POW) {
+            struct free_list *block_head = (struct free_list *) ((size_t) addr - CHUNK);
+            list_add_head(block_head, &arena_flist[info->pow]);
+            dprintf("free(%p) --> free BLOCK [2^%d]\n", addr, info->pow);
+        }
+        else {
+            dprintf("free(%p) --> free POOL [2^%d] %lu / %u / %u (use / free / max)\n",
+                    addr, info->pow,
+                    NUM_USED(info->next_pos, POOL_ALIGN_SIZE, info->size),
+                    info->free_num + 1, info->num);
+
+            info->free_num++;
+            if (info->free_num == info->num) {
+                /* intialize for reuse */
                 info->free_num = 1;
                 info->next_pos = (char *) info + info->size;
-            }
 
-            list_add_tail(&(info->list), &arena_flist[info->pow]);
+                list_add_tail(&(info->list), &arena_flist[info->pow]);
 
-            dprintf("       POOL [2^%d]   ALL FREED -> add list [%p]\n", info->pow,
-                    &arena_flist[info->pow]);
+                dprintf("       POOL [2^%d]   ALL FREED -> add list [%p]\n", info->pow,
+                        &arena_flist[info->pow]);
+            }
         }
     }
 

http://git.mpich.org/mpich.git/commitdiff/51f6709f5422b015910b26bebe651ed835a77f9b

commit 51f6709f5422b015910b26bebe651ed835a77f9b
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date:   Thu Jun 19 13:12:04 2014 +0900

    Add queue to store outstanding_tx_empty command
    
    When the value of outstanding_connection_tx is not 0, it may not be able
    to send a command for reporting outstanding_tx_empty. Therefore, create
    a queue and enqueue it, and send it later.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt b/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
index 210c2a5..9154e1e 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
@@ -8,6 +8,7 @@
 **MPID_nem_ib_cm_poll_syn:MPID_nem_ib_cm_poll_syn failed
 **MPID_nem_ib_cm_progress:MPID_nem_ib_cm_progress failed
 **MPID_nem_ib_cm_send_core:MPID_nem_ib_cm_send_core failed
+**MPID_nem_ib_cm_notify_send:MPID_nem_ib_cm_notify_send failed
 **MPID_nem_ib_com_alloc:MPID_nem_ib_com_alloc failed
 **MPID_nem_ib_com_cas_scratch_pad:MPID_nem_ib_com_cas_scratch_pad failed
 **MPID_nem_ib_com_close:MPID_nem_ib_com_close failed
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
index 74bc2d2..3be6a44 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
@@ -229,7 +229,14 @@ typedef struct {
 typedef struct {
     MPID_nem_ib_cm_cmd_type_t type;
     int initiator_rank;
-} MPID_nem_ib_cm_wr_send_t;
+} MPID_nem_ib_cm_notify_send_t;
+
+typedef struct MPID_nem_ib_cm_notify_send_req {
+    MPID_nem_ib_com_t *ibcom;
+    int my_rank;
+    int pg_rank;
+    struct MPID_nem_ib_cm_notify_send_req *sendq_next;
+} MPID_nem_ib_cm_notify_send_req_t;
 
 #define MPID_NEM_IB_CM_RELEASED ((uint64_t)(-1))
 #define MPID_NEM_IB_CM_OFF_SYN (256)    /* Align for 256-byte-write PCI command */
@@ -244,6 +251,14 @@ typedef GENERIC_Q_DECL(MPID_nem_ib_cm_req_t) MPID_nem_ib_cm_sendq_t;
 #define MPID_nem_ib_cm_sendq_next(ep) ((ep)->sendq_next)
 #define MPID_nem_ib_cm_sendq_enqueue(qp, ep) GENERICM_Q_ENQUEUE (qp, ep, MPID_nem_ib_cm_sendq_next_field, sendq_next);
 
+typedef GENERIC_Q_DECL(MPID_nem_ib_cm_notify_send_req_t) MPID_nem_ib_cm_notify_sendq_t;
+
+#define MPID_nem_ib_cm_notify_sendq_empty(q) GENERICM_Q_EMPTY (q)
+#define MPID_nem_ib_cm_notify_sendq_head(q) GENERICM_Q_HEAD (q)
+#define MPID_nem_ib_cm_notify_sendq_next_field(ep, next_field) ((ep)->next_field)
+#define MPID_nem_ib_cm_notify_sendq_next(ep) ((ep)->sendq_next)
+#define MPID_nem_ib_cm_notify_sendq_enqueue(qp, ep) GENERICM_Q_ENQUEUE (qp, ep, MPID_nem_ib_cm_notify_sendq_next_field, sendq_next);
+
 #ifdef HAVE_LIBDCFA
 #define MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO_MR_ADDR host_adddr
 #else
@@ -336,6 +351,9 @@ static inline void MPID_nem_ib_cm_request_release(MPID_nem_ib_cm_req_t * req)
 
 int MPID_nem_ib_cm_progress(void);
 int MPID_nem_ib_cm_release(uint16_t index);
+
+int MPID_nem_ib_cm_notify_send(int pg_rank, int myrank);
+int MPID_nem_ib_cm_notify_progress(void);
 #endif
 
 /* Ring buffer protocol
@@ -569,7 +587,6 @@ int MPID_nem_ib_ringbuf_ask_fetch(MPIDI_VC_t * vc);
 int MPID_nem_ib_ringbuf_ask_cas_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t * shadow,
                                      uint64_t head);
 int MPID_nem_ib_ringbuf_progress(void);
-int MPID_nem_ib_cm_wr_send(int pg_rank, int myrank);
 
 int MPID_nem_ib_ringbuf_alloc(MPIDI_VC_t * vc);
 int MPID_nem_ib_ringbuf_free(MPIDI_VC_t * vc);
@@ -630,6 +647,7 @@ extern uint64_t MPID_nem_ib_cm_ringbuf_released[(MPID_NEM_IB_CM_NSEG + 63) / 64]
 
 /* overflow queue when no more slots for responder to write on are available */
 extern MPID_nem_ib_cm_sendq_t MPID_nem_ib_cm_sendq;
+extern MPID_nem_ib_cm_notify_sendq_t MPID_nem_ib_cm_notify_sendq;
 
 extern MPID_nem_ib_ringbuf_sendq_t MPID_nem_ib_ringbuf_sendq;
 
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
index 21bd140..1d9fa9d 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
@@ -87,11 +87,13 @@ uint16_t MPID_nem_ib_cm_ringbuf_head;
 uint16_t MPID_nem_ib_cm_ringbuf_tail;
 uint64_t MPID_nem_ib_cm_ringbuf_released[(MPID_NEM_IB_CM_NSEG + 63) / 64];
 MPID_nem_ib_cm_sendq_t MPID_nem_ib_cm_sendq = { NULL, NULL };
+MPID_nem_ib_cm_notify_sendq_t MPID_nem_ib_cm_notify_sendq = { NULL, NULL };
 
 int MPID_nem_ib_ncqe_scratch_pad_to_drain;
 #endif
 MPID_nem_ib_ringbuf_sendq_t MPID_nem_ib_ringbuf_sendq = { NULL, NULL };
 
+
 int MPID_nem_ib_ncqe_scratch_pad;
 int MPID_nem_ib_ncqe_to_drain;
 int MPID_nem_ib_ncqe_nces;
@@ -493,7 +495,7 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
     for (i = 0; i < MPID_nem_ib_nranks; i++) {
         if (i != MPID_nem_ib_myrank) {
             for (j = 0; j < MPID_NEM_IB_COM_MAX_RQ_CAPACITY; j++) {
-                MPID_nem_ib_com_scratch_pad_recv(MPID_nem_ib_scratch_pad_fds[i], sizeof(MPID_nem_ib_cm_wr_send_t));
+                MPID_nem_ib_com_scratch_pad_recv(MPID_nem_ib_scratch_pad_fds[i], sizeof(MPID_nem_ib_cm_notify_send_t));
             }
         }
     }
@@ -1005,7 +1007,6 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
     int ibcom_errno;
     int req_errno = MPI_SUCCESS;
     int i;
-    int send_empty = 0;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_VC_TERMINATE);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_VC_TERMINATE);
 
@@ -1063,15 +1064,22 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
     }
 #endif
 
-#if 1
-    /* Wait until transmission and reception of NOTIFY_OUTSTANDING_TX_COMP are completed. */
-    while (1) {
-#else
+#ifdef MPID_NEM_IB_ONDEMAND
+    MPID_nem_ib_cm_notify_send_req_t *req = MPIU_Malloc(sizeof(MPID_nem_ib_cm_notify_send_req_t));
+    req->ibcom = MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank];
+    req->my_rank = MPID_nem_ib_myrank;
+    req->pg_rank = vc->pg_rank;
+    MPID_nem_ib_cm_notify_sendq_enqueue(&MPID_nem_ib_cm_notify_sendq, req);
+#endif
+
     /* Empty sendq */
     while (!MPID_nem_ib_sendq_empty(vc_ib->sendq) ||
            VC_FIELD(vc, pending_sends) > 0 ||
-           MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank]->outstanding_connection_tx > 0 ||
-           MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank]->incoming_connection_tx > 0) {
+           (MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank]->notify_outstanding_tx_empty !=
+            NOTIFY_OUTSTANDING_TX_COMP)) {
+#ifdef MPID_NEM_IB_ONDEMAND
+        MPID_nem_ib_cm_notify_progress();       /* progress cm_notify_sendq */
+        MPID_nem_ib_cm_drain_rcq();
 #endif
         /* mimic ib_poll because vc_terminate might be called from ib_poll_eager */
         mpi_errno = MPID_nem_ib_send_progress(vc);
@@ -1087,19 +1095,6 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
         MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_progress");
         ibcom_errno = MPID_nem_ib_cm_drain_scq();
         MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_drain_scq");
-
-        if ((send_empty == 0) &&
-            (MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank]->outstanding_connection_tx == 0)) {
-            MPID_nem_ib_cm_wr_send(vc->pg_rank, MPID_nem_ib_myrank);
-
-            send_empty = 1;
-        }
-        MPID_nem_ib_cm_drain_rcq();
-
-        if (MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank]->notify_outstanding_tx_empty ==
-            NOTIFY_OUTSTANDING_TX_COMP) {
-            break;
-        }
 #endif
         ibcom_errno = MPID_nem_ib_ringbuf_progress();
         MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
index 6cd823c..4d931e6 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
@@ -2369,7 +2369,7 @@ int MPID_nem_ib_cm_drain_rcq(void)
     int result;
     int i;
     struct ibv_wc cqe[MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN];
-    MPID_nem_ib_cm_wr_send_t *shadow_cm;
+    MPID_nem_ib_cm_notify_send_t *shadow_cm;
 
     if (!MPID_nem_ib_rc_shared_rcq_scratch_pad) {
         dprintf("cm_drain_rcq,CQ is null\n");
@@ -2410,15 +2410,15 @@ int MPID_nem_ib_cm_drain_rcq(void)
                 MPID_nem_ib_com_t *ibcom;
 
                 dprintf("cm_drain_rcq,notify_outstanding_tx_empty\n");
-                shadow_cm = (MPID_nem_ib_cm_wr_send_t *) cqe[i].wr_id;
+                shadow_cm = (MPID_nem_ib_cm_notify_send_t *) cqe[i].wr_id;
                 initiator_rank = shadow_cm->initiator_rank;
 
-                MPID_nem_ib_rdmawr_from_free(shadow_cm, sizeof(MPID_nem_ib_cm_wr_send_t));
+                MPID_nem_ib_rdmawr_from_free(shadow_cm, sizeof(MPID_nem_ib_cm_notify_send_t));
 
                 MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[initiator_rank], &ibcom);
                 ibcom->notify_outstanding_tx_empty |= NOTIFY_OUTSTANDING_TX_RCQ;
                 MPID_nem_ib_com_scratch_pad_recv(MPID_nem_ib_scratch_pad_fds[initiator_rank],
-                                                 sizeof(MPID_nem_ib_cm_wr_send_t));
+                                                 sizeof(MPID_nem_ib_cm_notify_send_t));
             }
             break;
         default:
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
index c1b766d..28d9608 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
@@ -1656,6 +1656,95 @@ int MPID_nem_ib_cm_cmd_core(int rank, MPID_nem_ib_cm_cmd_shadow_t * shadow, void
   fn_fail:
     goto fn_exit;
 }
+
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_cm_notify_send
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_ib_cm_notify_send(int pg_rank, int myrank)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int ibcom_errno;
+
+    MPID_nem_ib_cm_cmd_shadow_t *shadow =
+        (MPID_nem_ib_cm_cmd_shadow_t *) MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+    MPID_nem_ib_cm_notify_send_t *buf_from = (MPID_nem_ib_cm_notify_send_t *)
+        MPID_nem_ib_rdmawr_from_alloc(sizeof(MPID_nem_ib_cm_notify_send_t));
+    MPID_nem_ib_cm_req_t *req = MPIU_Malloc(sizeof(MPID_nem_ib_cm_req_t));
+
+    shadow->type = MPID_NEM_IB_NOTIFY_OUTSTANDING_TX_EMPTY;
+
+    buf_from->type = MPID_NEM_IB_NOTIFY_OUTSTANDING_TX_EMPTY;
+    buf_from->initiator_rank = myrank;
+    shadow->req = req;
+    shadow->buf_from = (void *) buf_from;
+    shadow->buf_from_sz = sizeof(MPID_nem_ib_cm_notify_send_t);
+
+    shadow->req->ibcom = MPID_nem_ib_scratch_pad_ibcoms[pg_rank];
+
+    ibcom_errno =
+        MPID_nem_ib_com_wr_scratch_pad(MPID_nem_ib_scratch_pad_fds[pg_rank],
+                                       (uint64_t) shadow, shadow->buf_from, shadow->buf_from_sz);
+
+    MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_wr_scratch_pad");
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_cm_notify_progress
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_ib_cm_notify_progress(void)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int ibcom_errno;
+    MPID_nem_ib_cm_notify_send_req_t *sreq, *prev_sreq;
+
+    sreq = MPID_nem_ib_cm_notify_sendq_head(MPID_nem_ib_cm_notify_sendq);
+    if (sreq) {
+        prev_sreq = NULL;
+        do {
+            if (sreq->ibcom->outstanding_connection_tx != 0) {
+                goto next;
+            }
+
+            ibcom_errno = MPID_nem_ib_cm_notify_send(sreq->pg_rank, sreq->my_rank);
+            MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_notify_send");
+
+            /* unlink sreq */
+            if (prev_sreq != NULL) {
+                MPID_nem_ib_cm_notify_sendq_next(prev_sreq) = MPID_nem_ib_cm_notify_sendq_next(sreq);
+            }
+            else {
+                MPID_nem_ib_cm_notify_sendq_head(MPID_nem_ib_cm_notify_sendq) =
+                    MPID_nem_ib_cm_notify_sendq_next(sreq);
+            }
+            if (MPID_nem_ib_cm_notify_sendq_next(sreq) == NULL) {
+                MPID_nem_ib_cm_notify_sendq.tail = prev_sreq;
+            }
+
+            MPID_nem_ib_cm_notify_send_req_t *tmp_sreq = sreq;
+            sreq = MPID_nem_ib_cm_notify_sendq_next(sreq);
+
+            MPIU_Free(tmp_sreq);
+
+            goto next_unlinked;
+          next:
+            prev_sreq = sreq;
+            sreq = MPID_nem_ib_cm_notify_sendq_next(sreq);
+          next_unlinked:;
+        } while (sreq);
+    }
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
 #endif /* MPID_NEM_ONDEMAND */
 
 /* RDMA-read the head pointer of the shared ring buffer */
@@ -1978,40 +2067,3 @@ int MPID_nem_ib_ringbuf_progress()
   fn_fail:
     goto fn_exit;
 }
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_cm_wr_send
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_cm_wr_send(int pg_rank, int myrank)
-{
-    int mpi_errno = MPI_SUCCESS;
-    int ibcom_errno;
-
-    MPID_nem_ib_cm_cmd_shadow_t *shadow =
-        (MPID_nem_ib_cm_cmd_shadow_t *) MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
-    MPID_nem_ib_cm_wr_send_t *buf_from = (MPID_nem_ib_cm_wr_send_t *)
-        MPID_nem_ib_rdmawr_from_alloc(sizeof(MPID_nem_ib_cm_wr_send_t));
-    MPID_nem_ib_cm_req_t *req = MPIU_Malloc(sizeof(MPID_nem_ib_cm_req_t));
-
-    shadow->type = MPID_NEM_IB_NOTIFY_OUTSTANDING_TX_EMPTY;
-
-    buf_from->type = MPID_NEM_IB_NOTIFY_OUTSTANDING_TX_EMPTY;
-    buf_from->initiator_rank = myrank;
-    shadow->req = req;
-    shadow->buf_from = (void *) buf_from;
-    shadow->buf_from_sz = sizeof(MPID_nem_ib_cm_wr_send_t);
-
-    shadow->req->ibcom = MPID_nem_ib_scratch_pad_ibcoms[pg_rank];
-
-    ibcom_errno =
-        MPID_nem_ib_com_wr_scratch_pad(MPID_nem_ib_scratch_pad_fds[pg_rank],
-                                       (uint64_t) shadow, shadow->buf_from, shadow->buf_from_sz);
-
-    MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_wr_scratch_pad");
-
-  fn_exit:
-    return mpi_errno;
-  fn_fail:
-    goto fn_exit;
-}

http://git.mpich.org/mpich.git/commitdiff/56a0b44574d7e609bed94047ecfb435b0a9e55b3

commit 56a0b44574d7e609bed94047ecfb435b0a9e55b3
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date:   Wed Jun 18 11:57:30 2014 +0900

    Improve the method of IB-dereg_mr
    
    Create a queue to store memory regions which are possibility of
    deregistration. When ibv_post_send fails with ENOMEM, dequeue some
    regions from the queu and release release them.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
index a06a8ba..99ba26b 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
@@ -102,7 +102,8 @@ static int MPID_nem_ib_rdmawr_to_init(uint64_t sz)
 
     memset(start, 0, sz);
 
-    MPID_nem_ib_rdmawr_to_alloc_mr = MPID_nem_ib_com_reg_mr_fetch(start, sz, 0);
+    MPID_nem_ib_rdmawr_to_alloc_mr =
+        MPID_nem_ib_com_reg_mr_fetch(start, sz, 0, MPID_NEM_IB_COM_REG_MR_STICKY);
     MPID_NEM_IB_COM_ERR_CHKANDJUMP(!MPID_nem_ib_rdmawr_to_alloc_mr, -1,
                                    printf("MPID_nem_ib_com_reg_mr_fetchibv_reg_mr failed\n"));
     dprintf("rdmawr_to_init,rkey=%08x\n", MPID_nem_ib_rdmawr_to_alloc_mr->rkey);
@@ -808,7 +809,8 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
 
         conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] =
             MPID_nem_ib_com_reg_mr_fetch(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM],
-                                         conp->icom_msize[MPID_NEM_IB_COM_SCRATCH_PAD_FROM], 0);
+                                         conp->icom_msize[MPID_NEM_IB_COM_SCRATCH_PAD_FROM], 0,
+                                         MPID_NEM_IB_COM_REG_MR_STICKY);
         MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_FROM], -1,
                                        printf("ibv_reg_mr failed\n"));
 
@@ -854,7 +856,8 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
 
         conp->icom_mrlist[MPID_NEM_IB_COM_UDWR_FROM] =
             MPID_nem_ib_com_reg_mr_fetch(conp->icom_mem[MPID_NEM_IB_COM_UDWR_FROM],
-                                         conp->icom_msize[MPID_NEM_IB_COM_UDWR_FROM], 0);
+                                         conp->icom_msize[MPID_NEM_IB_COM_UDWR_FROM], 0,
+                                         MPID_NEM_IB_COM_REG_MR_STICKY);
         MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_mrlist[MPID_NEM_IB_COM_UDWR_FROM], -1,
                                        dprintf("ibv_reg_mr failed with mr_flags=0x%x\n", mr_flags));
 
@@ -875,7 +878,8 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
 
         conp->icom_mrlist[MPID_NEM_IB_COM_UDWR_TO] =
             MPID_nem_ib_com_reg_mr_fetch(conp->icom_mem[MPID_NEM_IB_COM_UDWR_TO],
-                                         conp->icom_msize[MPID_NEM_IB_COM_UDWR_TO], 0);
+                                         conp->icom_msize[MPID_NEM_IB_COM_UDWR_TO], 0,
+                                         MPID_NEM_IB_COM_REG_MR_STICKY);
         MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_mrlist[MPID_NEM_IB_COM_UDWR_TO], -1,
                                        dprintf("ibv_reg_mr failed with mr_flags=0x%x\n", mr_flags));
 
@@ -1221,7 +1225,7 @@ int MPID_nem_ib_com_alloc(int condesc, int sz)
         conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_TO] =
             MPID_nem_ib_com_reg_mr_fetch(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_TO],
                                          conp->icom_msize[MPID_NEM_IB_COM_SCRATCH_PAD_TO],
-                                         IBV_ACCESS_REMOTE_ATOMIC);
+                                         IBV_ACCESS_REMOTE_ATOMIC, MPID_NEM_IB_COM_REG_MR_STICKY);
         MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_TO], -1,
                                        dprintf("ibv_reg_mr failed with mr_flags=0x%x\n", mr_flags));
 
@@ -1474,7 +1478,8 @@ int MPID_nem_ib_com_isend(int condesc,
 
     if (sz_data) {
         //dprintf("MPID_nem_ib_com_isend,data=%p,sz_data=%d\n", data, sz_data);
-        struct ibv_mr *mr_data = MPID_nem_ib_com_reg_mr_fetch(data, sz_data, 0);
+        struct ibv_mr *mr_data =
+            MPID_nem_ib_com_reg_mr_fetch(data, sz_data, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
         MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_data, -1,
                                        printf("MPID_nem_ib_com_isend,ibv_reg_mr_fetch failed\n"));
 #ifdef HAVE_LIBDCFA
@@ -1961,7 +1966,8 @@ int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, long sz_data
     MPID_NEM_IB_COM_ERR_CHKANDJUMP(!sz_data, -1, dprintf("MPID_nem_ib_com_lrecv,sz_data==0\n"));
 
     /* register memory area containing data */
-    struct ibv_mr *mr_data = MPID_nem_ib_com_reg_mr_fetch(laddr, sz_data, 0);
+    struct ibv_mr *mr_data =
+        MPID_nem_ib_com_reg_mr_fetch(laddr, sz_data, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
     MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_data, -1,
                                    dprintf("MPID_nem_ib_com_lrecv,ibv_reg_mr_fetch failed\n"));
 
@@ -2103,7 +2109,8 @@ int MPID_nem_ib_com_put_lmt(int condesc, uint64_t wr_id, void *raddr, int sz_dat
     num_sge = 0;
 
     /* register memory area containing data */
-    struct ibv_mr *mr_data = MPID_nem_ib_com_reg_mr_fetch(laddr, sz_data, 0);
+    struct ibv_mr *mr_data =
+        MPID_nem_ib_com_reg_mr_fetch(laddr, sz_data, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
     MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_data, -1,
                                    dprintf("MPID_nem_ib_com_put_lmt,ibv_reg_mr_fetch failed\n"));
 
@@ -2860,6 +2867,7 @@ int MPID_nem_ib_com_reg_mr(void *addr, long len, struct ibv_mr **mr,
                            enum ibv_access_flags additional_flags)
 {
     int ibcom_errno = 0;
+    int err = -1;
     dprintf("MPID_nem_ib_com_reg_mr,addr=%p,len=%ld,mr=%p\n", addr, len, mr);
 
     *mr =
@@ -2867,7 +2875,12 @@ int MPID_nem_ib_com_reg_mr(void *addr, long len, struct ibv_mr **mr,
                    IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE |
                    IBV_ACCESS_REMOTE_READ | additional_flags);
 
-    MPID_NEM_IB_COM_ERR_CHKANDJUMP(*mr == 0, -1,
+    if (*mr == 0) {
+        err = errno;    /* copy errno of ibv_reg_mr */
+    }
+
+    /* return the errno of ibv_reg_mr when error occurs */
+    MPID_NEM_IB_COM_ERR_CHKANDJUMP(*mr == 0, err,
                                    dprintf("MPID_nem_ib_com_reg_mr,cannot register memory\n"));
 
   fn_exit:
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
index f7c7ffd..75e3757 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
@@ -570,7 +570,13 @@ extern int MPID_nem_ib_com_mem_udwr_to(int condesc, void **out);
 extern int MPID_nem_ib_com_register_cache_init(void);
 extern int MPID_nem_ib_com_register_cache_release(void);
 extern struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
-                                                   enum ibv_access_flags additional_flags);
+                                                   enum ibv_access_flags additional_flags,
+                                                   int mode);
+#define MPID_NEM_IB_COM_REG_MR_GLOBAL (0)
+#define MPID_NEM_IB_COM_REG_MR_STICKY (1)
+
+#define list_entry(ptr, type, member) \
+            ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
 
 extern int MPID_nem_ib_com_udbuf_init(void *q);
 
@@ -703,7 +709,8 @@ static inline void *MPID_nem_ib_rdmawr_from_alloc(uint32_t _sz)
             }
 
             ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->mr =
-                MPID_nem_ib_com_reg_mr_fetch(q, MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA, 0);
+                MPID_nem_ib_com_reg_mr_fetch(q, MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA, 0,
+                                             MPID_NEM_IB_COM_REG_MR_STICKY);
             if (!((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->mr) {
                 printf("ibv_reg_mr failed\n");
                 MPID_nem_ib_segv;
@@ -718,7 +725,8 @@ static inline void *MPID_nem_ib_rdmawr_from_alloc(uint32_t _sz)
                       1) * MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA;
                  p += MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA) {
                 ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) p)->mr =
-                    MPID_nem_ib_com_reg_mr_fetch(q, MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA, 0);
+                    MPID_nem_ib_com_reg_mr_fetch(q, MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA, 0,
+                                                 MPID_NEM_IB_COM_REG_MR_STICKY);
                 if (!((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) p)->mr) {
                     printf("ibv_reg_mr failed\n");
                     MPID_nem_ib_segv;
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
index 8c4cbda..cd2980c 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
@@ -110,7 +110,8 @@ int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_p
 #endif
 
     /* put IB rkey */
-    struct ibv_mr *mr = MPID_nem_ib_com_reg_mr_fetch(write_from_buf, data_sz, 0);
+    struct ibv_mr *mr =
+        MPID_nem_ib_com_reg_mr_fetch(write_from_buf, data_sz, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
     MPIU_ERR_CHKANDJUMP(!mr, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
 #ifdef HAVE_LIBDCFA
     s_cookie_buf->addr = (void *) mr->host_addr;
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
index f4a930d..7eeabee 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
@@ -103,6 +103,7 @@ struct MPID_nem_ib_com_reg_mr_cache_entry_t {
     /* : public MPID_nem_ib_com_reg_mr_listnode_t */
     struct MPID_nem_ib_com_reg_mr_listnode_t *lru_next;
     struct MPID_nem_ib_com_reg_mr_listnode_t *lru_prev;
+    struct MPID_nem_ib_com_reg_mr_listnode_t g_lru;
 
     struct ibv_mr *mr;
     void *addr;
@@ -110,6 +111,7 @@ struct MPID_nem_ib_com_reg_mr_cache_entry_t {
     int refc;
 };
 
+static struct MPID_nem_ib_com_reg_mr_listnode_t MPID_nem_ib_com_reg_mr_global_cache;
 static struct MPID_nem_ib_com_reg_mr_listnode_t
     MPID_nem_ib_com_reg_mr_cache[MPID_NEM_IB_COM_REG_MR_NLINE];
 
@@ -166,7 +168,7 @@ static inline void __lru_queue_display()
 }
 
 struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
-                                            enum ibv_access_flags additional_flags)
+                                            enum ibv_access_flags additional_flags, int mode)
 {
 #if 0   /* debug */
     struct ibv_mr *mr;
@@ -184,6 +186,7 @@ struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
     int ibcom_errno;
     int key;
     struct MPID_nem_ib_com_reg_mr_cache_entry_t *e;
+    static unsigned long long num_global_cache = 0ULL;
 
 #if 1   /*def HAVE_LIBDCFA */
     /* we can't change addr because ibv_post_send assumes mr->host_addr (output of this function)
@@ -220,6 +223,7 @@ struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
 
     // miss
 
+#if 0
     // evict an entry and de-register its MR when the cache-set is full
     if (way > MPID_NEM_IB_COM_REG_MR_NWAY) {
         struct MPID_nem_ib_com_reg_mr_cache_entry_t *victim =
@@ -235,6 +239,7 @@ struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
         }
         afree(victim, MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
     }
+#endif
 
     e = aalloc(sizeof(struct MPID_nem_ib_com_reg_mr_cache_entry_t),
                MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
@@ -245,8 +250,47 @@ struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
     /* register memory */
     ibcom_errno = MPID_nem_ib_com_reg_mr(addr_aligned, len_aligned, &e->mr, additional_flags);
     if (ibcom_errno != 0) {
-        fprintf(stderr, "mrcache,MPID_nem_ib_com_reg_mr\n");
-        goto fn_fail;
+        /* ib_com_reg_mr returns the errno of ibv_reg_mr */
+        if (ibcom_errno == ENOMEM) {
+            /* deregister memory region. The value of 'num_global_cache' means the number of global-cached.
+             * delete 5 percents of global-cached */
+            int i;
+            int del_num = num_global_cache / 20;
+            struct MPID_nem_ib_com_reg_mr_cache_entry_t *victim;
+
+            dprintf("mrcache,MPID_nem_ib_com_reg_mr,ENOMEM,del_num(%d)\n", del_num);
+
+            for (i = 0; i < del_num; i++) {
+                /* get LRU data from MPID_nem_ib_com_reg_mr_global_cache */
+                victim = list_entry(MPID_nem_ib_com_reg_mr_global_cache.lru_prev, struct MPID_nem_ib_com_reg_mr_cache_entry_t, g_lru);
+
+                MPID_nem_ib_com_reg_mr_unlink((struct MPID_nem_ib_com_reg_mr_listnode_t *)victim);
+                MPID_nem_ib_com_reg_mr_unlink(&(victim->g_lru));
+
+                ibcom_errno = MPID_nem_ib_com_dereg_mr(victim->mr);
+                if (ibcom_errno) {
+                    printf("mrcache,MPID_nem_ib_com_dereg_mr\n");
+                    afree(e, MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
+                    goto fn_fail;
+                }
+                afree(victim, MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
+                num_global_cache--;
+            }
+
+            /* re-registraion */
+            ibcom_errno = MPID_nem_ib_com_reg_mr(addr_aligned, len_aligned, &e->mr, additional_flags);
+            if (ibcom_errno != 0) {
+                fprintf(stderr, "mrcache,MPID_nem_ib_com_reg_mr,retry,errno=%d\n", ibcom_errno);
+                afree(e, MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
+                goto fn_fail;
+            }
+        }
+        else {
+            /* errno is not ENOMEM */
+            fprintf(stderr, "mrcache,MPID_nem_ib_com_reg_mr,errno=%d\n", ibcom_errno);
+            afree(e, MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
+            goto fn_fail;
+        }
     }
     e->addr = addr_aligned;
     e->len = len_aligned;
@@ -257,6 +301,11 @@ struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
     /* register to cache */
     MPID_nem_ib_com_reg_mr_insert(&MPID_nem_ib_com_reg_mr_cache[key],
                                   (struct MPID_nem_ib_com_reg_mr_listnode_t *) e);
+    if (mode != MPID_NEM_IB_COM_REG_MR_STICKY) {
+        /* register to global-cache */
+        num_global_cache++;
+        MPID_nem_ib_com_reg_mr_insert(&MPID_nem_ib_com_reg_mr_global_cache, &(e->g_lru));
+    }
 
     //__lru_queue_display();
 
@@ -266,7 +315,7 @@ struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
 
     /* reference counter is used when evicting entry */
     e->refc++;
-#if 0   /* disable for debug */
+#if 1
     /* move to head of the list */
     if (e !=
         (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) MPID_nem_ib_com_reg_mr_cache[key].lru_next)
@@ -276,6 +325,11 @@ struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
                                       (struct MPID_nem_ib_com_reg_mr_listnode_t *) e);
     }
 #endif
+    if (mode != MPID_NEM_IB_COM_REG_MR_STICKY) {
+        /* move to head of the list in global-cache */
+        MPID_nem_ib_com_reg_mr_unlink(&(e->g_lru));
+        MPID_nem_ib_com_reg_mr_insert(&MPID_nem_ib_com_reg_mr_global_cache, &(e->g_lru));
+    }
     //dprintf("[MrCache] reuse e=%p,key=%d,mr=%p,refc=%d,addr=%p,len=%ld,lkey=%08x,rkey=%08x\n", e,
     //key, e->mr, e->refc, e->mr->addr, e->mr->length, e->mr->lkey, e->mr->rkey);
 
@@ -318,6 +372,10 @@ int MPID_nem_ib_com_register_cache_init()
             MPID_nem_ib_com_reg_mr_cache[i].lru_prev =
                 (struct MPID_nem_ib_com_reg_mr_listnode_t *) &MPID_nem_ib_com_reg_mr_cache[i];
         }
+        MPID_nem_ib_com_reg_mr_global_cache.lru_next =
+            (struct MPID_nem_ib_com_reg_mr_listnode_t *) &MPID_nem_ib_com_reg_mr_global_cache;
+        MPID_nem_ib_com_reg_mr_global_cache.lru_prev =
+            (struct MPID_nem_ib_com_reg_mr_listnode_t *) &MPID_nem_ib_com_reg_mr_global_cache;
 
         dprintf("[MrCache] cache initializes %d entries\n", MPID_NEM_IB_COM_REG_MR_NLINE);
     }

http://git.mpich.org/mpich.git/commitdiff/a499ad05e4208b4a76843b79eb726f80862ec97c

commit a499ad05e4208b4a76843b79eb726f80862ec97c
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date:   Wed Jun 4 16:50:16 2014 +0900

    Fix wrong type casting
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
index 32f46fa..a06a8ba 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
@@ -1969,8 +1969,8 @@ int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, long sz_data
     *post_num = 1;
 
     /* Type of max_msg_sz is uint32_t. */
-    if ((uint32_t) (sz_data) > conp->icom_pattr.max_msg_sz) {
-        *post_num += (uint32_t) (sz_data) / conp->icom_pattr.max_msg_sz;
+    if (sz_data > (long) conp->icom_pattr.max_msg_sz) {
+        *post_num +=  sz_data / (long)conp->icom_pattr.max_msg_sz;
     }
 
     for (i = 0; i < *post_num; i++) {
@@ -1996,10 +1996,10 @@ int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, long sz_data
         conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].addr =
             (uint64_t) laddr + (i * conp->icom_pattr.max_msg_sz);
 #endif
-        if ((uint32_t) sz_data > conp->icom_pattr.max_msg_sz) {
+        if (sz_data > (long) conp->icom_pattr.max_msg_sz) {
             if (i == *post_num - 1) {
                 conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].length =
-                    (uint32_t) sz_data - i * conp->icom_pattr.max_msg_sz;
+                    (uint32_t) (sz_data - (long) conp->icom_pattr.max_msg_sz * i);
             }
             else {
                 conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].length =

http://git.mpich.org/mpich.git/commitdiff/fded59aee40a99eda9e5a96955751b5149c0d1bc

commit fded59aee40a99eda9e5a96955751b5149c0d1bc
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date:   Tue Jun 3 14:48:04 2014 +0900

    Fix the buffer address for send/recv
    
    The data format to transmit or receive may be contiguous and
    have the nonzero lower bound.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
index c4e426c..8c4cbda 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
@@ -55,7 +55,7 @@ int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_p
     //assert(dt_true_lb == 0);
     void *write_from_buf;
     if (dt_contig) {
-        write_from_buf = req->dev.user_buf;
+        write_from_buf = (void *) ((char *) req->dev.user_buf + dt_true_lb);
     }
     else {
         /* see MPIDI_CH3_EagerNoncontigSend (in ch3u_eager.c) */
@@ -232,7 +232,7 @@ int MPID_nem_ib_lmt_start_recv(struct MPIDI_VC *vc, struct MPID_Request *req, MP
 
     void *write_to_buf;
     if (dt_contig) {
-        write_to_buf = (void *) ((char *) req->dev.user_buf /*+ REQ_FIELD(req, lmt_dt_true_lb) */);
+        write_to_buf = (void *) ((char *) req->dev.user_buf + dt_true_lb);
     }
     else {
         //REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc((size_t)req->ch.lmt_data_sz);

http://git.mpich.org/mpich.git/commitdiff/3690597fbbf8cf8289530eaa5483967eebce2d31

commit 3690597fbbf8cf8289530eaa5483967eebce2d31
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date:   Tue Jun 3 08:50:38 2014 +0900

    Fix data size in a header
    
    It's not necessary to include the size of ib_netmod_trailer_t in data size.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
index c4268bd..32f46fa 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
@@ -1439,8 +1439,7 @@ int MPID_nem_ib_com_isend(int condesc,
     uint32_t hdr_ringbuf_type = local_ringbuf_type;
     MPID_NEM_IB_NETMOD_HDR_SZ_SET(buf_from,
                                   MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) +
-                                  sz_prefix + sz_hdr + sz_data +
-                                  sizeof(MPID_nem_ib_netmod_trailer_t));
+                                  sz_prefix + sz_hdr + sz_data);
     if (remote_ringbuf_type == MPID_NEM_IB_RINGBUF_EXCLUSIVE) {
         hdr_ringbuf_type |= MPID_NEM_IB_RINGBUF_RELINDEX;
         MPID_NEM_IB_NETMOD_HDR_RELINDEX_SET(buf_from, conp->rsr_seq_num_tail);
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
index 1fa2525..6cd823c 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
@@ -633,8 +633,7 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t * ringbuf)
     ssize_t sz_pkt = MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf);
     MPIDI_CH3_Pkt_eager_send_t *pkt = (MPIDI_CH3_Pkt_eager_send_t *) ((uint8_t *) buf + sz_pkt);
     dprintf("pkt=%p,sizeof=%ld\n", pkt, sz_pkt);
-    MPIU_Assert(MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) >=
-                sz_pkt + sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPID_nem_ib_netmod_trailer_t));
+    MPIU_Assert(MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) >= sz_pkt + sizeof(MPIDI_CH3_Pkt_t));
     dprintf
         ("handle_pkt,before,%d<-%d,id=%d,pkt->type=%d,pcc=%d,MPIDI_CH3_PKT_END_ALL=%d,pkt=%p,subtype=%d\n",
          MPID_nem_ib_myrank, vc->pg_rank, *remote_poll, pkt->type,
@@ -643,8 +642,7 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t * ringbuf)
     /* see MPIDI_CH3_PktHandler_EagerSend (in src/mpid/ch3/src/ch3u_eager.c) */
     mpi_errno =
         MPID_nem_handle_pkt(vc, (char *) ((uint8_t *) buf + sz_pkt),
-                            (MPIDI_msg_sz_t) (MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) -
-                                              sz_pkt - sizeof(MPID_nem_ib_netmod_trailer_t)));
+                            (MPIDI_msg_sz_t) (MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) - sz_pkt));
     if (mpi_errno) {
         MPIU_ERR_POP(mpi_errno);
     }
@@ -705,8 +703,7 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t * ringbuf)
                                                     sz_pkt + sizeof(MPIDI_CH3_Pkt_t)));
     }
     else {
-        if (MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) ==
-            sz_pkt + sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPID_nem_ib_netmod_trailer_t)) {
+        if (MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) == sz_pkt + sizeof(MPIDI_CH3_Pkt_t)) {
             if (pkt->type == MPIDI_CH3_PKT_EAGERSHORT_SEND
                 //||                  pkt->type == MPIDI_CH3_PKT_GET
 ) {

http://git.mpich.org/mpich.git/commitdiff/586e7122784d8f45ea2d6c5fa7df2de2f53dfe2a

commit 586e7122784d8f45ea2d6c5fa7df2de2f53dfe2a
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date:   Mon Jun 2 10:38:44 2014 +0900

    Fix clz after calculating power of two
    
    The value of clz has to be decremented after calculating power of two.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
index 5d988f5..f7c7ffd 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
@@ -648,7 +648,12 @@ typedef struct {
     } else {                                                            \
         clz = __builtin_clz(_sz);                                       \
         int ctz = __builtin_ctz(_sz);                                   \
-        sz = (clz + ctz == 31) ? _sz : (1ULL << (32 - clz));            \
+        if (clz + ctz == 31) {                                          \
+            sz = _sz;                                                   \
+        } else {                                                        \
+            sz = (1ULL << (32 - clz));                                  \
+            clz = clz - 1;                                              \
+        }                                                               \
     }
 
 static inline void *MPID_nem_ib_rdmawr_from_alloc(uint32_t _sz)

http://git.mpich.org/mpich.git/commitdiff/df39ada6f58aecea6eb36c21521e1ab557e365a4

commit df39ada6f58aecea6eb36c21521e1ab557e365a4
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date:   Fri May 30 17:51:27 2014 +0900

    Fix transmission processing of large-message
    
    IB can transmit the size of 'max_msg_sz' at one time. Fragmentation is
    required when transmitting a message which exceeds the size of it.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
index 0e4effe..c4268bd 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
@@ -1517,7 +1517,15 @@ int MPID_nem_ib_com_isend(int condesc,
             off_pow2_aligned, sz_pad, num_sge);
 
     conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].num_sge = num_sge;
+#if 1
+    MPID_nem_ib_rc_send_request *wrap_wr_id = MPIU_Malloc(sizeof(MPID_nem_ib_rc_send_request));
+    wrap_wr_id->wr_id = wr_id;
+    wrap_wr_id->mf = 0;
+
+    conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr_id = (uint64_t) wrap_wr_id;
+#else
     conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr_id = wr_id;
+#endif
     conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.remote_addr =
         (uint64_t) conp->local_ringbuf_start +
         MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t) (conp->sseq_num % conp->local_ringbuf_nslot));
@@ -1936,29 +1944,101 @@ int MPID_nem_ib_com_udrecv(int condesc)
     goto fn_exit;
 }
 
-int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, int sz_data, uint32_t rkey,
-                          void *laddr)
+int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, long sz_data, uint32_t rkey,
+                          void *laddr, int *post_num)
 {
     MPID_nem_ib_com_t *conp;
     int ibcom_errno = 0;
     struct ibv_send_wr *bad_wr;
     int ib_errno;
-    int num_sge;
+    int num_sge = 0;
+    int i;
 
-    dprintf("MPID_nem_ib_com_lrecv,enter,raddr=%p,sz_data=%d,laddr=%p\n", raddr, sz_data, laddr);
+    dprintf("MPID_nem_ib_com_lrecv,enter,raddr=%p,sz_data=%ld,laddr=%p\n", raddr, sz_data, laddr);
 
     MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
     MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_connected, -1,
                                    dprintf("MPID_nem_ib_com_lrecv,not connected\n"));
     MPID_NEM_IB_COM_ERR_CHKANDJUMP(!sz_data, -1, dprintf("MPID_nem_ib_com_lrecv,sz_data==0\n"));
 
-    num_sge = 0;
-
     /* register memory area containing data */
     struct ibv_mr *mr_data = MPID_nem_ib_com_reg_mr_fetch(laddr, sz_data, 0);
     MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_data, -1,
                                    dprintf("MPID_nem_ib_com_lrecv,ibv_reg_mr_fetch failed\n"));
 
+#if 1
+    *post_num = 1;
+
+    /* Type of max_msg_sz is uint32_t. */
+    if ((uint32_t) (sz_data) > conp->icom_pattr.max_msg_sz) {
+        *post_num += (uint32_t) (sz_data) / conp->icom_pattr.max_msg_sz;
+    }
+
+    for (i = 0; i < *post_num; i++) {
+        MPID_nem_ib_rc_send_request *wrap_wr_id = MPIU_Malloc(sizeof(MPID_nem_ib_rc_send_request));
+        wrap_wr_id->wr_id = wr_id;
+
+        if (i == *post_num - 1)
+            wrap_wr_id->mf = 0; /* end of packet */
+        else
+            wrap_wr_id->mf = 1;
+
+        dprintf("MPID_nem_ib_com_lrecv,mf=%d,post=%d/%d\n", wrap_wr_id->mf, i + 1, *post_num);
+
+        num_sge = 0;
+
+#ifdef HAVE_LIBDCFA
+        conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].mic_addr =
+            (uint64_t) laddr + (i * conp->icom_pattr.max_msg_sz);
+        conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].addr =
+            mr_data->host_addr + (i * conp->icom_pattr.max_msg_sz) + ((uint64_t) laddr -
+                                                                      (uint64_t) laddr);
+#else
+        conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].addr =
+            (uint64_t) laddr + (i * conp->icom_pattr.max_msg_sz);
+#endif
+        if ((uint32_t) sz_data > conp->icom_pattr.max_msg_sz) {
+            if (i == *post_num - 1) {
+                conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].length =
+                    (uint32_t) sz_data - i * conp->icom_pattr.max_msg_sz;
+            }
+            else {
+                conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].length =
+                    conp->icom_pattr.max_msg_sz;
+            }
+        }
+        else {
+            conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].length = sz_data;
+        }
+        conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].lkey = mr_data->lkey;
+        num_sge += 1;
+
+        conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].num_sge = num_sge;
+        conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].wr_id = (uint64_t) wrap_wr_id;
+        conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].wr.rdma.remote_addr =
+            (uint64_t) raddr + (i * conp->icom_pattr.max_msg_sz);
+        conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].wr.rdma.rkey = rkey;
+
+#ifdef HAVE_LIBDCFA
+        ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR]);
+        MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
+                                       dprintf("MPID_nem_ib_com_lrecv, ibv_post_send, rc=%d\n",
+                                               ib_errno));
+#else
+        ib_errno =
+            ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR], &bad_wr);
+        MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
+                                       printf
+                                       ("MPID_nem_ib_com_lrecv, ibv_post_send, rc=%d, bad_wr=%p\n",
+                                        ib_errno, bad_wr));
+#endif
+
+        /* other commands can be executed before RDMA-rd command */
+        /* see the "Ordering and the Fence Indicator" section in "InfiniBand Architecture" by William T. Futral */
+
+        conp->ncom += 1;
+    }
+#else
     /* Erase magic, super bug!! */
     //((MPID_nem_ib_netmod_trailer_t*)(laddr + sz_data - sizeof(MPID_nem_ib_netmod_trailer_t)))->magic = 0;
 #ifdef HAVE_LIBDCFA
@@ -1996,6 +2076,7 @@ int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, int sz_data,
     conp->after_rdma_rd = 1;
 #endif
     conp->ncom += 1;
+#endif
 
   fn_exit:
     return ibcom_errno;
@@ -2039,7 +2120,15 @@ int MPID_nem_ib_com_put_lmt(int condesc, uint64_t wr_id, void *raddr, int sz_dat
     num_sge += 1;
 
     conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].num_sge = num_sge;
+#if 1
+    MPID_nem_ib_rc_send_request *wrap_wr_id = MPIU_Malloc(sizeof(MPID_nem_ib_rc_send_request));
+    wrap_wr_id->wr_id = wr_id;
+    wrap_wr_id->mf = 0;
+
+    conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].wr_id = (uint64_t) wrap_wr_id;
+#else
     conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].wr_id = wr_id;
+#endif
     conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].wr.rdma.remote_addr = (uint64_t) raddr;
     conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].wr.rdma.rkey = rkey;
 
@@ -2768,11 +2857,11 @@ char *MPID_nem_ib_com_strerror(int errno)
     goto fn_exit;
 }
 
-int MPID_nem_ib_com_reg_mr(void *addr, int len, struct ibv_mr **mr,
+int MPID_nem_ib_com_reg_mr(void *addr, long len, struct ibv_mr **mr,
                            enum ibv_access_flags additional_flags)
 {
     int ibcom_errno = 0;
-    dprintf("MPID_nem_ib_com_reg_mr,addr=%p,len=%d,mr=%p\n", addr, len, mr);
+    dprintf("MPID_nem_ib_com_reg_mr,addr=%p,len=%ld,mr=%p\n", addr, len, mr);
 
     *mr =
         ibv_reg_mr(ib_pd, addr, len,
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
index 182d39e..5d988f5 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
@@ -353,6 +353,11 @@ typedef struct {
 }
 MPID_nem_ib_rdmawr_to_alloc_hdr_t;
 
+typedef struct {
+    uint64_t wr_id;             /* address of MPID_Request */
+    int mf;                     /* more fragment (0 means the end of packet) */
+} MPID_nem_ib_rc_send_request;
+
 /* Ring-buffer to which a remote note RDMA-writes */
 #define MPID_NEM_IB_NRINGBUF 64
 #define MPID_NEM_IB_RINGBUF_NSLOT 16
@@ -533,8 +538,8 @@ extern int MPID_nem_ib_com_irecv(int condesc, uint64_t wr_id);
 extern int MPID_nem_ib_com_udsend(int condesc, union ibv_gid *remote_gid, uint16_t remote_lid,
                                   uint32_t remote_qpn, uint32_t imm_data, uint64_t wr_id);
 extern int MPID_nem_ib_com_udrecv(int condesc);
-extern int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, int sz_data,
-                                 uint32_t rkey, void *laddr);
+extern int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, long sz_data,
+                                 uint32_t rkey, void *laddr, int *post_num);
 extern int MPID_nem_ib_com_put_lmt(int condesc, uint64_t wr_id, void *raddr, int sz_data,
                                    uint32_t rkey, void *laddr);
 extern int MPID_nem_ib_com_scratch_pad_recv(int condesc, int sz_data);
@@ -543,7 +548,7 @@ extern int MPID_nem_ib_com_poll_cq(int which_cq, struct ibv_wc *wc, int *result)
 extern int MPID_nem_ib_com_obtain_pointer(int condesc, MPID_nem_ib_com_t ** MPID_nem_ib_com);
 
 /* for ib_reg_mr.c */
-extern int MPID_nem_ib_com_reg_mr(void *addr, int len, struct ibv_mr **mr,
+extern int MPID_nem_ib_com_reg_mr(void *addr, long len, struct ibv_mr **mr,
                                   enum ibv_access_flags additional_flags);
 extern int MPID_nem_ib_com_dereg_mr(struct ibv_mr *mr);
 
@@ -564,7 +569,7 @@ extern int MPID_nem_ib_com_mem_udwr_to(int condesc, void **out);
 /* ib_reg_mr.c */
 extern int MPID_nem_ib_com_register_cache_init(void);
 extern int MPID_nem_ib_com_register_cache_release(void);
-extern struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, int len,
+extern struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
                                                    enum ibv_access_flags additional_flags);
 
 extern int MPID_nem_ib_com_udbuf_init(void *q);
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
index 123dde4..c4e426c 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
@@ -148,10 +148,19 @@ int MPID_nem_ib_lmt_start_recv_core(struct MPID_Request *req, void *raddr, uint3
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);
 
+#if 1
+    int post_num = 1;
+
+    ibcom_errno =
+        MPID_nem_ib_com_lrecv(vc_ib->sc->fd, (uint64_t) req, raddr, req->ch.lmt_data_sz, rkey,
+                              write_to_buf, &post_num);
+    MPID_nem_ib_ncqe += post_num;
+#else
     ibcom_errno =
         MPID_nem_ib_com_lrecv(vc_ib->sc->fd, (uint64_t) req, raddr, req->ch.lmt_data_sz, rkey,
                               write_to_buf);
     MPID_nem_ib_ncqe += 1;
+#endif
     //dprintf("start_recv,ncqe=%d\n", MPID_nem_ib_ncqe);
     MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_lrecv");
     dprintf("lmt_start_recv_core,MPID_nem_ib_ncqe=%d\n", MPID_nem_ib_ncqe);
@@ -163,7 +172,11 @@ int MPID_nem_ib_lmt_start_recv_core(struct MPID_Request *req, void *raddr, uint3
     //fflush(stdout);
 
 #ifdef MPID_NEM_IB_LMT_GET_CQE
+#if 1
+    MPID_nem_ib_ncqe_to_drain += post_num;      /* use CQE instead of polling */
+#else
     MPID_nem_ib_ncqe_to_drain += 1;     /* use CQE instead of polling */
+#endif
 #else
     /* drain_scq and ib_poll is not ordered, so both can decrement ref_count */
     MPIR_Request_add_ref(req);
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
index fb3141b..1fa2525 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
@@ -100,7 +100,9 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
         int req_type, msg_type;
 
         /* Obtain sreq */
-        req = (MPID_Request *) cqe[i].wr_id;
+        //req = (MPID_Request *) cqe[i].wr_id;
+        MPID_nem_ib_rc_send_request *req_wrap = (MPID_nem_ib_rc_send_request *) cqe[i].wr_id;
+        req = (MPID_Request *) req_wrap->wr_id;
 
         kind = req->kind;
         req_type = MPIDI_Request_get_type(req);
@@ -244,6 +246,7 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
 
             dprintf("drain_scq,eager-send,next\n");
 
+            MPIU_Free(req_wrap);
         }
         else if (req_type == MPIDI_REQUEST_TYPE_GET_RESP && msg_type == MPIDI_REQUEST_EAGER_MSG) {
             dprintf("drain_scq,GET_RESP,eager,req_type=%d,,comm=%p,opcode=%d\n", req_type,
@@ -294,6 +297,7 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
 
             dprintf("drain_scq,GET_RESP,next\n");
 
+            MPIU_Free(req_wrap);
         }
         else if (req_type == MPIDI_REQUEST_TYPE_RECV && msg_type == MPIDI_REQUEST_RNDV_MSG &&
                  cqe[i].opcode == IBV_WC_RDMA_READ) {
@@ -309,42 +313,45 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
             MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
 #if defined(MPID_NEM_IB_LMT_GET_CQE)
 
-            /* unpack non-contiguous dt */
-            int is_contig;
-            MPID_Datatype_is_contig(req->dev.datatype, &is_contig);
-            if (!is_contig) {
-                dprintf("drain_scq,lmt,GET_CQE,unpack noncontiguous data to user buffer\n");
-
-                /* see MPIDI_CH3U_Request_unpack_uebuf (in /src/mpid/ch3/src/ch3u_request.c) */
-                /* or MPIDI_CH3U_Receive_data_found (in src/mpid/ch3/src/ch3u_handle_recv_pkt.c) */
-                MPIDI_msg_sz_t unpack_sz = req->ch.lmt_data_sz;
-                MPID_Segment seg;
-                MPI_Aint last;
-
-                MPID_Segment_init(req->dev.user_buf, req->dev.user_count, req->dev.datatype, &seg,
-                                  0);
-                last = unpack_sz;
-                MPID_Segment_unpack(&seg, 0, &last, REQ_FIELD(req, lmt_pack_buf));
-                if (last != unpack_sz) {
-                    /* --BEGIN ERROR HANDLING-- */
-                    /* received data was not entirely consumed by unpack()
-                     * because too few bytes remained to fill the next basic
-                     * datatype */
-                    MPIR_STATUS_SET_COUNT(req->status, last);
-                    req->status.MPI_ERROR =
-                        MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__,
-                                             MPI_ERR_TYPE, "**MPID_nem_ib_poll", 0);
-                    /* --END ERROR HANDLING-- */
+            /* end of packet */
+            if (req_wrap->mf == 0) {
+                /* unpack non-contiguous dt */
+                int is_contig;
+                MPID_Datatype_is_contig(req->dev.datatype, &is_contig);
+                if (!is_contig) {
+                    dprintf("drain_scq,lmt,GET_CQE,unpack noncontiguous data to user buffer\n");
+
+                    /* see MPIDI_CH3U_Request_unpack_uebuf (in /src/mpid/ch3/src/ch3u_request.c) */
+                    /* or MPIDI_CH3U_Receive_data_found (in src/mpid/ch3/src/ch3u_handle_recv_pkt.c) */
+                    MPIDI_msg_sz_t unpack_sz = req->ch.lmt_data_sz;
+                    MPID_Segment seg;
+                    MPI_Aint last;
+
+                    MPID_Segment_init(req->dev.user_buf, req->dev.user_count, req->dev.datatype,
+                                      &seg, 0);
+                    last = unpack_sz;
+                    MPID_Segment_unpack(&seg, 0, &last, REQ_FIELD(req, lmt_pack_buf));
+                    if (last != unpack_sz) {
+                        /* --BEGIN ERROR HANDLING-- */
+                        /* received data was not entirely consumed by unpack()
+                         * because too few bytes remained to fill the next basic
+                         * datatype */
+                        MPIR_STATUS_SET_COUNT(req->status, last);
+                        req->status.MPI_ERROR =
+                            MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME,
+                                                 __LINE__, MPI_ERR_TYPE, "**MPID_nem_ib_poll", 0);
+                        /* --END ERROR HANDLING-- */
+                    }
+                    dprintf("drain_scq,lmt,GET_CQE,ref_count=%d,lmt_pack_buf=%p\n", req->ref_count,
+                            REQ_FIELD(req, lmt_pack_buf));
+                    MPID_nem_ib_stfree(REQ_FIELD(req, lmt_pack_buf), (size_t) req->ch.lmt_data_sz);
                 }
-                dprintf("drain_scq,lmt,GET_CQE,ref_count=%d,lmt_pack_buf=%p\n", req->ref_count,
-                        REQ_FIELD(req, lmt_pack_buf));
-                MPID_nem_ib_stfree(REQ_FIELD(req, lmt_pack_buf), (size_t) req->ch.lmt_data_sz);
-            }
-            dprintf("drain_scq,lmt,GET_CQE,lmt_send_GET_DONE,rsr_seq_num_tail=%d\n",
-                    vc_ib->ibcom->rsr_seq_num_tail);
+                dprintf("drain_scq,lmt,GET_CQE,lmt_send_GET_DONE,rsr_seq_num_tail=%d\n",
+                        vc_ib->ibcom->rsr_seq_num_tail);
 
-            /* send done to sender. vc is stashed in MPID_nem_ib_lmt_start_recv (in ib_lmt.c) */
-            MPID_nem_ib_lmt_send_GET_DONE(req->ch.vc, req);
+                /* send done to sender. vc is stashed in MPID_nem_ib_lmt_start_recv (in ib_lmt.c) */
+                MPID_nem_ib_lmt_send_GET_DONE(req->ch.vc, req);
+            }
 #endif
             /* unmark "lmt is going on" */
 
@@ -356,10 +363,12 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
             dprintf("drain_scq,rdma-read,ncqe=%d\n", MPID_nem_ib_ncqe);
 
 #ifdef MPID_NEM_IB_LMT_GET_CQE
-            dprintf("drain_scq,GET_CQE,Request_complete\n");
-            /* mark completion on rreq */
-            MPIDI_CH3U_Request_complete(req);
-            dprintf("drain_scq,complete,req=%p\n", req);
+            if (req_wrap->mf == 0) {
+                dprintf("drain_scq,GET_CQE,Request_complete\n");
+                /* mark completion on rreq */
+                MPIDI_CH3U_Request_complete(req);
+                dprintf("drain_scq,complete,req=%p\n", req);
+            }
 #else /* GET, and !GET_CQE */
 
             int is_contig;
@@ -382,6 +391,8 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
             MPID_Request_release(req);
             dprintf("drain_scq,relese,req=%p\n", req);
 #endif
+            MPIU_Free(req_wrap);
+
             /* try to send from sendq */
             if (!MPID_nem_ib_sendq_empty(vc_ib->sendq)) {
                 dprintf("drain_scq,GET,ncom=%d,ncqe=%d,diff=%d\n",
@@ -412,6 +423,7 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
 #else
             //printf("kind=%d\n", kind);
 #endif
+            MPIU_Free(req_wrap);
         }
     }
     if (!dont_call_progress) {
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
index 61792c1..f4a930d 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
@@ -106,7 +106,7 @@ struct MPID_nem_ib_com_reg_mr_cache_entry_t {
 
     struct ibv_mr *mr;
     void *addr;
-    int len;
+    long len;
     int refc;
 };
 
@@ -165,7 +165,7 @@ static inline void __lru_queue_display()
     }
 }
 
-struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, int len,
+struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
                                             enum ibv_access_flags additional_flags)
 {
 #if 0   /* debug */
@@ -189,17 +189,17 @@ struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, int len,
     /* we can't change addr because ibv_post_send assumes mr->host_addr (output of this function)
      * must have an exact mirror value of addr (input of this function) */
     void *addr_aligned = addr;
-    int len_aligned = len;
+    long len_aligned = len;
 #else
     void *addr_aligned = (void *) ((unsigned long) addr & ~(MPID_NEM_IB_COM_REG_MR_SZPAGE - 1));
-    int len_aligned =
+    long len_aligned =
         ((((unsigned long) addr + len) - (unsigned long) addr_aligned +
           MPID_NEM_IB_COM_REG_MR_SZPAGE - 1) & ~(MPID_NEM_IB_COM_REG_MR_SZPAGE - 1));
 #endif
     key = MPID_nem_ib_com_hash_func(addr);
 
     dprintf("[MrCache] addr=%p, len=%d\n", addr, len);
-    dprintf("[MrCache] aligned addr=%p, len=%d\n", addr_aligned, len_aligned);
+    dprintf("[MrCache] aligned addr=%p, len=%ld\n", addr_aligned, len_aligned);
 
     //__lru_queue_display();
     int way = 0;
@@ -241,7 +241,7 @@ struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, int len,
     /* reference counter is used when evicting entry */
     e->refc = 1;
 
-    dprintf("MPID_nem_ib_com_reg_mr_fetch,miss,addr=%p,len=%d\n", addr_aligned, len_aligned);
+    dprintf("MPID_nem_ib_com_reg_mr_fetch,miss,addr=%p,len=%ld\n", addr_aligned, len_aligned);
     /* register memory */
     ibcom_errno = MPID_nem_ib_com_reg_mr(addr_aligned, len_aligned, &e->mr, additional_flags);
     if (ibcom_errno != 0) {

http://git.mpich.org/mpich.git/commitdiff/2f25f42745d009e86a0eb0c4fe21f04d76d7121a

commit 2f25f42745d009e86a0eb0c4fe21f04d76d7121a
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date:   Fri May 30 09:44:07 2014 +0900

    Fix some conditional judgments in ib_drain_scq
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
index ad297aa..fb3141b 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
@@ -135,7 +135,7 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
          */
         if (
                //req_type == MPIDI_REQUEST_TYPE_SEND
-               (req_type == MPIDI_REQUEST_TYPE_SEND ||
+               (req_type == MPIDI_REQUEST_TYPE_SEND || req_type == MPIDI_REQUEST_TYPE_RSEND ||
                 req_type == MPIDI_REQUEST_TYPE_RECV || req_type == MPIDI_REQUEST_TYPE_SSEND)
                && msg_type == MPIDI_REQUEST_EAGER_MSG) {
             dprintf("drain_scq,send/recv,eager,req_type=%d,,comm=%p,opcode=%d\n", req_type,
@@ -174,7 +174,7 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
                     REQ_FIELD(req, buf_from_sz));
 
             dprintf("drain_scq,eager-send,ncqe=%d\n", MPID_nem_ib_ncqe);
-            MPIU_Assert(req->ref_count == 1 || req->ref_count == 2);
+            MPIU_Assert(req->ref_count >= 1 && req->ref_count <= 3);
 
             /* ref_count is decremented in drain_scq and wait */
             if (*req->cc_ptr > 0) {

http://git.mpich.org/mpich.git/commitdiff/d5c2a5dab82547764974b27bdb695901f59801f7

commit d5c2a5dab82547764974b27bdb695901f59801f7
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date:   Thu May 29 16:17:14 2014 +0900

    Delete some unnecessary increments
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
index 0904b97..c1b766d 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
@@ -229,7 +229,7 @@ static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, vo
 
     /* increment cc because PktHandler_EagerSyncAck, ssend.c, drain_scq decrement it */
     if (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_EAGER_SYNC_SEND) {
-        MPIR_Request_add_ref(sreq);
+        //MPIR_Request_add_ref(sreq);
     }
     if (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_GET) {
         //printf("isendcontig_core,MPIDI_CH3_PKT_GET,ref_count=%d\n", sreq->ref_count);
@@ -708,7 +708,7 @@ static int MPID_nem_ib_SendNoncontig_core(MPIDI_VC_t * vc, MPID_Request * sreq,
 
     /* increment cc because PktHandler_EagerSyncAck, ssend.c, drain_scq decrement it */
     if (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_EAGER_SYNC_SEND) {
-        MPIR_Request_add_ref(sreq);
+        //MPIR_Request_add_ref(sreq);
     }
 
     if (sizeof(MPIDI_CH3_Pkt_t) != hdr_sz) {

http://git.mpich.org/mpich.git/commitdiff/5bfff7d3bccfff88d89e4e56565184a8ebf077b0

commit 5bfff7d3bccfff88d89e4e56565184a8ebf077b0
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date:   Wed May 28 13:49:47 2014 +0900

    Add a command to notify of outstanding_tx_empty
    
    Add a command for reporting outstanding_tx is empty. And confirm that
    mutual outstanding_tx is empty, before closing a connection.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt b/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
index 0bfa8ce..210c2a5 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
@@ -2,6 +2,7 @@
 **MPIDI_PG_GetConnKVSname:MPIDI_PG_GetConnKVSname failed
 **MPID_nem_ib_cm_cas:MPID_nem_ib_cm_cas failed
 **MPID_nem_ib_cm_connect_cas_core:MPID_nem_ib_cm_connect_cas_core failed
+**MPID_nem_ib_cm_drain_rcq:MPID_nem_ib_cm_drain_rcq failed
 **MPID_nem_ib_cm_drain_scq:MPID_nem_ib_cm_drain_scq failed
 **MPID_nem_ib_cm_poll:MPID_nem_ib_cm_poll failed
 **MPID_nem_ib_cm_poll_syn:MPID_nem_ib_cm_poll_syn failed
@@ -30,6 +31,7 @@
 **MPID_nem_ib_com_reg_mr_fetch:MPID_nem_ib_com_reg_mr_fetch failed
 **MPID_nem_ib_com_rts:MPID_nem_ib_com_rts failed
 **MPID_nem_ib_com_sq_occupancy_notify_rate_get:MPID_nem_ib_com_sq_occupancy_notify_rate_get failed
+**MPID_nem_ib_com_wr_scratch_pad:MPID_nem_ib_com_wr_scratch_pad failed
 **MPID_nem_ib_drain_scq:MPID_nem_ib_drain_scq failed
 **MPID_nem_ib_drain_scq_scratch_pad:MPID_nem_ib_drain_scq_scratch_pad failed
 **MPID_nem_ib_kvs_put_binary:MPID_nem_ib_kvs_put_binary failed
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
index 8ff798b..0e4effe 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
@@ -47,7 +47,7 @@ static struct ibv_cq *MPID_nem_ib_ud_shared_scq;
 static int MPID_nem_ib_ud_shared_scq_ref_count;
 static struct ibv_cq *MPID_nem_ib_rc_shared_rcq;
 static int MPID_nem_ib_rc_shared_rcq_ref_count;
-static struct ibv_cq *MPID_nem_ib_rc_shared_rcq_scratch_pad;
+struct ibv_cq *MPID_nem_ib_rc_shared_rcq_scratch_pad;
 static int MPID_nem_ib_rc_shared_rcq_scratch_pad_ref_count;
 struct ibv_cq *MPID_nem_ib_ud_shared_rcq;
 static int MPID_nem_ib_ud_shared_rcq_ref_count;
@@ -449,8 +449,13 @@ static int MPID_nem_ib_com_clean(MPID_nem_ib_com_t * conp)
             MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list);
             MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list);
             MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list);
+            MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].sg_list);
 #endif
             MPIU_Free(conp->icom_sr);
+#ifndef HAVE_LIBDCFA
+            MPIU_Free(conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].sg_list);
+#endif
+            MPIU_Free(conp->icom_rr);
             break;
         case MPID_NEM_IB_COM_OPEN_UD:
             MPIU_Assert(MPID_nem_ib_ud_shared_scq_ref_count > 0);
@@ -1060,6 +1065,44 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
             conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].num_sge = 1;
             conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].opcode = IBV_WR_ATOMIC_CMP_AND_SWP;
             conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].send_flags = IBV_SEND_SIGNALED;
+
+#ifdef HAVE_LIBDCFA
+            memset(&(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].sg_list[0]),
+                   0, sizeof(struct ibv_sge) * WR_SG_NUM);
+#else
+            sge = (struct ibv_sge *) MPIU_Malloc(sizeof(struct ibv_sge));
+            memset(sge, 0, sizeof(struct ibv_sge));
+#endif
+            conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].next = NULL;
+#ifdef HAVE_LIBDCFA
+#else
+            conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].sg_list = sge;
+#endif
+            conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].num_sge = 1;
+            conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].opcode = IBV_WR_SEND;
+            conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].send_flags = IBV_SEND_SIGNALED;
+
+            /* RR (receive request) template */
+            conp->icom_rr =
+                (struct ibv_recv_wr *) MPIU_Malloc(sizeof(struct ibv_recv_wr) *
+                                                   MPID_NEM_IB_COM_SCRATCH_PAD_RR_NTEMPLATE);
+            memset(conp->icom_rr, 0,
+                   sizeof(struct ibv_recv_wr) * MPID_NEM_IB_COM_SCRATCH_PAD_RR_NTEMPLATE);
+
+            /* RR (receive request) template for MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER */
+#ifdef HAVE_LIBDCFA
+            memset(&(conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].sg_list[0]), 0,
+                   sizeof(struct ibv_sge) * WR_SG_NUM);
+#else
+            sge = (struct ibv_sge *) MPIU_Malloc(sizeof(struct ibv_sge));
+            memset(sge, 0, sizeof(struct ibv_sge));
+#endif
+            conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].next = NULL;
+#ifdef HAVE_LIBDCFA
+#else
+            conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].sg_list = sge;
+#endif
+            conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].num_sge = 1;
             break;
         }
 
@@ -2022,6 +2065,49 @@ int MPID_nem_ib_com_put_lmt(int condesc, uint64_t wr_id, void *raddr, int sz_dat
     goto fn_exit;
 }
 
+int MPID_nem_ib_com_scratch_pad_recv(int condesc, int sz_data)
+{
+    MPID_nem_ib_com_t *conp;
+    struct ibv_recv_wr *bad_wr;
+    int ibcom_errno = 0, ib_errno;
+
+    MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
+
+    void *buf_to = MPID_nem_ib_rdmawr_from_alloc(sz_data);
+    struct ibv_mr *mr_buf_to = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_MR(buf_to);
+
+    /* Create RR */
+
+#ifdef HAVE_LIBDCFA
+    conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].sg_list[0].mic_addr = (uint64_t) buf_to;
+    conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].sg_list[0].addr =
+        mr_buf_to->host_addr + ((uint64_t) buf_to -
+                                (uint64_t) MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_START(buf_to));
+#else
+    conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].sg_list[0].addr = (uint64_t) buf_to;
+#endif
+
+    conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].sg_list[0].length = sz_data;
+    conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].sg_list[0].lkey = mr_buf_to->lkey;
+
+    conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].wr_id = (uint64_t) buf_to;
+
+    /* Post RR to RQ */
+#ifdef HAVE_LIBDCFA
+    ib_errno = ibv_post_recv(conp->icom_qp, &conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER]);
+#else
+    ib_errno =
+        ibv_post_recv(conp->icom_qp, &conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER],
+                      &bad_wr);
+#endif
+    MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_post_recv ib_errno=%d\n", ib_errno));
+
+  fn_exit:
+    return ibcom_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
 int MPID_nem_ib_com_put_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset, int sz,
                                     void *laddr, void **buf_from_out, uint32_t * buf_from_sz_out)
 {
@@ -2236,6 +2322,60 @@ int MPID_nem_ib_com_cas_scratch_pad(int condesc,
     goto fn_exit;
 }
 
+int MPID_nem_ib_com_wr_scratch_pad(int condesc, uint64_t wr_id,
+                                   void *buf_from, uint32_t buf_from_sz)
+{
+    MPID_nem_ib_com_t *conp;
+    int ibcom_errno = 0;
+    struct ibv_send_wr *bad_wr;
+    int ib_errno;
+
+    dprintf("MPID_nem_ib_com_wr_scratch_pad,enter,wr_id=%llx,buf=%llx,sz=%d\n",
+            (unsigned long long) wr_id, (unsigned long long) buf_from, buf_from_sz);
+
+    MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
+
+    struct ibv_mr *mr_rdmawr_from = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_MR(buf_from);
+
+#ifdef HAVE_LIBDCFA
+    conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].sg_list[0].mic_addr = (uint64_t) buf_from;
+    conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].sg_list[0].addr =
+        mr_rdmawr_from->host_addr + ((uint64_t) buf_from - (uint64_t) buf_from);
+#else
+    conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].sg_list[0].addr = (uint64_t) buf_from;
+#endif
+    conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].sg_list[0].length = buf_from_sz;
+    conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].sg_list[0].lkey = mr_rdmawr_from->lkey;
+
+    /* num_sge is defined in MPID_nem_ib_com_open */
+    conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].wr_id = wr_id;
+
+    dprintf("MPID_nem_ib_com_wr_scratch_pad,wr.rdma.remote_addr=%llx\n",
+            (unsigned long long) conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].wr.rdma.remote_addr);
+
+#ifdef HAVE_LIBDCFA
+    ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR]);
+    MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
+                                   dprintf
+                                   ("MPID_nem_ib_com_wr_scratch_pad, ibv_post_send, rc=%d\n",
+                                    ib_errno));
+#else
+    ib_errno =
+        ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR], &bad_wr);
+    MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
+                                   dprintf
+                                   ("MPID_nem_ib_com_wr_scratch_pad, ibv_post_send, rc=%d, bad_wr=%p\n",
+                                    ib_errno, bad_wr));
+#endif
+
+    conp->ncom_scratch_pad += 1;
+
+  fn_exit:
+    return ibcom_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
 /* poll completion queue */
 int MPID_nem_ib_com_poll_cq(int which_cq, struct ibv_wc *wc, int *result)
 {
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
index ada7053..182d39e 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
@@ -174,6 +174,7 @@ static inline unsigned long long MPID_nem_ib_rdtsc_cpuid(void)
 
 extern struct ibv_cq *MPID_nem_ib_rc_shared_scq;
 extern struct ibv_cq *MPID_nem_ib_rc_shared_scq_scratch_pad;
+extern struct ibv_cq *MPID_nem_ib_rc_shared_rcq_scratch_pad;
 extern struct ibv_cq *MPID_nem_ib_ud_shared_rcq;
 extern uint8_t *MPID_nem_ib_scratch_pad;
 extern int MPID_nem_ib_scratch_pad_ref_count;
@@ -270,11 +271,12 @@ extern uint8_t *MPID_nem_ib_rdmawr_to_alloc_free_list;
 #define MPID_NEM_IB_COM_UD_INITIATOR 0  /* index to send request template */
 #define MPID_NEM_IB_COM_UD_RESPONDER 0  /* index to recv request template */
 
-#define MPID_NEM_IB_COM_SCRATCH_PAD_SR_NTEMPLATE 3
+#define MPID_NEM_IB_COM_SCRATCH_PAD_SR_NTEMPLATE 4
 #define MPID_NEM_IB_COM_SCRATCH_PAD_RR_NTEMPLATE 1
 #define MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR 0 /* index to send request template */
 #define MPID_NEM_IB_COM_SCRATCH_PAD_CAS       1
 #define MPID_NEM_IB_COM_SCRATCH_PAD_GET       2
+#define MPID_NEM_IB_COM_SCRATCH_PAD_WR        3
 #define MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER 0 /* index to recv request template */
 
 /* Header prepended to the MPI packet */
@@ -485,6 +487,7 @@ typedef struct MPID_nem_ib_com {
      * freeing scratch-pad QP. */
     int outstanding_connection_tx;
     int incoming_connection_tx;
+    int notify_outstanding_tx_empty;
 
 } MPID_nem_ib_com_t;
 
@@ -522,6 +525,8 @@ extern int MPID_nem_ib_com_get_scratch_pad(int condesc, uint64_t wr_id, uint64_t
 extern int MPID_nem_ib_com_cas_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset,
                                            uint64_t compare, uint64_t swap, void **buf_from_out,
                                            uint32_t * buf_from_sz_out);
+extern int MPID_nem_ib_com_wr_scratch_pad(int condesc, uint64_t wr_id,
+                                          void *buf_from, uint32_t buf_from_sz);
 
 //extern int MPID_nem_ib_com_isend(int condesc, uint64_t wr_id, void* hdr, int sz_hdr, void* data, int sz_data);
 extern int MPID_nem_ib_com_irecv(int condesc, uint64_t wr_id);
@@ -532,6 +537,7 @@ extern int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, int s
                                  uint32_t rkey, void *laddr);
 extern int MPID_nem_ib_com_put_lmt(int condesc, uint64_t wr_id, void *raddr, int sz_data,
                                    uint32_t rkey, void *laddr);
+extern int MPID_nem_ib_com_scratch_pad_recv(int condesc, int sz_data);
 extern int MPID_nem_ib_com_poll_cq(int which_cq, struct ibv_wc *wc, int *result);
 
 extern int MPID_nem_ib_com_obtain_pointer(int condesc, MPID_nem_ib_com_t ** MPID_nem_ib_com);
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
index a42df5d..74bc2d2 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
@@ -126,9 +126,14 @@ enum MPID_nem_ib_cm_cmd_types {
     MPID_NEM_IB_RINGBUF_ASK_CAS,
     MPID_NEM_IB_CM_CAS_RELEASE,
     MPID_NEM_IB_CM_ALREADY_ESTABLISHED,
-    MPID_NEM_IB_CM_RESPONDER_IS_CONNECTING
+    MPID_NEM_IB_CM_RESPONDER_IS_CONNECTING,
+    MPID_NEM_IB_NOTIFY_OUTSTANDING_TX_EMPTY
 };
 
+#define NOTIFY_OUTSTANDING_TX_SCQ  (1 << 0)
+#define NOTIFY_OUTSTANDING_TX_RCQ  (1 << 1)
+#define NOTIFY_OUTSTANDING_TX_COMP (NOTIFY_OUTSTANDING_TX_SCQ | NOTIFY_OUTSTANDING_TX_RCQ)
+
 /* Packet types of connection protocol */
 struct MPID_nem_ib_cm_req;
 
@@ -221,6 +226,11 @@ typedef struct {
     uint32_t buf_from_sz;
 } MPID_nem_ib_cm_cmd_shadow_t;
 
+typedef struct {
+    MPID_nem_ib_cm_cmd_type_t type;
+    int initiator_rank;
+} MPID_nem_ib_cm_wr_send_t;
+
 #define MPID_NEM_IB_CM_RELEASED ((uint64_t)(-1))
 #define MPID_NEM_IB_CM_OFF_SYN (256)    /* Align for 256-byte-write PCI command */
 #define MPID_NEM_IB_CM_OFF_CMD (256*2)  /* Align for 256-byte-write PCI command */
@@ -527,6 +537,7 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t * ringbuf);
 int MPID_nem_ib_ring_alloc(MPIDI_VC_t * vc);
 
 int MPID_nem_ib_cm_drain_scq(void);
+int MPID_nem_ib_cm_drain_rcq(void);
 int MPID_nem_ib_cm_poll_syn(void);
 int MPID_nem_ib_cm_poll(void);
 
@@ -558,6 +569,7 @@ int MPID_nem_ib_ringbuf_ask_fetch(MPIDI_VC_t * vc);
 int MPID_nem_ib_ringbuf_ask_cas_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t * shadow,
                                      uint64_t head);
 int MPID_nem_ib_ringbuf_progress(void);
+int MPID_nem_ib_cm_wr_send(int pg_rank, int myrank);
 
 int MPID_nem_ib_ringbuf_alloc(MPIDI_VC_t * vc);
 int MPID_nem_ib_ringbuf_free(MPIDI_VC_t * vc);
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
index cc64921..21bd140 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
@@ -488,6 +488,16 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
                         MPID_nem_ib_nranks * sizeof(MPID_nem_ib_conn_t), mpi_errno,
                         "connection table");
     memset(MPID_nem_ib_conns, 0, MPID_nem_ib_nranks * sizeof(MPID_nem_ib_conn_t));
+
+    /* post receive request */
+    for (i = 0; i < MPID_nem_ib_nranks; i++) {
+        if (i != MPID_nem_ib_myrank) {
+            for (j = 0; j < MPID_NEM_IB_COM_MAX_RQ_CAPACITY; j++) {
+                MPID_nem_ib_com_scratch_pad_recv(MPID_nem_ib_scratch_pad_fds[i], sizeof(MPID_nem_ib_cm_wr_send_t));
+            }
+        }
+    }
+
 #if 0
     MPIU_CHKPMEM_MALLOC(MPID_nem_ib_pollingset, MPIDI_VC_t **,
                         MPID_NEM_IB_MAX_POLLINGSET * sizeof(MPIDI_VC_t *), mpi_errno,
@@ -995,6 +1005,7 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
     int ibcom_errno;
     int req_errno = MPI_SUCCESS;
     int i;
+    int send_empty = 0;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_VC_TERMINATE);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_VC_TERMINATE);
 
@@ -1052,11 +1063,16 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
     }
 #endif
 
+#if 1
+    /* Wait until transmission and reception of NOTIFY_OUTSTANDING_TX_COMP are completed. */
+    while (1) {
+#else
     /* Empty sendq */
     while (!MPID_nem_ib_sendq_empty(vc_ib->sendq) ||
            VC_FIELD(vc, pending_sends) > 0 ||
            MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank]->outstanding_connection_tx > 0 ||
            MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank]->incoming_connection_tx > 0) {
+#endif
         /* mimic ib_poll because vc_terminate might be called from ib_poll_eager */
         mpi_errno = MPID_nem_ib_send_progress(vc);
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_send_progress");
@@ -1071,6 +1087,19 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
         MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_progress");
         ibcom_errno = MPID_nem_ib_cm_drain_scq();
         MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_drain_scq");
+
+        if ((send_empty == 0) &&
+            (MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank]->outstanding_connection_tx == 0)) {
+            MPID_nem_ib_cm_wr_send(vc->pg_rank, MPID_nem_ib_myrank);
+
+            send_empty = 1;
+        }
+        MPID_nem_ib_cm_drain_rcq();
+
+        if (MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank]->notify_outstanding_tx_empty ==
+            NOTIFY_OUTSTANDING_TX_COMP) {
+            break;
+        }
 #endif
         ibcom_errno = MPID_nem_ib_ringbuf_progress();
         MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
index a7858d7..ad297aa 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
@@ -2327,6 +2327,13 @@ int MPID_nem_ib_cm_drain_scq()
                 MPIU_Free(shadow_ringbuf);
                 break;
             }
+        case MPID_NEM_IB_NOTIFY_OUTSTANDING_TX_EMPTY:
+            shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
+            shadow_cm->req->ibcom->notify_outstanding_tx_empty |= NOTIFY_OUTSTANDING_TX_SCQ;
+            MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
+            MPIU_Free(shadow_cm->req);
+            MPIU_Free(shadow_cm);
+            break;
         default:
             printf("unknown type=%d\n", *type);
             MPIU_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_drain_scq");
@@ -2347,6 +2354,77 @@ int MPID_nem_ib_cm_drain_scq()
     goto fn_exit;
 }
 
+int MPID_nem_ib_cm_drain_rcq(void)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int result;
+    int i;
+    struct ibv_wc cqe[MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN];
+    MPID_nem_ib_cm_wr_send_t *shadow_cm;
+
+    if (!MPID_nem_ib_rc_shared_rcq_scratch_pad) {
+        dprintf("cm_drain_rcq,CQ is null\n");
+        goto fn_exit;
+    }
+
+    result =
+        ibv_poll_cq(MPID_nem_ib_rc_shared_rcq_scratch_pad, MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN,
+                    &cqe[0]);
+    MPIU_ERR_CHKANDJUMP(result < 0, mpi_errno, MPI_ERR_OTHER, "**netmod,ib,ibv_poll_cq");
+
+    if (result > 0) {
+        dprintf("cm_drain_rcq,found,result=%d\n", result);
+    }
+    for (i = 0; i < result; i++) {
+
+        dprintf("cm_drain_rcq,wr_id=%p\n", (void *) cqe[i].wr_id);
+
+#ifdef HAVE_LIBDCFA
+        if (cqe[i].status != IBV_WC_SUCCESS) {
+            dprintf("cm_drain_rcq,status=%08x\n", cqe[i].status);
+            MPID_nem_ib_segv;
+        }
+#else
+        if (cqe[i].status != IBV_WC_SUCCESS) {
+            dprintf("cm_drain_rcq,status=%08x,%s\n", cqe[i].status,
+                    ibv_wc_status_str(cqe[i].status));
+            MPID_nem_ib_segv;
+        }
+#endif
+        MPIU_ERR_CHKANDJUMP(cqe[i].status != IBV_WC_SUCCESS, mpi_errno, MPI_ERR_OTHER,
+                            "**MPID_nem_ib_cm_drain_rcq");
+
+        MPID_nem_ib_cm_cmd_type_t *type = (MPID_nem_ib_cm_cmd_type_t *) cqe[i].wr_id;
+        switch (*type) {
+        case MPID_NEM_IB_NOTIFY_OUTSTANDING_TX_EMPTY:{
+                int initiator_rank;
+                MPID_nem_ib_com_t *ibcom;
+
+                dprintf("cm_drain_rcq,notify_outstanding_tx_empty\n");
+                shadow_cm = (MPID_nem_ib_cm_wr_send_t *) cqe[i].wr_id;
+                initiator_rank = shadow_cm->initiator_rank;
+
+                MPID_nem_ib_rdmawr_from_free(shadow_cm, sizeof(MPID_nem_ib_cm_wr_send_t));
+
+                MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[initiator_rank], &ibcom);
+                ibcom->notify_outstanding_tx_empty |= NOTIFY_OUTSTANDING_TX_RCQ;
+                MPID_nem_ib_com_scratch_pad_recv(MPID_nem_ib_scratch_pad_fds[initiator_rank],
+                                                 sizeof(MPID_nem_ib_cm_wr_send_t));
+            }
+            break;
+        default:
+            printf("unknown type=%d\n", *type);
+            MPIU_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_drain_rcq");
+            break;
+        }
+    }
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
 #undef FUNCNAME
 #define FUNCNAME MPID_nem_ib_cm_poll_syn
 #undef FCNAME
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
index 1932198..0904b97 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
@@ -1978,3 +1978,40 @@ int MPID_nem_ib_ringbuf_progress()
   fn_fail:
     goto fn_exit;
 }
+
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_cm_wr_send
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_ib_cm_wr_send(int pg_rank, int myrank)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int ibcom_errno;
+
+    MPID_nem_ib_cm_cmd_shadow_t *shadow =
+        (MPID_nem_ib_cm_cmd_shadow_t *) MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+    MPID_nem_ib_cm_wr_send_t *buf_from = (MPID_nem_ib_cm_wr_send_t *)
+        MPID_nem_ib_rdmawr_from_alloc(sizeof(MPID_nem_ib_cm_wr_send_t));
+    MPID_nem_ib_cm_req_t *req = MPIU_Malloc(sizeof(MPID_nem_ib_cm_req_t));
+
+    shadow->type = MPID_NEM_IB_NOTIFY_OUTSTANDING_TX_EMPTY;
+
+    buf_from->type = MPID_NEM_IB_NOTIFY_OUTSTANDING_TX_EMPTY;
+    buf_from->initiator_rank = myrank;
+    shadow->req = req;
+    shadow->buf_from = (void *) buf_from;
+    shadow->buf_from_sz = sizeof(MPID_nem_ib_cm_wr_send_t);
+
+    shadow->req->ibcom = MPID_nem_ib_scratch_pad_ibcoms[pg_rank];
+
+    ibcom_errno =
+        MPID_nem_ib_com_wr_scratch_pad(MPID_nem_ib_scratch_pad_fds[pg_rank],
+                                       (uint64_t) shadow, shadow->buf_from, shadow->buf_from_sz);
+
+    MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_wr_scratch_pad");
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}

-----------------------------------------------------------------------

Summary of changes:
 .../ch3/channels/nemesis/netmod/ib/errnames.txt    |    4 +
 .../ch3/channels/nemesis/netmod/ib/ib_finalize.c   |    2 +
 src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c |  306 ++++++-
 src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h |   46 +-
 src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h  |   82 ++-
 src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c  |   52 +-
 src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c   |   31 +-
 .../ch3/channels/nemesis/netmod/ib/ib_malloc.c     |  121 ++-
 src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c  |  934 +++++++++++++++-----
 .../ch3/channels/nemesis/netmod/ib/ib_reg_mr.c     |   88 ++-
 src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c  |  247 +++++-
 11 files changed, 1538 insertions(+), 375 deletions(-)


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list