[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.2a1-3-gc99d346

Service Account noreply at mpich.org
Mon Sep 8 08:20:31 CDT 2014


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  c99d3467e2d96ecfd143921740eaa60cec222494 (commit)
       via  dc5df5b0d05cf5ab313a1386ca18b709069876b5 (commit)
      from  79444598bb51599702cabcecab341b5a20847124 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/c99d3467e2d96ecfd143921740eaa60cec222494

commit c99d3467e2d96ecfd143921740eaa60cec222494
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date:   Fri Sep 5 14:50:35 2014 +0900

    Modify the method of IB-dereg_mr
    
    Deregister until the total amount released exceeds the requested length.
    If a registration fails after deregistering some memory regions, then
    deregister a next memory region and try to re-register.

diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
index 59b967d..30bc222 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
@@ -1475,12 +1475,13 @@ int MPID_nem_ib_com_isend(int condesc,
     conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].lkey = mr_rdmawr_from->lkey;
     num_sge += 1;
 
+    struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache = NULL;
     if (sz_data) {
         //dprintf("MPID_nem_ib_com_isend,data=%p,sz_data=%d\n", data, sz_data);
-        struct ibv_mr *mr_data =
-            MPID_nem_ib_com_reg_mr_fetch(data, sz_data, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
-        MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_data, -1,
+        mr_cache = MPID_nem_ib_com_reg_mr_fetch(data, sz_data, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
+        MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_cache, -1,
                                        printf("MPID_nem_ib_com_isend,ibv_reg_mr_fetch failed\n"));
+        struct ibv_mr *mr_data = mr_cache->mr;
 #ifdef HAVE_LIBDCFA
         conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].mic_addr = (uint64_t) data;
         conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].addr =
@@ -1524,6 +1525,7 @@ int MPID_nem_ib_com_isend(int condesc,
     MPID_nem_ib_rc_send_request *wrap_wr_id = MPIU_Malloc(sizeof(MPID_nem_ib_rc_send_request));
     wrap_wr_id->wr_id = wr_id;
     wrap_wr_id->mf = MPID_NEM_IB_LAST_PKT;
+    wrap_wr_id->mr_cache = (void *) mr_cache;
 
     conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr_id = (uint64_t) wrap_wr_id;
 #else
@@ -1964,14 +1966,16 @@ int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, long sz_data
     MPID_NEM_IB_COM_ERR_CHKANDJUMP(!sz_data, -1, dprintf("MPID_nem_ib_com_lrecv,sz_data==0\n"));
 
     /* register memory area containing data */
-    struct ibv_mr *mr_data =
+    struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache =
         MPID_nem_ib_com_reg_mr_fetch(laddr, sz_data, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
-    MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_data, -1,
+    MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_cache, -1,
                                    dprintf("MPID_nem_ib_com_lrecv,ibv_reg_mr_fetch failed\n"));
+    struct ibv_mr *mr_data = mr_cache->mr;
 
     MPID_nem_ib_rc_send_request *wrap_wr_id = MPIU_Malloc(sizeof(MPID_nem_ib_rc_send_request));
     wrap_wr_id->wr_id = wr_id;
     wrap_wr_id->mf = last;
+    wrap_wr_id->mr_cache = (void *) mr_cache;
 
     num_sge = 0;
 
@@ -2039,10 +2043,11 @@ int MPID_nem_ib_com_put_lmt(int condesc, uint64_t wr_id, void *raddr, int sz_dat
     num_sge = 0;
 
     /* register memory area containing data */
-    struct ibv_mr *mr_data =
+    struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache =
         MPID_nem_ib_com_reg_mr_fetch(laddr, sz_data, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
-    MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_data, -1,
+    MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_cache, -1,
                                    dprintf("MPID_nem_ib_com_put_lmt,ibv_reg_mr_fetch failed\n"));
+    struct ibv_mr *mr_data = mr_cache->mr;
 
 #ifdef HAVE_LIBDCFA
     conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].sg_list[num_sge].mic_addr = (uint64_t) laddr;
@@ -2060,6 +2065,7 @@ int MPID_nem_ib_com_put_lmt(int condesc, uint64_t wr_id, void *raddr, int sz_dat
     MPID_nem_ib_rc_send_request *wrap_wr_id = MPIU_Malloc(sizeof(MPID_nem_ib_rc_send_request));
     wrap_wr_id->wr_id = wr_id;
     wrap_wr_id->mf = MPID_NEM_IB_LAST_PKT;
+    wrap_wr_id->mr_cache = (void *) mr_cache;
 
     conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].wr_id = (uint64_t) wrap_wr_id;
 #else
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
index 88e871f..ae28f55 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
@@ -356,6 +356,7 @@ MPID_nem_ib_rdmawr_to_alloc_hdr_t;
 typedef struct {
     uint64_t wr_id;             /* address of MPID_Request */
     int mf;                     /* more fragment (0 means the end of packet) */
+    void *mr_cache;             /* address of mr_cache_entry. derecement refc in drain_scq */
 } MPID_nem_ib_rc_send_request;
 
 #define MPID_NEM_IB_LMT_LAST_PKT        0
@@ -572,11 +573,27 @@ extern int MPID_nem_ib_com_mem_udwr_from(int condesc, void **out);
 extern int MPID_nem_ib_com_mem_udwr_to(int condesc, void **out);
 
 /* ib_reg_mr.c */
+struct MPID_nem_ib_com_reg_mr_listnode_t {
+    struct MPID_nem_ib_com_reg_mr_listnode_t *lru_next;
+    struct MPID_nem_ib_com_reg_mr_listnode_t *lru_prev;
+};
+
+struct MPID_nem_ib_com_reg_mr_cache_entry_t {
+    /* : public MPID_nem_ib_com_reg_mr_listnode_t */
+    struct MPID_nem_ib_com_reg_mr_listnode_t *lru_next;
+    struct MPID_nem_ib_com_reg_mr_listnode_t *lru_prev;
+    struct MPID_nem_ib_com_reg_mr_listnode_t g_lru;
+
+    struct ibv_mr *mr;
+    void *addr;
+    long len;
+    int refc;
+};
 extern int MPID_nem_ib_com_register_cache_init(void);
 extern int MPID_nem_ib_com_register_cache_release(void);
-extern struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
-                                                   enum ibv_access_flags additional_flags,
-                                                   int mode);
+extern void *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
+                                          enum ibv_access_flags additional_flags, int mode);
+extern void MPID_nem_ib_com_reg_mr_release(struct MPID_nem_ib_com_reg_mr_cache_entry_t *entry);
 #define MPID_NEM_IB_COM_REG_MR_GLOBAL (0)
 #define MPID_NEM_IB_COM_REG_MR_STICKY (1)
 
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
index 8f878fb..4941ccb 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
@@ -137,9 +137,11 @@ int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_p
         length = data_sz;
     }
     /* put IB rkey */
-    struct ibv_mr *mr =
+    struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache =
         MPID_nem_ib_com_reg_mr_fetch(write_from_buf, length, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
-    MPIU_ERR_CHKANDJUMP(!mr, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
+    MPIU_ERR_CHKANDJUMP(!mr_cache, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
+    struct ibv_mr *mr = mr_cache->mr;
+    REQ_FIELD(req, lmt_mr_cache) = (void *) mr_cache;
 #ifdef HAVE_LIBDCFA
     s_cookie_buf->addr = (void *) mr->host_addr;
     dprintf("lmt_initiate_lmt,s_cookie_buf->addr=%p\n", s_cookie_buf->addr);
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
index 050aefd..9106eb9 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
@@ -107,6 +107,13 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
         MPID_nem_ib_rc_send_request *req_wrap = (MPID_nem_ib_rc_send_request *) cqe[i].wr_id;
         req = (MPID_Request *) req_wrap->wr_id;
 
+        /* decrement reference counter of mr_cache_entry registered by ib_com_isend or ib_com_lrecv */
+        struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache =
+            (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) req_wrap->mr_cache;
+        if (mr_cache) {
+            MPID_nem_ib_com_reg_mr_release(mr_cache);
+        }
+
         kind = req->kind;
         req_type = MPIDI_Request_get_type(req);
         msg_type = MPIDI_Request_get_msg_type(req);
@@ -2227,6 +2234,10 @@ int MPID_nem_ib_pkt_GET_DONE_handler(MPIDI_VC_t * vc,
 #endif
         //dprintf("lmt_start_recv,reply_seq_num,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
 #endif
+
+        /* decrement reference counter of mr_cache_entry */
+        MPID_nem_ib_com_reg_mr_release(REQ_FIELD(req, lmt_mr_cache));
+
         /* try to send from sendq because at least one RDMA-write-to buffer has been released */
         //dprintf("lmt_start_recv,reply_seq_num,send_progress\n");
         if (!MPID_nem_ib_sendq_empty(vc_ib->sendq)) {
@@ -2261,9 +2272,13 @@ int MPID_nem_ib_pkt_GET_DONE_handler(MPIDI_VC_t * vc,
             void *addr =
                 (void *) ((char *) REQ_FIELD(req, buf.from) +
                           (long) (next_seg_seq_num - 1) * REQ_FIELD(req, max_msg_sz));
-            struct ibv_mr *mr =
+            struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache =
                 MPID_nem_ib_com_reg_mr_fetch(addr, length, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
-            MPIU_ERR_CHKANDJUMP(!mr, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
+            MPIU_ERR_CHKANDJUMP(!mr_cache, mpi_errno, MPI_ERR_OTHER,
+                                "**MPID_nem_ib_com_reg_mr_fetch");
+            struct ibv_mr *mr = mr_cache->mr;
+            /* store new cache entry */
+            REQ_FIELD(req, lmt_mr_cache) = (void *) mr_cache;
 
 #ifdef HAVE_LIBDCFA
             void *_addr = mr->host_addr;
@@ -2492,6 +2507,9 @@ int MPID_nem_ib_pkt_rma_lmt_getdone(MPIDI_VC_t * vc,
 
     MPIU_THREAD_CS_ENTER(LMT,);
 
+    /* decrement reference counter of mr_cache_entry */
+    MPID_nem_ib_com_reg_mr_release(REQ_FIELD(req, lmt_mr_cache));
+
     req_type = MPIDI_Request_get_type(req);
     /* free memory area for cookie */
     if (!req->ch.s_cookie) {
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
index bbabe11..2a2c8b6 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
@@ -96,23 +96,6 @@ static inline void afree(const void *p, int id)
 #endif
 }
 
-struct MPID_nem_ib_com_reg_mr_listnode_t {
-    struct MPID_nem_ib_com_reg_mr_listnode_t *lru_next;
-    struct MPID_nem_ib_com_reg_mr_listnode_t *lru_prev;
-};
-
-struct MPID_nem_ib_com_reg_mr_cache_entry_t {
-    /* : public MPID_nem_ib_com_reg_mr_listnode_t */
-    struct MPID_nem_ib_com_reg_mr_listnode_t *lru_next;
-    struct MPID_nem_ib_com_reg_mr_listnode_t *lru_prev;
-    struct MPID_nem_ib_com_reg_mr_listnode_t g_lru;
-
-    struct ibv_mr *mr;
-    void *addr;
-    long len;
-    int refc;
-};
-
 static struct MPID_nem_ib_com_reg_mr_listnode_t MPID_nem_ib_com_reg_mr_global_cache;
 static struct MPID_nem_ib_com_reg_mr_listnode_t
     MPID_nem_ib_com_reg_mr_cache[MPID_NEM_IB_COM_REG_MR_NLINE];
@@ -169,8 +152,8 @@ static inline void __lru_queue_display()
     }
 }
 
-struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
-                                            enum ibv_access_flags additional_flags, int mode)
+void *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
+                                   enum ibv_access_flags additional_flags, int mode)
 {
 #if 0   /* debug */
     struct ibv_mr *mr;
@@ -254,10 +237,61 @@ struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
     if (ibcom_errno != 0) {
         /* ib_com_reg_mr returns the errno of ibv_reg_mr */
         if (ibcom_errno == ENOMEM) {
+#if 1
+            /* deregister memory region unused and re-register new one */
+            struct MPID_nem_ib_com_reg_mr_listnode_t *ptr;
+            struct MPID_nem_ib_com_reg_mr_cache_entry_t *victim;
+            unsigned long long dereg_total = 0;
+            int reg_success = 0;
+            for (ptr = MPID_nem_ib_com_reg_mr_global_cache.lru_prev;
+                 ptr !=
+                 (struct MPID_nem_ib_com_reg_mr_listnode_t *) &MPID_nem_ib_com_reg_mr_global_cache;)
+            {
+                victim = list_entry(ptr, struct MPID_nem_ib_com_reg_mr_cache_entry_t, g_lru);
+                ptr = ptr->lru_prev;
+                /* 'refc == 0' means this cache_entry is not used */
+                if (victim && victim->addr && (victim->refc == 0)) {
+                    MPID_nem_ib_com_reg_mr_unlink((struct MPID_nem_ib_com_reg_mr_listnode_t *)
+                                                  victim);
+                    MPID_nem_ib_com_reg_mr_unlink(&(victim->g_lru));
+
+                    ibcom_errno = MPID_nem_ib_com_dereg_mr(victim->mr);
+                    if (ibcom_errno) {
+                        printf("mrcache,MPID_nem_ib_com_dereg_mr\n");
+                        afree(e, MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
+                        goto fn_fail;
+                    }
+                    dereg_total += (unsigned long long) victim->len;
+                    afree(victim, MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
+                    num_global_cache--;
+
+                    /* end loop if the total length released exceeds the requested */
+                    if (dereg_total > len_aligned) {
+                        dprintf("ib_com_reg_mr_fetch,dereg=%llu,len=%ld\n", dereg_total,
+                                len_aligned);
+                        /* re-registraion */
+                        ibcom_errno =
+                            MPID_nem_ib_com_reg_mr(addr_aligned, len_aligned, &e->mr,
+                                                   additional_flags);
+                        if (ibcom_errno == 0) {
+                            /* ibv_reg_mr success */
+                            reg_success = 1;
+                            break;
+                        }
+                    }
+                }
+            }
+
+            if (reg_success == 0) {
+                fprintf(stderr, "mrcache,MPID_nem_ib_com_reg_mr,failed\n");
+                afree(e, MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
+                goto fn_fail;
+            }
+#else
             /* deregister memory region. The value of 'num_global_cache' means the number of global-cached.
              * delete 5 percents of global-cached */
             int i;
-            int del_num = num_global_cache / 20;
+            int del_num = (num_global_cache + 19) / 20;
             struct MPID_nem_ib_com_reg_mr_cache_entry_t *victim;
 
             dprintf("mrcache,MPID_nem_ib_com_reg_mr,ENOMEM,del_num(%d)\n", del_num);
@@ -286,6 +320,7 @@ struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
                 afree(e, MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
                 goto fn_fail;
             }
+#endif
         }
         else {
             /* errno is not ENOMEM */
@@ -338,7 +373,10 @@ struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
     //__lru_queue_display();
 
   fn_exit:
-    return e->mr;
+    if (mode == MPID_NEM_IB_COM_REG_MR_STICKY)
+        return e->mr;
+    else
+        return e;
   fn_fail:
     goto fn_exit;
 #endif
@@ -358,6 +396,11 @@ static void MPID_nem_ib_com_reg_mr_dereg(struct ibv_mr *mr)
     //e->refc, offset);
 }
 #endif
+void MPID_nem_ib_com_reg_mr_release(struct MPID_nem_ib_com_reg_mr_cache_entry_t *entry)
+{
+    entry->refc--;
+    MPIU_Assert(ref_count >= 0);
+}
 
 int MPID_nem_ib_com_register_cache_init()
 {
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
index 69615c1..152f33c 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
@@ -253,10 +253,14 @@ static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, vo
             sreq->ch.s_cookie = s_cookie_buf;
 
             s_cookie_buf->tail = *((uint8_t *) ((uint8_t *) write_from_buf + data_sz - sizeof(uint8_t)));
-
-            struct ibv_mr *mr =
-                MPID_nem_ib_com_reg_mr_fetch(write_from_buf, data_sz, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
-            MPIU_ERR_CHKANDJUMP(!mr, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
+            /* put IB rkey */
+            struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache =
+                MPID_nem_ib_com_reg_mr_fetch(write_from_buf, data_sz, 0,
+                                             MPID_NEM_IB_COM_REG_MR_GLOBAL);
+            MPIU_ERR_CHKANDJUMP(!mr_cache, mpi_errno, MPI_ERR_OTHER,
+                                "**MPID_nem_ib_com_reg_mr_fetch");
+            struct ibv_mr *mr = mr_cache->mr;
+            REQ_FIELD(sreq, lmt_mr_cache) = (void *) mr_cache;
 #ifdef HAVE_LIBDCFA
             s_cookie_buf->addr = (void *) mr->host_addr;
 #else
@@ -816,9 +820,14 @@ static int MPID_nem_ib_SendNoncontig_core(MPIDI_VC_t * vc, MPID_Request * sreq,
             sreq->ch.s_cookie = s_cookie_buf;
 
             s_cookie_buf->tail = *((uint8_t *) ((uint8_t *) write_from_buf + last - sizeof(uint8_t)));
-
-            struct ibv_mr *mr = MPID_nem_ib_com_reg_mr_fetch(write_from_buf, last, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
-            MPIU_ERR_CHKANDJUMP(!mr, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
+            /* put IB rkey */
+            struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache =
+                MPID_nem_ib_com_reg_mr_fetch(write_from_buf, last, 0,
+                                             MPID_NEM_IB_COM_REG_MR_GLOBAL);
+            MPIU_ERR_CHKANDJUMP(!mr_cache, mpi_errno, MPI_ERR_OTHER,
+                                "**MPID_nem_ib_com_reg_mr_fetch");
+            struct ibv_mr *mr = mr_cache->mr;
+            REQ_FIELD(sreq, lmt_mr_cache) = (void *) mr_cache;
 #ifdef HAVE_LIBDCFA
             s_cookie_buf->addr = (void *) mr->host_addr;
 #else

http://git.mpich.org/mpich.git/commitdiff/dc5df5b0d05cf5ab313a1386ca18b709069876b5

commit dc5df5b0d05cf5ab313a1386ca18b709069876b5
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date:   Fri Sep 5 13:39:59 2014 +0900

    Divide memory area to segments when registering
    
    There is a maximum message size supported by HCA port. So it's necessary
    to divide a memory area to segments when a message size is larger than it.

diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt b/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
index a7f7adc..f11ac62 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
@@ -42,6 +42,7 @@
 **MPID_nem_ib_lmt_done_recv:MPID_nem_ib_lmt_done_recv failed
 **MPID_nem_ib_lmt_done_send:MPID_nem_ib_lmt_done_send failed
 **MPID_nem_ib_lmt_send_GET_DONE:MPID_nem_ib_lmt_send_GET_DONE failed
+**MPID_nem_ib_lmt_send_RTS:MPID_nem_ib_lmt_send_RTS failed
 **MPID_nem_ib_npollingset:MPID_nem_ib_npollingset failed
 **MPID_nem_ib_poll:MPID_nem_ib_poll failed
 **MPID_nem_ib_poll_eager:MPID_nem_ib_poll_eager failed
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
index 5f4ec31..59b967d 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
@@ -1523,7 +1523,7 @@ int MPID_nem_ib_com_isend(int condesc,
 #if 1
     MPID_nem_ib_rc_send_request *wrap_wr_id = MPIU_Malloc(sizeof(MPID_nem_ib_rc_send_request));
     wrap_wr_id->wr_id = wr_id;
-    wrap_wr_id->mf = 0;
+    wrap_wr_id->mf = MPID_NEM_IB_LAST_PKT;
 
     conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr_id = (uint64_t) wrap_wr_id;
 #else
@@ -1948,14 +1948,13 @@ int MPID_nem_ib_com_udrecv(int condesc)
 }
 
 int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, long sz_data, uint32_t rkey,
-                          void *laddr, int *post_num)
+                          void *laddr, int last)
 {
     MPID_nem_ib_com_t *conp;
     int ibcom_errno = 0;
     struct ibv_send_wr *bad_wr;
     int ib_errno;
     int num_sge = 0;
-    int i;
 
     dprintf("MPID_nem_ib_com_lrecv,enter,raddr=%p,sz_data=%ld,laddr=%p\n", raddr, sz_data, laddr);
 
@@ -1970,79 +1969,12 @@ int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, long sz_data
     MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_data, -1,
                                    dprintf("MPID_nem_ib_com_lrecv,ibv_reg_mr_fetch failed\n"));
 
-#if 1
-    *post_num = 1;
-
-    /* Type of max_msg_sz is uint32_t. */
-    if (sz_data > (long) conp->icom_pattr.max_msg_sz) {
-        *post_num +=  sz_data / (long)conp->icom_pattr.max_msg_sz;
-    }
-
-    for (i = 0; i < *post_num; i++) {
-        MPID_nem_ib_rc_send_request *wrap_wr_id = MPIU_Malloc(sizeof(MPID_nem_ib_rc_send_request));
-        wrap_wr_id->wr_id = wr_id;
-
-        if (i == *post_num - 1)
-            wrap_wr_id->mf = 0; /* end of packet */
-        else
-            wrap_wr_id->mf = 1;
-
-        dprintf("MPID_nem_ib_com_lrecv,mf=%d,post=%d/%d\n", wrap_wr_id->mf, i + 1, *post_num);
-
-        num_sge = 0;
-
-#ifdef HAVE_LIBDCFA
-        conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].mic_addr =
-            (uint64_t) laddr + (i * conp->icom_pattr.max_msg_sz);
-        conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].addr =
-            mr_data->host_addr + (i * conp->icom_pattr.max_msg_sz) + ((uint64_t) laddr -
-                                                                      (uint64_t) laddr);
-#else
-        conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].addr =
-            (uint64_t) laddr + (i * conp->icom_pattr.max_msg_sz);
-#endif
-        if (sz_data > (long) conp->icom_pattr.max_msg_sz) {
-            if (i == *post_num - 1) {
-                conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].length =
-                    (uint32_t) (sz_data - (long) conp->icom_pattr.max_msg_sz * i);
-            }
-            else {
-                conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].length =
-                    conp->icom_pattr.max_msg_sz;
-            }
-        }
-        else {
-            conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].length = sz_data;
-        }
-        conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].lkey = mr_data->lkey;
-        num_sge += 1;
-
-        conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].num_sge = num_sge;
-        conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].wr_id = (uint64_t) wrap_wr_id;
-        conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].wr.rdma.remote_addr =
-            (uint64_t) raddr + (i * conp->icom_pattr.max_msg_sz);
-        conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].wr.rdma.rkey = rkey;
-
-#ifdef HAVE_LIBDCFA
-        ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR]);
-        MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
-                                       dprintf("MPID_nem_ib_com_lrecv, ibv_post_send, rc=%d\n",
-                                               ib_errno));
-#else
-        ib_errno =
-            ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR], &bad_wr);
-        MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
-                                       printf
-                                       ("MPID_nem_ib_com_lrecv, ibv_post_send, rc=%d, bad_wr=%p\n",
-                                        ib_errno, bad_wr));
-#endif
+    MPID_nem_ib_rc_send_request *wrap_wr_id = MPIU_Malloc(sizeof(MPID_nem_ib_rc_send_request));
+    wrap_wr_id->wr_id = wr_id;
+    wrap_wr_id->mf = last;
 
-        /* other commands can be executed before RDMA-rd command */
-        /* see the "Ordering and the Fence Indicator" section in "InfiniBand Architecture" by William T. Futral */
+    num_sge = 0;
 
-        conp->ncom += 1;
-    }
-#else
     /* Erase magic, super bug!! */
     //((MPID_nem_ib_netmod_trailer_t*)(laddr + sz_data - sizeof(MPID_nem_ib_netmod_trailer_t)))->magic = 0;
 #ifdef HAVE_LIBDCFA
@@ -2057,7 +1989,7 @@ int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, long sz_data
     num_sge += 1;
 
     conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].num_sge = num_sge;
-    conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].wr_id = wr_id;
+    conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].wr_id = (uint64_t) wrap_wr_id;
     conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].wr.rdma.remote_addr = (uint64_t) raddr;
     conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].wr.rdma.rkey = rkey;
 
@@ -2080,7 +2012,6 @@ int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, long sz_data
     conp->after_rdma_rd = 1;
 #endif
     conp->ncom += 1;
-#endif
 
   fn_exit:
     return ibcom_errno;
@@ -2128,7 +2059,7 @@ int MPID_nem_ib_com_put_lmt(int condesc, uint64_t wr_id, void *raddr, int sz_dat
 #if 1
     MPID_nem_ib_rc_send_request *wrap_wr_id = MPIU_Malloc(sizeof(MPID_nem_ib_rc_send_request));
     wrap_wr_id->wr_id = wr_id;
-    wrap_wr_id->mf = 0;
+    wrap_wr_id->mf = MPID_NEM_IB_LAST_PKT;
 
     conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].wr_id = (uint64_t) wrap_wr_id;
 #else
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
index 180a1fc..88e871f 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
@@ -358,6 +358,11 @@ typedef struct {
     int mf;                     /* more fragment (0 means the end of packet) */
 } MPID_nem_ib_rc_send_request;
 
+#define MPID_NEM_IB_LMT_LAST_PKT        0
+#define MPID_NEM_IB_LMT_SEGMENT_LAST    1
+#define MPID_NEM_IB_LMT_PART_OF_SEGMENT 2
+#define MPID_NEM_IB_LAST_PKT            MPID_NEM_IB_LMT_LAST_PKT
+
 /* Ring-buffer to which a remote note RDMA-writes */
 #define MPID_NEM_IB_NRINGBUF 64
 #define MPID_NEM_IB_RINGBUF_NSLOT 16
@@ -539,7 +544,7 @@ extern int MPID_nem_ib_com_udsend(int condesc, union ibv_gid *remote_gid, uint16
                                   uint32_t remote_qpn, uint32_t imm_data, uint64_t wr_id);
 extern int MPID_nem_ib_com_udrecv(int condesc);
 extern int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, long sz_data,
-                                 uint32_t rkey, void *laddr, int *post_num);
+                                 uint32_t rkey, void *laddr, int last);
 extern int MPID_nem_ib_com_put_lmt(int condesc, uint64_t wr_id, void *raddr, int sz_data,
                                    uint32_t rkey, void *laddr);
 extern int MPID_nem_ib_com_scratch_pad_recv(int condesc, int sz_data);
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
index 2d462eb..576c61f 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
@@ -67,7 +67,7 @@ typedef struct {
     struct MPID_Request *sendq_next;    /* for sendq */
     void *lmt_raddr;            /* remember this for sendq, it might be better to use sreq->dev.iov[0].MPID_IOV_BUF instead */
     uint32_t lmt_rkey;          /* remember this for sendq, survive over lrecv and referenced when dequeueing from sendq */
-    uint32_t lmt_szsend;        /* remember this for sendq */
+    long lmt_szsend;            /* remember this for sendq */
     uint8_t lmt_tail, lmt_sender_tail, lmt_receiver_tail;       /* survive over lrecv and referenced when polling */
     MPI_Aint lmt_dt_true_lb;    /* to locate the last byte of receive buffer */
     void *lmt_write_to_buf;     /* user buffer or temporary buffer for pack and remember it for lmt_orderq */
@@ -75,6 +75,16 @@ typedef struct {
     void *buf_from;             /* address of RDMA write from buffer */
     uint32_t buf_from_sz;       /* size of RDMA write from buffer. It's set on sending, referenced on freeing */
     uint8_t ask;                /* Issued ask or not on send */
+    union {
+        void *from;
+        void *to;
+    } buf;
+    uint32_t max_msg_sz;        /* remember this for sendq, max message size */
+    MPIDI_msg_sz_t data_sz;
+    int seg_seq_num;            /* sequence number of segments */
+    int seg_num;                /* number of segments */
+    int last;                   /* flag for last packet or not */
+    void *lmt_mr_cache;         /* address of mr_cache_entry */
 } MPID_nem_ib_req_area;
 
 /* macro for secret area in req */
@@ -602,8 +612,8 @@ int MPID_nem_ib_send_progress(MPIDI_VC_t * vc);
 /* CH3--lmt send/recv functions */
 int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_pkt,
                                  struct MPID_Request *req);
-int MPID_nem_ib_lmt_start_recv_core(struct MPID_Request *req, void *raddr, uint32_t rkey,
-                                    void *write_to_buf);
+int MPID_nem_ib_lmt_start_recv_core(struct MPID_Request *req, void *raddr, uint32_t rkey, long len,
+                                    void *write_to_buf, uint32_t max_msg_sz, int end);
 int MPID_nem_ib_lmt_start_recv(struct MPIDI_VC *vc, struct MPID_Request *req, MPID_IOV s_cookie);
 int MPID_nem_ib_lmt_handle_cookie(struct MPIDI_VC *vc, struct MPID_Request *req, MPID_IOV cookie);
 int MPID_nem_ib_lmt_switch_send(struct MPIDI_VC *vc, struct MPID_Request *req);
@@ -692,6 +702,9 @@ typedef struct {
     int seq_num_tail;           /* notify RDMA-write-to buffer occupation */
 #endif
     uint8_t tail;               /* last word of payload */
+    uint32_t max_msg_sz;        /* max message size */
+    int seg_seq_num;
+    int seg_num;
 } MPID_nem_ib_lmt_cookie_t;
 
 typedef struct {
@@ -700,6 +713,7 @@ typedef struct {
     uint8_t tail;               /* last word of payload */
     int len;
     MPI_Request sender_req_id;  /* request id of sender side */
+    uint32_t max_msg_sz;        /* max message size */
 } MPID_nem_ib_rma_lmt_cookie_t;
 
 typedef enum MPID_nem_ib_pkt_subtype {
@@ -712,6 +726,7 @@ typedef enum MPID_nem_ib_pkt_subtype {
     MPIDI_NEM_IB_PKT_GET,
     MPIDI_NEM_IB_PKT_GET_RESP,
     MPIDI_NEM_IB_PKT_LMT_GET_DONE,
+    MPIDI_NEM_IB_PKT_LMT_RTS,
     MPIDI_NEM_IB_PKT_REQ_SEQ_NUM,
     MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM,
     MPIDI_NEM_IB_PKT_CHG_RDMABUF_OCC_NOTIFY_STATE,
@@ -734,8 +749,20 @@ typedef struct MPID_nem_ib_pkt_lmt_get_done {
     /* additional field */
     MPI_Request req_id;
     int16_t seq_num_tail;
+    MPI_Request receiver_req_id;
 } MPID_nem_ib_pkt_lmt_get_done_t;
 
+typedef struct MPID_nem_ib_pkt_lmt_rts {
+    MPIDI_CH3_Pkt_type_t type;
+    unsigned subtype;
+    /* additional field */
+    MPI_Request req_id;
+    int16_t seq_num_tail;
+    void *addr;
+    uint32_t rkey;
+    int seg_seq_num;
+} MPID_nem_ib_pkt_lmt_rts_t;
+
 /* derived from MPID_nem_pkt_netmod_t */
 typedef struct MPID_nem_ib_pkt_req_seq_num_t {
     MPIDI_CH3_Pkt_type_t type;
@@ -784,6 +811,8 @@ int MPID_nem_ib_PktHandler_lmt_done(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                                     MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp);
 int MPID_nem_ib_pkt_GET_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                                      MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp);
+int MPID_nem_ib_pkt_RTS_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
+                                MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp);
 int MPID_nem_ib_PktHandler_req_seq_num(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                                        MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp);
 int MPID_nem_ib_PktHandler_reply_seq_num(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
@@ -886,6 +915,7 @@ int pkt_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt, MPIDI_msg_sz_t * bu
         MPIDI_Pkt_init(_done_pkt, MPIDI_NEM_PKT_NETMOD); \
         _done_pkt->subtype = MPIDI_NEM_IB_PKT_LMT_GET_DONE;\
         _done_pkt->req_id = (rreq)->ch.lmt_req_id; \
+        _done_pkt->receiver_req_id = (rreq)->handle; \
             /* embed SR occupancy information */ \
         _done_pkt->seq_num_tail = VC_FIELD(vc, ibcom->rsr_seq_num_tail); \
  \
@@ -902,6 +932,33 @@ int pkt_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt, MPIDI_msg_sz_t * bu
         }                                                                                                       \
     } while (0)
 
+#define MPID_nem_ib_lmt_send_RTS(vc, _req_id, _addr, _rkey, _seg_seq_num) do {          \
+        MPID_PKT_DECL_CAST(_upkt, MPID_nem_ib_pkt_lmt_rts_t, _rts_pkt);                                        \
+        MPID_Request *_rts_req;                                                                                \
+                                                                                                               \
+        MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"sending rndv RTS segment packet"); \
+        MPIDI_Pkt_init(_rts_pkt, MPIDI_NEM_PKT_NETMOD); \
+        _rts_pkt->subtype = MPIDI_NEM_IB_PKT_LMT_RTS;\
+        _rts_pkt->req_id = _req_id; \
+        _rts_pkt->addr = _addr; \
+        _rts_pkt->rkey = _rkey; \
+        _rts_pkt->seg_seq_num = _seg_seq_num; \
+            /* embed SR occupancy information */ \
+        _rts_pkt->seq_num_tail = VC_FIELD(vc, ibcom->rsr_seq_num_tail); \
+ \
+            /* remember the last one sent */ \
+        VC_FIELD(vc, ibcom->rsr_seq_num_tail_last_sent) = VC_FIELD(vc, ibcom->rsr_seq_num_tail); \
+                                                                                                                \
+        mpi_errno = MPIDI_CH3_iStartMsg((vc), _rts_pkt, sizeof(*_rts_pkt), &_rts_req);                       \
+        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_lmt_send_RTS");                                  \
+        if (_rts_req != NULL)                                                                                  \
+        {                                                                                                       \
+            MPIU_ERR_CHKANDJUMP(_rts_req->status.MPI_ERROR, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_lmt_send_RTS");            \
+            MPID_Request_release(_rts_req);                                                                    \
+            dprintf("send_rts,release,req=%p\n", _rts_req);       \
+        }                                                                                                       \
+    } while (0)
+
 #define MPID_nem_ib_lmt_send_PKT_LMT_DONE(vc, rreq) do {                                                                   \
         MPID_PKT_DECL_CAST(_upkt, MPID_nem_ib_pkt_lmt_get_done_t, _done_pkt);                                          \
         MPID_Request *_done_req;                                                                                \
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
index 4f8b5bc..a8382fa 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
@@ -959,6 +959,7 @@ int MPID_nem_ib_vc_init(MPIDI_VC_t * vc)
     MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_GET_RESP] = MPID_nem_ib_PktHandler_GetResp;
     MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_ACCUMULATE] = MPID_nem_ib_PktHandler_Accumulate;
     MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_LMT_GET_DONE] = MPID_nem_ib_pkt_GET_DONE_handler;
+    MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_LMT_RTS] = MPID_nem_ib_pkt_RTS_handler;
     MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_REQ_SEQ_NUM] = MPID_nem_ib_PktHandler_req_seq_num;
     MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM] = MPID_nem_ib_PktHandler_reply_seq_num;
     MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_CHG_RDMABUF_OCC_NOTIFY_STATE] =
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
index ed40e64..8f878fb 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
@@ -110,9 +110,35 @@ int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_p
     vc_ib->ibcom->rsr_seq_num_tail_last_sent = vc_ib->ibcom->rsr_seq_num_tail;
 #endif
 
+    int post_num;
+    uint32_t max_msg_sz;
+    MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
+    MPID_nem_ib_com_get_info_conn(vc_ib->sc->fd, MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ,
+                                  &max_msg_sz, sizeof(uint32_t));
+
+    /* Type of max_msg_sz is uint32_t. */
+    post_num = (data_sz + (long) max_msg_sz - 1) / (long) max_msg_sz;
+
+    s_cookie_buf->max_msg_sz = max_msg_sz;
+    s_cookie_buf->seg_seq_num = 1;
+    s_cookie_buf->seg_num = post_num;
+
+    REQ_FIELD(req, buf.from) = write_from_buf;
+    REQ_FIELD(req, data_sz) = data_sz;
+    REQ_FIELD(req, seg_seq_num) = 1;    // only send 1st-segment, even if there are some segments.
+    REQ_FIELD(req, seg_num) = post_num;
+    REQ_FIELD(req, max_msg_sz) = max_msg_sz;
+
+    long length;
+    if (post_num > 1) {
+        length = max_msg_sz;
+    }
+    else {
+        length = data_sz;
+    }
     /* put IB rkey */
     struct ibv_mr *mr =
-        MPID_nem_ib_com_reg_mr_fetch(write_from_buf, data_sz, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
+        MPID_nem_ib_com_reg_mr_fetch(write_from_buf, length, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
     MPIU_ERR_CHKANDJUMP(!mr, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
 #ifdef HAVE_LIBDCFA
     s_cookie_buf->addr = (void *) mr->host_addr;
@@ -139,32 +165,73 @@ int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_p
 #define FUNCNAME MPID_nem_ib_lmt_start_recv_core
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_lmt_start_recv_core(struct MPID_Request *req, void *raddr, uint32_t rkey,
-                                    void *write_to_buf)
+int MPID_nem_ib_lmt_start_recv_core(struct MPID_Request *req, void *raddr, uint32_t rkey, long len,
+                                    void *write_to_buf, uint32_t max_msg_sz, int end)
 {
     int mpi_errno = MPI_SUCCESS;
     int ibcom_errno;
     struct MPIDI_VC *vc = req->ch.vc;
     MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
+    int i;
+    int divide;
+    int posted_num;
+    int last;
+    uint32_t r_max_msg_sz;      /* responder's max_msg_sz */
+    void *write_pos;
+    void *addr;
+    long data_sz;
+    MPIDI_msg_sz_t rest_data_sz;
 
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);
 
-#if 1
-    int post_num = 1;
+    MPID_nem_ib_com_get_info_conn(vc_ib->sc->fd, MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ,
+                                  &r_max_msg_sz, sizeof(uint32_t));
 
-    ibcom_errno =
-        MPID_nem_ib_com_lrecv(vc_ib->sc->fd, (uint64_t) req, raddr, req->ch.lmt_data_sz, rkey,
-                              write_to_buf, &post_num);
-    MPID_nem_ib_ncqe += post_num;
-#else
-    ibcom_errno =
-        MPID_nem_ib_com_lrecv(vc_ib->sc->fd, (uint64_t) req, raddr, req->ch.lmt_data_sz, rkey,
-                              write_to_buf);
-    MPID_nem_ib_ncqe += 1;
-#endif
+    divide = (max_msg_sz + r_max_msg_sz - 1) / r_max_msg_sz;
+
+    write_pos = write_to_buf;
+    posted_num = 0;
+    last = MPID_NEM_IB_LMT_PART_OF_SEGMENT;
+    rest_data_sz = len;
+    addr = raddr;
+
+    for (i = 0; i < divide; i++) {
+        if (i == divide - 1)
+            data_sz = max_msg_sz - i * r_max_msg_sz;
+        else
+            data_sz = r_max_msg_sz;
+
+        if (i == divide - 1) {
+            if (end)
+                last = MPID_NEM_IB_LMT_LAST_PKT;        /* last part of last segment packet */
+            else
+                last = MPID_NEM_IB_LMT_SEGMENT_LAST;    /* last part of this segment */
+
+            /* last data may be smaller than initiator's max_msg_sz */
+            if (rest_data_sz < max_msg_sz)
+                data_sz = rest_data_sz;
+        }
+
+        ibcom_errno =
+            MPID_nem_ib_com_lrecv(vc_ib->sc->fd, (uint64_t) req, addr, data_sz, rkey,
+                                  write_pos, last);
+        MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_lrecv");
+
+        /* update position */
+        write_pos = (void *) ((char *) write_pos + data_sz);
+        addr = (void *) ((char *) addr + data_sz);
+
+        /* update rest data size */
+        rest_data_sz -= data_sz;
+
+        /* count request number */
+        posted_num++;
+    }
+
+    MPIU_Assert(rest_data_sz == 0);
+    MPID_nem_ib_ncqe += posted_num;
     //dprintf("start_recv,ncqe=%d\n", MPID_nem_ib_ncqe);
-    MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_lrecv");
     dprintf("lmt_start_recv_core,MPID_nem_ib_ncqe=%d\n", MPID_nem_ib_ncqe);
     dprintf
         ("lmt_start_recv_core,req=%p,sz=%ld,write_to_buf=%p,lmt_pack_buf=%p,user_buf=%p,raddr=%p,rkey=%08x,tail=%p=%02x\n",
@@ -174,11 +241,7 @@ int MPID_nem_ib_lmt_start_recv_core(struct MPID_Request *req, void *raddr, uint3
     //fflush(stdout);
 
 #ifdef MPID_NEM_IB_LMT_GET_CQE
-#if 1
-    MPID_nem_ib_ncqe_to_drain += post_num;      /* use CQE instead of polling */
-#else
-    MPID_nem_ib_ncqe_to_drain += 1;     /* use CQE instead of polling */
-#endif
+    MPID_nem_ib_ncqe_to_drain += posted_num;    /* use CQE instead of polling */
 #else
     /* drain_scq and ib_poll is not ordered, so both can decrement ref_count */
     MPIR_Request_add_ref(req);
@@ -242,6 +305,8 @@ int MPID_nem_ib_lmt_start_recv(struct MPIDI_VC *vc, struct MPID_Request *req, MP
         write_to_buf = REQ_FIELD(req, lmt_pack_buf);
     }
 
+    REQ_FIELD(req, buf.to) = write_to_buf;
+
 #ifdef MPID_NEM_IB_LMT_GET_CQE
 #else
     /* unmark magic */
@@ -260,12 +325,25 @@ int MPID_nem_ib_lmt_start_recv(struct MPIDI_VC *vc, struct MPID_Request *req, MP
 
     //dprintf("lmt_start_recv,sendq_empty=%d,ncom=%d,ncqe=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY, MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
 
+    int last = 1;
+    long length = req->ch.lmt_data_sz;
+
+    if (s_cookie_buf->seg_seq_num != s_cookie_buf->seg_num) {
+        last = 0;
+        length = s_cookie_buf->max_msg_sz;
+    }
+
+    REQ_FIELD(req, max_msg_sz) = s_cookie_buf->max_msg_sz; /* store initiator's max_msg_sz */
+    REQ_FIELD(req, seg_num) = s_cookie_buf->seg_num; /* store number of segments */
+
     /* try to issue RDMA-read command */
     int slack = 1;              /* slack for control packet bringing sequence number */
     if (MPID_nem_ib_sendq_empty(vc_ib->sendq) &&
         vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
         MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack) {
-        mpi_errno = MPID_nem_ib_lmt_start_recv_core(req, s_cookie_buf->addr, s_cookie_buf->rkey, write_to_buf); /* fast path not storing raddr and rkey */
+        mpi_errno =
+            MPID_nem_ib_lmt_start_recv_core(req, s_cookie_buf->addr, s_cookie_buf->rkey, length,
+                                            write_to_buf, s_cookie_buf->max_msg_sz, last);
         if (mpi_errno) {
             MPIU_ERR_POP(mpi_errno);
         }
@@ -281,6 +359,8 @@ int MPID_nem_ib_lmt_start_recv(struct MPIDI_VC *vc, struct MPID_Request *req, MP
         REQ_FIELD(req, lmt_raddr) = s_cookie_buf->addr;
         REQ_FIELD(req, lmt_rkey) = s_cookie_buf->rkey;
         REQ_FIELD(req, lmt_write_to_buf) = write_to_buf;
+        REQ_FIELD(req, lmt_szsend) = length;
+        REQ_FIELD(req, last) = last;
 
         MPID_nem_ib_sendq_enqueue(&vc_ib->sendq, req);
     }
@@ -322,6 +402,7 @@ int MPID_nem_ib_lmt_start_recv(struct MPIDI_VC *vc, struct MPID_Request *req, MP
     goto fn_exit;
 }
 
+#if 0   /* unused function */
 /* fall-back to lmt-get if end-flag of send-buf has the same value as the end-flag of recv-buf */
 #undef FUNCNAME
 #define FUNCNAME MPID_nem_ib_lmt_switch_send
@@ -396,6 +477,7 @@ int MPID_nem_ib_lmt_switch_send(struct MPIDI_VC *vc, struct MPID_Request *req)
   fn_fail:
     goto fn_exit;
 }
+#endif
 
 /* when cookie is received in the middle of the lmt */
 #undef FUNCNAME
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
index 25b9d57..050aefd 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
@@ -332,7 +332,7 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
 #if defined(MPID_NEM_IB_LMT_GET_CQE)
 
             /* end of packet */
-            if (req_wrap->mf == 0) {
+            if (req_wrap->mf == MPID_NEM_IB_LMT_LAST_PKT) {
                 /* unpack non-contiguous dt */
                 int is_contig;
                 MPID_Datatype_is_contig(req->dev.datatype, &is_contig);
@@ -370,6 +370,9 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
                 /* send done to sender. vc is stashed in MPID_nem_ib_lmt_start_recv (in ib_lmt.c) */
                 MPID_nem_ib_lmt_send_GET_DONE(req->ch.vc, req);
             }
+            else if (req_wrap->mf == MPID_NEM_IB_LMT_SEGMENT_LAST) {
+                MPID_nem_ib_lmt_send_GET_DONE(req->ch.vc, req);
+            }
 #endif
             /* unmark "lmt is going on" */
 
@@ -381,7 +384,7 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
             dprintf("drain_scq,rdma-read,ncqe=%d\n", MPID_nem_ib_ncqe);
 
 #ifdef MPID_NEM_IB_LMT_GET_CQE
-            if (req_wrap->mf == 0) {
+            if (req_wrap->mf == MPID_NEM_IB_LMT_LAST_PKT) {
                 dprintf("drain_scq,GET_CQE,Request_complete\n");
                 /* mark completion on rreq */
                 MPIDI_CH3U_Request_complete(req);
@@ -437,7 +440,7 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
             MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
 
             /* end of packet */
-            if (req_wrap->mf == 0) {
+            if (req_wrap->mf == MPID_NEM_IB_LMT_LAST_PKT) {
                 MPIDI_msg_sz_t data_len = req->ch.lmt_data_sz;
                 MPI_Aint type_size;
 
@@ -468,7 +471,7 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
             MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
 
             /* end of packet */
-            if (req_wrap->mf == 0) {
+            if (req_wrap->mf == MPID_NEM_IB_LMT_LAST_PKT) {
                 MPIDI_msg_sz_t data_len = req->ch.lmt_data_sz;
                 int complete = 0;
                 mpi_errno =
@@ -500,7 +503,7 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
                  cqe[i].opcode == IBV_WC_RDMA_READ) {
             MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
             /* end of packet */
-            if (req_wrap->mf == 0) {
+            if (req_wrap->mf == MPID_NEM_IB_LMT_LAST_PKT) {
                 MPIDI_msg_sz_t buflen = req->ch.lmt_data_sz;
                 char *buf = (char *) REQ_FIELD(req, lmt_pack_buf);
                 int complete = 0;
@@ -541,7 +544,7 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
             MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
 
             /* end of packet */
-            if (req_wrap->mf == 0) {
+            if (req_wrap->mf == MPID_NEM_IB_LMT_LAST_PKT) {
                 MPIDI_msg_sz_t data_len = req->ch.lmt_data_sz;
                 int complete = 0;
                 mpi_errno =
@@ -573,7 +576,7 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
                  cqe[i].opcode == IBV_WC_RDMA_READ) {
             MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
             /* end of packet */
-            if (req_wrap->mf == 0) {
+            if (req_wrap->mf == MPID_NEM_IB_LMT_LAST_PKT) {
                 MPIDI_msg_sz_t buflen = req->ch.lmt_data_sz;
                 char *buf = (char *) REQ_FIELD(req, lmt_pack_buf);
                 int complete = 0;
@@ -1856,7 +1859,10 @@ int MPID_nem_ib_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     if (MPID_nem_ib_sendq_empty(vc_ib->sendq) &&
         vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
         MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack) {
-        mpi_errno = MPID_nem_ib_lmt_start_recv_core(req, s_cookie_buf->addr, s_cookie_buf->rkey, write_to_buf); /* fast path not storing raddr and rkey */
+        mpi_errno =
+            MPID_nem_ib_lmt_start_recv_core(req, s_cookie_buf->addr, s_cookie_buf->rkey,
+                                            s_cookie_buf->len, write_to_buf,
+                                            s_cookie_buf->max_msg_sz, 1);
         if (mpi_errno) {
             MPIU_ERR_POP(mpi_errno);
         }
@@ -1872,6 +1878,9 @@ int MPID_nem_ib_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         REQ_FIELD(req, lmt_raddr) = s_cookie_buf->addr;
         REQ_FIELD(req, lmt_rkey) = s_cookie_buf->rkey;
         REQ_FIELD(req, lmt_write_to_buf) = write_to_buf;
+        REQ_FIELD(req, lmt_szsend) = s_cookie_buf->len;
+        REQ_FIELD(req, max_msg_sz) = s_cookie_buf->max_msg_sz;
+        REQ_FIELD(req, last) = 1;       /* not support segmentation */
 
         /* set for send_progress */
         MPIDI_Request_set_msg_type(req, MPIDI_REQUEST_RNDV_MSG);
@@ -1992,7 +2001,10 @@ int MPID_nem_ib_PktHandler_Accumulate(MPIDI_VC_t * vc,
     if (MPID_nem_ib_sendq_empty(vc_ib->sendq) &&
         vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
         MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack) {
-        mpi_errno = MPID_nem_ib_lmt_start_recv_core(req, s_cookie_buf->addr, s_cookie_buf->rkey, write_to_buf); /* fast path not storing raddr and rkey */
+        mpi_errno =
+            MPID_nem_ib_lmt_start_recv_core(req, s_cookie_buf->addr, s_cookie_buf->rkey,
+                                            s_cookie_buf->len, write_to_buf,
+                                            s_cookie_buf->max_msg_sz, 1);
         if (mpi_errno) {
             MPIU_ERR_POP(mpi_errno);
         }
@@ -2008,6 +2020,9 @@ int MPID_nem_ib_PktHandler_Accumulate(MPIDI_VC_t * vc,
         REQ_FIELD(req, lmt_raddr) = s_cookie_buf->addr;
         REQ_FIELD(req, lmt_rkey) = s_cookie_buf->rkey;
         REQ_FIELD(req, lmt_write_to_buf) = write_to_buf;
+        REQ_FIELD(req, lmt_szsend) = s_cookie_buf->len;
+        REQ_FIELD(req, max_msg_sz) = s_cookie_buf->max_msg_sz;
+        REQ_FIELD(req, last) = 1;       /* not support segmentation */
 
         /* set for send_progress */
         MPIDI_Request_set_msg_type(req, MPIDI_REQUEST_RNDV_MSG);
@@ -2129,7 +2144,10 @@ int MPID_nem_ib_PktHandler_GetResp(MPIDI_VC_t * vc,
     if (MPID_nem_ib_sendq_empty(vc_ib->sendq) &&
         vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
         MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack) {
-        mpi_errno = MPID_nem_ib_lmt_start_recv_core(req, s_cookie_buf->addr, s_cookie_buf->rkey, write_to_buf); /* fast path not storing raddr and rkey */
+        mpi_errno =
+            MPID_nem_ib_lmt_start_recv_core(req, s_cookie_buf->addr, s_cookie_buf->rkey,
+                                            s_cookie_buf->len, write_to_buf,
+                                            s_cookie_buf->max_msg_sz, 1);
         if (mpi_errno) {
             MPIU_ERR_POP(mpi_errno);
         }
@@ -2145,6 +2163,9 @@ int MPID_nem_ib_PktHandler_GetResp(MPIDI_VC_t * vc,
         REQ_FIELD(req, lmt_raddr) = s_cookie_buf->addr;
         REQ_FIELD(req, lmt_rkey) = s_cookie_buf->rkey;
         REQ_FIELD(req, lmt_write_to_buf) = write_to_buf;
+        REQ_FIELD(req, lmt_szsend) = s_cookie_buf->len;
+        REQ_FIELD(req, max_msg_sz) = s_cookie_buf->max_msg_sz;
+        REQ_FIELD(req, last) = 1;       /* not support segmentation */
 
         MPID_nem_ib_sendq_enqueue(&vc_ib->sendq, req);
     }
@@ -2219,9 +2240,39 @@ int MPID_nem_ib_pkt_GET_DONE_handler(MPIDI_VC_t * vc,
         }
         dprintf("get_done_handler,send_progress\n");
         fflush(stdout);
-        MPID_NEM_IB_CHECK_AND_SEND_PROGRESS mpi_errno = vc->ch.lmt_done_send(vc, req);
-        if (mpi_errno)
-            MPIU_ERR_POP(mpi_errno);
+
+        if (REQ_FIELD(req, seg_seq_num) == REQ_FIELD(req, seg_num)) {
+            /* last packet of segments */
+            MPID_NEM_IB_CHECK_AND_SEND_PROGRESS mpi_errno = vc->ch.lmt_done_send(vc, req);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+        }
+        else {
+            /* Send RTS for next segment */
+            REQ_FIELD(req, seg_seq_num) += 1;   /* next segment number */
+            int next_seg_seq_num = REQ_FIELD(req, seg_seq_num);
+
+            uint32_t length;
+            if (next_seg_seq_num == REQ_FIELD(req, seg_num))
+                length = REQ_FIELD(req, data_sz) - (long) (next_seg_seq_num - 1) * REQ_FIELD(req, max_msg_sz);  //length of last segment
+            else
+                length = REQ_FIELD(req, max_msg_sz);
+
+            void *addr =
+                (void *) ((char *) REQ_FIELD(req, buf.from) +
+                          (long) (next_seg_seq_num - 1) * REQ_FIELD(req, max_msg_sz));
+            struct ibv_mr *mr =
+                MPID_nem_ib_com_reg_mr_fetch(addr, length, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
+            MPIU_ERR_CHKANDJUMP(!mr, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
+
+#ifdef HAVE_LIBDCFA
+            void *_addr = mr->host_addr;
+#else
+            void *_addr = addr;
+#endif
+            MPID_nem_ib_lmt_send_RTS(vc, done_pkt->receiver_req_id, _addr, mr->rkey,
+                                     next_seg_seq_num);
+        }
         break;
     default:
         MPIU_ERR_INTERNALANDJUMP(mpi_errno, "unexpected request type");
@@ -2238,6 +2289,78 @@ int MPID_nem_ib_pkt_GET_DONE_handler(MPIDI_VC_t * vc,
 }
 
 #undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_pkt_RTS_handler
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_ib_pkt_RTS_handler(MPIDI_VC_t * vc,
+                                MPIDI_CH3_Pkt_t * pkt,
+                                MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPID_nem_ib_pkt_lmt_rts_t *const rts_pkt = (MPID_nem_ib_pkt_lmt_rts_t *) pkt;
+    MPID_Request *req;
+    MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
+    dprintf("ib_pkt_RTS_handler,enter\n");
+    *buflen = sizeof(MPIDI_CH3_Pkt_t);
+    MPID_Request_get_ptr(rts_pkt->req_id, req);
+    MPIU_THREAD_CS_ENTER(LMT,);
+
+    void *write_to_buf =
+        (void *) ((char *) REQ_FIELD(req, buf.to) +
+                  (long) (rts_pkt->seg_seq_num - 1) * REQ_FIELD(req, max_msg_sz));
+
+    int last;
+    long length;
+
+    /* last segment */
+    if (rts_pkt->seg_seq_num == REQ_FIELD(req, seg_num)) {
+        last = 1;
+        length =
+            req->ch.lmt_data_sz - (long) (rts_pkt->seg_seq_num - 1) * REQ_FIELD(req, max_msg_sz);
+    }
+    else {
+        last = 0;
+        length = REQ_FIELD(req, max_msg_sz);
+    }
+    /* try to issue RDMA-read command */
+    int slack = 1;              /* slack for control packet bringing sequence number */
+    if (MPID_nem_ib_sendq_empty(vc_ib->sendq) &&
+        vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
+        MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack) {
+        mpi_errno =
+            MPID_nem_ib_lmt_start_recv_core(req, rts_pkt->addr, rts_pkt->rkey, length,
+                                            write_to_buf, REQ_FIELD(req, max_msg_sz),
+                                            last);
+        if (mpi_errno) {
+            MPIU_ERR_POP(mpi_errno);
+        }
+    }
+    else {
+        /* enqueue command into send_queue */
+        dprintf("ib_pkt_RTS_handler, enqueuing,sendq_empty=%d,ncom=%d,ncqe=%d\n",
+                MPID_nem_ib_sendq_empty(vc_ib->sendq),
+                vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
+                MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
+
+        /* make raddr, (sz is in rreq->ch.lmt_data_sz), rkey, (user_buf is in req->dev.user_buf) survive enqueue, free cookie, dequeue */
+        REQ_FIELD(req, lmt_raddr) = rts_pkt->addr;
+        REQ_FIELD(req, lmt_rkey) = rts_pkt->rkey;
+        REQ_FIELD(req, lmt_write_to_buf) = write_to_buf;
+        REQ_FIELD(req, lmt_szsend) = length;
+        REQ_FIELD(req, last) = last;
+
+        MPID_nem_ib_sendq_enqueue(&vc_ib->sendq, req);
+    }
+
+    *rreqp = NULL;
+  fn_exit:
+    MPIU_THREAD_CS_EXIT(LMT,);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
 #define FUNCNAME MPID_nem_ib_PktHandler_req_seq_num
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
index 1ab307b..69615c1 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
@@ -241,12 +241,19 @@ static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, vo
 
             void *write_from_buf = data;
 
+            uint32_t max_msg_sz;
+            MPID_nem_ib_com_get_info_conn(vc_ib->sc->fd, MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ,
+                                          &max_msg_sz, sizeof(uint32_t));
+
+            /* RMA : Netmod IB supports only smaller size than max_msg_sz. */
+            MPIU_Assert(data_sz <= max_msg_sz);
+
             MPID_nem_ib_rma_lmt_cookie_t *s_cookie_buf = (MPID_nem_ib_rma_lmt_cookie_t *) MPIU_Malloc(sizeof(MPID_nem_ib_rma_lmt_cookie_t));
 
             sreq->ch.s_cookie = s_cookie_buf;
 
             s_cookie_buf->tail = *((uint8_t *) ((uint8_t *) write_from_buf + data_sz - sizeof(uint8_t)));
-            /* put IB rkey */
+
             struct ibv_mr *mr =
                 MPID_nem_ib_com_reg_mr_fetch(write_from_buf, data_sz, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
             MPIU_ERR_CHKANDJUMP(!mr, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
@@ -258,6 +265,7 @@ static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, vo
             s_cookie_buf->rkey = mr->rkey;
             s_cookie_buf->len = data_sz;
             s_cookie_buf->sender_req_id = sreq->handle;
+            s_cookie_buf->max_msg_sz = max_msg_sz;
 
 	    /* set for ib_com_isend */
 	    prefix = (void *)&pkt_netmod;
@@ -796,14 +804,20 @@ static int MPID_nem_ib_SendNoncontig_core(MPIDI_VC_t * vc, MPID_Request * sreq,
 
             void *write_from_buf = REQ_FIELD(sreq, lmt_pack_buf);
 
+            uint32_t max_msg_sz;
+            MPID_nem_ib_com_get_info_conn(vc_ib->sc->fd, MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ,
+                                          &max_msg_sz, sizeof(uint32_t));
+
+            /* RMA : Netmod IB supports only smaller size than max_msg_sz. */
+            MPIU_Assert(data_sz <= max_msg_sz);
+
             MPID_nem_ib_rma_lmt_cookie_t *s_cookie_buf = (MPID_nem_ib_rma_lmt_cookie_t *) MPIU_Malloc(sizeof(MPID_nem_ib_rma_lmt_cookie_t));
 
             sreq->ch.s_cookie = s_cookie_buf;
 
             s_cookie_buf->tail = *((uint8_t *) ((uint8_t *) write_from_buf + last - sizeof(uint8_t)));
-            /* put IB rkey */
-            struct ibv_mr *mr =
-                MPID_nem_ib_com_reg_mr_fetch(write_from_buf, last, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
+
+            struct ibv_mr *mr = MPID_nem_ib_com_reg_mr_fetch(write_from_buf, last, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
             MPIU_ERR_CHKANDJUMP(!mr, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
 #ifdef HAVE_LIBDCFA
             s_cookie_buf->addr = (void *) mr->host_addr;
@@ -813,6 +827,7 @@ static int MPID_nem_ib_SendNoncontig_core(MPIDI_VC_t * vc, MPID_Request * sreq,
             s_cookie_buf->rkey = mr->rkey;
             s_cookie_buf->len = last;
             s_cookie_buf->sender_req_id = sreq->handle;
+            s_cookie_buf->max_msg_sz = max_msg_sz;
 
 	    /* set for ib_com_isend */
 	    prefix = (void *)&pkt_netmod;
@@ -1321,7 +1336,10 @@ int MPID_nem_ib_send_progress(MPIDI_VC_t * vc)
                 mpi_errno =
                     MPID_nem_ib_lmt_start_recv_core(sreq, REQ_FIELD(sreq, lmt_raddr),
                                                     REQ_FIELD(sreq, lmt_rkey), REQ_FIELD(sreq,
-                                                                                         lmt_write_to_buf));
+                                                                                         lmt_szsend),
+                                                    REQ_FIELD(sreq, lmt_write_to_buf),
+                                                    REQ_FIELD(sreq, max_msg_sz), REQ_FIELD(sreq,
+                                                                                           last));
                 if (mpi_errno) {
                     MPIU_ERR_POP(mpi_errno);
                 }

-----------------------------------------------------------------------

Summary of changes:
 .../ch3/channels/nemesis/netmod/ib/errnames.txt    |    1 +
 src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c |  105 +++----------
 src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h |   30 +++-
 src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h  |   63 +++++++-
 src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c  |    1 +
 src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c   |  132 +++++++++++++---
 src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c  |  167 ++++++++++++++++++--
 .../ch3/channels/nemesis/netmod/ib/ib_reg_mr.c     |   85 ++++++++---
 src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c  |   41 ++++-
 9 files changed, 469 insertions(+), 156 deletions(-)


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list