[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.1.1-55-g7d91972
Service Account
noreply at mpich.org
Fri Jul 4 15:37:54 CDT 2014
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".
The branch, master has been updated
via 7d91972ad4c39e50d7971592625a6ae23a598ae8 (commit)
via c3e1d60bdd4f5f94b357f46e2bba7ba64b2d1971 (commit)
via 6caab150572e0ea65ade269096ec9a28525e68dc (commit)
via 85231ee6e0e1c8b2b69039500a8a864abe608b12 (commit)
via a069e97f945a199ee5737fc727819fd93384a69a (commit)
via 92c811d33ba9d12fbf32aacd032c53f1d2934256 (commit)
via 0e7e956869725cf55227711d155b7680c8b783b8 (commit)
via bb280027d7271cf501df7fddaca66c2b3cc18d7a (commit)
via b5c0c7ef288e9d4fb66cb834bbfe6a1250f5bca5 (commit)
via 8aaede8f58bd07ea790ef112131633a96528ec9d (commit)
via 0361551c718eee9d3bd1514f0e18a4e270e75bea (commit)
via d384cbab8bd27a1c46a3f13891b11f8152d92a14 (commit)
from 283319f5d9d1cdbcf016dc0ff8ab284114c0be7e (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/7d91972ad4c39e50d7971592625a6ae23a598ae8
commit 7d91972ad4c39e50d7971592625a6ae23a598ae8
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date: Mon May 26 08:34:51 2014 +0900
white space fixup
Signed-off-by: Pavan Balaji <balaji at anl.gov>
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
index 6f954a2..7732be8 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
@@ -38,24 +38,25 @@ static struct ibv_device **ib_devlist;
static struct ibv_context *ib_ctx;
struct ibv_context *MPID_nem_ib_ctx_export; /* for SC13 demo connector */
static struct ibv_pd *ib_pd;
-struct ibv_pd *MPID_nem_ib_pd_export; /* for SC13 demo connector */
-struct ibv_cq *MPID_nem_ib_rc_shared_scq;
-static int MPID_nem_ib_rc_shared_scq_ref_count;
-struct ibv_cq *MPID_nem_ib_rc_shared_scq_scratch_pad;
-static int MPID_nem_ib_rc_shared_scq_scratch_pad_ref_count;
+struct ibv_pd *MPID_nem_ib_pd_export; /* for SC13 demo connector */
+struct ibv_cq *MPID_nem_ib_rc_shared_scq;
+static int MPID_nem_ib_rc_shared_scq_ref_count;
+struct ibv_cq *MPID_nem_ib_rc_shared_scq_scratch_pad;
+static int MPID_nem_ib_rc_shared_scq_scratch_pad_ref_count;
static struct ibv_cq *MPID_nem_ib_ud_shared_scq;
-static int MPID_nem_ib_ud_shared_scq_ref_count;
+static int MPID_nem_ib_ud_shared_scq_ref_count;
static struct ibv_cq *MPID_nem_ib_rc_shared_rcq;
-static int MPID_nem_ib_rc_shared_rcq_ref_count;
+static int MPID_nem_ib_rc_shared_rcq_ref_count;
static struct ibv_cq *MPID_nem_ib_rc_shared_rcq_scratch_pad;
-static int MPID_nem_ib_rc_shared_rcq_scratch_pad_ref_count;
-struct ibv_cq *MPID_nem_ib_ud_shared_rcq;
-static int MPID_nem_ib_ud_shared_rcq_ref_count;
+static int MPID_nem_ib_rc_shared_rcq_scratch_pad_ref_count;
+struct ibv_cq *MPID_nem_ib_ud_shared_rcq;
+static int MPID_nem_ib_ud_shared_rcq_ref_count;
uint8_t *MPID_nem_ib_scratch_pad = 0;
int MPID_nem_ib_scratch_pad_ref_count;
char *MPID_nem_ib_rdmawr_from_alloc_free_list_front[MPID_NEM_IB_RDMAWR_FROM_ALLOC_NID] = { 0 };
char *MPID_nem_ib_rdmawr_from_alloc_arena_free_list[MPID_NEM_IB_RDMAWR_FROM_ALLOC_NID] = { 0 };
-struct ibv_mr* MPID_nem_ib_rdmawr_to_alloc_mr;
+
+struct ibv_mr *MPID_nem_ib_rdmawr_to_alloc_mr;
uint8_t *MPID_nem_ib_rdmawr_to_alloc_start;
uint8_t *MPID_nem_ib_rdmawr_to_alloc_free_list;
@@ -69,12 +70,12 @@ uint8_t *MPID_nem_ib_rdmawr_to_alloc_free_list;
#define MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp) \
{ \
if (condesc < 0 || condesc >= MPID_NEM_IB_COM_SIZE) { \
- dprintf("condesc=%d\n", condesc);\
-MPID_nem_ib_segv; \
- return -1; \
- } \
- conp = &contab[condesc]; \
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_used != 1, -1, dprintf("MPID_NEM_IB_RANGE_CHECK_WITH_ERROR,conp->icom_used=%d\n", conp->icom_used)); \
+ dprintf("condesc=%d\n", condesc); \
+ MPID_nem_ib_segv; \
+ return -1; \
+ } \
+ conp = &contab[condesc]; \
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_used != 1, -1, dprintf("MPID_NEM_IB_RANGE_CHECK_WITH_ERROR,conp->icom_used=%d\n", conp->icom_used)); \
}
/* Allocator for RDMA write to buffer
@@ -95,25 +96,25 @@ static int MPID_nem_ib_rdmawr_to_init(uint64_t sz)
int ibcom_errno = 0;
void *start;
void *cur;
- start = (void *) mmap(0, sz, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE,
- -1, 0);
+ start = (void *) mmap(0, sz, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(start == (void *) -1, -1, printf("mmap failed\n"));
dprintf("rdmawr_to_init,sz=%ld,start=%p\n", sz, start);
-
+
memset(start, 0, sz);
-
+
MPID_nem_ib_rdmawr_to_alloc_mr = MPID_nem_ib_com_reg_mr_fetch(start, sz, 0);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(!MPID_nem_ib_rdmawr_to_alloc_mr, -1,
printf("MPID_nem_ib_com_reg_mr_fetchibv_reg_mr failed\n"));
dprintf("rdmawr_to_init,rkey=%08x\n", MPID_nem_ib_rdmawr_to_alloc_mr->rkey);
-
+
MPID_nem_ib_rdmawr_to_alloc_start = start;
MPID_nem_ib_rdmawr_to_alloc_free_list = start;
for (cur = start;
- cur < (uint8_t *)start + sz - MPID_NEM_IB_COM_RDMABUF_SZSEG;
- cur = (uint8_t *)cur + MPID_NEM_IB_COM_RDMABUF_SZSEG) {
+ cur < (uint8_t *) start + sz - MPID_NEM_IB_COM_RDMABUF_SZSEG;
+ cur = (uint8_t *) cur + MPID_NEM_IB_COM_RDMABUF_SZSEG) {
//dprintf("rdmawr_to_init,cur=%p\n", cur);
- ((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) cur)->next = (uint8_t*)cur + MPID_NEM_IB_COM_RDMABUF_SZSEG;
+ ((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) cur)->next =
+ (uint8_t *) cur + MPID_NEM_IB_COM_RDMABUF_SZSEG;
}
((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) cur)->next = 0;
@@ -128,13 +129,13 @@ void *MPID_nem_ib_rdmawr_to_alloc(int nslots)
dprintf("rdmawr_to_alloc,nslots=%d\n", nslots);
void *start;
int i;
- for(i = 0; i < nslots; i++) {
+ for (i = 0; i < nslots; i++) {
//dprintf("MPID_nem_ib_rdmawr_to_alloc,free_list=%p\n", MPID_nem_ib_rdmawr_to_alloc_free_list);
if (MPID_nem_ib_rdmawr_to_alloc_free_list) {
- if(i == 0) {
+ if (i == 0) {
start = MPID_nem_ib_rdmawr_to_alloc_free_list;
}
- MPID_nem_ib_rdmawr_to_alloc_free_list =
+ MPID_nem_ib_rdmawr_to_alloc_free_list =
((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) MPID_nem_ib_rdmawr_to_alloc_free_list)->next;
}
else {
@@ -149,12 +150,12 @@ void MPID_nem_ib_rdmawr_to_free(void *p, int nslots)
{
void *q;
((MPID_nem_ib_rdmawr_to_alloc_hdr_t *)
- ((uint8_t*)p + MPID_NEM_IB_COM_RDMABUF_SZSEG * (nslots-1)))->next =
+ ((uint8_t *) p + MPID_NEM_IB_COM_RDMABUF_SZSEG * (nslots - 1)))->next =
MPID_nem_ib_rdmawr_to_alloc_free_list;
- for (q = (uint8_t *)p + MPID_NEM_IB_COM_RDMABUF_SZSEG * (nslots-2);
- q >= p;
- q = (uint8_t *)q - MPID_NEM_IB_COM_RDMABUF_SZSEG) {
- ((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) q)->next = (uint8_t *)q + MPID_NEM_IB_COM_RDMABUF_SZSEG;
+ for (q = (uint8_t *) p + MPID_NEM_IB_COM_RDMABUF_SZSEG * (nslots - 2);
+ q >= p; q = (uint8_t *) q - MPID_NEM_IB_COM_RDMABUF_SZSEG) {
+ ((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) q)->next =
+ (uint8_t *) q + MPID_NEM_IB_COM_RDMABUF_SZSEG;
}
MPID_nem_ib_rdmawr_to_alloc_free_list = p;
}
@@ -374,28 +375,29 @@ static int MPID_nem_ib_com_clean(MPID_nem_ib_com_t * conp)
switch (conp->open_flag) {
case MPID_NEM_IB_COM_OPEN_RC:
MPIU_Assert(MPID_nem_ib_rc_shared_scq_ref_count > 0);
- if(--MPID_nem_ib_rc_shared_scq_ref_count == 0) {
+ if (--MPID_nem_ib_rc_shared_scq_ref_count == 0) {
dprintf("ibcom,destroy MPID_nem_ib_rc_shared_scq\n");
ib_errno = ibv_destroy_cq(MPID_nem_ib_rc_shared_scq);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq failed\n"));
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq failed\n"));
/* Tell drain_scq that CQ is destroyed because
- drain_scq is called after poll_eager calls vc_terminate */
+ * drain_scq is called after poll_eager calls vc_terminate */
MPID_nem_ib_rc_shared_scq = NULL;
}
MPIU_Assert(MPID_nem_ib_rc_shared_rcq_ref_count > 0);
- if(--MPID_nem_ib_rc_shared_rcq_ref_count == 0) {
+ if (--MPID_nem_ib_rc_shared_rcq_ref_count == 0) {
dprintf("ibcom,destroy MPID_nem_ib_rc_shared_rcq\n");
ib_errno = ibv_destroy_cq(MPID_nem_ib_rc_shared_rcq);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq failed\n"));
MPID_nem_ib_rc_shared_rcq = NULL;
}
-#if 0 /* It's not used */
- retval = munmap(conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_FROM], MPID_NEM_IB_COM_RDMABUF_SZ);
+#if 0 /* It's not used */
+ retval =
+ munmap(conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_FROM], MPID_NEM_IB_COM_RDMABUF_SZ);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, dprintf("munmap"));
#endif
-#if 0 /* Don't free it because it's managed through VC_FILED(vc, ibcom->remote_ringbuf) */
+#if 0 /* Don't free it because it's managed through VC_FILED(vc, ibcom->remote_ringbuf) */
retval = munmap(conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO], MPID_NEM_IB_COM_RDMABUF_SZ);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, dprintf("munmap"));
#endif
@@ -419,15 +421,15 @@ static int MPID_nem_ib_com_clean(MPID_nem_ib_com_t * conp)
break;
case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
MPIU_Assert(MPID_nem_ib_rc_shared_scq_scratch_pad_ref_count > 0);
- if(--MPID_nem_ib_rc_shared_scq_scratch_pad_ref_count == 0) {
+ if (--MPID_nem_ib_rc_shared_scq_scratch_pad_ref_count == 0) {
ib_errno = ibv_destroy_cq(MPID_nem_ib_rc_shared_scq_scratch_pad);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq failed\n"));
/* Tell drain_scq that CQ is destroyed because
- drain_scq is called after poll_eager calls vc_terminate */
+ * drain_scq is called after poll_eager calls vc_terminate */
MPID_nem_ib_rc_shared_scq_scratch_pad = NULL;
}
MPIU_Assert(MPID_nem_ib_rc_shared_rcq_scratch_pad_ref_count > 0);
- if(--MPID_nem_ib_rc_shared_rcq_scratch_pad_ref_count == 0) {
+ if (--MPID_nem_ib_rc_shared_rcq_scratch_pad_ref_count == 0) {
ib_errno = ibv_destroy_cq(MPID_nem_ib_rc_shared_rcq_scratch_pad);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq failed\n"));
}
@@ -452,15 +454,15 @@ static int MPID_nem_ib_com_clean(MPID_nem_ib_com_t * conp)
break;
case MPID_NEM_IB_COM_OPEN_UD:
MPIU_Assert(MPID_nem_ib_ud_shared_scq_ref_count > 0);
- if(--MPID_nem_ib_ud_shared_scq_ref_count == 0) {
+ if (--MPID_nem_ib_ud_shared_scq_ref_count == 0) {
ib_errno = ibv_destroy_cq(MPID_nem_ib_ud_shared_scq);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq"));
/* Tell drain_scq that CQ is destroyed because
- drain_scq is called after poll_eager calls vc_terminate */
+ * drain_scq is called after poll_eager calls vc_terminate */
MPID_nem_ib_ud_shared_scq = NULL;
}
MPIU_Assert(MPID_nem_ib_ud_shared_rcq_ref_count > 0);
- if(--MPID_nem_ib_ud_shared_rcq_ref_count == 0) {
+ if (--MPID_nem_ib_ud_shared_rcq_ref_count == 0) {
ib_errno = ibv_destroy_cq(MPID_nem_ib_ud_shared_rcq);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq"));
}
@@ -486,9 +488,9 @@ static int MPID_nem_ib_com_clean(MPID_nem_ib_com_t * conp)
}
memset(conp, 0, sizeof(MPID_nem_ib_com_t));
- fn_exit:
+ fn_exit:
return ibcom_errno;
- fn_fail:
+ fn_fail:
goto fn_exit;
}
@@ -516,8 +518,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
/* Increment reference counter of ibv_reg_mr cache */
ibcom_errno = MPID_nem_ib_com_register_cache_init();
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ibcom_errno, -1,
- dprintf("MPID_nem_ib_com_register_cache_init"));
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(ibcom_errno, -1, dprintf("MPID_nem_ib_com_register_cache_init"));
/* device open error */
if (MPID_nem_ib_com_device_init() < 0) {
@@ -736,23 +737,23 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
goto fn_fail;
}
#else
- /* ibv_reg_mr all memory area for all ring buffers
- including shared and exclusive ones */
- if(!MPID_nem_ib_rdmawr_to_alloc_start) {
+ /* ibv_reg_mr all memory area for all ring buffers
+ * including shared and exclusive ones */
+ if (!MPID_nem_ib_rdmawr_to_alloc_start) {
ibcom_errno =
MPID_nem_ib_rdmawr_to_init(MPID_NEM_IB_COM_RDMABUF_SZ * MPID_NEM_IB_NRINGBUF);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(ibcom_errno, -1, printf("MPID_nem_ib_rdmawr_to_init"));
- dprintf("ib_com_open,MPID_nem_ib_rdmawr_to_alloc_free_list=%p\n", MPID_nem_ib_rdmawr_to_alloc_free_list);
+ dprintf("ib_com_open,MPID_nem_ib_rdmawr_to_alloc_free_list=%p\n",
+ MPID_nem_ib_rdmawr_to_alloc_free_list);
}
- conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO] =
- MPID_nem_ib_rdmawr_to_alloc_start;
- //mmap(0, MPID_NEM_IB_COM_RDMABUF_SZ, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE,
- //-1, 0);
+ conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO] = MPID_nem_ib_rdmawr_to_alloc_start;
+ //mmap(0, MPID_NEM_IB_COM_RDMABUF_SZ, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE,
+ //-1, 0);
dprintf("MPID_nem_ib_com_open,mmap=%p,len=%d\n", conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO],
MPID_NEM_IB_COM_RDMABUF_SZ);
#endif
-
+
#ifdef HAVE_LIBDCFA
dprintf("MPID_nem_ib_com_open,fd=%d,rmem=%p\n", *condesc,
MPID_nem_ib_rdmawr_to_alloc_mr->buf);
@@ -781,7 +782,9 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
break;
case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
/* RDMA-write-from and -to local memory area */
- conp->icom_mrlist = (struct ibv_mr **) MPIU_Malloc(sizeof(struct ibv_mr *) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
+ conp->icom_mrlist =
+ (struct ibv_mr **) MPIU_Malloc(sizeof(struct ibv_mr *) *
+ MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
memset(conp->icom_mrlist, 0, sizeof(struct ibv_mr *) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
conp->icom_mrlen = MPID_NEM_IB_COM_NBUF_SCRATCH_PAD;
conp->icom_mem = (void **) MPIU_Malloc(sizeof(void *) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
@@ -792,15 +795,15 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
/* RDMA-write-from local memory area */
conp->icom_msize[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] = MPID_NEM_IB_COM_SCRATCH_PAD_FROM_SZ;
- conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] =
- mmap(0, MPID_NEM_IB_COM_SCRATCH_PAD_FROM_SZ, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE,
- -1, 0);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] == (void*)-1, -1,
- printf("mmap failed\n"));
+ conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] =
+ mmap(0, MPID_NEM_IB_COM_SCRATCH_PAD_FROM_SZ, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] ==
+ (void *) -1, -1, printf("mmap failed\n"));
- conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] =
+ conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] =
MPID_nem_ib_com_reg_mr_fetch(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM],
- conp->icom_msize[MPID_NEM_IB_COM_SCRATCH_PAD_FROM], 0);
+ conp->icom_msize[MPID_NEM_IB_COM_SCRATCH_PAD_FROM], 0);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_FROM], -1,
printf("ibv_reg_mr failed\n"));
@@ -809,7 +812,8 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_rmem == 0, -1, dprintf("malloc failed\n"));
memset(conp->icom_rmem, 0, sizeof(void *) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
- conp->icom_rsize = (size_t *) MPIU_Malloc(sizeof(size_t) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
+ conp->icom_rsize =
+ (size_t *) MPIU_Malloc(sizeof(size_t) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_rsize == 0, -1, dprintf("malloc failed\n"));
memset(conp->icom_rsize, 0, sizeof(size_t) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
@@ -889,7 +893,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
/* SR (send request) template */
conp->icom_sr =
(struct ibv_send_wr *) MPIU_Malloc(sizeof(struct ibv_send_wr) *
- MPID_NEM_IB_COM_RC_SR_NTEMPLATE);
+ MPID_NEM_IB_COM_RC_SR_NTEMPLATE);
memset(conp->icom_sr, 0, sizeof(struct ibv_send_wr) * MPID_NEM_IB_COM_RC_SR_NTEMPLATE);
for (i = 0; i < MPID_NEM_IB_COM_SMT_INLINE_NCHAIN; i++) {
@@ -900,7 +904,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
#else
sge =
(struct ibv_sge *) MPIU_Malloc(sizeof(struct ibv_sge) *
- MPID_NEM_IB_COM_SMT_INLINE_INITIATOR_NSGE);
+ MPID_NEM_IB_COM_SMT_INLINE_INITIATOR_NSGE);
memset(sge, 0, sizeof(struct ibv_sge) * MPID_NEM_IB_COM_SMT_INLINE_INITIATOR_NSGE);
#endif
conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].next =
@@ -923,7 +927,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
#else
sge =
(struct ibv_sge *) MPIU_Malloc(sizeof(struct ibv_sge) *
- MPID_NEM_IB_COM_SMT_NOINLINE_INITIATOR_NSGE);
+ MPID_NEM_IB_COM_SMT_NOINLINE_INITIATOR_NSGE);
memset(sge, 0, sizeof(struct ibv_sge) * MPID_NEM_IB_COM_SMT_NOINLINE_INITIATOR_NSGE);
#endif
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].next = NULL;
@@ -942,7 +946,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
#else
sge =
(struct ibv_sge *) MPIU_Malloc(sizeof(struct ibv_sge) *
- MPID_NEM_IB_COM_LMT_INITIATOR_NSGE);
+ MPID_NEM_IB_COM_LMT_INITIATOR_NSGE);
memset(sge, 0, sizeof(struct ibv_sge) * MPID_NEM_IB_COM_LMT_INITIATOR_NSGE);
#endif
conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].next = NULL;
@@ -979,7 +983,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
/* RR (receive request) template for MPID_NEM_IB_COM_RDMAWR_RESPONDER */
conp->icom_rr =
(struct ibv_recv_wr *) MPIU_Malloc(sizeof(struct ibv_recv_wr) *
- MPID_NEM_IB_COM_RC_RR_NTEMPLATE);
+ MPID_NEM_IB_COM_RC_RR_NTEMPLATE);
memset(conp->icom_rr, 0, sizeof(struct ibv_recv_wr) * MPID_NEM_IB_COM_RC_RR_NTEMPLATE);
/* create one dummy RR to ibv_post_recv */
@@ -995,7 +999,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
/* SR (send request) template */
conp->icom_sr =
(struct ibv_send_wr *) MPIU_Malloc(sizeof(struct ibv_send_wr) *
- MPID_NEM_IB_COM_SCRATCH_PAD_SR_NTEMPLATE);
+ MPID_NEM_IB_COM_SCRATCH_PAD_SR_NTEMPLATE);
memset(conp->icom_sr, 0,
sizeof(struct ibv_send_wr) * MPID_NEM_IB_COM_SCRATCH_PAD_SR_NTEMPLATE);
@@ -1006,7 +1010,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
#else
sge =
(struct ibv_sge *) MPIU_Malloc(sizeof(struct ibv_sge) *
- MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR_NSGE);
+ MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR_NSGE);
memset(sge, 0, sizeof(struct ibv_sge) * MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR_NSGE);
#endif
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].next = NULL;
@@ -1025,7 +1029,8 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
0, sizeof(struct ibv_sge) * WR_SG_NUM);
#else
sge =
- (struct ibv_sge*) MPIU_Malloc(sizeof(struct ibv_sge) * MPID_NEM_IB_COM_SCRATCH_PAD_GET_NSGE);
+ (struct ibv_sge *) MPIU_Malloc(sizeof(struct ibv_sge) *
+ MPID_NEM_IB_COM_SCRATCH_PAD_GET_NSGE);
memset(sge, 0, sizeof(struct ibv_sge) * MPID_NEM_IB_COM_SCRATCH_PAD_GET_NSGE);
#endif
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].next = NULL;
@@ -1043,7 +1048,8 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
0, sizeof(struct ibv_sge) * WR_SG_NUM);
#else
sge =
- (struct ibv_sge*) MPIU_Malloc(sizeof(struct ibv_sge) * MPID_NEM_IB_COM_SCRATCH_PAD_CAS_NSGE);
+ (struct ibv_sge *) MPIU_Malloc(sizeof(struct ibv_sge) *
+ MPID_NEM_IB_COM_SCRATCH_PAD_CAS_NSGE);
memset(sge, 0, sizeof(struct ibv_sge) * MPID_NEM_IB_COM_SCRATCH_PAD_CAS_NSGE);
#endif
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].next = NULL;
@@ -1073,7 +1079,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
conp->icom_ah_attr =
(struct ibv_ah_attr *) MPIU_Calloc(MPID_NEM_IB_COM_UD_SR_NTEMPLATE,
- sizeof(struct ibv_ah_attr));
+ sizeof(struct ibv_ah_attr));
conp->icom_ah_attr[MPID_NEM_IB_COM_UD_INITIATOR].sl = 0;
conp->icom_ah_attr[MPID_NEM_IB_COM_UD_INITIATOR].src_path_bits = 0;
@@ -1092,7 +1098,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
/* SR (send request) template for MPID_NEM_IB_COM_UD_INITIATOR */
conp->icom_sr =
(struct ibv_send_wr *) MPIU_Calloc(MPID_NEM_IB_COM_UD_SR_NTEMPLATE,
- sizeof(struct ibv_send_wr));
+ sizeof(struct ibv_send_wr));
conp->icom_sr[MPID_NEM_IB_COM_UD_INITIATOR].next = NULL;
#ifdef HAVE_LIBDCFA
@@ -1119,7 +1125,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
/* RR (receive request) template for MPID_NEM_IB_COM_UD_RESPONDER */
conp->icom_rr =
(struct ibv_recv_wr *) MPIU_Calloc(MPID_NEM_IB_COM_UD_RR_NTEMPLATE,
- sizeof(struct ibv_recv_wr));
+ sizeof(struct ibv_recv_wr));
/* create one dummy RR to ibv_post_recv */
conp->icom_rr[MPID_NEM_IB_COM_UD_RESPONDER].next = NULL;
@@ -1158,10 +1164,11 @@ int MPID_nem_ib_com_alloc(int condesc, int sz)
/* RDMA-write-to local memory area */
MPID_nem_ib_scratch_pad_ref_count++;
if (!MPID_nem_ib_scratch_pad) {
- MPID_nem_ib_scratch_pad = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ MPID_nem_ib_scratch_pad =
+ mmap(0, sz, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
dprintf("MPID_nem_ib_com_alloc,mmap=%p,len=%d\n", MPID_nem_ib_scratch_pad, sz);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(MPID_nem_ib_scratch_pad == (void *) -1, -1,
- dprintf("failed to allocate buffer\n"));
+ dprintf("failed to allocate buffer\n"));
dprintf("MPID_nem_ib_com_alloc,MPID_nem_ib_scratch_pad=%p\n", MPID_nem_ib_scratch_pad);
memset(MPID_nem_ib_scratch_pad, 0, sz);
}
@@ -1198,7 +1205,8 @@ int MPID_nem_ib_com_alloc(int condesc, int sz)
goto fn_exit;
}
-int MPID_nem_ib_com_free(int condesc, int sz) {
+int MPID_nem_ib_com_free(int condesc, int sz)
+{
MPID_nem_ib_com_t *conp;
int ibcom_errno = 0;
int retval;
@@ -1209,7 +1217,7 @@ int MPID_nem_ib_com_free(int condesc, int sz) {
case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
MPIU_Assert(MPID_nem_ib_scratch_pad_ref_count > 0);
- if(--MPID_nem_ib_scratch_pad_ref_count == 0) {
+ if (--MPID_nem_ib_scratch_pad_ref_count == 0) {
retval = munmap(MPID_nem_ib_scratch_pad, sz);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, dprintf("munmap"));
MPID_nem_ib_scratch_pad = NULL;
@@ -1222,10 +1230,10 @@ int MPID_nem_ib_com_free(int condesc, int sz) {
conp->open_flag));
break;
}
-
- fn_exit:
+
+ fn_exit:
return ibcom_errno;
- fn_fail:
+ fn_fail:
goto fn_exit;
}
@@ -1242,7 +1250,7 @@ int MPID_nem_ib_com_close(int condesc)
MPID_NEM_IB_COM_ERR_CHKANDJUMP(ibcom_errno, -1,
printf("MPID_nem_ib_com_register_cache_release"));
--maxcon;
-
+
fn_exit:
return ibcom_errno;
fn_fail:
@@ -1269,7 +1277,7 @@ int MPID_nem_ib_com_rts(int condesc, int remote_qpnum, uint16_t remote_lid,
case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
/* Init QP */
ib_errno = modify_qp_to_init(conp->icom_qp, conp->icom_port, IBV_ACCESS_REMOTE_ATOMIC);
- if(ib_errno) {
+ if (ib_errno) {
fprintf(stderr, "change QP state to INIT failed\n");
ibcom_errno = ib_errno;
goto fn_fail;
@@ -1283,7 +1291,7 @@ int MPID_nem_ib_com_rts(int condesc, int remote_qpnum, uint16_t remote_lid,
ibcom_errno = ib_errno;
goto fn_fail;
}
- common_tail:
+ common_tail:
/* Modify QP TO RTR status */
ib_errno =
modify_qp_to_rtr(conp->icom_qp, remote_qpnum, remote_lid, remote_gid, conp->icom_port,
@@ -1346,7 +1354,7 @@ int MPID_nem_ib_com_isend(int condesc,
void *data, int sz_data,
int *copied,
uint32_t local_ringbuf_type, uint32_t remote_ringbuf_type,
- void** buf_from_out, uint32_t* buf_from_sz_out)
+ void **buf_from_out, uint32_t * buf_from_sz_out)
{
MPID_nem_ib_com_t *conp;
int ibcom_errno = 0;
@@ -1354,21 +1362,25 @@ int MPID_nem_ib_com_isend(int condesc,
int ib_errno;
int num_sge;
- dprintf("MPID_nem_ib_com_isend,prefix=%p,sz_prefix=%d,hdr=%p,sz_hdr=%d,data=%p,sz_data=%d,local_ringbuf_type=%d,remote_ringbuf_type=%d\n",
- prefix, sz_prefix, hdr, sz_hdr, data, sz_data, local_ringbuf_type, remote_ringbuf_type);
+ dprintf
+ ("MPID_nem_ib_com_isend,prefix=%p,sz_prefix=%d,hdr=%p,sz_hdr=%d,data=%p,sz_data=%d,local_ringbuf_type=%d,remote_ringbuf_type=%d\n",
+ prefix, sz_prefix, hdr, sz_hdr, data, sz_data, local_ringbuf_type, remote_ringbuf_type);
MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
if (conp->icom_connected == 0) {
return -1;
}
-
+
int off_pow2_aligned;
- MPID_NEM_IB_OFF_POW2_ALIGNED(MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr + sz_data);
+ MPID_NEM_IB_OFF_POW2_ALIGNED(MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix +
+ sz_hdr + sz_data);
uint32_t sumsz = off_pow2_aligned + sizeof(MPID_nem_ib_netmod_trailer_t);
- int sz_pad = off_pow2_aligned - (MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr + sz_data);
+ int sz_pad =
+ off_pow2_aligned - (MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr +
+ sz_data);
- uint32_t buf_from_sz = MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr +
+ uint32_t buf_from_sz = MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr +
sz_pad + sizeof(MPID_nem_ib_netmod_trailer_t);
*buf_from_sz_out = buf_from_sz;
void *buf_from = MPID_nem_ib_rdmawr_from_alloc(buf_from_sz);
@@ -1382,23 +1394,22 @@ int MPID_nem_ib_com_isend(int condesc,
num_sge = 0;
uint32_t hdr_ringbuf_type = local_ringbuf_type;
- MPID_NEM_IB_NETMOD_HDR_SZ_SET(buf_from,
+ MPID_NEM_IB_NETMOD_HDR_SZ_SET(buf_from,
MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) +
sz_prefix + sz_hdr + sz_data +
sizeof(MPID_nem_ib_netmod_trailer_t));
- if(remote_ringbuf_type == MPID_NEM_IB_RINGBUF_EXCLUSIVE) {
+ if (remote_ringbuf_type == MPID_NEM_IB_RINGBUF_EXCLUSIVE) {
hdr_ringbuf_type |= MPID_NEM_IB_RINGBUF_RELINDEX;
MPID_NEM_IB_NETMOD_HDR_RELINDEX_SET(buf_from, conp->rsr_seq_num_tail);
conp->rsr_seq_num_tail_last_sent = conp->rsr_seq_num_tail;
dprintf("isend,rsr_seq_num_tail=%d\n", MPID_NEM_IB_NETMOD_HDR_RELINDEX_GET(buf_from));
}
- if(local_ringbuf_type == MPID_NEM_IB_RINGBUF_SHARED) {
+ if (local_ringbuf_type == MPID_NEM_IB_RINGBUF_SHARED) {
MPID_NEM_IB_NETMOD_HDR_VC_SET(buf_from, conp->remote_vc);
dprintf("isend,remote_vc=%p\n", MPID_NEM_IB_NETMOD_HDR_VC_GET(buf_from));
}
MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_SET(buf_from, hdr_ringbuf_type);
- dprintf("isend,hdr_ringbuf_type=%08x\n",
- MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf_from));
+ dprintf("isend,hdr_ringbuf_type=%08x\n", MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf_from));
/* memcpy hdr is needed because hdr resides in stack when sending close-VC command */
/* memcpy is performed onto MPID_NEM_IB_COM_RDMAWR_FROM buffer */
@@ -1408,17 +1419,15 @@ int MPID_nem_ib_com_isend(int condesc,
#ifdef HAVE_LIBDCFA
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].mic_addr = (uint64_t) buf_from;
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].addr =
- mr_rdmawr_from->host_addr +
- ((uint64_t) buf_from -
- (uint64_t) MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_START(buf_from));
+ mr_rdmawr_from->host_addr +
+ ((uint64_t) buf_from - (uint64_t) MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_START(buf_from));
#else
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].addr = (uint64_t) buf_from;
#endif
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].length =
MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr;
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].lkey =
- mr_rdmawr_from->lkey;
+ conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].lkey = mr_rdmawr_from->lkey;
num_sge += 1;
if (sz_data) {
@@ -1439,24 +1448,27 @@ int MPID_nem_ib_com_isend(int condesc,
}
MPID_nem_ib_netmod_trailer_t *netmod_trailer =
- (MPID_nem_ib_netmod_trailer_t *) ((uint8_t *) buf_from + MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) +
- sz_prefix + sz_hdr + sz_pad);
+ (MPID_nem_ib_netmod_trailer_t *) ((uint8_t *) buf_from +
+ MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) +
+ sz_prefix + sz_hdr + sz_pad);
netmod_trailer->tail_flag = MPID_NEM_IB_COM_MAGIC;
#ifdef HAVE_LIBDCFA
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].mic_addr =
- (uint64_t) buf_from + MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr;
+ (uint64_t) buf_from + MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix +
+ sz_hdr;
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].addr =
- mr_rdmawr_from->host_addr +
- ((uint64_t) buf_from + MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr -
- (uint64_t) MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_START(buf_from));
+ mr_rdmawr_from->host_addr + ((uint64_t) buf_from +
+ MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix +
+ sz_hdr - (uint64_t)
+ MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_START(buf_from));
#else
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].addr =
- (uint64_t) buf_from + MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr;
+ (uint64_t) buf_from + MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix +
+ sz_hdr;
#endif
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].length =
sz_pad + sizeof(MPID_nem_ib_netmod_trailer_t);
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].lkey =
- mr_rdmawr_from->lkey;
+ conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].lkey = mr_rdmawr_from->lkey;
num_sge += 1;
dprintf("MPID_nem_ib_com_isend,sz_data=%d,pow2=%d,sz_pad=%d,num_sge=%d\n", sz_data,
off_pow2_aligned, sz_pad, num_sge);
@@ -1465,16 +1477,16 @@ int MPID_nem_ib_com_isend(int condesc,
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr_id = wr_id;
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.remote_addr =
(uint64_t) conp->local_ringbuf_start +
- MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t)(conp->sseq_num % conp->local_ringbuf_nslot));
+ MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t) (conp->sseq_num % conp->local_ringbuf_nslot));
dprintf("isend,ringbuf_start=%p,local_head=%04ux,nslot=%d,rkey=%08x,remote_addr=%lx\n",
conp->local_ringbuf_start, conp->sseq_num, conp->local_ringbuf_nslot,
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.rkey,
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.remote_addr
- );
- if(conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.remote_addr <
- (uint64_t) conp->local_ringbuf_start ||
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.remote_addr >=
- (uint64_t) conp->local_ringbuf_start + MPID_NEM_IB_COM_RDMABUF_SZSEG * conp->local_ringbuf_nslot) {
+ conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.remote_addr);
+ if (conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.remote_addr <
+ (uint64_t) conp->local_ringbuf_start ||
+ conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.remote_addr >=
+ (uint64_t) conp->local_ringbuf_start +
+ MPID_NEM_IB_COM_RDMABUF_SZSEG * conp->local_ringbuf_nslot) {
MPID_nem_ib_segv;
}
/* rkey is defined in MPID_nem_ib_com_connect_ringbuf */
@@ -1551,7 +1563,8 @@ int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_h
void *buf_from =
(uint8_t *) conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_FROM] +
- MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t)(conp->sseq_num % MPID_NEM_IB_COM_RDMABUF_NSEG));
+ MPID_NEM_IB_COM_RDMABUF_SZSEG *
+ ((uint16_t) (conp->sseq_num % MPID_NEM_IB_COM_RDMABUF_NSEG));
/* make a tail-magic position is in a fixed set */
int off_pow2_aligned;
@@ -1560,7 +1573,8 @@ int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_h
/* let the last command icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAIN-1] which has IBV_WR_RDMA_WRITE_WITH_IMM */
int s =
MPID_NEM_IB_COM_SMT_INLINE_NCHAIN - (sizeof(MPID_nem_ib_netmod_hdr_t) + sz_hdr +
- off_pow2_aligned + sizeof(MPID_nem_ib_netmod_trailer_t) +
+ off_pow2_aligned +
+ sizeof(MPID_nem_ib_netmod_trailer_t) +
MPID_NEM_IB_COM_INLINE_DATA -
1) / MPID_NEM_IB_COM_INLINE_DATA;
MPID_NEM_IB_COM_ERR_CHKANDJUMP((sizeof(MPID_nem_ib_netmod_hdr_t) + sz_hdr +
@@ -1716,7 +1730,8 @@ int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_h
conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].wr_id = wr_id;
conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].wr.rdma.remote_addr =
(uint64_t) conp->icom_rmem[MPID_NEM_IB_COM_RDMAWR_TO] +
- MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t)(conp->sseq_num % MPID_NEM_IB_COM_RDMABUF_NSEG)) +
+ MPID_NEM_IB_COM_RDMABUF_SZSEG *
+ ((uint16_t) (conp->sseq_num % MPID_NEM_IB_COM_RDMABUF_NSEG)) +
MPID_NEM_IB_COM_INLINE_DATA * (i - s);
}
#if 0
@@ -2008,7 +2023,7 @@ int MPID_nem_ib_com_put_lmt(int condesc, uint64_t wr_id, void *raddr, int sz_dat
}
int MPID_nem_ib_com_put_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset, int sz,
- void *laddr, void **buf_from_out, uint32_t* buf_from_sz_out)
+ void *laddr, void **buf_from_out, uint32_t * buf_from_sz_out)
{
MPID_nem_ib_com_t *conp;
int ibcom_errno = 0;
@@ -2035,28 +2050,26 @@ int MPID_nem_ib_com_put_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset
// memcpy(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM], laddr, sz);
/* Instead of using the pre-mmaped memory (comp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM]),
- we allocate a memory. */
+ * we allocate a memory. */
void *buf_from = MPID_nem_ib_rdmawr_from_alloc(sz);
memcpy(buf_from, laddr, sz);
dprintf("put_scratch_pad,rdmawr_from_alloc=%p,sz=%d\n", buf_from, sz);
struct ibv_mr *mr_rdmawr_from = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_MR(buf_from);
- *buf_from_out = buf_from;
- *buf_from_sz_out = sz;
+ *buf_from_out = buf_from;
+ *buf_from_sz_out = sz;
void *from = (uint8_t *) buf_from;
#ifdef HAVE_LIBDCFA
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].mic_addr = (uint64_t) from;
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].addr =
- mr_rdmawr_from->host_addr +
- ((uint64_t) from - (uint64_t) from);
+ mr_rdmawr_from->host_addr + ((uint64_t) from - (uint64_t) from);
#else
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].addr = (uint64_t) from;
#endif
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].length = sz;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].lkey =
- mr_rdmawr_from->lkey;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].lkey = mr_rdmawr_from->lkey;
/* num_sge is defined in MPID_nem_ib_com_open */
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].wr_id = wr_id;
@@ -2065,8 +2078,8 @@ int MPID_nem_ib_com_put_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset
/* rkey is defined in MPID_nem_ib_com_reg_mr_connect */
dprintf("MPID_nem_ib_com_put_scratch_pad,wr.rdma.remote_addr=%llx\n",
- (unsigned long long) conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].wr.rdma.
- remote_addr);
+ (unsigned long long) conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].wr.
+ rdma.remote_addr);
#ifdef HAVE_LIBDCFA
ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR]);
@@ -2095,7 +2108,7 @@ int MPID_nem_ib_com_put_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset
int MPID_nem_ib_com_get_scratch_pad(int condesc,
uint64_t wr_id,
uint64_t offset, int sz,
- void** buf_from_out, uint32_t * buf_from_sz_out)
+ void **buf_from_out, uint32_t * buf_from_sz_out)
{
MPID_nem_ib_com_t *conp;
int ibcom_errno = 0;
@@ -2116,15 +2129,12 @@ int MPID_nem_ib_com_get_scratch_pad(int condesc,
#ifdef HAVE_LIBDCFA
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list[0].mic_addr = (uint64_t) buf_from;
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list[0].addr =
- mr_rdmawr_from->host_addr +
- ((uint64_t) buf_from -
- (uint64_t) buf_from);
+ mr_rdmawr_from->host_addr + ((uint64_t) buf_from - (uint64_t) buf_from);
#else
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list[0].addr = (uint64_t) buf_from;
#endif
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list[0].length = sz;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list[0].lkey =
- mr_rdmawr_from->lkey;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list[0].lkey = mr_rdmawr_from->lkey;
/* num_sge is defined in MPID_nem_ib_com_open */
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].wr_id = wr_id;
@@ -2133,8 +2143,8 @@ int MPID_nem_ib_com_get_scratch_pad(int condesc,
/* rkey is defined in MPID_nem_ib_com_reg_mr_connect */
dprintf("MPID_nem_ib_com_get_scratch_pad,wr.rdma.remote_addr=%llx\n",
- (unsigned long long) conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].wr.rdma.
- remote_addr);
+ (unsigned long long) conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].wr.
+ rdma.remote_addr);
#ifdef HAVE_LIBDCFA
ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET]);
@@ -2144,8 +2154,7 @@ int MPID_nem_ib_com_get_scratch_pad(int condesc,
ib_errno));
#else
ib_errno =
- ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET],
- &bad_wr);
+ ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET], &bad_wr);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
dprintf
("MPID_nem_ib_com_put_scratch_pad, ibv_post_send, rc=%d, bad_wr=%p\n",
@@ -2163,7 +2172,7 @@ int MPID_nem_ib_com_get_scratch_pad(int condesc,
int MPID_nem_ib_com_cas_scratch_pad(int condesc,
uint64_t wr_id, uint64_t offset,
uint64_t compare, uint64_t swap,
- void** buf_from_out, uint32_t * buf_from_sz_out)
+ void **buf_from_out, uint32_t * buf_from_sz_out)
{
MPID_nem_ib_com_t *conp;
int ibcom_errno = 0;
@@ -2185,14 +2194,12 @@ int MPID_nem_ib_com_cas_scratch_pad(int condesc,
#ifdef HAVE_LIBDCFA
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list[0].mic_addr = (uint64_t) buf_from;
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list[0].addr =
- mr_rdmawr_from->host_addr +
- ((uint64_t) buf_from - (uint64_t) buf_from);
+ mr_rdmawr_from->host_addr + ((uint64_t) buf_from - (uint64_t) buf_from);
#else
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list[0].addr = (uint64_t) buf_from;
#endif
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list[0].length = sizeof(uint64_t);
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list[0].lkey =
- mr_rdmawr_from->lkey;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list[0].lkey = mr_rdmawr_from->lkey;
/* num_sge is defined in MPID_nem_ib_com_open */
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].wr_id = wr_id;
@@ -2203,8 +2210,8 @@ int MPID_nem_ib_com_cas_scratch_pad(int condesc,
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].wr.atomic.swap = swap;
dprintf("MPID_nem_ib_com_cas_scratch_pad,wr.rdma.remote_addr=%llx\n",
- (unsigned long long) conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].wr.rdma.
- remote_addr);
+ (unsigned long long) conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].wr.
+ rdma.remote_addr);
#ifdef HAVE_LIBDCFA
ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS]);
@@ -2214,8 +2221,7 @@ int MPID_nem_ib_com_cas_scratch_pad(int condesc,
ib_errno));
#else
ib_errno =
- ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS],
- &bad_wr);
+ ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS], &bad_wr);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
dprintf
("MPID_nem_ib_com_cas_scratch_pad, ibv_post_send, rc=%d, bad_wr=%p\n",
@@ -2309,13 +2315,12 @@ int MPID_nem_ib_com_reg_mr_connect(int condesc, void *rmem, int rkey)
0: The new ring buffer is located in the same IB Memory Region as
the previous ring buffer is located in.
This happens when making the connection switch to smaller ring buffer.
- 1: The new ring buffer is located in the new IB Memory Region
+ 1: The new ring buffer is located in the new IB Memory Region
This happens when memory area shrunk then has grown. */
int MPID_nem_ib_com_connect_ringbuf(int condesc,
uint32_t ringbuf_type,
void *start, int rkey, int nslot,
- MPIDI_VC_t *remote_vc,
- uint32_t alloc_new_mr)
+ MPIDI_VC_t * remote_vc, uint32_t alloc_new_mr)
{
int ibcom_errno = 0;
MPID_nem_ib_com_t *conp;
@@ -2329,13 +2334,13 @@ int MPID_nem_ib_com_connect_ringbuf(int condesc,
/* Address and size */
conp->local_ringbuf_start = start;
conp->local_ringbuf_nslot = nslot;
- switch(conp->local_ringbuf_type) {
+ switch (conp->local_ringbuf_type) {
case MPID_NEM_IB_RINGBUF_EXCLUSIVE:
/* Head and tail pointers */
conp->sseq_num = 0;
conp->lsr_seq_num_tail = -1;
break;
- case MPID_NEM_IB_RINGBUF_SHARED:
+ case MPID_NEM_IB_RINGBUF_SHARED:
/* Mark as full to make the sender ask */
conp->lsr_seq_num_tail = conp->sseq_num - conp->local_ringbuf_nslot;
conp->remote_vc = remote_vc;
@@ -2344,15 +2349,15 @@ int MPID_nem_ib_com_connect_ringbuf(int condesc,
printf("unknown ringbuf type");
break;
}
- if(alloc_new_mr) {
+ if (alloc_new_mr) {
conp->local_ringbuf_rkey = rkey;
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.rkey = rkey;
}
- dprintf("connect_ringbuf,ringbuf_type=%d,rkey=%08x,start=%p,nslot=%d,sseq_num=%d,lsr_seq_num_tail=%d,remote_vc=%lx,alloc_new_mr=%d\n",
- conp->local_ringbuf_type, conp->local_ringbuf_rkey, conp->local_ringbuf_start,
- conp->local_ringbuf_nslot, conp->sseq_num,
- conp->lsr_seq_num_tail, conp->remote_vc,
- alloc_new_mr);
+ dprintf
+ ("connect_ringbuf,ringbuf_type=%d,rkey=%08x,start=%p,nslot=%d,sseq_num=%d,lsr_seq_num_tail=%d,remote_vc=%lx,alloc_new_mr=%d\n",
+ conp->local_ringbuf_type, conp->local_ringbuf_rkey, conp->local_ringbuf_start,
+ conp->local_ringbuf_nslot, conp->sseq_num, conp->lsr_seq_num_tail, conp->remote_vc,
+ alloc_new_mr);
fn_exit:
return ibcom_errno;
@@ -2455,7 +2460,8 @@ int MPID_nem_ib_com_mem_rdmawr_from(int condesc, void **out)
MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
*out =
(uint8_t *) conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_FROM] +
- MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t)(conp->sseq_num % MPID_NEM_IB_COM_RDMABUF_NSEG));
+ MPID_NEM_IB_COM_RDMABUF_SZSEG *
+ ((uint16_t) (conp->sseq_num % MPID_NEM_IB_COM_RDMABUF_NSEG));
fn_exit:
return ibcom_errno;
@@ -2622,7 +2628,8 @@ char *MPID_nem_ib_com_strerror(int errno)
goto fn_exit;
}
-int MPID_nem_ib_com_reg_mr(void *addr, int len, struct ibv_mr **mr, enum ibv_access_flags additional_flags)
+int MPID_nem_ib_com_reg_mr(void *addr, int len, struct ibv_mr **mr,
+ enum ibv_access_flags additional_flags)
{
int ibcom_errno = 0;
dprintf("MPID_nem_ib_com_reg_mr,addr=%p,len=%d,mr=%p\n", addr, len, mr);
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
index 4cea111..ada7053 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
@@ -179,11 +179,11 @@ extern uint8_t *MPID_nem_ib_scratch_pad;
extern int MPID_nem_ib_scratch_pad_ref_count;
extern char *MPID_nem_ib_rdmawr_from_alloc_free_list_front[MPID_NEM_IB_RDMAWR_FROM_ALLOC_NID];
extern char *MPID_nem_ib_rdmawr_from_alloc_arena_free_list[MPID_NEM_IB_RDMAWR_FROM_ALLOC_NID];
-extern struct ibv_mr* MPID_nem_ib_rdmawr_to_alloc_mr;
+extern struct ibv_mr *MPID_nem_ib_rdmawr_to_alloc_mr;
extern uint8_t *MPID_nem_ib_rdmawr_to_alloc_start;
extern uint8_t *MPID_nem_ib_rdmawr_to_alloc_free_list;
-#define MPID_NEM_IB_COM_SIZE (65536*2) /* Maxiumum number of QPs. One process uses 2 QPs. */
+#define MPID_NEM_IB_COM_SIZE (65536*2) /* Maxiumum number of QPs. One process uses 2 QPs. */
#define MPID_NEM_IB_COM_INLINE_DATA (512-64) /* experimented max is 884 */ /* this is lower bound and more than this value is set. the more this value is, the more the actual value set is. you need to check it */
#define MPID_NEM_IB_COM_MAX_SQ_CAPACITY (256/1)
@@ -239,10 +239,10 @@ extern uint8_t *MPID_nem_ib_rdmawr_to_alloc_free_list;
#define MPID_NEM_IB_COM_UDBUF_SZSEG (128)
#define MPID_NEM_IB_COM_UDBUF_NSEG (MPID_NEM_IB_COM_UDBUF_SZ / MPID_NEM_IB_COM_UDBUF_SZSEG)
-#define MPID_NEM_IB_COM_NBUF_SCRATCH_PAD 2 /* number of <addr, sz, lkey, rkey> */
+#define MPID_NEM_IB_COM_NBUF_SCRATCH_PAD 2 /* number of <addr, sz, lkey, rkey> */
#define MPID_NEM_IB_COM_SCRATCH_PAD_FROM_SZ 4096
#define MPID_NEM_IB_COM_SCRATCH_PAD_FROM 0
-#define MPID_NEM_IB_COM_SCRATCH_PAD_TO 1 /* index to RDMA-write-to buffer */
+#define MPID_NEM_IB_COM_SCRATCH_PAD_TO 1 /* index to RDMA-write-to buffer */
/* send command templates */
#define MPID_NEM_IB_COM_RC_SR_NTEMPLATE (8+1+2) /* number of request templates, 8 for inline-chained-smt, 1 for smt, 1 for lmt */
@@ -308,21 +308,21 @@ extern uint8_t *MPID_nem_ib_rdmawr_to_alloc_free_list;
#define MPID_NEM_IB_NETMOD_HDR_HEAD_FLAG_SET(buf, val) ((MPID_nem_ib_netmod_hdr_shared_t *)(buf))->first = (val);
typedef struct MPID_nem_ib_netmod_hdr_exclusive {
- /*
- [63:61] ring buffer type
- remote is exclusive:
- [47:32] largest index of contiguous released slots 16-bit
- reply to slot request:
- [51:20] Start address of acquired slots, MSB part
- [19:16] Log_2 of amount of acquired slots
- [15:0] Packet size without padding
- */
+ /*
+ * [63:61] ring buffer type
+ * remote is exclusive:
+ * [47:32] largest index of contiguous released slots 16-bit
+ * reply to slot request:
+ * [51:20] Start address of acquired slots, MSB part
+ * [19:16] Log_2 of amount of acquired slots
+ * [15:0] Packet size without padding
+ */
uint64_t first;
/* jump case:
- [31:0] Start address of acquired slots, LSB part
- */
+ * [31:0] Start address of acquired slots, LSB part
+ */
uint32_t second;
-
+
} MPID_nem_ib_netmod_hdr_exclusive_t;
typedef struct MPID_nem_ib_netmod_hdr_shared {
@@ -330,11 +330,11 @@ typedef struct MPID_nem_ib_netmod_hdr_shared {
uint32_t second;
/* remote is one slot:
- [31:0] VC pointer in remote node, MSB part */
+ * [31:0] VC pointer in remote node, MSB part */
uint32_t third;
/* remote is one slot:
- [31:0] VC pointer in remote node, LSB part */
+ * [31:0] VC pointer in remote node, LSB part */
uint32_t forth;
} MPID_nem_ib_netmod_hdr_shared_t;
@@ -346,9 +346,10 @@ typedef struct MPID_nem_ib_netmod_trailer {
/* Allocator for RDMA write to buffer */
typedef struct {
/* Avoid polluting netmod_hdr and trailer */
- uint8_t padding[sizeof(MPID_nem_ib_netmod_hdr_shared_t)];
+ uint8_t padding[sizeof(MPID_nem_ib_netmod_hdr_shared_t)];
uint8_t *next;
-} MPID_nem_ib_rdmawr_to_alloc_hdr_t;
+}
+MPID_nem_ib_rdmawr_to_alloc_hdr_t;
/* Ring-buffer to which a remote note RDMA-writes */
#define MPID_NEM_IB_NRINGBUF 64
@@ -364,12 +365,12 @@ typedef struct {
#define MPID_NEM_IB_RINGBUF_RELINDEX 4
typedef struct {
- uint32_t type; /* acquiring contiguous slots or a single slot */
- void* start;
+ uint32_t type; /* acquiring contiguous slots or a single slot */
+ void *start;
int nslot;
- MPIDI_VC_t * vc;
+ MPIDI_VC_t *vc;
uint64_t remote_released[(MPID_NEM_IB_COM_RDMABUF_NSEG + 63) / 64];
- int ref_count; /* number of VCs sharing the ring-buffer */
+ int ref_count; /* number of VCs sharing the ring-buffer */
} MPID_nem_ib_ringbuf_t;
/* Represent a ring-buffer is exclusively acquired */
@@ -381,22 +382,22 @@ extern uint64_t MPID_nem_ib_ringbuf_allocated[(MPID_NEM_IB_NRINGBUF + 63) / 64];
extern MPID_nem_ib_ringbuf_t *MPID_nem_ib_ringbuf;
-/* Next ring-buffer type and slots
+/* Next ring-buffer type and slots
Exclusive slots are sticky.
Shared slot is consumed.
Use the type described here because we need to
- use up acquired slots of shared ring-buffer when
+ use up acquired slots of shared ring-buffer when
transitioning from share to exclusive.
The next type is absent means we're transitioning
from exclusive to shared. */
typedef struct MPID_nem_ib_ringbuf_sector {
uint32_t type;
- void* start;
+ void *start;
int nslot;
uint16_t head;
uint16_t tail;
- struct MPID_nem_ib_ringbuf_sector * sectorq_next;
+ struct MPID_nem_ib_ringbuf_sector *sectorq_next;
} MPID_nem_ib_ringbuf_sector_t;
typedef GENERIC_Q_DECL(MPID_nem_ib_ringbuf_sector_t) MPID_nem_ib_ringbuf_sectorq_t;
@@ -434,12 +435,12 @@ typedef struct MPID_nem_ib_com {
size_t *icom_rsize;
uint16_t sseq_num;
uint16_t rsr_seq_num_poll;
- uint16_t rsr_seq_num_tail; /* occupation status of remote Send Request (SR) queue (it covers occupation status of local RDMA-wr-to buffer) */
- uint16_t rsr_seq_num_tail_last_sent; /* latest one sent to remote rank */
- uint16_t lsr_seq_num_tail; /* occupation status of local Send Request (SR) queue */
+ uint16_t rsr_seq_num_tail; /* occupation status of remote Send Request (SR) queue (it covers occupation status of local RDMA-wr-to buffer) */
+ uint16_t rsr_seq_num_tail_last_sent; /* latest one sent to remote rank */
+ uint16_t lsr_seq_num_tail; /* occupation status of local Send Request (SR) queue */
int lsr_seq_num_tail_last_requested; /* value when lmt_start_send issued req_seq_num */
int rdmabuf_occupancy_notify_rstate, rdmabuf_occupancy_notify_lstate;
- int ncom, ncom_scratch_pad; /* number of entries in the command queue */
+ int ncom, ncom_scratch_pad; /* number of entries in the command queue */
uint32_t max_inline_data; /* actual value obtained after ibv_create_qp */
uint32_t max_send_wr;
@@ -453,36 +454,36 @@ typedef struct MPID_nem_ib_com {
uint16_t after_rdma_rd;
/* Ring-buffer information on the receiver side.
- It's allocated on the receiver side. */
- MPID_nem_ib_ringbuf_t* remote_ringbuf;
+ * It's allocated on the receiver side. */
+ MPID_nem_ib_ringbuf_t *remote_ringbuf;
/* Ring buffer information on the sender side.
- The information is passed from the receiver side on connection. */
- uint32_t local_ringbuf_type;
- void* local_ringbuf_start;
+ * The information is passed from the receiver side on connection. */
+ uint32_t local_ringbuf_type;
+ void *local_ringbuf_start;
int local_ringbuf_rkey;
uint16_t local_ringbuf_nslot;
/* VC of remote node. It's embedded in a packet going to the
- shared ring buffer because no VC information is available on
- the receiver side in the shared case. c.f. They are stored in
- the individual exclusive ring-buffers in the exclusive case. */
- MPIDI_VC_t *remote_vc;
+ * shared ring buffer because no VC information is available on
+ * the receiver side in the shared case. c.f. They are stored in
+ * the individual exclusive ring-buffers in the exclusive case. */
+ MPIDI_VC_t *remote_vc;
/* Delay the fetch of the second ask until the first issues CAS */
uint8_t ask_guard;
-
+
/* Ring buffer sectors obtained through ask-send protocol */
- MPID_nem_ib_ringbuf_sectorq_t sectorq;
+ MPID_nem_ib_ringbuf_sectorq_t sectorq;
/* Two transactions from the both ends for a connection
- can be outstanding at the same time when they were initiated
- at the same time. This makes one end try to send ACK2 after
- freeing scratch-pad QP for the connection. So we must monitor and
- wait until all the onnection request transactions ends before
- freeing scratch-pad QP.*/
- int outstanding_connection_tx;
+ * can be outstanding at the same time when they were initiated
+ * at the same time. This makes one end try to send ACK2 after
+ * freeing scratch-pad QP for the connection. So we must monitor and
+ * wait until all the onnection request transactions ends before
+ * freeing scratch-pad QP. */
+ int outstanding_connection_tx;
int incoming_connection_tx;
} MPID_nem_ib_com_t;
@@ -499,31 +500,28 @@ extern int MPID_nem_ib_com_rts(int condesc, int remote_qpnum, uint16_t remote_li
extern int MPID_nem_ib_com_reg_mr_connect(int condesc, void *rmem, int rkey);
extern int MPID_nem_ib_com_connect_ringbuf(int condesc,
- uint32_t ringbuf_type,
- void *start, int rkey, int nslot,
- MPIDI_VC_t * remote_vc,
- uint32_t alloc_new_mr);
+ uint32_t ringbuf_type,
+ void *start, int rkey, int nslot,
+ MPIDI_VC_t * remote_vc, uint32_t alloc_new_mr);
extern int MPID_nem_ib_com_isend(int condesc,
- uint64_t wr_id,
- void *prefix, int sz_prefix,
- void *hdr, int sz_hdr,
- void *data, int sz_data,
- int *copied,
- uint32_t local_ringbuf_type, uint32_t remote_ringbuf_type,
- void** buf_from_out, uint32_t* buf_from_sz_out);
+ uint64_t wr_id,
+ void *prefix, int sz_prefix,
+ void *hdr, int sz_hdr,
+ void *data, int sz_data,
+ int *copied,
+ uint32_t local_ringbuf_type, uint32_t remote_ringbuf_type,
+ void **buf_from_out, uint32_t * buf_from_sz_out);
extern int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_hdr,
void *data, int sz_data);
extern int MPID_nem_ib_com_put_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset, int sz,
- void *laddr, void **buf_from_out, uint32_t* buf_from_sz_out);
-extern int MPID_nem_ib_com_get_scratch_pad(int condesc,
- uint64_t wr_id,
- uint64_t offset, int sz,
- void** buf_from_out, uint32_t * buf_from_sz_out);
-extern int MPID_nem_ib_com_cas_scratch_pad(int condesc,
- uint64_t wr_id, uint64_t offset,
- uint64_t compare, uint64_t swap,
- void** buf_from_out, uint32_t * buf_from_sz_out);
+ void *laddr, void **buf_from_out,
+ uint32_t * buf_from_sz_out);
+extern int MPID_nem_ib_com_get_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset, int sz,
+ void **buf_from_out, uint32_t * buf_from_sz_out);
+extern int MPID_nem_ib_com_cas_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset,
+ uint64_t compare, uint64_t swap, void **buf_from_out,
+ uint32_t * buf_from_sz_out);
//extern int MPID_nem_ib_com_isend(int condesc, uint64_t wr_id, void* hdr, int sz_hdr, void* data, int sz_data);
extern int MPID_nem_ib_com_irecv(int condesc, uint64_t wr_id);
@@ -560,7 +558,8 @@ extern int MPID_nem_ib_com_mem_udwr_to(int condesc, void **out);
/* ib_reg_mr.c */
extern int MPID_nem_ib_com_register_cache_init(void);
extern int MPID_nem_ib_com_register_cache_release(void);
-extern struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, int len, enum ibv_access_flags additional_flags);
+extern struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, int len,
+ enum ibv_access_flags additional_flags);
extern int MPID_nem_ib_com_udbuf_init(void *q);
@@ -623,7 +622,7 @@ typedef struct {
uint32_t ref_count;
char *next;
} first;
- struct ibv_mr* mr;
+ struct ibv_mr *mr;
} MPID_nem_ib_rdmawr_from_alloc_hdr_t;
#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA 65536
#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_ROUNDUP64(addr, align) ((addr + align - 1) & ~((unsigned long)align - 1))
@@ -632,7 +631,7 @@ typedef struct {
#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_START(p) ((void *) ((uint64_t) (p) & ~(MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA - 1)))
#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_MR(p) (((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) ((uint64_t) (p) & ~(MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA - 1)))->mr)
#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_PREPROCESS_SZ \
- if(_sz < 256) { \
+ if (_sz < 256) { \
clz = 23; \
sz = 256; \
} else { \
@@ -646,7 +645,7 @@ static inline void *MPID_nem_ib_rdmawr_from_alloc(uint32_t _sz)
int retval;
int clz;
uint32_t sz;
- assert(_sz <= (1ULL<<31));
+ assert(_sz <= (1ULL << 31));
MPID_NEM_IB_RDMAWR_FROM_ALLOC_PREPROCESS_SZ;
char *p = MPID_nem_ib_rdmawr_from_alloc_free_list_front[clz];
if ((unsigned long) p & (MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA - 1)) {
@@ -657,19 +656,25 @@ static inline void *MPID_nem_ib_rdmawr_from_alloc(uint32_t _sz)
char *q, r;
if (MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz]) {
q = MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz];
- MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz] = ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz])->first.next;
+ MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz] =
+ ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *)
+ MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz])->first.next;
}
else {
- unsigned long sz_clust = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ROUNDUP64(MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA * MPID_NEM_IB_RDMAWR_FROM_ALLOC_NCLUST_SLAB, 4096);
- char* unaligned = mmap(NULL,
- sz_clust + MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA,
- PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- if (unaligned == (void *)-1) {
+ unsigned long sz_clust =
+ MPID_NEM_IB_RDMAWR_FROM_ALLOC_ROUNDUP64(MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA *
+ MPID_NEM_IB_RDMAWR_FROM_ALLOC_NCLUST_SLAB,
+ 4096);
+ char *unaligned = mmap(NULL,
+ sz_clust + MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA,
+ PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (unaligned == (void *) -1) {
printf("mmap failed\n");
MPID_nem_ib_segv;
}
- q = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ROUNDUP64_ADDR(unaligned, MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA);
+ q = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ROUNDUP64_ADDR(unaligned,
+ MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA);
retval = munmap(unaligned, q - unaligned);
if (q - unaligned != 0 && retval) {
printf("munmap failed\n");
@@ -681,28 +686,36 @@ static inline void *MPID_nem_ib_rdmawr_from_alloc(uint32_t _sz)
MPID_nem_ib_segv;
}
- ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->mr = MPID_nem_ib_com_reg_mr_fetch(q, MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA, 0);
+ ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->mr =
+ MPID_nem_ib_com_reg_mr_fetch(q, MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA, 0);
if (!((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->mr) {
printf("ibv_reg_mr failed\n");
MPID_nem_ib_segv;
}
-
+
#if MPID_NEM_IB_RDMAWR_FROM_ALLOC_NCLUST_SLAB > 1
- MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz] = q + MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA;
- for (p = q + MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA; p < q + (MPID_NEM_IB_RDMAWR_FROM_ALLOC_NCLUST_SLAB - 1) * MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA;
+ MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz] =
+ q + MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA;
+ for (p = q + MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA;
+ p <
+ q + (MPID_NEM_IB_RDMAWR_FROM_ALLOC_NCLUST_SLAB -
+ 1) * MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA;
p += MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA) {
- ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) p)->mr = MPID_nem_ib_com_reg_mr_fetch(q, MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA, 0);
+ ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) p)->mr =
+ MPID_nem_ib_com_reg_mr_fetch(q, MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA, 0);
if (!((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) p)->mr) {
printf("ibv_reg_mr failed\n");
MPID_nem_ib_segv;
}
-
- ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) p)->first.next = p + MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA;
+
+ ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) p)->first.next =
+ p + MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA;
}
((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) p)->first.next = 0;
#endif
}
- ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->first.ref_count = MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA / sz - 1;
+ ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->first.ref_count =
+ MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA / sz - 1;
q += sz + (MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA % sz);
MPID_nem_ib_rdmawr_from_alloc_free_list_front[clz] = q + sz;
return q;
@@ -713,12 +726,12 @@ static inline void MPID_nem_ib_rdmawr_from_free(const void *p, uint32_t _sz)
{
int clz;
uint32_t sz;
- assert(_sz <= (1ULL<<31));
+ assert(_sz <= (1ULL << 31));
MPID_NEM_IB_RDMAWR_FROM_ALLOC_PREPROCESS_SZ;
void *q = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_START(p);
if (!(--(((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->first.ref_count))) {
- ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->first.next = MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz];
+ ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->first.next =
+ MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz];
MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz] = (char *) q;
}
}
-
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
index 4f4d99f..0a94f8d 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
@@ -45,12 +45,12 @@ typedef struct {
int pending_sends; /* number of send in flight */
MPID_nem_ib_com_t *ibcom;
MPID_nem_ib_sendq_t sendq; /* overflow queue for IB commands */
- int connection_state; /* dynamic connection, checked in iSendContig, protocol processed there and in progress engine */
+ int connection_state; /* dynamic connection, checked in iSendContig, protocol processed there and in progress engine */
/* Number of outstanding connection sequence started to eliminate
- duplicated connection reuests */
- uint8_t connection_guard;
- void *vc_terminate_buf; /* address of ringbuffer which calls vc_terminate */
+ * duplicated connection reuests */
+ uint8_t connection_guard;
+ void *vc_terminate_buf; /* address of ringbuffer which calls vc_terminate */
} MPID_nem_ib_vc_area;
/* macro for secret area in vc */
@@ -72,9 +72,9 @@ typedef struct {
MPI_Aint lmt_dt_true_lb; /* to locate the last byte of receive buffer */
void *lmt_write_to_buf; /* user buffer or temporary buffer for pack and remember it for lmt_orderq */
void *lmt_pack_buf; /* to pack non-contiguous data */
- void *buf_from; /* address of RDMA write from buffer */
- uint32_t buf_from_sz; /* size of RDMA write from buffer. It's set on sending, referenced on freeing */
- uint8_t ask; /* Issued ask or not on send */
+ void *buf_from; /* address of RDMA write from buffer */
+ uint32_t buf_from_sz; /* size of RDMA write from buffer. It's set on sending, referenced on freeing */
+ uint8_t ask; /* Issued ask or not on send */
} MPID_nem_ib_req_area;
/* macro for secret area in req */
@@ -115,7 +115,7 @@ typedef struct {
} MPID_nem_ib_cm_map_t;
/* Types of connection protocol packets */
-enum MPID_nem_ib_cm_cmd_types {
+enum MPID_nem_ib_cm_cmd_types {
MPID_NEM_IB_CM_HEAD_FLAG_ZERO = 0,
MPID_NEM_IB_CM_CAS,
MPID_NEM_IB_CM_SYN,
@@ -132,7 +132,7 @@ enum MPID_nem_ib_cm_cmd_types {
/* Packet types of connection protocol */
struct MPID_nem_ib_cm_req;
-/* They should have the same type because
+/* They should have the same type because
cm commands and ring buffer commands share one CQ */
typedef uint8_t MPID_nem_ib_cm_ringbuf_cmd_type_t;
typedef MPID_nem_ib_cm_ringbuf_cmd_type_t MPID_nem_ib_ringbuf_cmd_type_t;
@@ -143,23 +143,23 @@ typedef struct {
struct MPID_nem_ib_cm_req *initiator_req;
uint16_t responder_ringbuf_index;
int initiator_rank;
- MPID_nem_ib_netmod_trailer_t tail_flag;
+ MPID_nem_ib_netmod_trailer_t tail_flag;
} MPID_nem_ib_cm_cmd_syn_t;
typedef struct {
- MPID_nem_ib_cm_cmd_type_t type; /* this is used as head flag as well */
+ MPID_nem_ib_cm_cmd_type_t type; /* this is used as head flag as well */
uint32_t qpnum;
uint16_t lid;
union ibv_gid gid;
void *rmem;
uint32_t rkey;
int ringbuf_nslot;
- uint32_t ringbuf_type; /* Ring buffer information sent from receiver side to sender side */
+ uint32_t ringbuf_type; /* Ring buffer information sent from receiver side to sender side */
struct MPID_nem_ib_cm_req *initiator_req;
struct MPID_nem_ib_cm_req *responder_req;
- uint16_t initiator_ringbuf_index; /* index to connection protocol ring buffer */
- MPIDI_VC_t * remote_vc;
- MPID_nem_ib_netmod_trailer_t tail_flag;
+ uint16_t initiator_ringbuf_index; /* index to connection protocol ring buffer */
+ MPIDI_VC_t *remote_vc;
+ MPID_nem_ib_netmod_trailer_t tail_flag;
} MPID_nem_ib_cm_cmd_synack_t;
typedef struct {
@@ -170,17 +170,17 @@ typedef struct {
void *rmem;
uint32_t rkey;
int ringbuf_nslot;
- uint32_t ringbuf_type; /* Ring buffer information sent from sender side to receiver side */
+ uint32_t ringbuf_type; /* Ring buffer information sent from sender side to receiver side */
struct MPID_nem_ib_cm_req *initiator_req;
struct MPID_nem_ib_cm_req *responder_req;
- MPIDI_VC_t * remote_vc;
- MPID_nem_ib_netmod_trailer_t tail_flag;
+ MPIDI_VC_t *remote_vc;
+ MPID_nem_ib_netmod_trailer_t tail_flag;
} MPID_nem_ib_cm_cmd_ack1_t;
typedef struct {
MPID_nem_ib_cm_cmd_type_t type;
struct MPID_nem_ib_cm_req *initiator_req;
- MPID_nem_ib_netmod_trailer_t tail_flag;
+ MPID_nem_ib_netmod_trailer_t tail_flag;
} MPID_nem_ib_cm_cmd_ack2_t;
/* Base class for branching on type
@@ -196,35 +196,35 @@ typedef union {
/* State store for connection protocol */
typedef struct MPID_nem_ib_cm_req {
MPID_nem_ib_cm_cmd_type_t state;
- MPID_nem_ib_com_t *ibcom; /* Referenced in drain_scq */
- uint64_t retry_decided; /* Virtual time when CAS retry is decided */
- uint64_t retry_backoff; /* Back-off duration of retry */
- uint16_t ringbuf_index; /* index of slot where responder writes responds */
+ MPID_nem_ib_com_t *ibcom; /* Referenced in drain_scq */
+ uint64_t retry_decided; /* Virtual time when CAS retry is decided */
+ uint64_t retry_backoff; /* Back-off duration of retry */
+ uint16_t ringbuf_index; /* index of slot where responder writes responds */
int initiator_rank;
int responder_rank;
- uint16_t initiator_ringbuf_index; /* responder stores it when acquiring it */
- uint16_t responder_ringbuf_index; /* initiator stores it when acquiring it */
+ uint16_t initiator_ringbuf_index; /* responder stores it when acquiring it */
+ uint16_t responder_ringbuf_index; /* initiator stores it when acquiring it */
struct MPID_nem_ib_cm_req *sendq_next;
- MPID_nem_ib_cm_cmd_t cmd; /* buf used only when enqueued */
- uint32_t ask_on_connect; /* Ask ring-buffer slot when connected */
+ MPID_nem_ib_cm_cmd_t cmd; /* buf used only when enqueued */
+ uint32_t ask_on_connect; /* Ask ring-buffer slot when connected */
/* We need to track reference count because the last reference of state
- is non-deterministic. i.e. it happens either on receiving packet and draining SCQ */
+ * is non-deterministic. i.e. it happens either on receiving packet and draining SCQ */
uint32_t ref_count;
} MPID_nem_ib_cm_req_t;
/* Track identity of a packet */
typedef struct {
- MPID_nem_ib_cm_cmd_type_t type; /* Type referenced in drain_scq */
+ MPID_nem_ib_cm_cmd_type_t type; /* Type referenced in drain_scq */
MPID_nem_ib_cm_req_t *req;
- void* buf_from;
+ void *buf_from;
uint32_t buf_from_sz;
} MPID_nem_ib_cm_cmd_shadow_t;
#define MPID_NEM_IB_CM_RELEASED ((uint64_t)(-1))
-#define MPID_NEM_IB_CM_OFF_SYN (256) /* Align for 256-byte-write PCI command */
-#define MPID_NEM_IB_CM_OFF_CMD (256*2) /* Align for 256-byte-write PCI command */
-#define MPID_NEM_IB_CM_NSEG 16 /* number of slots to which responder writes its response */
+#define MPID_NEM_IB_CM_OFF_SYN (256) /* Align for 256-byte-write PCI command */
+#define MPID_NEM_IB_CM_OFF_CMD (256*2) /* Align for 256-byte-write PCI command */
+#define MPID_NEM_IB_CM_NSEG 16 /* number of slots to which responder writes its response */
typedef GENERIC_Q_DECL(MPID_nem_ib_cm_req_t) MPID_nem_ib_cm_sendq_t;
@@ -234,7 +234,7 @@ typedef GENERIC_Q_DECL(MPID_nem_ib_cm_req_t) MPID_nem_ib_cm_sendq_t;
#define MPID_nem_ib_cm_sendq_next(ep) ((ep)->sendq_next)
#define MPID_nem_ib_cm_sendq_enqueue(qp, ep) GENERICM_Q_ENQUEUE (qp, ep, MPID_nem_ib_cm_sendq_next_field, sendq_next);
-#ifdef HAVE_LIBDCFA
+#ifdef HAVE_LIBDCFA
#define MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO_MR_ADDR host_adddr
#else
#define MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO_MR_ADDR addr
@@ -265,7 +265,7 @@ typedef GENERIC_Q_DECL(MPID_nem_ib_cm_req_t) MPID_nem_ib_cm_sendq_t;
#define MPID_NEM_IB_CM_COMPOSE_SYN(cmd, req) { \
(cmd)->type = MPID_NEM_IB_CM_SYN; \
- (cmd)->initiator_req = (req); \
+ (cmd)->initiator_req = (req); \
(cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
}
@@ -279,10 +279,10 @@ typedef GENERIC_Q_DECL(MPID_nem_ib_cm_req_t) MPID_nem_ib_cm_sendq_t;
(cmd)->type = MPID_NEM_IB_CM_SYNACK; \
MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO((cmd), (req)->initiator_rank); \
(cmd)->ringbuf_type = VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, ibcom->remote_ringbuf->type); \
- (cmd)->initiator_req = (_initiator_req); \
- (cmd)->responder_req = (req); \
- (cmd)->remote_vc = MPID_nem_ib_conns[req->initiator_rank].vc; \
- (cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
+ (cmd)->initiator_req = (_initiator_req); \
+ (cmd)->responder_req = (req); \
+ (cmd)->remote_vc = MPID_nem_ib_conns[req->initiator_rank].vc; \
+ (cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
}
#define MPID_NEM_IB_CM_COMPOSE_END_CM(cmd, req, _initiator_req, _type) { \
@@ -296,15 +296,15 @@ typedef GENERIC_Q_DECL(MPID_nem_ib_cm_req_t) MPID_nem_ib_cm_sendq_t;
(cmd)->type = MPID_NEM_IB_CM_ACK1; \
MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO((cmd), (req)->responder_rank); \
(cmd)->ringbuf_type = VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, ibcom->remote_ringbuf->type); \
- (cmd)->initiator_req = (req); \
- (cmd)->responder_req = (_responder_req); \
- (cmd)->remote_vc = MPID_nem_ib_conns[req->responder_rank].vc; \
+ (cmd)->initiator_req = (req); \
+ (cmd)->responder_req = (_responder_req); \
+ (cmd)->remote_vc = MPID_nem_ib_conns[req->responder_rank].vc; \
(cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
}
#define MPID_NEM_IB_CM_COMPOSE_ACK2(cmd, _initiator_req) { \
(cmd)->type = MPID_NEM_IB_CM_ACK2; \
- (cmd)->initiator_req = (_initiator_req); \
+ (cmd)->initiator_req = (_initiator_req); \
(cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
}
@@ -314,11 +314,12 @@ typedef GENERIC_Q_DECL(MPID_nem_ib_cm_req_t) MPID_nem_ib_cm_sendq_t;
((MPID_nem_ib_cm_cmd_ack2_t *)(buf))->tail_flag.tail_flag = 0; \
}
-static inline void MPID_nem_ib_cm_request_release(MPID_nem_ib_cm_req_t * req) {
- if(req->ref_count == 0) {
+static inline void MPID_nem_ib_cm_request_release(MPID_nem_ib_cm_req_t * req)
+{
+ if (req->ref_count == 0) {
MPID_nem_ib_segv;
}
- if(--req->ref_count == 0) {
+ if (--req->ref_count == 0) {
MPIU_Free(req);
}
}
@@ -327,7 +328,7 @@ int MPID_nem_ib_cm_progress(void);
int MPID_nem_ib_cm_release(uint16_t index);
#endif
-/* Ring buffer protocol
+/* Ring buffer protocol
including Ask-Send protocol */
uint32_t MPID_nem_ib_ringbuf_local_shared_nseg;
@@ -335,7 +336,7 @@ uint32_t MPID_nem_ib_ringbuf_local_shared_nseg;
/* It's on the scratch pad, RDMA-read by a process which performs ask-send */
typedef struct {
- uint64_t head; /* CAS size is 64-bit */
+ uint64_t head; /* CAS size is 64-bit */
uint16_t tail;
} MPID_nem_ib_ringbuf_headtail_t;
@@ -345,24 +346,24 @@ typedef struct {
/* State store for connection protocol */
typedef struct MPID_nem_ib_ringbuf_req {
MPID_nem_ib_ringbuf_cmd_type_t state;
- MPIDI_VC_t * vc; /* You can eliminate this. */
- MPID_nem_ib_com_t *ibcom; /* ibcom of scratch pad, referenced in drain_scq */
+ MPIDI_VC_t *vc; /* You can eliminate this. */
+ MPID_nem_ib_com_t *ibcom; /* ibcom of scratch pad, referenced in drain_scq */
- /* fetch the head and compare-and-swap head and head + 1
- to prevent the case 2^32-1 contiguos fetches while assuming
- the ring buffer isn't full corrupt the head pointer */
- MPID_nem_ib_ringbuf_headtail_t fetched;
+ /* fetch the head and compare-and-swap head and head + 1
+ * to prevent the case 2^32-1 contiguos fetches while assuming
+ * the ring buffer isn't full corrupt the head pointer */
+ MPID_nem_ib_ringbuf_headtail_t fetched;
- uint64_t retry_decided; /* Virtual time when CAS retry is decided */
- uint64_t retry_backoff; /* Back-off duration of retry */
+ uint64_t retry_decided; /* Virtual time when CAS retry is decided */
+ uint64_t retry_backoff; /* Back-off duration of retry */
struct MPID_nem_ib_ringbuf_req *sendq_next;
} MPID_nem_ib_ringbuf_req_t;
/* Track identity of a packet */
typedef struct {
- MPID_nem_ib_ringbuf_cmd_type_t type; /* Type referenced in drain_scq */
+ MPID_nem_ib_ringbuf_cmd_type_t type; /* Type referenced in drain_scq */
MPID_nem_ib_ringbuf_req_t *req;
- void* buf_from;
+ void *buf_from;
uint32_t buf_from_sz;
} MPID_nem_ib_ringbuf_cmd_shadow_t;
@@ -522,7 +523,7 @@ int MPID_nem_ib_finalize(void);
int MPID_nem_ib_drain_scq(int dont_call_progress);
int MPID_nem_ib_drain_scq_scratch_pad(void);
int MPID_nem_ib_poll(int in_blocking_poll);
-int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf);
+int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t * ringbuf);
int MPID_nem_ib_ring_alloc(MPIDI_VC_t * vc);
int MPID_nem_ib_cm_drain_scq(void);
@@ -546,13 +547,16 @@ int MPID_nem_ib_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
int MPID_nem_ib_iStartContigMsg(MPIDI_VC_t * vc, void *hdr, MPIDI_msg_sz_t hdr_sz, void *data,
MPIDI_msg_sz_t data_sz, MPID_Request ** sreq_ptr);
-int MPID_nem_ib_cm_cas_core(int rank, MPID_nem_ib_cm_cmd_shadow_t* shadow);
+int MPID_nem_ib_cm_cas_core(int rank, MPID_nem_ib_cm_cmd_shadow_t * shadow);
int MPID_nem_ib_cm_cas(MPIDI_VC_t * vc, uint32_t ask_on_connect);
-int MPID_nem_ib_cm_cmd_core(int rank, MPID_nem_ib_cm_cmd_shadow_t* shadow, void* buf, MPIDI_msg_sz_t sz, uint32_t syn, uint16_t ringbuf_index);
-int MPID_nem_ib_ringbuf_ask_cas(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_req_t* req);
-int MPID_nem_ib_ringbuf_ask_fetch_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t* shadow, MPIDI_msg_sz_t sz);
+int MPID_nem_ib_cm_cmd_core(int rank, MPID_nem_ib_cm_cmd_shadow_t * shadow, void *buf,
+ MPIDI_msg_sz_t sz, uint32_t syn, uint16_t ringbuf_index);
+int MPID_nem_ib_ringbuf_ask_cas(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_req_t * req);
+int MPID_nem_ib_ringbuf_ask_fetch_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t * shadow,
+ MPIDI_msg_sz_t sz);
int MPID_nem_ib_ringbuf_ask_fetch(MPIDI_VC_t * vc);
-int MPID_nem_ib_ringbuf_ask_cas_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t* shadow, uint64_t head);
+int MPID_nem_ib_ringbuf_ask_cas_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t * shadow,
+ uint64_t head);
int MPID_nem_ib_ringbuf_progress(void);
int MPID_nem_ib_ringbuf_alloc(MPIDI_VC_t * vc);
@@ -595,7 +599,7 @@ extern MPID_nem_ib_conn_ud_t *MPID_nem_ib_conn_ud;
extern MPID_nem_ib_conn_t *MPID_nem_ib_conns;
extern int MPID_nem_ib_conns_ref_count;
//extern MPIDI_VC_t **MPID_nem_ib_pollingset;
-extern int *MPID_nem_ib_scratch_pad_fds; /* TODO: create structure including fds and ibcoms */
+extern int *MPID_nem_ib_scratch_pad_fds; /* TODO: create structure including fds and ibcoms */
extern int MPID_nem_ib_scratch_pad_fds_ref_count;
extern MPID_nem_ib_com_t **MPID_nem_ib_scratch_pad_ibcoms;
//extern int MPID_nem_ib_npollingset;
@@ -608,14 +612,14 @@ extern int MPID_nem_ib_ncqe; /* for lazy poll scq */
extern uint64_t MPID_nem_ib_progress_engine_vt; /* virtual time stamp counter */
extern uint16_t MPID_nem_ib_remote_poll_shared; /* index to poll for shared ring buffer */
#ifdef MPID_NEM_IB_ONDEMAND
-extern uint16_t MPID_nem_ib_cm_ringbuf_head; /* head is incremented after assigned */
+extern uint16_t MPID_nem_ib_cm_ringbuf_head; /* head is incremented after assigned */
extern uint16_t MPID_nem_ib_cm_ringbuf_tail;
extern uint64_t MPID_nem_ib_cm_ringbuf_released[(MPID_NEM_IB_CM_NSEG + 63) / 64];
/* overflow queue when no more slots for responder to write on are available */
-extern MPID_nem_ib_cm_sendq_t MPID_nem_ib_cm_sendq;
+extern MPID_nem_ib_cm_sendq_t MPID_nem_ib_cm_sendq;
-extern MPID_nem_ib_ringbuf_sendq_t MPID_nem_ib_ringbuf_sendq;
+extern MPID_nem_ib_ringbuf_sendq_t MPID_nem_ib_ringbuf_sendq;
#endif
extern int MPID_nem_ib_ncqe_scratch_pad;
@@ -642,14 +646,14 @@ extern uint8_t MPID_nem_ib_lmt_tail_addr_cbf[MPID_nem_ib_cbf_nslot *
#define MPID_NEM_IB_SYNC_SYNACK 1
#define MPID_NEM_IB_SYNC_NACK 2
-#define MPID_NEM_IB_EAGER_MAX_MSG_SZ (MPID_NEM_IB_COM_RDMABUF_SZSEG/*1024*/-sizeof(MPIDI_CH3_Pkt_t)+sizeof(MPIDI_CH3_Pkt_eager_send_t)-sizeof(MPID_nem_ib_netmod_hdr_shared_t)-sizeof(MPID_nem_ib_pkt_prefix_t)-sizeof(MPID_nem_ib_netmod_trailer_t)) /* when > this size, lmt is used. see src/mpid/ch3/src/mpid_isend.c */
+#define MPID_NEM_IB_EAGER_MAX_MSG_SZ (MPID_NEM_IB_COM_RDMABUF_SZSEG/*1024*/-sizeof(MPIDI_CH3_Pkt_t)+sizeof(MPIDI_CH3_Pkt_eager_send_t)-sizeof(MPID_nem_ib_netmod_hdr_shared_t)-sizeof(MPID_nem_ib_pkt_prefix_t)-sizeof(MPID_nem_ib_netmod_trailer_t)) /* when > this size, lmt is used. see src/mpid/ch3/src/mpid_isend.c */
#define MPID_NEM_IB_POLL_PERIOD_RECV_POSTED 2000 /* minimum period from previous ib_poll to ib_poll in recv_posted */
#define MPID_NEM_IB_POLL_PERIOD_SEND_POSTED 2000
typedef struct {
void *addr;
uint32_t rkey;
-#if 0 /* moving to packet header */
+#if 0 /* moving to packet header */
int seq_num_tail; /* notify RDMA-write-to buffer occupation */
#endif
uint8_t tail; /* last word of payload */
@@ -657,7 +661,7 @@ typedef struct {
typedef enum MPID_nem_ib_pkt_subtype {
MPIDI_NEM_IB_PKT_EAGER_SEND,
-#if 0 /* modification of mpid_nem_lmt.c is required */
+#if 0 /* modification of mpid_nem_lmt.c is required */
MPIDI_NEM_IB_PKT_LMT_RTS,
#endif
MPIDI_NEM_IB_PKT_PUT,
@@ -715,9 +719,9 @@ typedef struct MPID_nem_ib_pkt_change_rdmabuf_occupancy_notify_state_t {
int MPID_nem_ib_PktHandler_EagerSend(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
MPIDI_msg_sz_t * buflen /* out */ ,
MPID_Request ** rreqp /* out */);
-#if 0 /* modification of mpid_nem_lmt.c is required */
+#if 0 /* modification of mpid_nem_lmt.c is required */
int MPID_nem_ib_pkt_RTS_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen /* out */ ,
+ MPIDI_msg_sz_t * buflen /* out */ ,
MPID_Request ** rreqp /* out */);
#endif
int MPID_nem_ib_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
@@ -860,8 +864,8 @@ int pkt_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt, MPIDI_msg_sz_t * bu
- Slow when first-time allocs occur
- Free list is linked lists and prepared for 2^n sizes.
- Fast to find a empty slot (one load instruction)
- - Use mmap and munmap for requests of larger than or
- equal to 4KB buffers
+ - Use mmap and munmap for requests of larger than or
+ equal to 4KB buffers
- No unused slots for large requests */
static inline void *MPID_nem_ib_stmalloc(size_t _sz)
{
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
index 8014b1c..cc64921 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
@@ -86,10 +86,11 @@ uint16_t MPID_nem_ib_remote_poll_shared;
uint16_t MPID_nem_ib_cm_ringbuf_head;
uint16_t MPID_nem_ib_cm_ringbuf_tail;
uint64_t MPID_nem_ib_cm_ringbuf_released[(MPID_NEM_IB_CM_NSEG + 63) / 64];
-MPID_nem_ib_cm_sendq_t MPID_nem_ib_cm_sendq = { NULL, NULL };
+MPID_nem_ib_cm_sendq_t MPID_nem_ib_cm_sendq = { NULL, NULL };
+
int MPID_nem_ib_ncqe_scratch_pad_to_drain;
#endif
-MPID_nem_ib_ringbuf_sendq_t MPID_nem_ib_ringbuf_sendq = { NULL, NULL };
+MPID_nem_ib_ringbuf_sendq_t MPID_nem_ib_ringbuf_sendq = { NULL, NULL };
int MPID_nem_ib_ncqe_scratch_pad;
int MPID_nem_ib_ncqe_to_drain;
@@ -232,7 +233,7 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
MPID_nem_ib_remote_poll_shared = 0;
#ifdef MPID_NEM_IB_ONDEMAND
MPID_nem_ib_cm_ringbuf_head = 0;
- MPID_nem_ib_cm_ringbuf_tail = -1; /* it means slot 0 is not acquired */
+ MPID_nem_ib_cm_ringbuf_tail = -1; /* it means slot 0 is not acquired */
memset(MPID_nem_ib_cm_ringbuf_released, 0, (MPID_NEM_IB_CM_NSEG + 63) / 64);
#endif
@@ -249,11 +250,11 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
MPIU_CHKPMEM_MALLOC(MPID_nem_ib_scratch_pad_ibcoms, MPID_nem_ib_com_t **,
MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_t *),
mpi_errno, "connection table");
- memset(MPID_nem_ib_scratch_pad_ibcoms, 0, MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_t*));
+ memset(MPID_nem_ib_scratch_pad_ibcoms, 0, MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_t *));
/* prepare scrath-pad QP and malloc scratch-pad */
for (i = 0; i < MPID_nem_ib_nranks; i++) {
- if(i == MPID_nem_ib_myrank) {
+ if (i == MPID_nem_ib_myrank) {
continue;
}
dprintf("init,MPID_nem_ib_myrank=%d,i=%d\n", MPID_nem_ib_myrank, i);
@@ -270,14 +271,13 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
"**MPID_nem_ib_com_obtain_pointer");
- ibcom_errno =
- MPID_nem_ib_com_alloc(MPID_nem_ib_scratch_pad_fds[i],
+ ibcom_errno = MPID_nem_ib_com_alloc(MPID_nem_ib_scratch_pad_fds[i],
#ifdef MPID_NEM_IB_ONDEMAND
- MPID_NEM_IB_CM_OFF_CMD +
- MPID_NEM_IB_CM_NSEG * sizeof(MPID_nem_ib_cm_cmd_t) +
- sizeof(MPID_nem_ib_ringbuf_headtail_t)
+ MPID_NEM_IB_CM_OFF_CMD +
+ MPID_NEM_IB_CM_NSEG * sizeof(MPID_nem_ib_cm_cmd_t) +
+ sizeof(MPID_nem_ib_ringbuf_headtail_t)
#else
- MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_qp_state_t)
+ MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_qp_state_t)
#endif
);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_alloc");
@@ -287,8 +287,9 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
*((uint64_t *) MPID_nem_ib_scratch_pad) = MPID_NEM_IB_CM_RELEASED;
#endif
/* Initialize head and tail pointer of shared ring buffer */
- MPID_nem_ib_ringbuf_headtail_t * headtail =
- (MPID_nem_ib_ringbuf_headtail_t *) ((uint8_t*)MPID_nem_ib_scratch_pad + MPID_NEM_IB_RINGBUF_OFF_HEAD);
+ MPID_nem_ib_ringbuf_headtail_t *headtail =
+ (MPID_nem_ib_ringbuf_headtail_t *) ((uint8_t *) MPID_nem_ib_scratch_pad +
+ MPID_NEM_IB_RINGBUF_OFF_HEAD);
headtail->head = 0;
headtail->tail = -1;
@@ -790,8 +791,7 @@ int MPID_nem_ib_vc_onconnect(MPIDI_VC_t * vc)
/* store pointer to MPID_nem_ib_com */
ibcom_errno =
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[vc->pg_rank].fd,
- &VC_FIELD(vc, ibcom));
+ MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[vc->pg_rank].fd, &VC_FIELD(vc, ibcom));
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
#if 0
@@ -804,7 +804,7 @@ int MPID_nem_ib_vc_onconnect(MPIDI_VC_t * vc)
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_VC_ONCONNECT);
return mpi_errno;
- fn_fail:
+ fn_fail:
goto fn_exit;
}
@@ -860,8 +860,9 @@ int MPID_nem_ib_vc_init(MPIDI_VC_t * vc)
int ntrial = 0;
volatile MPID_nem_ib_com_qp_state_t *rstate =
- (MPID_nem_ib_com_qp_state_t *) ((uint8_t *) MPID_nem_ib_com_scratch_pad->
- icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_TO] +
+ (MPID_nem_ib_com_qp_state_t *) ((uint8_t *)
+ MPID_nem_ib_com_scratch_pad->icom_mem
+ [MPID_NEM_IB_COM_SCRATCH_PAD_TO] +
vc->pg_rank * sizeof(MPID_nem_ib_com_qp_state_t));
dprintf("ib_init,rstate=%p,*rstate=%08x\n", rstate, *((uint32_t *) rstate));
while (rstate->state != MPID_NEM_IB_COM_QP_STATE_RTR) {
@@ -887,7 +888,7 @@ int MPID_nem_ib_vc_init(MPIDI_VC_t * vc)
#endif
MPIDI_CHANGE_VC_STATE(vc, ACTIVE);
-#if 0 /* dead code */
+#if 0 /* dead code */
uint32_t max_msg_sz;
MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conns[vc->pg_rank].fd,
MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ, &max_msg_sz,
@@ -896,7 +897,7 @@ int MPID_nem_ib_vc_init(MPIDI_VC_t * vc)
VC_FIELD(vc, pending_sends) = 0;
//MPIU_Assert(sizeof(MPID_nem_ib_netmod_hdr_t) == 8); /* assumption in ib_ibcom.h */
- MPIU_Assert(sizeof(MPID_nem_ib_netmod_trailer_t) == 1); /* assumption in ib_ibcom.h */
+ MPIU_Assert(sizeof(MPID_nem_ib_netmod_trailer_t) == 1); /* assumption in ib_ibcom.h */
uint32_t sz;
#if 0
@@ -935,7 +936,7 @@ int MPID_nem_ib_vc_init(MPIDI_VC_t * vc)
vc_ch->pkt_handler = MPID_nem_ib_pkt_handler;
vc_ch->num_pkt_handlers = MPIDI_NEM_IB_PKT_NUM_PKT_HANDLERS;
MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_EAGER_SEND] = MPID_nem_ib_PktHandler_EagerSend;
-#if 0 /* modification of mpid_nem_lmt.c is required */
+#if 0 /* modification of mpid_nem_lmt.c is required */
MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_LMT_RTS] = MPID_nem_ib_pkt_RTS_handler;
#endif
MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_PUT] = MPID_nem_ib_PktHandler_Put;
@@ -1016,7 +1017,7 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
case MPID_NEM_IB_RINGBUF_SHARED:
remote_poll = MPID_nem_ib_remote_poll_shared;
break;
- default: /* FIXME */
+ default: /* FIXME */
printf("unknown ringbuf->type\n");
break;
}
@@ -1026,14 +1027,15 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
VC_FIELD(vc, vc_terminate_buf) =
(uint8_t *) ringbuf->start +
- MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t)(remote_poll % ringbuf->nslot));
+ MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t) (remote_poll % ringbuf->nslot));
- dprintf("vc_terminate,before,%d->%d,diff-rsr=%d,l diff-lsr=%d,sendq_empty=%d,ncqe=%d,pending_sends=%d\n",
- MPID_nem_ib_myrank, vc->pg_rank,
- MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
- vc_ib->ibcom->rsr_seq_num_tail_last_sent),
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
- MPID_nem_ib_sendq_empty(vc_ib->sendq), MPID_nem_ib_ncqe, VC_FIELD(vc, pending_sends));
+ dprintf
+ ("vc_terminate,before,%d->%d,diff-rsr=%d,l diff-lsr=%d,sendq_empty=%d,ncqe=%d,pending_sends=%d\n",
+ MPID_nem_ib_myrank, vc->pg_rank, MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
+ vc_ib->ibcom->
+ rsr_seq_num_tail_last_sent),
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
+ MPID_nem_ib_sendq_empty(vc_ib->sendq), MPID_nem_ib_ncqe, VC_FIELD(vc, pending_sends));
/* update remote RDMA-write-to buffer occupancy */
#if 0 /* we can't send it when the other party has closed QP */
@@ -1071,9 +1073,10 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_drain_scq");
#endif
ibcom_errno = MPID_nem_ib_ringbuf_progress();
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_progress");
-
- MPID_nem_ib_progress_engine_vt += 1; /* Progress virtual time */
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_ringbuf_progress");
+
+ MPID_nem_ib_progress_engine_vt += 1; /* Progress virtual time */
}
dprintf("init,middle,%d->%d,r rdmaocc=%d,l rdmaocc=%d,sendq=%d,ncqe=%d,pending_sends=%d\n",
@@ -1121,68 +1124,62 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
/* drain scratch-pad scq */
#ifdef MPID_NEM_IB_ONDEMAND
ibcom_errno = MPID_nem_ib_cm_drain_scq();
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_drain_scq");
- dprintf("init,scratch_pad,ncqe=%d,to_drain=%d\n", MPID_nem_ib_ncqe_scratch_pad, MPID_nem_ib_ncqe_scratch_pad_to_drain);
- dprintf("init,scratch_pad,ncom_scratch_pad=%d\n", MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank]->ncom_scratch_pad);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_drain_scq");
+ dprintf("init,scratch_pad,ncqe=%d,to_drain=%d\n", MPID_nem_ib_ncqe_scratch_pad,
+ MPID_nem_ib_ncqe_scratch_pad_to_drain);
+ dprintf("init,scratch_pad,ncom_scratch_pad=%d\n",
+ MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank]->ncom_scratch_pad);
#else
ibcom_errno = MPID_nem_ib_drain_scq_scratch_pad();
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
"**MPID_nem_ib_drain_scq_scratch_pad");
-#endif
+#endif
mpi_errno = MPIDI_CH3U_Handle_connection(vc, MPIDI_VC_EVENT_TERMINATED);
if (mpi_errno) {
MPIU_ERR_POP(mpi_errno);
}
-#if 0 /* We move this code to the end of poll_eager. */
+#if 0 /* We move this code to the end of poll_eager. */
/* Destroy VC QP */
/* Destroy ring-buffer */
ibcom_errno = MPID_nem_ib_ringbuf_free(vc);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_ringbuf_free");
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_free");
/* Check connection status stored in VC when on-demand connection is used */
dprintf("vc_terminate,%d->%d,close\n", MPID_nem_ib_myrank, vc->pg_rank);
ibcom_errno = MPID_nem_ib_com_close(vc_ib->sc->fd);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_close");
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_close");
/* Destroy array of scratch-pad QPs */
MPIU_Assert(MPID_nem_ib_conns_ref_count > 0);
- if(--MPID_nem_ib_conns_ref_count == 0) {
+ if (--MPID_nem_ib_conns_ref_count == 0) {
MPIU_Free(MPID_nem_ib_conns);
}
/* TODO don't create them for shared memory vc */
/* Destroy scratch-pad */
- ibcom_errno =
- MPID_nem_ib_com_free(MPID_nem_ib_scratch_pad_fds[vc->pg_rank],
+ ibcom_errno = MPID_nem_ib_com_free(MPID_nem_ib_scratch_pad_fds[vc->pg_rank],
#ifdef MPID_NEM_IB_ONDEMAND
- MPID_NEM_IB_CM_OFF_CMD +
- MPID_NEM_IB_CM_NSEG * sizeof(MPID_nem_ib_cm_cmd_t) +
- sizeof(MPID_nem_ib_ringbuf_headtail_t)
+ MPID_NEM_IB_CM_OFF_CMD +
+ MPID_NEM_IB_CM_NSEG * sizeof(MPID_nem_ib_cm_cmd_t) +
+ sizeof(MPID_nem_ib_ringbuf_headtail_t)
#else
- MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_qp_state_t)
-
+ MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_qp_state_t)
#endif
- );
+);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_free");
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_free");
/* Destroy scratch-pad QP */
- ibcom_errno =
- MPID_nem_ib_com_close(MPID_nem_ib_scratch_pad_fds[vc->pg_rank]);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_close");
+ ibcom_errno = MPID_nem_ib_com_close(MPID_nem_ib_scratch_pad_fds[vc->pg_rank]);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_close");
/* Destroy array of scratch-pad QPs */
MPIU_Assert(MPID_nem_ib_scratch_pad_fds_ref_count > 0);
- if(--MPID_nem_ib_scratch_pad_fds_ref_count == 0) {
+ if (--MPID_nem_ib_scratch_pad_fds_ref_count == 0) {
MPIU_Free(MPID_nem_ib_scratch_pad_fds);
MPIU_Free(MPID_nem_ib_scratch_pad_ibcoms);
}
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
index a777293..123dde4 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
@@ -100,7 +100,7 @@ int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_p
/* prepare magic */
//*((uint32_t*)(write_from_buf + data_sz - sizeof(tailmagic_t))) = MPID_NEM_IB_COM_MAGIC;
-#if 0 /* moving to packet header */ /* embed RDMA-write-to buffer occupancy information */
+#if 0 /* moving to packet header */ /* embed RDMA-write-to buffer occupancy information */
dprintf("lmt_initiate_lmt,rsr_seq_num_tail=%d\n", vc_ib->ibcom->rsr_seq_num_tail);
/* embed RDMA-write-to buffer occupancy information */
s_cookie_buf->seq_num_tail = vc_ib->ibcom->rsr_seq_num_tail;
@@ -272,7 +272,7 @@ int MPID_nem_ib_lmt_start_recv(struct MPIDI_VC *vc, struct MPID_Request *req, MP
MPID_nem_ib_sendq_enqueue(&vc_ib->sendq, req);
}
-#if 0 /* moving to packet header */
+#if 0 /* moving to packet header */
/* extract embeded RDMA-write-to buffer occupancy information */
dprintf("lmt_start_recv,old lsr_seq_num=%d,s_cookie_buf->seq_num_tail=%d\n",
vc_ib->ibcom->lsr_seq_num_tail, s_cookie_buf->seq_num_tail);
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
index 951768a..06db998 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
@@ -72,9 +72,9 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
entered_drain_scq = 1;
#ifdef MPID_NEM_IB_ONDEMAND
- /* drain_scq is called after poll_eager calls vc_terminate
- or nobody created QP */
- if(!MPID_nem_ib_rc_shared_scq) {
+ /* drain_scq is called after poll_eager calls vc_terminate
+ * or nobody created QP */
+ if (!MPID_nem_ib_rc_shared_scq) {
dprintf("drain_scq,CQ is null\n");
goto fn_exit;
}
@@ -116,17 +116,18 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
#ifdef HAVE_LIBDCFA
if (cqe[i].status != IBV_WC_SUCCESS) {
printf("drain_scq,kind=%d,req_type=%d,msg_type=%d,cqe.status=%08x\n", kind, req_type,
- msg_type, cqe[i].status);
+ msg_type, cqe[i].status);
}
#else
if (cqe[i].status != IBV_WC_SUCCESS) {
- printf("drain_scq,kind=%d,req_type=%d,msg_type=%d,comm=%p,cqe.status=%08x,%s,sseq_num=%d\n", kind,
- req_type, msg_type, req->comm, cqe[i].status, ibv_wc_status_str(cqe[i].status),
- VC_FIELD(req->ch.vc, ibcom->sseq_num));
+ printf
+ ("drain_scq,kind=%d,req_type=%d,msg_type=%d,comm=%p,cqe.status=%08x,%s,sseq_num=%d\n",
+ kind, req_type, msg_type, req->comm, cqe[i].status,
+ ibv_wc_status_str(cqe[i].status), VC_FIELD(req->ch.vc, ibcom->sseq_num));
}
#endif
MPID_NEM_IB_ERR_FATAL(cqe[i].status != IBV_WC_SUCCESS, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_drain_scq");
+ "**MPID_nem_ib_drain_scq");
/*
* packets generated by MPIDI_CH3_iStartMsgv has req_type of RECV
@@ -169,7 +170,8 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
vc_ib->ibcom->ncom -= 1;
MPID_nem_ib_ncqe -= 1;
MPID_nem_ib_rdmawr_from_free(REQ_FIELD(req, buf_from), REQ_FIELD(req, buf_from_sz));
- dprintf("drain_scq,afree=%p,sz=%d\n", REQ_FIELD(req, buf_from), REQ_FIELD(req, buf_from_sz));
+ dprintf("drain_scq,afree=%p,sz=%d\n", REQ_FIELD(req, buf_from),
+ REQ_FIELD(req, buf_from_sz));
dprintf("drain_scq,eager-send,ncqe=%d\n", MPID_nem_ib_ncqe);
MPIU_Assert(req->ref_count == 1 || req->ref_count == 2);
@@ -253,7 +255,7 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
/* decrement the number of entries in IB command queue */
vc_ib->ibcom->ncom -= 1;
MPID_nem_ib_ncqe -= 1;
- MPID_nem_ib_rdmawr_from_free(REQ_FIELD(req, buf_from) , REQ_FIELD(req, buf_from_sz));
+ MPID_nem_ib_rdmawr_from_free(REQ_FIELD(req, buf_from), REQ_FIELD(req, buf_from_sz));
dprintf("drain_scq,GET_RESP,ncqe=%d\n", MPID_nem_ib_ncqe);
MPIU_Assert(req->ref_count == 1 || req->ref_count == 2);
@@ -439,7 +441,7 @@ int MPID_nem_ib_drain_scq_scratch_pad()
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_DRAIN_SCQ_SCRATCH_PAD);
/* drain_scq_scratch_pad is called after poll_eager calls vc_terminate */
- if(!MPID_nem_ib_rc_shared_scq_scratch_pad) {
+ if (!MPID_nem_ib_rc_shared_scq_scratch_pad) {
dprintf("drain_scq_scratch_pad,CQ is null\n");
goto fn_exit;
}
@@ -488,13 +490,13 @@ int MPID_nem_ib_drain_scq_scratch_pad()
#define FUNCNAME MPID_nem_ib_poll_eager
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
+int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t * ringbuf)
{
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
- struct MPIDI_VC * vc;
- MPID_nem_ib_vc_area * vc_ib;
+ struct MPIDI_VC *vc;
+ MPID_nem_ib_vc_area *vc_ib;
int result;
struct ibv_wc cqe[MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN];
uint64_t tscs, tsce;
@@ -504,8 +506,8 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
//MPID_nem_ib_tsc_poll = MPID_nem_ib_rdtsc();
- uint16_t * remote_poll;
- switch(ringbuf->type) {
+ uint16_t *remote_poll;
+ switch (ringbuf->type) {
case MPID_NEM_IB_RINGBUF_EXCLUSIVE:
remote_poll = &VC_FIELD(ringbuf->vc, ibcom->rsr_seq_num_poll);
break;
@@ -518,18 +520,20 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
void *buf =
(uint8_t *) ringbuf->start +
- MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t)(*remote_poll % ringbuf->nslot));
+ MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t) (*remote_poll % ringbuf->nslot));
volatile uint64_t *head_flag = MPID_NEM_IB_NETMOD_HDR_HEAD_FLAG_PTR(buf);
if (*head_flag == 0) {
goto fn_exit;
}
- dprintf("ib_poll_eager,remote_poll=%d,buf=%p,sz=%d\n", *remote_poll, buf, MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf));
+ dprintf("ib_poll_eager,remote_poll=%d,buf=%p,sz=%d\n", *remote_poll, buf,
+ MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf));
#if 0
ibcom_errno = MPID_nem_ib_com_poll_cq(MPID_NEM_IB_COM_RC_SHARED_RCQ, &cqe, &result);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_poll_cq");
#endif
- dprintf("ib_poll_eager,eager-send,found\n");fflush(stdout);
+ dprintf("ib_poll_eager,eager-send,found\n");
+ fflush(stdout);
//MPIU_ERR_CHKANDJUMP1(cqe.status != IBV_WC_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_poll_cq", "**MPID_nem_ib_com_poll_cq %s", MPID_nem_ib_com_strerror(ibcom_errno));
@@ -537,7 +541,8 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
MPID_NEM_IB_OFF_POW2_ALIGNED(MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf));
volatile MPID_nem_ib_netmod_trailer_t *netmod_trailer =
(MPID_nem_ib_netmod_trailer_t *) ((uint8_t *) buf + off_pow2_aligned);
- dprintf("poll,off_pow2_aligned=%d,netmod_trailer=%p,sz=%d\n", off_pow2_aligned, netmod_trailer, MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf));
+ dprintf("poll,off_pow2_aligned=%d,netmod_trailer=%p,sz=%d\n", off_pow2_aligned, netmod_trailer,
+ MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf));
int k = 0;
//tsce = MPID_nem_ib_rdtsc(); printf("9,%ld\n", tsce - tscs); // 55 for 512-byte
//tscs = MPID_nem_ib_rdtsc();
@@ -550,10 +555,9 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
#if 0 /* pre-fetch next RDMA-write-buf slot to cover TLB miss latency */
__asm__ __volatile__
("movq %0, %%rsi;"
- "movq 0(%%rsi), %%rsi;"
- :
- :"r"(ringbuf->start + MPID_NEM_IB_COM_RDMABUF_SZSEG *
- ((uint16_t)((*remote_poll + 1) % MPID_NEM_IB_COM_RDMABUF_NSEG)))
+ "movq 0(%%rsi), %%rsi;"::"r"(ringbuf->start + MPID_NEM_IB_COM_RDMABUF_SZSEG *
+ ((uint16_t)
+ ((*remote_poll + 1) % MPID_NEM_IB_COM_RDMABUF_NSEG)))
:"%rsi");
#endif
#ifdef MPID_NEM_IB_TLBPREF_POLL
@@ -582,22 +586,15 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
__asm__ __volatile__
("movq %0, %%rsi;"
"vprefetch0 0x00(%%rsi);"
- "vprefetch0 0x40(%%rsi);"
- "vprefetch0 0x80(%%rsi);"
- "vprefetch0 0xc0(%%rsi);"
- :
- : "r"(rsi)
- : "%rsi");
+ "vprefetch0 0x40(%%rsi);" "vprefetch0 0x80(%%rsi);" "vprefetch0 0xc0(%%rsi);"::"r"(rsi)
+ :"%rsi");
#else
__asm__ __volatile__
("movq %0, %%rsi;"
"prefetchnta 0x00(%%rsi);"
"prefetchnta 0x40(%%rsi);"
- "prefetchnta 0x80(%%rsi);"
- "prefetchnta 0xc0(%%rsi);"
- :
- : "r"(rsi)
- : "%rsi");
+ "prefetchnta 0x80(%%rsi);" "prefetchnta 0xc0(%%rsi);"::"r"(rsi)
+ :"%rsi");
#endif
}
#endif
@@ -607,7 +604,7 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
dprintf("ib_poll,inc,remote_poll=%d\n", *remote_poll);
/* VC is stored in the packet for shared ring buffer */
- switch(ringbuf->type) {
+ switch (ringbuf->type) {
case MPID_NEM_IB_RINGBUF_EXCLUSIVE:
vc = ringbuf->vc;
break;
@@ -619,15 +616,13 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
}
vc_ib = VC_IB(vc);
dprintf("poll_eager,vc=%p\n", vc);
-
+
/* Save it because handle_pkt frees buf when the packet is MPIDI_CH3_PKT_CLOSE */
ssize_t sz_pkt = MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf);
- MPIDI_CH3_Pkt_eager_send_t *pkt =
- (MPIDI_CH3_Pkt_eager_send_t *) ((uint8_t *) buf + sz_pkt);
+ MPIDI_CH3_Pkt_eager_send_t *pkt = (MPIDI_CH3_Pkt_eager_send_t *) ((uint8_t *) buf + sz_pkt);
dprintf("pkt=%p,sizeof=%ld\n", pkt, sz_pkt);
MPIU_Assert(MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) >=
- sz_pkt + sizeof(MPIDI_CH3_Pkt_t) +
- sizeof(MPID_nem_ib_netmod_trailer_t));
+ sz_pkt + sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPID_nem_ib_netmod_trailer_t));
dprintf
("handle_pkt,before,%d<-%d,id=%d,pkt->type=%d,pcc=%d,MPIDI_NEM_PKT_END=%d,pkt=%p,subtype=%d\n",
MPID_nem_ib_myrank, vc->pg_rank, *remote_poll, pkt->type,
@@ -637,8 +632,7 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
mpi_errno =
MPID_nem_handle_pkt(vc, (char *) ((uint8_t *) buf + sz_pkt),
(MPIDI_msg_sz_t) (MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) -
- sz_pkt -
- sizeof(MPID_nem_ib_netmod_trailer_t)));
+ sz_pkt - sizeof(MPID_nem_ib_netmod_trailer_t)));
if (mpi_errno) {
MPIU_ERR_POP(mpi_errno);
}
@@ -649,30 +643,29 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
* because MPID_nem_handle_pkt releases RDMA-wr-to buf by copying data out */
/* responder releases resource and then embed largest sequence number into MPI message bound to initiator */
#if 1
- if((vc->state != MPIDI_VC_STATE_INACTIVE) ||
- (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
- dprintf
- ("handle_pkt,after,%d<-%d,id=%d,pkt->type=%d,eagershort=%d,close=%d,rts=%d,piggy-backed-eagersend=%d\n",
- MPID_nem_ib_myrank, vc->pg_rank, *remote_poll, pkt->type,
- MPIDI_CH3_PKT_EAGERSHORT_SEND, MPIDI_CH3_PKT_CLOSE, MPIDI_NEM_PKT_LMT_RTS,
- MPIDI_NEM_IB_PKT_EAGER_SEND);
+ if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
+ (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
+ dprintf
+ ("handle_pkt,after,%d<-%d,id=%d,pkt->type=%d,eagershort=%d,close=%d,rts=%d,piggy-backed-eagersend=%d\n",
+ MPID_nem_ib_myrank, vc->pg_rank, *remote_poll, pkt->type,
+ MPIDI_CH3_PKT_EAGERSHORT_SEND, MPIDI_CH3_PKT_CLOSE, MPIDI_NEM_PKT_LMT_RTS,
+ MPIDI_NEM_IB_PKT_EAGER_SEND);
int notify_rate;
- if((vc->state != MPIDI_VC_STATE_INACTIVE) ||
- (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf)) {
- ibcom_errno =
- MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get(MPID_nem_ib_conns[vc->pg_rank].fd,
- ¬ify_rate);
- dprintf("poll_eager,sendq=%d,ncom=%d,ncqe=%d,ldiff=%d(%d-%d),rate=%d\n",
- MPID_nem_ib_sendq_empty(vc_ib->sendq),
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY,
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
- vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail,
- notify_rate);
+ if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
+ (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf)) {
+ ibcom_errno =
+ MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get(MPID_nem_ib_conns[vc->pg_rank].fd,
+ ¬ify_rate);
+ dprintf("poll_eager,sendq=%d,ncom=%d,ncqe=%d,ldiff=%d(%d-%d),rate=%d\n",
+ MPID_nem_ib_sendq_empty(vc_ib->sendq),
+ vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
+ MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY,
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
+ vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail, notify_rate);
}
- if(ringbuf->type == MPID_NEM_IB_RINGBUF_EXCLUSIVE) {
+ if (ringbuf->type == MPID_NEM_IB_RINGBUF_EXCLUSIVE) {
dprintf("poll_eager,rdiff=%d(%d-%d)\n",
MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
vc_ib->ibcom->rsr_seq_num_tail_last_sent),
@@ -689,21 +682,19 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
* because there is no way to trace the RDMA-write-to buffer addr
* because rreq->dev.tmpbuf is set to zero in ch3_eager.c
*/
- if((vc->state != MPIDI_VC_STATE_INACTIVE) ||
- (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
- dprintf("poll_eager,released,type=%d,MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM=%d\n", pkt->type,
- MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM);
- if((vc->state != MPIDI_VC_STATE_INACTIVE) ||
- (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
- MPID_nem_ib_recv_buf_released(vc,
- (void *) ((uint8_t *) buf +
- sz_pkt +
- sizeof(MPIDI_CH3_Pkt_t)));
+ if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
+ (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
+ dprintf("poll_eager,released,type=%d,MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM=%d\n", pkt->type,
+ MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM);
+ if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
+ (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
+ MPID_nem_ib_recv_buf_released(vc,
+ (void *) ((uint8_t *) buf +
+ sz_pkt + sizeof(MPIDI_CH3_Pkt_t)));
}
else {
if (MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) ==
- sz_pkt + sizeof(MPIDI_CH3_Pkt_t) +
- sizeof(MPID_nem_ib_netmod_trailer_t)) {
+ sz_pkt + sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPID_nem_ib_netmod_trailer_t)) {
if (pkt->type == MPIDI_CH3_PKT_EAGERSHORT_SEND
//|| pkt->type == MPIDI_CH3_PKT_GET
) {
@@ -717,71 +708,65 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
}
#endif
- if((vc->state != MPIDI_VC_STATE_INACTIVE) ||
- (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
- dprintf("ib_poll,hdr_ringbuf_type=%d\n", MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf));
+ if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
+ (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
+ dprintf("ib_poll,hdr_ringbuf_type=%d\n", MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf));
- if((vc->state != MPIDI_VC_STATE_INACTIVE) ||
- (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
- if(MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf) & MPID_NEM_IB_RINGBUF_RELINDEX) {
- vc_ib->ibcom->lsr_seq_num_tail = MPID_NEM_IB_NETMOD_HDR_RELINDEX_GET(buf);
- dprintf("ib_poll,local_tail is updated to %d\n",
- MPID_NEM_IB_NETMOD_HDR_RELINDEX_GET(buf));
- }
+ if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
+ (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
+ if (MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf) & MPID_NEM_IB_RINGBUF_RELINDEX) {
+ vc_ib->ibcom->lsr_seq_num_tail = MPID_NEM_IB_NETMOD_HDR_RELINDEX_GET(buf);
+ dprintf("ib_poll,local_tail is updated to %d\n",
+ MPID_NEM_IB_NETMOD_HDR_RELINDEX_GET(buf));
+ }
/* Clear flag */
- if((vc->state != MPIDI_VC_STATE_INACTIVE) ||
- (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
- MPID_NEM_IB_NETMOD_HDR_HEAD_FLAG_SET(buf, 0);
+ if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
+ (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
+ MPID_NEM_IB_NETMOD_HDR_HEAD_FLAG_SET(buf, 0);
-#if 1 /* We move this code from the end of vc_terminate. */
+#if 1 /* We move this code from the end of vc_terminate. */
if (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf) {
/* clear stored data */
vc_ib->vc_terminate_buf = NULL;
/* Destroy ring-buffer */
ibcom_errno = MPID_nem_ib_ringbuf_free(vc);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_ringbuf_free");
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_free");
/* Check connection status stored in VC when on-demand connection is used */
dprintf("vc_terminate,%d->%d,close\n", MPID_nem_ib_myrank, vc->pg_rank);
ibcom_errno = MPID_nem_ib_com_close(vc_ib->sc->fd);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_close");
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_close");
/* Destroy array of scratch-pad QPs */
MPIU_Assert(MPID_nem_ib_conns_ref_count > 0);
- if(--MPID_nem_ib_conns_ref_count == 0) {
+ if (--MPID_nem_ib_conns_ref_count == 0) {
MPIU_Free(MPID_nem_ib_conns);
}
/* TODO don't create them for shared memory vc */
/* Destroy scratch-pad */
- ibcom_errno =
- MPID_nem_ib_com_free(MPID_nem_ib_scratch_pad_fds[vc->pg_rank],
+ ibcom_errno = MPID_nem_ib_com_free(MPID_nem_ib_scratch_pad_fds[vc->pg_rank],
#ifdef MPID_NEM_IB_ONDEMAND
- MPID_NEM_IB_CM_OFF_CMD +
- MPID_NEM_IB_CM_NSEG * sizeof(MPID_nem_ib_cm_cmd_t) +
- sizeof(MPID_nem_ib_ringbuf_headtail_t)
+ MPID_NEM_IB_CM_OFF_CMD +
+ MPID_NEM_IB_CM_NSEG * sizeof(MPID_nem_ib_cm_cmd_t) +
+ sizeof(MPID_nem_ib_ringbuf_headtail_t)
#else
- MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_qp_state_t)
+ MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_qp_state_t)
#endif
- );
+);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_free");
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_free");
/* Destroy scratch-pad QP */
- ibcom_errno =
- MPID_nem_ib_com_close(MPID_nem_ib_scratch_pad_fds[vc->pg_rank]);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_close");
+ ibcom_errno = MPID_nem_ib_com_close(MPID_nem_ib_scratch_pad_fds[vc->pg_rank]);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_close");
/* Destroy array of scratch-pad QPs */
MPIU_Assert(MPID_nem_ib_scratch_pad_fds_ref_count > 0);
- if(--MPID_nem_ib_scratch_pad_fds_ref_count == 0) {
+ if (--MPID_nem_ib_scratch_pad_fds_ref_count == 0) {
MPIU_Free(MPID_nem_ib_scratch_pad_fds);
MPIU_Free(MPID_nem_ib_scratch_pad_ibcoms);
}
@@ -971,7 +956,8 @@ int MPID_nem_ib_poll(int in_blocking_poll)
/* [MPID_NEM_IB_NRINGBUF-1] stores shared ring buffer */
for (i = 0; i < MPID_NEM_IB_NRINGBUF; i++) {
- if ((((MPID_nem_ib_ringbuf_allocated[i / 64] >> (i & 63)) & 1) == 0) || !MPID_nem_ib_ringbuf) {
+ if ((((MPID_nem_ib_ringbuf_allocated[i / 64] >> (i & 63)) & 1) == 0) ||
+ !MPID_nem_ib_ringbuf) {
//dprintf("poll,cont\n");
continue;
}
@@ -990,17 +976,19 @@ int MPID_nem_ib_poll(int in_blocking_poll)
* to perform send_progress
* when send and progress_send call drain_scq asking it
* for not performing send_progress and make the CQ empty */
- if(MPID_nem_ib_ringbuf[i].type == MPID_NEM_IB_RINGBUF_EXCLUSIVE) {
+ if (MPID_nem_ib_ringbuf[i].type == MPID_NEM_IB_RINGBUF_EXCLUSIVE) {
mpi_errno = MPID_nem_ib_send_progress(MPID_nem_ib_ringbuf[i].vc);
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_send_progress");
- ncom_almost_full |= (VC_FIELD(MPID_nem_ib_ringbuf[i].vc, ibcom->ncom) >= MPID_NEM_IB_COM_MAX_SQ_HEIGHT_DRAIN);
+ ncom_almost_full |=
+ (VC_FIELD(MPID_nem_ib_ringbuf[i].vc, ibcom->ncom) >=
+ MPID_NEM_IB_COM_MAX_SQ_HEIGHT_DRAIN);
}
#if 0
/* aggressively perform drain_scq */
- ncom_almost_full |= !(MPID_nem_ib_sendq_empty(VC_FIELD(MPID_nem_ib_ringbuf[i].vc, sendq));
+ ncom_almost_full |= !(MPID_nem_ib_sendq_empty(VC_FIELD(MPID_nem_ib_ringbuf[i].vc, sendq)));
#endif
}
#if defined (MPID_NEM_IB_TIMER_WAIT_IB_POLL)
@@ -1015,8 +1003,8 @@ int MPID_nem_ib_poll(int in_blocking_poll)
MPID_nem_ib_ncqe >= MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN || ncom_almost_full)
#endif
#if !defined (MPID_NEM_IB_LMT_GET_CQE)
- if (/*(in_blocking_poll && result == 0) || */ MPID_nem_ib_ncqe_nces > 0 ||
- MPID_nem_ib_ncqe >= MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN || ncom_almost_full)
+ if (/*(in_blocking_poll && result == 0) || */ MPID_nem_ib_ncqe_nces >
+ 0 || MPID_nem_ib_ncqe >= MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN || ncom_almost_full)
#endif
{
#if defined (MPID_NEM_IB_TIMER_WAIT_IB_POLL)
@@ -1038,16 +1026,14 @@ int MPID_nem_ib_poll(int in_blocking_poll)
ibcom_errno = MPID_nem_ib_drain_scq(0);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_drain_scq");
#endif
-
#ifdef MPID_NEM_IB_ONDEMAND
/* process incoming connection request */
MPID_nem_ib_cm_poll_syn();
MPID_nem_ib_cm_poll();
-
//dprintf("ib_poll,MPID_nem_ib_ncqe_scratch_pad_to_drain=%d\n",
//MPID_nem_ib_ncqe_scratch_pad_to_drain);
/* process outgoing conncetion request */
- if (MPID_nem_ib_ncqe_scratch_pad_to_drain > 0 ||
+ if (MPID_nem_ib_ncqe_scratch_pad_to_drain > 0 ||
MPID_nem_ib_ncqe_scratch_pad >= MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN) {
ibcom_errno = MPID_nem_ib_cm_drain_scq();
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_drain_scq");
@@ -1057,9 +1043,7 @@ int MPID_nem_ib_poll(int in_blocking_poll)
MPID_nem_ib_cm_progress();
#endif
MPID_nem_ib_ringbuf_progress();
-
- MPID_nem_ib_progress_engine_vt += 1; /* Progress virtual time */
-
+ MPID_nem_ib_progress_engine_vt += 1; /* Progress virtual time */
#if 1
/* if polling on eager-send and lmt would repeat frequently, perform "pause" to yield instruction issue bandwitdh to other logical-core */
if (in_blocking_poll && progress_completion_count_old == MPIDI_CH3I_progress_completion_count.v) {
@@ -1088,11 +1072,9 @@ int MPID_nem_ib_recv_posted(struct MPIDI_VC *vc, struct MPID_Request *req)
int mpi_errno = MPI_SUCCESS;
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
-
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RECV_POSTED);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RECV_POSTED);
dprintf("recv_posted,enter,%d->%d,req=%p\n", MPID_nem_ib_myrank, vc->pg_rank, req);
-
#ifdef MPID_NEM_IB_ONDEMAND
if (vc_ib->connection_state != MPID_NEM_IB_CM_ESTABLISHED) {
goto fn_exit;
@@ -1104,44 +1086,43 @@ int MPID_nem_ib_recv_posted(struct MPIDI_VC *vc, struct MPID_Request *req)
ibcom_errno = MPID_nem_ib_com_irecv(vc_ib->sc->fd, (uint64_t) vc->pg_rank);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_irecv");
#endif
-
#if 1 /*takagi */
MPIDI_msg_sz_t data_sz;
int dt_contig;
MPI_Aint dt_true_lb;
MPID_Datatype *dt_ptr;
- MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr,
- dt_true_lb);
-
+ MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype,
+ dt_contig, data_sz, dt_ptr, dt_true_lb);
/* poll when rreq is for lmt */
/* anticipating received message finds maching request in the posted-queue */
if (data_sz + sizeof(MPIDI_CH3_Pkt_eager_send_t) > vc->eager_max_msg_sz) {
//if (MPID_nem_ib_tsc_poll - MPID_nem_ib_rdtsc() > MPID_NEM_IB_POLL_PERIOD_RECV_POSTED) {
-#if 1
- if(VC_FIELD(vc, ibcom->remote_ringbuf)) {
+//#if 1
+ if (VC_FIELD(vc, ibcom->remote_ringbuf)) {
mpi_errno = MPID_nem_ib_poll_eager(VC_FIELD(vc, ibcom->remote_ringbuf));
-#else
- mpi_errno = MPID_nem_ib_poll(0);
-#endif
+//#else
+// mpi_errno = MPID_nem_ib_poll(0);
+//#endif
if (mpi_errno) {
MPIU_ERR_POP(mpi_errno);
}
}
//}
}
+
else {
#if 1
/* anticipating received message finds maching request in the posted-queue */
//if (MPID_nem_ib_tsc_poll - MPID_nem_ib_rdtsc() > MPID_NEM_IB_POLL_PERIOD_RECV_POSTED) {
- if(VC_FIELD(vc, ibcom->remote_ringbuf)) {
+ if (VC_FIELD(vc, ibcom->remote_ringbuf)) {
#if 1
- mpi_errno = MPID_nem_ib_poll_eager(VC_FIELD(vc, ibcom->remote_ringbuf));
+ mpi_errno = MPID_nem_ib_poll_eager(VC_FIELD(vc, ibcom->remote_ringbuf));
#else
- mpi_errno = MPID_nem_ib_poll(0);
+ mpi_errno = MPID_nem_ib_poll(0);
#endif
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
+ if (mpi_errno) {
+ MPIU_ERR_POP(mpi_errno);
+ }
}
//}
#endif
@@ -1171,14 +1152,12 @@ int MPID_nem_ib_recv_buf_released(struct MPIDI_VC *vc, void *user_data)
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
-
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RECV_BUF_RELEASED);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RECV_BUF_RELEASED);
dprintf("recv_buf_released,%d<-%d,user_data=%p\n", MPID_nem_ib_myrank, vc->pg_rank, user_data);
#if 1 /* moving from ib_poll */
/* Clear all possible tail flag slots */
/* tail flag is located at MPID_NEM_IB_COM_INLINE_DATA boundary and variable length entails multiple prospective locations for the future use */
-
/* see MPIDI_CH3_PktHandler_EagerShortSend (in src/mpid/ch3/src/ch3u_eager.c */
/* eager-send with zero-length data is released in poll
* because there is no way to trace the RDMA-write-to buffer addr
@@ -1188,19 +1167,17 @@ int MPID_nem_ib_recv_buf_released(struct MPIDI_VC *vc, void *user_data)
goto fn_exit;
}
- if(MPID_nem_ib_rdmawr_to_alloc_start > user_data &&
- user_data >= MPID_nem_ib_rdmawr_to_alloc_start +
- MPID_NEM_IB_COM_RDMABUF_SZ * MPID_NEM_IB_NRINGBUF) {
+ if (MPID_nem_ib_rdmawr_to_alloc_start > user_data &&
+ user_data >= MPID_nem_ib_rdmawr_to_alloc_start +
+ MPID_NEM_IB_COM_RDMABUF_SZ * MPID_NEM_IB_NRINGBUF) {
MPID_nem_ib_segv;
}
unsigned long mod =
(unsigned long) ((uint8_t *) user_data -
(uint8_t *) vc_ib->ibcom->remote_ringbuf->start) &
(MPID_NEM_IB_COM_RDMABUF_SZSEG - 1);
-
void *buf = (void *) ((uint8_t *) user_data - mod);
//dprintf("recv_buf_released,clearing,buf=%p\n", buf);
-
int off_pow2_aligned;
MPID_NEM_IB_OFF_POW2_ALIGNED(MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf));
//dprintf("recv_buf_released,sz=%d,pow2=%d\n", MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf), off_pow2_aligned);
@@ -1208,15 +1185,13 @@ int MPID_nem_ib_recv_buf_released(struct MPIDI_VC *vc, void *user_data)
uint32_t offset;
for (offset = 15;;
offset =
- (((offset + 1) << 1) - 1) > MPID_NEM_IB_MAX_OFF_POW2_ALIGNED ?
- MPID_NEM_IB_MAX_OFF_POW2_ALIGNED :
- (((offset + 1) << 1) - 1) ) {
+ (((offset + 1) << 1) - 1) > MPID_NEM_IB_MAX_OFF_POW2_ALIGNED ?
+ MPID_NEM_IB_MAX_OFF_POW2_ALIGNED : (((offset + 1) << 1) - 1)) {
MPID_nem_ib_netmod_trailer_t *netmod_trailer =
(MPID_nem_ib_netmod_trailer_t *) ((uint8_t *) buf + offset);
- if(MPID_nem_ib_rdmawr_to_alloc_start > (uint8_t *) netmod_trailer &&
- (uint8_t *) netmod_trailer >=
- MPID_nem_ib_rdmawr_to_alloc_start +
- MPID_NEM_IB_COM_RDMABUF_SZ * MPID_NEM_IB_NRINGBUF) {
+ if (MPID_nem_ib_rdmawr_to_alloc_start > (uint8_t *) netmod_trailer &&
+ (uint8_t *) netmod_trailer >=
+ MPID_nem_ib_rdmawr_to_alloc_start + MPID_NEM_IB_COM_RDMABUF_SZ * MPID_NEM_IB_NRINGBUF) {
MPID_nem_ib_segv;
}
netmod_trailer->tail_flag = 0;
@@ -1234,36 +1209,46 @@ int MPID_nem_ib_recv_buf_released(struct MPIDI_VC *vc, void *user_data)
(uint8_t *) vc_ib->ibcom->remote_ringbuf->start) /
MPID_NEM_IB_COM_RDMABUF_SZSEG;
MPIU_Assert(0 <= index_slot && index_slot < vc_ib->ibcom->remote_ringbuf->nslot);
- dprintf("released,user_data=%p,mem=%p,sub=%08lx,index_slot=%d\n", user_data, vc_ib->ibcom->remote_ringbuf->start, (unsigned long)user_data - (unsigned long)vc_ib->ibcom->remote_ringbuf->start, index_slot);
- dprintf("released,index_slot=%d,released=%016lx\n", index_slot, vc_ib->ibcom->remote_ringbuf->remote_released[index_slot / 64]);
+ dprintf("released,user_data=%p,mem=%p,sub=%08lx,index_slot=%d\n",
+ user_data, vc_ib->ibcom->remote_ringbuf->start,
+ (unsigned long) user_data -
+ (unsigned long) vc_ib->ibcom->remote_ringbuf->start, index_slot);
+ dprintf("released,index_slot=%d,released=%016lx\n", index_slot,
+ vc_ib->ibcom->remote_ringbuf->remote_released[index_slot / 64]);
vc_ib->ibcom->remote_ringbuf->remote_released[index_slot / 64] |= (1ULL << (index_slot & 63));
- dprintf("released,after bitset,%016lx\n", vc_ib->ibcom->remote_ringbuf->remote_released[index_slot / 64]);
+ dprintf("released,after bitset,%016lx\n",
+ vc_ib->ibcom->remote_ringbuf->remote_released[index_slot / 64]);
// int index_tail = (vc_ib->ibcom->rsr_seq_num_tail + 1) & (vc_ib->ibcom->local_ringbuf_nslot-1);
-
- MPID_nem_ib_ringbuf_headtail_t * headtail =
+ MPID_nem_ib_ringbuf_headtail_t *headtail =
(MPID_nem_ib_ringbuf_headtail_t *) ((uint8_t *) MPID_nem_ib_scratch_pad +
MPID_NEM_IB_RINGBUF_OFF_HEAD);
-
- uint16_t index_tail = vc_ib->ibcom->remote_ringbuf->type == MPID_NEM_IB_RINGBUF_EXCLUSIVE ?
- ((uint16_t)(vc_ib->ibcom->rsr_seq_num_tail + 1) % vc_ib->ibcom->remote_ringbuf->nslot) :
- ((uint16_t)(headtail->tail + 1) % vc_ib->ibcom->remote_ringbuf->nslot);
+ uint16_t index_tail =
+ vc_ib->ibcom->remote_ringbuf->type ==
+ MPID_NEM_IB_RINGBUF_EXCLUSIVE ? ((uint16_t) (vc_ib->ibcom->rsr_seq_num_tail + 1) %
+ vc_ib->ibcom->
+ remote_ringbuf->nslot) : ((uint16_t) (headtail->tail +
+ 1) %
+ vc_ib->ibcom->remote_ringbuf->
+ nslot);
dprintf("released,index_tail=%d\n", index_tail);
dprintf("released,%016lx\n", vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64]);
- if (1 || (index_tail & 7) || MPID_nem_ib_diff16(index_slot, index_tail) >= vc_ib->ibcom->remote_ringbuf->nslot - 8) { /* avoid wrap-around */
+ if (1 || (index_tail & 7) || MPID_nem_ib_diff16(index_slot, index_tail) >= vc_ib->ibcom->remote_ringbuf->nslot - 8) { /* avoid wrap-around */
while (1) {
- if (((vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64] >> (index_tail & 63)) & 1) ==
- 1) {
- if(vc_ib->ibcom->remote_ringbuf->type == MPID_NEM_IB_RINGBUF_EXCLUSIVE) {
+ if (((vc_ib->ibcom->remote_ringbuf->
+ remote_released[index_tail / 64] >> (index_tail & 63)) & 1) == 1) {
+ if (vc_ib->ibcom->remote_ringbuf->type == MPID_NEM_IB_RINGBUF_EXCLUSIVE) {
vc_ib->ibcom->rsr_seq_num_tail += 1;
- dprintf("exclusive ringbuf,remote_tail,incremented to %d\n", vc_ib->ibcom->rsr_seq_num_tail);
- } else {
+ dprintf("exclusive ringbuf,remote_tail,incremented to %d\n",
+ vc_ib->ibcom->rsr_seq_num_tail);
+ }
+ else {
headtail->tail += 1;
dprintf("shared ringbuf,tail,incremented to %d,head=%ld\n",
headtail->tail, headtail->head);
}
- vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64] &= ~(1ULL << (index_tail & 63));
-
- index_tail = (uint16_t)(index_tail + 1) % vc_ib->ibcom->remote_ringbuf->nslot;
+ vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64] &=
+ ~(1ULL << (index_tail & 63));
+ index_tail = (uint16_t) (index_tail + 1) % vc_ib->ibcom->remote_ringbuf->nslot;
}
else {
break;
@@ -1271,10 +1256,12 @@ int MPID_nem_ib_recv_buf_released(struct MPIDI_VC *vc, void *user_data)
}
}
else {
- if (((vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64] >> (index_tail & 63)) & 0xff) ==
+ if (((vc_ib->ibcom->remote_ringbuf->remote_released[index_tail /
+ 64] >> (index_tail & 63)) & 0xff) ==
0xff) {
vc_ib->ibcom->rsr_seq_num_tail += 8;
- vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64] &= ~(0xffULL << (index_tail & 63));
+ vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64] &=
+ ~(0xffULL << (index_tail & 63));
//dprintf("released[index_tail/64]=%016lx\n", vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64]);
}
}
@@ -1283,16 +1270,15 @@ int MPID_nem_ib_recv_buf_released(struct MPIDI_VC *vc, void *user_data)
int notify_rate;
ibcom_errno =
- MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get(MPID_nem_ib_conns[vc->pg_rank].fd,
- ¬ify_rate);
+ MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get(MPID_nem_ib_conns
+ [vc->pg_rank].fd, ¬ify_rate);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
"**MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get");
-
/* if you missed the chance to make eager-send message piggy-back it */
- if (vc_ib->ibcom->remote_ringbuf->type == MPID_NEM_IB_RINGBUF_EXCLUSIVE &&
- MPID_nem_ib_diff16
- (vc_ib->ibcom->rsr_seq_num_tail,
- vc_ib->ibcom->rsr_seq_num_tail_last_sent) >
+ if (vc_ib->ibcom->remote_ringbuf->type ==
+ MPID_NEM_IB_RINGBUF_EXCLUSIVE &&
+ MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
+ vc_ib->ibcom->rsr_seq_num_tail_last_sent) >
MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_RATE_DELAY_MULTIPLIER(notify_rate)
//|| MPID_nem_ib_diff16(lsr_seq_num_head, vc_ib->ibcom->lsr_seq_num_tail_last_sent) == vc_ib->ibcom->local_ringbuf_nslot
) {
@@ -1301,7 +1287,10 @@ int MPID_nem_ib_recv_buf_released(struct MPIDI_VC *vc, void *user_data)
if (sreq) {
int msg_type = MPIDI_Request_get_msg_type(sreq);
if (msg_type == MPIDI_REQUEST_EAGER_MSG && /* guard for the following pointer dereference */
- ((MPIDI_CH3_Pkt_t *) sreq->dev.iov[0].MPID_IOV_BUF)->type == MPIDI_NEM_PKT_NETMOD &&
+ ((MPIDI_CH3_Pkt_t
+ *) sreq->dev.iov[0].MPID_IOV_BUF)->type ==
+ MPIDI_NEM_PKT_NETMOD
+ &&
((MPID_nem_pkt_netmod_t *) sreq->dev.iov[0].MPID_IOV_BUF)->subtype ==
MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM) {
goto skip;
@@ -1327,7 +1316,8 @@ int MPID_nem_ib_recv_buf_released(struct MPIDI_VC *vc, void *user_data)
#define FUNCNAME MPID_nem_ib_PktHandler_lmt_done
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_PktHandler_lmt_done(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
+int MPID_nem_ib_PktHandler_lmt_done(MPIDI_VC_t * vc,
+ MPIDI_CH3_Pkt_t * pkt,
MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
{
int mpi_errno = MPI_SUCCESS;
@@ -1336,7 +1326,6 @@ int MPID_nem_ib_PktHandler_lmt_done(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
MPID_Request *req;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_LMT_DONE);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_LMT_DONE);
-
/* Check the assumption on sizeof(MPIDI_CH3_Pkt_t).
* It is utilized in pkt_DONE_handler (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c)
* that must be larger than sizeof(MPID_nem_ib_pkt_lmt_done_t) */
@@ -1374,8 +1363,8 @@ int MPID_nem_ib_PktHandler_lmt_done(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
#define FUNCNAME MPID_nem_ib_PktHandler_EagerSend
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_PktHandler_EagerSend(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen /* out */ ,
+int MPID_nem_ib_PktHandler_EagerSend(MPIDI_VC_t * vc,
+ MPIDI_CH3_Pkt_t * pkt, MPIDI_msg_sz_t * buflen /* out */ ,
MPID_Request ** rreqp /* out */)
{
MPID_nem_ib_pkt_prefix_t *netmod_pkt = (MPID_nem_ib_pkt_prefix_t *) pkt;
@@ -1388,36 +1377,34 @@ int MPID_nem_ib_PktHandler_EagerSend(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
MPIDI_msg_sz_t data_len;
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
-
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_EAGERSEND);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_EAGERSEND);
-
dprintf("ib_pkthandler_eagersend,tag=%d\n", ch3_pkt->match.parts.tag);
-
/* Check the assumption on sizeof(MPIDI_CH3_Pkt_t).
* It is utilized to point the payload location in MPIDI_CH3_PktHandler_EagerSend
* (src/mpid/ch3/src/ch3u_eager.c) that must be larger than sizeof(MPID_nem_ib_pkt_eager_send_t) */
//if (sizeof(MPID_nem_ib_pkt_eager_send_t) > sizeof(MPIDI_CH3_Pkt_t)) {
//MPIU_ERR_SETFATALANDJUMP(mpi_errno, MPI_ERR_INTERN, "**sizeof(MPIDI_CH3_Pkt_t)");
//}
-
/* Update occupation status of local SR (send request) queue */
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- dprintf("MPID_nem_ib_PktHandler_EagerSend,lsr_seq_num_tail=%d,netmod_pkt->seq_num_tail=%d\n",
- vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
+ dprintf
+ ("MPID_nem_ib_PktHandler_EagerSend,lsr_seq_num_tail=%d,netmod_pkt->seq_num_tail=%d\n",
+ vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
vc_ib->ibcom->lsr_seq_num_tail = netmod_pkt->seq_num_tail;
- dprintf("MPID_nem_ib_PktHandler_EagerSend,lsr_seq_num_tail updated to %d\n", vc_ib->ibcom->lsr_seq_num_tail);
-
+ dprintf
+ ("MPID_nem_ib_PktHandler_EagerSend,lsr_seq_num_tail updated to %d\n",
+ vc_ib->ibcom->lsr_seq_num_tail);
#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
/* change remote notification policy of RDMA-write-to buf */
dprintf("pkthandler,eagersend,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
MPID_nem_ib_change_rdmabuf_occupancy_notify_policy_lw(vc_ib, lsr_seq_num_tail);
dprintf("pkthandler,eagersend,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
#endif
-
- dprintf("pkthandler,eagersend,sendq_empty=%d,ncom=%d,rdmabuf_occ=%d\n",
- MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom,
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
+ dprintf
+ ("pkthandler,eagersend,sendq_empty=%d,ncom=%d,rdmabuf_occ=%d\n",
+ MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom,
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
/* try to send from sendq because at least one RDMA-write-to buffer has been released */
/* calling drain_scq from progress_send derpives of chance
* for ib_poll to drain sendq using ncqe
@@ -1455,45 +1442,43 @@ int MPID_nem_ib_PktHandler_EagerSend(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
goto fn_exit;
}
-#if 0 /* modification of mpid_nem_lmt.c is required */
+#if 0 /* modification of mpid_nem_lmt.c is required */
/* Temporary fix because it's static */
-int pkt_RTS_handler(MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt, MPIDI_msg_sz_t *buflen, MPID_Request **rreqp);
-
+int pkt_RTS_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
+ MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp);
/* packet handler for wrapper packet of MPIDI_NEM_PKT_LMT_RTS */
/* see pkt_RTS_handler (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c) */
#undef FUNCNAME
#define FUNCNAME MPID_nem_ib_pkt_RTS_handler
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_pkt_RTS_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen /* out */ ,
- MPID_Request ** rreqp /* out */)
+int MPID_nem_ib_pkt_RTS_handler(MPIDI_VC_t * vc,
+ MPIDI_CH3_Pkt_t * pkt, MPIDI_msg_sz_t * buflen /* out */ ,
+ MPID_Request ** rreqp /* out */)
{
MPID_nem_ib_pkt_prefix_t *netmod_pkt = (MPID_nem_ib_pkt_prefix_t *) pkt;
MPIDI_CH3_Pkt_t *ch3_pkt =
(MPIDI_CH3_Pkt_t *) ((uint8_t *) pkt + sizeof(MPID_nem_ib_pkt_prefix_t));
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
-
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKT_RTS_HANDLER);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKT_RTS_HANDLER);
-
/* Update occupation status of local SR (send request) queue */
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- dprintf("MPID_nem_ib_pkt_RTS_handler,lsr_seq_num_tail=%d,netmod_pkt->seq_num_tail=%d\n",
- vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
+ dprintf
+ ("MPID_nem_ib_pkt_RTS_handler,lsr_seq_num_tail=%d,netmod_pkt->seq_num_tail=%d\n",
+ vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
vc_ib->ibcom->lsr_seq_num_tail = netmod_pkt->seq_num_tail;
- dprintf("MPID_nem_ib_pkt_RTS_handler,lsr_seq_num_tail updated to %d\n",
- vc_ib->ibcom->lsr_seq_num_tail);
-
+ dprintf
+ ("MPID_nem_ib_pkt_RTS_handler,lsr_seq_num_tail updated to %d\n",
+ vc_ib->ibcom->lsr_seq_num_tail);
#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
/* change remote notification policy of RDMA-write-to buf */
dprintf("pkthandler,rts,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
MPID_nem_ib_change_rdmabuf_occupancy_notify_policy_lw(vc_ib, &vc_ib->ibcom->lsr_seq_num_tail);
dprintf("pkthandler,rts,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
#endif
-
dprintf("pkthandler,rts,sendq_empty=%d,ncom=%d,rdmabuf_occ=%d\n",
MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom,
MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
@@ -1501,7 +1486,6 @@ int MPID_nem_ib_pkt_RTS_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
dprintf("pkthandler,eagersend,send_progress\n");
fflush(stdout);
MPID_NEM_IB_CHECK_AND_SEND_PROGRESS;
-
/* fall back to the original handler */
/* we don't need to worry about the difference caused by embedding seq_num
* because size of MPI-header of MPIDI_CH3_PKT_EAGER_SEND equals to sizeof(MPIDI_CH3_Pkt_t)
@@ -1543,18 +1527,16 @@ int MPID_nem_ib_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
MPIDI_msg_sz_t data_len;
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
-
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_PUT);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_PUT);
-
/* Update occupation status of local SR (send request) queue */
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- dprintf("MPID_nem_ib_Pkthandler_Put,lsr_seq_num_tail=%d,put_pkt->seq_num_tail=%d\n",
- vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
+ dprintf
+ ("MPID_nem_ib_Pkthandler_Put,lsr_seq_num_tail=%d,put_pkt->seq_num_tail=%d\n",
+ vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
vc_ib->ibcom->lsr_seq_num_tail = netmod_pkt->seq_num_tail;
dprintf("MPID_nem_ib_Pkthandler_Put,lsr_seq_num_tail updated to %d\n",
vc_ib->ibcom->lsr_seq_num_tail);
-
#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
/* change remote notification policy of RDMA-write-to buf */
dprintf("pkthandler,put,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
@@ -1568,7 +1550,6 @@ int MPID_nem_ib_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
dprintf("pkthandler,put,send_progress\n");
fflush(stdout);
MPID_NEM_IB_CHECK_AND_SEND_PROGRESS;
-
/* fall back to the original handler */
/* we don't need to worry about the difference caused by embedding seq_num
* because size of MPI-header of MPIDI_CH3_PKT_PUT equals to sizeof(MPIDI_CH3_Pkt_t)
@@ -1595,8 +1576,8 @@ int MPID_nem_ib_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
#define FUNCNAME MPID_nem_ib_PktHandler_Accumulate
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen /* out */ ,
+int MPID_nem_ib_PktHandler_Accumulate(MPIDI_VC_t * vc,
+ MPIDI_CH3_Pkt_t * pkt, MPIDI_msg_sz_t * buflen /* out */ ,
MPID_Request ** rreqp /* out */)
{
MPID_nem_ib_pkt_prefix_t *netmod_pkt = (MPID_nem_ib_pkt_prefix_t *) pkt;
@@ -1609,18 +1590,17 @@ int MPID_nem_ib_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
MPIDI_msg_sz_t data_len;
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
-
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_ACCUMULATE);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_ACCUMULATE);
-
/* Update occupation status of local SR (send request) queue */
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- dprintf("MPID_nem_ib_Pkthandler_Accumulate,lsr_seq_num_tail=%d,accum_pkt->seq_num_tail=%d\n",
- vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
+ dprintf
+ ("MPID_nem_ib_Pkthandler_Accumulate,lsr_seq_num_tail=%d,accum_pkt->seq_num_tail=%d\n",
+ vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
vc_ib->ibcom->lsr_seq_num_tail = netmod_pkt->seq_num_tail;
- dprintf("MPID_nem_ib_Pkthandler_Accumulate,lsr_seq_num_tail updated to %d\n",
- vc_ib->ibcom->lsr_seq_num_tail);
-
+ dprintf
+ ("MPID_nem_ib_Pkthandler_Accumulate,lsr_seq_num_tail updated to %d\n",
+ vc_ib->ibcom->lsr_seq_num_tail);
#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
/* change remote notification policy of RDMA-write-to buf */
dprintf("pkthandler,put,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
@@ -1674,18 +1654,16 @@ int MPID_nem_ib_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
MPIDI_msg_sz_t data_len;
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
-
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_GET);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_GET);
-
/* Update occupation status of local SR (send request) queue */
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- dprintf("MPID_nem_ib_Pkthandler_Get,lsr_seq_num_tail=%d,get_pkt->seq_num_tail=%d\n",
- vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
+ dprintf
+ ("MPID_nem_ib_Pkthandler_Get,lsr_seq_num_tail=%d,get_pkt->seq_num_tail=%d\n",
+ vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
vc_ib->ibcom->lsr_seq_num_tail = netmod_pkt->seq_num_tail;
dprintf("MPID_nem_ib_Pkthandler_Get,lsr_seq_num_tail updated to %d\n",
vc_ib->ibcom->lsr_seq_num_tail);
-
#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
/* change remote notification policy of RDMA-write-to buf */
dprintf("pkthandler,put,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
@@ -1724,8 +1702,8 @@ int MPID_nem_ib_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
#define FUNCNAME MPID_nem_ib_PktHandler_GetResp
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_PktHandler_GetResp(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen /* out */ ,
+int MPID_nem_ib_PktHandler_GetResp(MPIDI_VC_t * vc,
+ MPIDI_CH3_Pkt_t * pkt, MPIDI_msg_sz_t * buflen /* out */ ,
MPID_Request ** rreqp /* out */)
{
MPID_nem_ib_pkt_prefix_t *netmod_pkt = (MPID_nem_ib_pkt_prefix_t *) pkt;
@@ -1738,18 +1716,17 @@ int MPID_nem_ib_PktHandler_GetResp(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
MPIDI_msg_sz_t data_len;
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
-
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_GETRESP);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_GETRESP);
-
/* Update occupation status of local SR (send request) queue */
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- dprintf("MPID_nem_ib_Pkthandler_GetResp,lsr_seq_num_tail=%d,get_pkt->seq_num_tail=%d\n",
- vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
+ dprintf
+ ("MPID_nem_ib_Pkthandler_GetResp,lsr_seq_num_tail=%d,get_pkt->seq_num_tail=%d\n",
+ vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
vc_ib->ibcom->lsr_seq_num_tail = netmod_pkt->seq_num_tail;
- dprintf("MPID_nem_ib_Pkthandler_GetResp,lsr_seq_num_tail updated to %d\n",
- vc_ib->ibcom->lsr_seq_num_tail);
-
+ dprintf
+ ("MPID_nem_ib_Pkthandler_GetResp,lsr_seq_num_tail updated to %d\n",
+ vc_ib->ibcom->lsr_seq_num_tail);
#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
/* change remote notification policy of RDMA-write-to buf */
dprintf("pkthandler,put,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
@@ -1787,7 +1764,8 @@ int MPID_nem_ib_PktHandler_GetResp(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
#define FUNCNAME MPID_nem_ib_pkt_GET_DONE_handler
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_pkt_GET_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
+int MPID_nem_ib_pkt_GET_DONE_handler(MPIDI_VC_t * vc,
+ MPIDI_CH3_Pkt_t * pkt,
MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
{
int mpi_errno = MPI_SUCCESS;
@@ -1797,14 +1775,10 @@ int MPID_nem_ib_pkt_GET_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKT_GET_DONE_HANDLER);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKT_GET_DONE_HANDLER);
-
dprintf("get_done_handler,enter\n");
-
*buflen = sizeof(MPIDI_CH3_Pkt_t);
MPID_Request_get_ptr(done_pkt->req_id, req);
-
MPIU_THREAD_CS_ENTER(LMT,);
-
switch (MPIDI_Request_get_type(req)) {
/* MPIDI_Request_set_type is not performed when
* MPID_Isend --> FDU_or_AEP --> recv_posted --> ib_poll --> PUTCTS packet-handler */
@@ -1815,13 +1789,13 @@ int MPID_nem_ib_pkt_GET_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
case MPIDI_REQUEST_TYPE_RSEND:
case MPIDI_REQUEST_TYPE_SSEND:
case MPIDI_REQUEST_TYPE_BSEND:
-#if 0 /* obsolete, it's in netmod header now */
+#if 0 /* obsolete, it's in netmod header now */
/* extract embeded RDMA-write-to buffer occupancy information */
- dprintf("get_done_handler,old lsr_seq_num_tail=%d,done_pkt->seq_num_tail=%d\n",
- vc_ib->ibcom->lsr_seq_num_tail, done_pkt->seq_num_tail);
+ dprintf
+ ("get_done_handler,old lsr_seq_num_tail=%d,done_pkt->seq_num_tail=%d\n",
+ vc_ib->ibcom->lsr_seq_num_tail, done_pkt->seq_num_tail);
vc_ib->ibcom->lsr_seq_num_tail = done_pkt->seq_num_tail;
//dprintf("lmt_start_recv,new lsr_seq_num=%d\n", vc_ib->ibcom->lsr_seq_num_tail);
-
#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
/* change remote notification policy of RDMA-write-to buf */
//dprintf("lmt_start_recv,reply_seq_num,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
@@ -1830,7 +1804,6 @@ int MPID_nem_ib_pkt_GET_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
//dprintf("lmt_start_recv,reply_seq_num,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
#endif
//dprintf("lmt_start_recv,reply_seq_num,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
-
#endif
/* try to send from sendq because at least one RDMA-write-to buffer has been released */
//dprintf("lmt_start_recv,reply_seq_num,send_progress\n");
@@ -1855,7 +1828,6 @@ int MPID_nem_ib_pkt_GET_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
}
*rreqp = NULL;
-
fn_exit:
MPIU_THREAD_CS_EXIT(LMT,);
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_PKT_GET_DONE_HANDLER);
@@ -1868,7 +1840,8 @@ int MPID_nem_ib_pkt_GET_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
#define FUNCNAME MPID_nem_ib_PktHandler_req_seq_num
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_PktHandler_req_seq_num(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
+int MPID_nem_ib_PktHandler_req_seq_num(MPIDI_VC_t * vc,
+ MPIDI_CH3_Pkt_t * pkt,
MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
{
int mpi_errno = MPI_SUCCESS;
@@ -1877,30 +1850,25 @@ int MPID_nem_ib_PktHandler_req_seq_num(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
MPID_Request *req;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_REQ_SEQ_NUM);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_REQ_SEQ_NUM);
-
/* mark as all of the message is read */
*buflen = sizeof(MPIDI_CH3_Pkt_t);
-
/* mark as I don't need continuation read request */
*rreqp = NULL;
-
/* update occupancy info of SR */
/* request piggy-backs seq_num although it's requesting responder's seq_num */
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
vc_ib->ibcom->lsr_seq_num_tail = req_pkt->seq_num_tail;
-
- dprintf("PktHandler_req_seq_num,sendq=%d,ncom=%d,ncqe=%d,diff=%d(%d-%d)\n",
- MPID_nem_ib_sendq_empty(vc_ib->sendq),
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY,
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) < vc_ib->ibcom->local_ringbuf_nslot,
- vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail);
-
+ dprintf
+ ("PktHandler_req_seq_num,sendq=%d,ncom=%d,ncqe=%d,diff=%d(%d-%d)\n",
+ MPID_nem_ib_sendq_empty(vc_ib->sendq),
+ vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
+ MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY,
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
+ vc_ib->ibcom->lsr_seq_num_tail) <
+ vc_ib->ibcom->local_ringbuf_nslot, vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail);
/* send reply */
dprintf("PktHandler_req_seq_num,sending reply_seq_num,id=%d\n", vc_ib->ibcom->sseq_num);
MPID_nem_ib_send_reply_seq_num(vc);
-
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_PKTHANDLER_REQ_SEQ_NUM);
return mpi_errno;
@@ -1912,7 +1880,8 @@ int MPID_nem_ib_PktHandler_req_seq_num(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
#define FUNCNAME MPID_nem_ib_PktHandler_reply_seq_num
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_PktHandler_reply_seq_num(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
+int MPID_nem_ib_PktHandler_reply_seq_num(MPIDI_VC_t * vc,
+ MPIDI_CH3_Pkt_t * pkt,
MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
{
int mpi_errno = MPI_SUCCESS;
@@ -1921,34 +1890,29 @@ int MPID_nem_ib_PktHandler_reply_seq_num(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
MPID_Request *req;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_REPLY_SEQ_NUM);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_REPLY_SEQ_NUM);
-
-
/* mark as all of the message is consumed */
*buflen = sizeof(MPIDI_CH3_Pkt_t);
-
/* mark as I don't need continuation read request */
*rreqp = NULL;
-
/* update occupancy info of RDMA-write-buf */
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- dprintf("pkthandler,reply_seq_num,old lsr_seq_num=%d,reply_pkt->seq_num_tail=%d\n",
- vc_ib->ibcom->lsr_seq_num_tail, reply_pkt->seq_num_tail);
+ dprintf
+ ("pkthandler,reply_seq_num,old lsr_seq_num=%d,reply_pkt->seq_num_tail=%d\n",
+ vc_ib->ibcom->lsr_seq_num_tail, reply_pkt->seq_num_tail);
vc_ib->ibcom->lsr_seq_num_tail = reply_pkt->seq_num_tail;
//dprintf("pkthandler,reply_seq_num,new lsr_seq_num=%d\n", vc_ib->ibcom->lsr_seq_num_tail);
-
#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
/* change remote notification policy of RDMA-write-to buf */
//dprintf("pkthandler,reply_seq_num,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
MPID_nem_ib_change_rdmabuf_occupancy_notify_policy_lw(vc_ib, &(vc_ib->ibcom->lsr_seq_num_tail));
//dprintf("pkthandler,reply_seq_num,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
#endif
-
//dprintf("pkthandler,reply_seq_num,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
/* try to send from sendq because at least one RDMA-write-to buffer has been released */
//dprintf("pkthandler,reply_seq_num,send_progress\n");
dprintf("pkthandler,reply_seq_num,send_progress\n");
- MPID_NEM_IB_CHECK_AND_SEND_PROGRESS fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_PKTHANDLER_REPLY_SEQ_NUM);
+ MPID_NEM_IB_CHECK_AND_SEND_PROGRESS fn_exit:MPIDI_FUNC_EXIT
+ (MPID_STATE_MPID_NEM_IB_PKTHANDLER_REPLY_SEQ_NUM);
return mpi_errno;
//fn_fail:
goto fn_exit;
@@ -1958,11 +1922,8 @@ int MPID_nem_ib_PktHandler_reply_seq_num(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
#define FUNCNAME MPID_nem_ib_PktHandler_change_rdmabuf_occupancy_notify_state
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_PktHandler_change_rdmabuf_occupancy_notify_state(MPIDI_VC_t * vc,
- MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen,
- MPID_Request ** rreqp)
-{
+int MPID_nem_ib_PktHandler_change_rdmabuf_occupancy_notify_state
+ (MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt, MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp) {
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
MPID_nem_ib_pkt_change_rdmabuf_occupancy_notify_state_t *const reply_pkt =
@@ -1970,14 +1931,10 @@ int MPID_nem_ib_PktHandler_change_rdmabuf_occupancy_notify_state(MPIDI_VC_t * vc
MPID_Request *req;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_CHANGE_RDMABUF_OCCUPANCY_NOTIFY_STATE);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_CHANGE_RDMABUF_OCCUPANCY_NOTIFY_STATE);
-
/* mark as all of the message is read */
*buflen = sizeof(MPIDI_CH3_Pkt_t);
-
/* mark as I don't need continuation read request */
*rreqp = NULL;
-
-
/* update occupancy info of SR */
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
dprintf("pkthandler,change notify state,old lstate=%d,pkt->state=%d\n",
@@ -1991,7 +1948,6 @@ int MPID_nem_ib_PktHandler_change_rdmabuf_occupancy_notify_state(MPIDI_VC_t * vc
*rdmabuf_occupancy_notify_lstate = reply_pkt->state;
dprintf("pkthandler,change notify state,new lstate=%d\n",
vc_ib->ibcom->rdmabuf_occupancy_notify_lstate);
-
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_PKTHANDLER_CHANGE_RDMABUF_OCCUPANCY_NOTIFY_STATE);
return mpi_errno;
@@ -2011,33 +1967,27 @@ int MPID_nem_ib_cm_drain_scq()
int result;
int i;
struct ibv_wc cqe[MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN];
- MPID_nem_ib_cm_cmd_shadow_t* shadow_cm;
- MPID_nem_ib_ringbuf_cmd_shadow_t * shadow_ringbuf;
-
+ MPID_nem_ib_cm_cmd_shadow_t *shadow_cm;
+ MPID_nem_ib_ringbuf_cmd_shadow_t *shadow_ringbuf;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_DRAIN_SCQ);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_DRAIN_SCQ);
-
//dprintf("cm_drain_scq,enter\n");
-
-
/* cm_drain_scq is called after poll_eager calls vc_terminate */
- if(!MPID_nem_ib_rc_shared_scq_scratch_pad) {
+ if (!MPID_nem_ib_rc_shared_scq_scratch_pad) {
dprintf("cm_drain_scq,CQ is null\n");
goto fn_exit;
}
result =
- ibv_poll_cq(MPID_nem_ib_rc_shared_scq_scratch_pad, MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN,
- &cqe[0]);
+ ibv_poll_cq(MPID_nem_ib_rc_shared_scq_scratch_pad,
+ MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN, &cqe[0]);
MPIU_ERR_CHKANDJUMP(result < 0, mpi_errno, MPI_ERR_OTHER, "**netmod,ib,ibv_poll_cq");
-
if (result > 0) {
dprintf("cm_drain_scq,found,result=%d\n", result);
}
for (i = 0; i < result; i++) {
dprintf("cm_drain_scq,wr_id=%p\n", (void *) cqe[i].wr_id);
-
#ifdef HAVE_LIBDCFA
if (cqe[i].status != IBV_WC_SUCCESS) {
dprintf("cm_drain_scq,status=%08x\n", cqe[i].status);
@@ -2050,135 +2000,144 @@ int MPID_nem_ib_cm_drain_scq()
MPID_nem_ib_segv;
}
#endif
- MPIU_ERR_CHKANDJUMP(cqe[i].status != IBV_WC_SUCCESS, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_drain_scq");
-
- MPID_nem_ib_cm_ringbuf_cmd_type_t * type = (MPID_nem_ib_cm_ringbuf_cmd_type_t *) cqe[i].wr_id;
- switch(*type) {
- case MPID_NEM_IB_CM_CAS: {
- shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
-
- dprintf("cm_drain_scq,cm_cas,req=%p,responder_rank=%d\n",
- shadow_cm->req, shadow_cm->req->responder_rank);
-
- /* Check if CAS have succeeded */
- uint64_t* cas_retval = (uint64_t *) shadow_cm->buf_from;
- if(*cas_retval == MPID_NEM_IB_CM_RELEASED) {
- /* CAS succeeded, so write command */
-
- dprintf("cm_drain_scq,cm_cas,succeeded\n");
-
- if (is_conn_established(shadow_cm->req->responder_rank)) {
- /* Connection is already established.
- * In this case, responder may already have performed vc_terminate.
- * However, since initiator has to release responder's CAS word,
- * initiator sends CM_CAS_RELEASE. */
-
- shadow_cm->req->state = MPID_NEM_IB_CM_CAS_RELEASE;
- if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
- shadow_cm->req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY) {
-
- MPID_nem_ib_cm_cmd_syn_t *cmd = (MPID_nem_ib_cm_cmd_syn_t *) shadow_cm->req->ibcom->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
- MPID_NEM_IB_CM_COMPOSE_CAS_RELEASE(cmd, shadow_cm->req);
- cmd->initiator_rank = MPID_nem_ib_myrank;
-
- MPID_nem_ib_cm_cmd_shadow_t * shadow_syn =
- (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow_syn->type = shadow_cm->req->state;
- shadow_syn->req = shadow_cm->req;
- dprintf("shadow_syn=%p,shadow_syn->req=%p\n", shadow_syn, shadow_syn->req);
- mpi_errno =
- MPID_nem_ib_cm_cmd_core(shadow_cm->req->responder_rank, shadow_syn,
- (void *) cmd, sizeof(MPID_nem_ib_cm_cmd_syn_t),
- 1 /* syn:1 */, 0);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_send_core");
- } else {
- MPID_NEM_IB_CM_COMPOSE_CAS_RELEASE((MPID_nem_ib_cm_cmd_syn_t *)&(shadow_cm->req->cmd), shadow_cm->req);
- MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, shadow_cm->req);
+ MPIU_ERR_CHKANDJUMP(cqe[i].status != IBV_WC_SUCCESS, mpi_errno,
+ MPI_ERR_OTHER, "**MPID_nem_ib_cm_drain_scq");
+ MPID_nem_ib_cm_ringbuf_cmd_type_t *type =
+ (MPID_nem_ib_cm_ringbuf_cmd_type_t *) cqe[i].wr_id;
+ switch (*type) {
+ case MPID_NEM_IB_CM_CAS:{
+ shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
+ dprintf("cm_drain_scq,cm_cas,req=%p,responder_rank=%d\n",
+ shadow_cm->req, shadow_cm->req->responder_rank);
+ /* Check if CAS have succeeded */
+ uint64_t *cas_retval = (uint64_t *) shadow_cm->buf_from;
+ if (*cas_retval == MPID_NEM_IB_CM_RELEASED) {
+ /* CAS succeeded, so write command */
+
+ dprintf("cm_drain_scq,cm_cas,succeeded\n");
+ if (is_conn_established(shadow_cm->req->responder_rank)) {
+ /* Connection is already established.
+ * In this case, responder may already have performed vc_terminate.
+ * However, since initiator has to release responder's CAS word,
+ * initiator sends CM_CAS_RELEASE. */
+
+ shadow_cm->req->state = MPID_NEM_IB_CM_CAS_RELEASE;
+ if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
+ shadow_cm->req->ibcom->ncom_scratch_pad <
+ MPID_NEM_IB_COM_MAX_SQ_CAPACITY) {
+
+ MPID_nem_ib_cm_cmd_syn_t *cmd =
+ (MPID_nem_ib_cm_cmd_syn_t *) shadow_cm->req->ibcom->
+ icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
+ MPID_NEM_IB_CM_COMPOSE_CAS_RELEASE(cmd, shadow_cm->req);
+ cmd->initiator_rank = MPID_nem_ib_myrank;
+ MPID_nem_ib_cm_cmd_shadow_t *shadow_syn =
+ (MPID_nem_ib_cm_cmd_shadow_t *)
+ MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ shadow_syn->type = shadow_cm->req->state;
+ shadow_syn->req = shadow_cm->req;
+ dprintf("shadow_syn=%p,shadow_syn->req=%p\n", shadow_syn,
+ shadow_syn->req);
+ mpi_errno =
+ MPID_nem_ib_cm_cmd_core(shadow_cm->req->responder_rank, shadow_syn,
+ (void *) cmd,
+ sizeof(MPID_nem_ib_cm_cmd_syn_t),
+ 1 /* syn:1 */ , 0);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_send_core");
+ }
+ else {
+ MPID_NEM_IB_CM_COMPOSE_CAS_RELEASE((MPID_nem_ib_cm_cmd_syn_t *) &
+ (shadow_cm->req->cmd),
+ shadow_cm->req);
+ MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, shadow_cm->req);
+ }
+ }
+ else {
+ /* Increment receiving transaction counter. Initiator receives SYNACK and ACK2 */
+ shadow_cm->req->ibcom->incoming_connection_tx += 2;
+ shadow_cm->req->state = MPID_NEM_IB_CM_SYN;
+ if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
+ shadow_cm->req->ibcom->ncom_scratch_pad <
+ MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
+ MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
+ MPID_nem_ib_cm_ringbuf_tail) < MPID_NEM_IB_CM_NSEG) {
+
+ MPID_nem_ib_cm_cmd_syn_t *cmd =
+ (MPID_nem_ib_cm_cmd_syn_t *) shadow_cm->req->ibcom->
+ icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
+ MPID_NEM_IB_CM_COMPOSE_SYN(cmd, shadow_cm->req);
+ cmd->responder_ringbuf_index =
+ shadow_cm->req->responder_ringbuf_index =
+ MPID_nem_ib_cm_ringbuf_head;
+ dprintf("cm_drain_scq,giving ringbuf_index=%d\n",
+ cmd->responder_ringbuf_index);
+ MPID_nem_ib_cm_ringbuf_head++;
+ cmd->initiator_rank = MPID_nem_ib_myrank;
+ MPID_nem_ib_cm_cmd_shadow_t *shadow_syn =
+ (MPID_nem_ib_cm_cmd_shadow_t *)
+ MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ shadow_syn->type = shadow_cm->req->state;
+ shadow_syn->req = shadow_cm->req;
+ dprintf("shadow_syn=%p,shadow_syn->req=%p\n", shadow_syn,
+ shadow_syn->req);
+ mpi_errno =
+ MPID_nem_ib_cm_cmd_core(shadow_cm->req->responder_rank, shadow_syn,
+ (void *) cmd,
+ sizeof(MPID_nem_ib_cm_cmd_syn_t),
+ 1 /* syn:1 */ , 0);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_send_core");
+ }
+ else {
+ MPID_NEM_IB_CM_COMPOSE_SYN((MPID_nem_ib_cm_cmd_syn_t *) &
+ (shadow_cm->req->cmd), shadow_cm->req);
+ MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, shadow_cm->req);
+ }
}
}
else {
- /* Increment receiving transaction counter. Initiator receives SYNACK and ACK2 */
- shadow_cm->req->ibcom->incoming_connection_tx += 2;
+ if (is_conn_established(shadow_cm->req->responder_rank)) {
+ /* CAS is failed, and connection is already established */
+
+ dprintf("cm_drain_scq,cm_cas,connection is already established\n");
+ MPID_nem_ib_ncqe_scratch_pad_to_drain -= 1;
+ shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
+ shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
+ MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
+ /* Let the guard down to let the following connection request go. */
+ VC_FIELD(MPID_nem_ib_conns[shadow_cm->req->responder_rank].vc,
+ connection_guard) = 0;
+ /* free memory : req->ref_count is 2, so call MPIU_Free() directly */
+ //MPID_nem_ib_cm_request_release(shadow_cm->req);
+ MPIU_Free(shadow_cm->req);
+ MPIU_Free(shadow_cm);
+ break;
+ }
- shadow_cm->req->state = MPID_NEM_IB_CM_SYN;
- if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
- shadow_cm->req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
- MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
- MPID_nem_ib_cm_ringbuf_tail) < MPID_NEM_IB_CM_NSEG) {
-
- MPID_nem_ib_cm_cmd_syn_t *cmd = (MPID_nem_ib_cm_cmd_syn_t *) shadow_cm->req->ibcom->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
- MPID_NEM_IB_CM_COMPOSE_SYN(cmd, shadow_cm->req);
- cmd->responder_ringbuf_index = shadow_cm->req->responder_ringbuf_index = MPID_nem_ib_cm_ringbuf_head;
- dprintf("cm_drain_scq,giving ringbuf_index=%d\n", cmd->responder_ringbuf_index);
- MPID_nem_ib_cm_ringbuf_head++;
- cmd->initiator_rank = MPID_nem_ib_myrank;
-
- MPID_nem_ib_cm_cmd_shadow_t * shadow_syn =
- (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow_syn->type = shadow_cm->req->state;
- shadow_syn->req = shadow_cm->req;
- dprintf("shadow_syn=%p,shadow_syn->req=%p\n", shadow_syn, shadow_syn->req);
- mpi_errno =
- MPID_nem_ib_cm_cmd_core(shadow_cm->req->responder_rank, shadow_syn,
- (void *) cmd, sizeof(MPID_nem_ib_cm_cmd_syn_t),
- 1 /* syn:1 */, 0);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_send_core");
- } else {
- MPID_NEM_IB_CM_COMPOSE_SYN((MPID_nem_ib_cm_cmd_syn_t *)&(shadow_cm->req->cmd), shadow_cm->req);
+ dprintf("cm_drain_scq,cm_cas,retval=%016lx,backoff=%ld\n", *cas_retval,
+ shadow_cm->req->retry_backoff);
+ shadow_cm->req->retry_backoff =
+ shadow_cm->req->retry_backoff ? (shadow_cm->req->retry_backoff << 1) : 1;
+ shadow_cm->req->retry_decided = MPID_nem_ib_progress_engine_vt; /* Schedule retry */
MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, shadow_cm->req);
+ dprintf("cm_drain_scq,cm_cas,failed,decided=%ld,backoff=%ld\n",
+ shadow_cm->req->retry_decided, shadow_cm->req->retry_backoff);
}
- }
- } else {
- if (is_conn_established(shadow_cm->req->responder_rank)) {
- /* CAS is failed, and connection is already established */
-
- dprintf("cm_drain_scq,cm_cas,connection is already established\n");
-
- MPID_nem_ib_ncqe_scratch_pad_to_drain -= 1;
- shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
- shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
- MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
-
- /* Let the guard down to let the following connection request go. */
- VC_FIELD(MPID_nem_ib_conns[shadow_cm->req->responder_rank].vc, connection_guard) = 0;
-
- /* free memory : req->ref_count is 2, so call MPIU_Free() directly */
- //MPID_nem_ib_cm_request_release(shadow_cm->req);
- MPIU_Free(shadow_cm->req);
-
- MPIU_Free(shadow_cm);
- break;
- }
-
- dprintf("cm_drain_scq,cm_cas,retval=%016lx,backoff=%ld\n",
- *cas_retval, shadow_cm->req->retry_backoff);
- shadow_cm->req->retry_backoff =
- shadow_cm->req->retry_backoff ?
- (shadow_cm->req->retry_backoff << 1) :
- 1;
- shadow_cm->req->retry_decided = MPID_nem_ib_progress_engine_vt; /* Schedule retry */
- MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, shadow_cm->req);
- dprintf("cm_drain_scq,cm_cas,failed,decided=%ld,backoff=%ld\n",
- shadow_cm->req->retry_decided, shadow_cm->req->retry_backoff);
+ MPID_nem_ib_ncqe_scratch_pad_to_drain -= 1;
+ shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
+ MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
+ MPIU_Free(shadow_cm);
+ break;
}
- MPID_nem_ib_ncqe_scratch_pad_to_drain -= 1;
- shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
- MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
- MPIU_Free(shadow_cm);
- break; }
case MPID_NEM_IB_CM_SYN:
dprintf("cm_drain_scq,syn sent\n");
shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
dprintf("cm_drain_scq,tx=%d\n", shadow_cm->req->ibcom->outstanding_connection_tx);
-
- dprintf("cm_drain_scq,syn,buf_from=%p,sz=%d\n",
- shadow_cm->buf_from, shadow_cm->buf_from_sz);
+ dprintf("cm_drain_scq,syn,buf_from=%p,sz=%d\n", shadow_cm->buf_from,
+ shadow_cm->buf_from_sz);
MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
-
MPIU_Free(shadow_cm);
break;
case MPID_NEM_IB_CM_CAS_RELEASE:
@@ -2187,28 +2146,23 @@ int MPID_nem_ib_cm_drain_scq()
shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
dprintf("cm_drain_scq,tx=%d\n", shadow_cm->req->ibcom->outstanding_connection_tx);
-
- dprintf("cm_drain_scq,syn,buf_from=%p,sz=%d\n",
- shadow_cm->buf_from, shadow_cm->buf_from_sz);
+ dprintf("cm_drain_scq,syn,buf_from=%p,sz=%d\n", shadow_cm->buf_from,
+ shadow_cm->buf_from_sz);
MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
-
/* free memory : req->ref_count is 2, so call MPIU_Free() directly */
//MPID_nem_ib_cm_request_release(shadow_cm->req);
MPIU_Free(shadow_cm->req);
-
MPIU_Free(shadow_cm);
break;
case MPID_NEM_IB_CM_SYNACK:
shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
- dprintf("cm_drain_scq,synack sent,req=%p,initiator_rank=%d\n",
- shadow_cm->req, shadow_cm->req->initiator_rank);
+ dprintf("cm_drain_scq,synack sent,req=%p,initiator_rank=%d\n", shadow_cm->req,
+ shadow_cm->req->initiator_rank);
shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
dprintf("cm_drain_scq,tx=%d\n", shadow_cm->req->ibcom->outstanding_connection_tx);
-
- dprintf("cm_drain_scq,synack,buf_from=%p,sz=%d\n",
- shadow_cm->buf_from, shadow_cm->buf_from_sz);
+ dprintf("cm_drain_scq,synack,buf_from=%p,sz=%d\n", shadow_cm->buf_from,
+ shadow_cm->buf_from_sz);
MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
-
MPIU_Free(shadow_cm);
break;
case MPID_NEM_IB_CM_ACK1:
@@ -2217,33 +2171,29 @@ int MPID_nem_ib_cm_drain_scq()
shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
dprintf("cm_drain_scq,tx=%d\n", shadow_cm->req->ibcom->outstanding_connection_tx);
-
- dprintf("cm_drain_scq,ack1,buf_from=%p,sz=%d\n",
- shadow_cm->buf_from, shadow_cm->buf_from_sz);
+ dprintf("cm_drain_scq,ack1,buf_from=%p,sz=%d\n", shadow_cm->buf_from,
+ shadow_cm->buf_from_sz);
MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
-
/* Finalize protocol because there is no referer in cm_drain_scq and sendq.
- Note that there might be one in cm_poll.*/
+ * Note that there might be one in cm_poll. */
MPID_nem_ib_cm_request_release(shadow_cm->req);
MPIU_Free(shadow_cm);
break;
case MPID_NEM_IB_CM_ACK2:
shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
dprintf("cm_drain_scq,ack2 sent,req=%p,initiator_rank=%p=%d\n",
- shadow_cm->req, &shadow_cm->req->initiator_rank, shadow_cm->req->initiator_rank);
+ shadow_cm->req, &shadow_cm->req->initiator_rank,
+ shadow_cm->req->initiator_rank);
shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
dprintf("cm_drain_scq,tx=%d\n", shadow_cm->req->ibcom->outstanding_connection_tx);
-
- dprintf("cm_drain_scq,ack2,buf_from=%p,sz=%d\n",
- shadow_cm->buf_from, shadow_cm->buf_from_sz);
+ dprintf("cm_drain_scq,ack2,buf_from=%p,sz=%d\n", shadow_cm->buf_from,
+ shadow_cm->buf_from_sz);
MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
-
/* Let the guard down to let the following connection request go. */
VC_FIELD(MPID_nem_ib_conns[shadow_cm->req->initiator_rank].vc, connection_guard) = 0;
-
/* Finalize protocol because there is no referer in cm_drain_scq, sendq
- and cm_poll because cm_poll sent ACK2. */
+ * and cm_poll because cm_poll sent ACK2. */
MPID_nem_ib_cm_request_release(shadow_cm->req);
MPIU_Free(shadow_cm);
break;
@@ -2251,34 +2201,30 @@ int MPID_nem_ib_cm_drain_scq()
case MPID_NEM_IB_CM_RESPONDER_IS_CONNECTING:
/* These cases mean the end of CM-op, so we do the almost same operation as ack2 */
shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
- dprintf("cm_drain_scq,established or connecting sent,req=%p,initiator_rank=%p=%d\n",
- shadow_cm->req, &shadow_cm->req->initiator_rank, shadow_cm->req->initiator_rank);
+ dprintf
+ ("cm_drain_scq,established or connecting sent,req=%p,initiator_rank=%p=%d\n",
+ shadow_cm->req, &shadow_cm->req->initiator_rank, shadow_cm->req->initiator_rank);
shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
dprintf("cm_drain_scq,tx=%d\n", shadow_cm->req->ibcom->outstanding_connection_tx);
-
shadow_cm->req->ibcom->incoming_connection_tx -= 1;
-
MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
-
/* Let the guard down to let the following connection request go. */
VC_FIELD(MPID_nem_ib_conns[shadow_cm->req->initiator_rank].vc, connection_guard) = 0;
-
/* Finalize protocol because there is no referer in cm_drain_scq, sendq
- and cm_poll because cm_poll sent ACK2. */
+ * and cm_poll because cm_poll sent ACK2. */
MPID_nem_ib_cm_request_release(shadow_cm->req);
MPIU_Free(shadow_cm);
break;
case MPID_NEM_IB_RINGBUF_ASK_FETCH:
shadow_ringbuf = (MPID_nem_ib_ringbuf_cmd_shadow_t *) cqe[i].wr_id;
memcpy(&shadow_ringbuf->req->fetched,
- shadow_ringbuf->buf_from,
- sizeof(MPID_nem_ib_ringbuf_headtail_t));
- dprintf("cm_drain_scq,ask_fetch sent,%d->%d,req=%p,fetched->head=%ld,tail=%d\n",
- MPID_nem_ib_myrank,
- shadow_ringbuf->req->vc->pg_rank,
- shadow_ringbuf->req, shadow_ringbuf->req->fetched.head,
- shadow_ringbuf->req->fetched.tail);
+ shadow_ringbuf->buf_from, sizeof(MPID_nem_ib_ringbuf_headtail_t));
+ dprintf
+ ("cm_drain_scq,ask_fetch sent,%d->%d,req=%p,fetched->head=%ld,tail=%d\n",
+ MPID_nem_ib_myrank, shadow_ringbuf->req->vc->pg_rank,
+ shadow_ringbuf->req, shadow_ringbuf->req->fetched.head,
+ shadow_ringbuf->req->fetched.tail);
/* Proceed to cas */
MPID_nem_ib_ringbuf_ask_cas(shadow_ringbuf->req->vc, shadow_ringbuf->req);
MPID_nem_ib_ncqe_scratch_pad_to_drain -= 1;
@@ -2286,94 +2232,101 @@ int MPID_nem_ib_cm_drain_scq()
MPID_nem_ib_rdmawr_from_free(shadow_ringbuf->buf_from, shadow_ringbuf->buf_from_sz);
MPIU_Free(shadow_ringbuf);
break;
- case MPID_NEM_IB_RINGBUF_ASK_CAS: {
- shadow_ringbuf = (MPID_nem_ib_ringbuf_cmd_shadow_t *) cqe[i].wr_id;
- /* Check if CAS have succeeded */
- MPID_nem_ib_ringbuf_headtail_t* cas_retval =
- (MPID_nem_ib_ringbuf_headtail_t *) shadow_ringbuf->buf_from;
- dprintf("cm_drain_scq,ask_cas sent,req=%p,fetched.head=%lx,retval=%lx\n",
- shadow_ringbuf->req, shadow_ringbuf->req->fetched.head, cas_retval->head);
- if(cas_retval->head == shadow_ringbuf->req->fetched.head) {
- /* CAS succeeded */
- dprintf("cm_drain_scq,ask_cas,cas succeeded,%d->%d,local_head=%d,local_tail=%d,nslot=%d\n",
- MPID_nem_ib_myrank,
- shadow_ringbuf->req->vc->pg_rank,
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num),
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->lsr_seq_num_tail),
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot));
- if(MPID_nem_ib_diff16(VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num),
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->lsr_seq_num_tail)) >=
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot)) {
- dprintf("cm_drain_scq,ask_cas,refill fast path\n");
- /* Refill now when we don't have any slots */
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num) =
- (uint16_t)shadow_ringbuf->req->fetched.head;
- /* Move tail pointer to indicate only one slot is available to us */
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->lsr_seq_num_tail) =
- (uint16_t)
- (VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num) -
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot) + 1);
- dprintf("cm_drain_scq,ask_cas,after refill,local_head=%d,local_tail=%d,nslot=%d\n",
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num),
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->lsr_seq_num_tail),
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot));
- } else {
- dprintf("cm_drain_scq,ask_cas,refill slow path\n");
- /* Enqueue slots to avoid overwriting the slots when we have some slots.
- This happens when two or more asks succeeded before
- the first queued send is issued. */
- MPID_nem_ib_ringbuf_sector_t * sector =
- (MPID_nem_ib_ringbuf_sector_t *) MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_sector_t));
- MPIU_ERR_CHKANDJUMP(!sector, mpi_errno, MPI_ERR_OTHER, "**malloc");
- sector->type = MPID_NEM_IB_RINGBUF_SHARED;
- sector->start = VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_start);
- sector->nslot = VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot);
- sector->head = (uint16_t)shadow_ringbuf->req->fetched.head;
- sector->tail = sector->head -
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot) + 1;
- MPID_nem_ib_ringbuf_sectorq_enqueue(&VC_FIELD(shadow_ringbuf->req->vc, ibcom->sectorq),
- sector);
+ case MPID_NEM_IB_RINGBUF_ASK_CAS:{
+ shadow_ringbuf = (MPID_nem_ib_ringbuf_cmd_shadow_t *) cqe[i].wr_id;
+ /* Check if CAS have succeeded */
+ MPID_nem_ib_ringbuf_headtail_t *cas_retval =
+ (MPID_nem_ib_ringbuf_headtail_t *) shadow_ringbuf->buf_from;
+ dprintf
+ ("cm_drain_scq,ask_cas sent,req=%p,fetched.head=%lx,retval=%lx\n",
+ shadow_ringbuf->req, shadow_ringbuf->req->fetched.head, cas_retval->head);
+ if (cas_retval->head == shadow_ringbuf->req->fetched.head) {
+ /* CAS succeeded */
+ dprintf
+ ("cm_drain_scq,ask_cas,cas succeeded,%d->%d,local_head=%d,local_tail=%d,nslot=%d\n",
+ MPID_nem_ib_myrank, shadow_ringbuf->req->vc->pg_rank,
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num),
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->lsr_seq_num_tail),
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot));
+ if (MPID_nem_ib_diff16
+ (VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num),
+ VC_FIELD(shadow_ringbuf->req->vc,
+ ibcom->lsr_seq_num_tail)) >=
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot)) {
+ dprintf("cm_drain_scq,ask_cas,refill fast path\n");
+ /* Refill now when we don't have any slots */
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num) =
+ (uint16_t) shadow_ringbuf->req->fetched.head;
+ /* Move tail pointer to indicate only one slot is available to us */
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->lsr_seq_num_tail) = (uint16_t)
+ (VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num) -
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot) + 1);
+ dprintf
+ ("cm_drain_scq,ask_cas,after refill,local_head=%d,local_tail=%d,nslot=%d\n",
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num),
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->lsr_seq_num_tail),
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot));
+ }
+ else {
+ dprintf("cm_drain_scq,ask_cas,refill slow path\n");
+ /* Enqueue slots to avoid overwriting the slots when we have some slots.
+ * This happens when two or more asks succeeded before
+ * the first queued send is issued. */
+ MPID_nem_ib_ringbuf_sector_t *sector = (MPID_nem_ib_ringbuf_sector_t *)
+ MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_sector_t));
+ MPIU_ERR_CHKANDJUMP(!sector, mpi_errno, MPI_ERR_OTHER, "**malloc");
+ sector->type = MPID_NEM_IB_RINGBUF_SHARED;
+ sector->start =
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_start);
+ sector->nslot =
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot);
+ sector->head = (uint16_t) shadow_ringbuf->req->fetched.head;
+ sector->tail =
+ sector->head - VC_FIELD(shadow_ringbuf->req->vc,
+ ibcom->local_ringbuf_nslot) + 1;
+ MPID_nem_ib_ringbuf_sectorq_enqueue(&VC_FIELD
+ (shadow_ringbuf->req->vc,
+ ibcom->sectorq), sector);
+ }
+ /* Let the guard down so that the following ask-fetch can be issued */
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->ask_guard) = 0;
+ /* Kick progress engine */
+ dprintf
+ ("cm_drain_scq,call send_progress for %d,ncom=%d,ncqe=%d,local_head=%d,local_tail=%d,nslot=%d\n",
+ shadow_ringbuf->req->vc->pg_rank, VC_FIELD(shadow_ringbuf->req->vc,
+ ibcom->ncom),
+ MPID_nem_ib_ncqe, VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num),
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->lsr_seq_num_tail),
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot)
+);
+ MPID_nem_ib_send_progress(shadow_ringbuf->req->vc);
+ MPIU_Free(shadow_ringbuf->req);
}
- /* Let the guard down so that the following ask-fetch can be issued */
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->ask_guard) = 0;
-
- /* Kick progress engine */
- dprintf("cm_drain_scq,call send_progress for %d,ncom=%d,ncqe=%d,local_head=%d,local_tail=%d,nslot=%d\n",
- shadow_ringbuf->req->vc->pg_rank,
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->ncom),
- MPID_nem_ib_ncqe,
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num),
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->lsr_seq_num_tail),
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot)
- );
- MPID_nem_ib_send_progress(shadow_ringbuf->req->vc);
-
- MPIU_Free(shadow_ringbuf->req);
- } else {
- /* CAS failed */
- printf("ask-cas,failed\n");MPID_nem_ib_segv;
- /* Let the guard down so that this ask-fetch can be issued in ringbuf_progress */
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->ask_guard) = 0;
-
- /* Retry from fetch */
- shadow_ringbuf->req->state = MPID_NEM_IB_RINGBUF_ASK_FETCH;
-
- /* Schedule retry */
- dprintf("cm_drain_scq,retval=%08lx,backoff=%ld\n",
- cas_retval->head, shadow_ringbuf->req->retry_backoff);
- MPID_NEM_IB_RINGBUF_UPDATE_BACKOFF(shadow_ringbuf->req->retry_backoff);
- shadow_ringbuf->req->retry_decided = MPID_nem_ib_progress_engine_vt;
-
- /* Make the ask-fetch in order */
- MPID_nem_ib_ringbuf_sendq_enqueue_at_head(&MPID_nem_ib_ringbuf_sendq, shadow_ringbuf->req);
- dprintf("cm_drain_scq,ask_cas,cas failed,decided=%ld,backoff=%ld\n",
- shadow_ringbuf->req->retry_decided, shadow_ringbuf->req->retry_backoff);
+ else {
+ /* CAS failed */
+ printf("ask-cas,failed\n");
+ MPID_nem_ib_segv;
+ /* Let the guard down so that this ask-fetch can be issued in ringbuf_progress */
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->ask_guard) = 0;
+ /* Retry from fetch */
+ shadow_ringbuf->req->state = MPID_NEM_IB_RINGBUF_ASK_FETCH;
+ /* Schedule retry */
+ dprintf("cm_drain_scq,retval=%08lx,backoff=%ld\n",
+ cas_retval->head, shadow_ringbuf->req->retry_backoff);
+ MPID_NEM_IB_RINGBUF_UPDATE_BACKOFF(shadow_ringbuf->req->retry_backoff);
+ shadow_ringbuf->req->retry_decided = MPID_nem_ib_progress_engine_vt;
+ /* Make the ask-fetch in order */
+ MPID_nem_ib_ringbuf_sendq_enqueue_at_head(&MPID_nem_ib_ringbuf_sendq,
+ shadow_ringbuf->req);
+ dprintf("cm_drain_scq,ask_cas,cas failed,decided=%ld,backoff=%ld\n",
+ shadow_ringbuf->req->retry_decided, shadow_ringbuf->req->retry_backoff);
+ }
+ MPID_nem_ib_ncqe_scratch_pad_to_drain -= 1;
+ shadow_ringbuf->req->ibcom->ncom_scratch_pad -= 1;
+ MPID_nem_ib_rdmawr_from_free(shadow_ringbuf->buf_from, shadow_ringbuf->buf_from_sz);
+ MPIU_Free(shadow_ringbuf);
+ break;
}
- MPID_nem_ib_ncqe_scratch_pad_to_drain -= 1;
- shadow_ringbuf->req->ibcom->ncom_scratch_pad -= 1;
- MPID_nem_ib_rdmawr_from_free(shadow_ringbuf->buf_from, shadow_ringbuf->buf_from_sz);
- MPIU_Free(shadow_ringbuf);
- break; }
default:
printf("unknown type=%d\n", *type);
MPIU_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_drain_scq");
@@ -2382,9 +2335,9 @@ int MPID_nem_ib_cm_drain_scq()
MPID_nem_ib_ncqe_scratch_pad -= 1;
}
/* The number of CQE is reduced or a slot of the ringbuf is released, so kick progress engine */
- if(result > 0) {
- MPID_nem_ib_cm_progress();
- MPID_nem_ib_ringbuf_progress();
+ if (result > 0) {
+ MPID_nem_ib_cm_progress();
+ MPID_nem_ib_ringbuf_progress();
}
fn_exit:
@@ -2404,167 +2357,178 @@ int MPID_nem_ib_cm_poll_syn()
int ibcom_errno;
int ib_port = 1;
int i;
-
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_POLL_SYN);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_POLL_SYN);
-
/* scratch pad is freed after receiving CLOSE */
- if(!MPID_nem_ib_scratch_pad) {
+ if (!MPID_nem_ib_scratch_pad) {
dprintf("cm_poll_syn,MPID_nem_ib_scratch_pad is zero\n");
goto fn_exit;
}
/* Memory layout is (CAS-word:SYN#0:SYN#1:...:SYN#N:CMD#0:CMD#1:...CMD#M) */
- void* slot = (MPID_nem_ib_scratch_pad + MPID_NEM_IB_CM_OFF_SYN +
+ void *slot = (MPID_nem_ib_scratch_pad + MPID_NEM_IB_CM_OFF_SYN +
sizeof(MPID_nem_ib_cm_cmd_t) * (0 % MPID_NEM_IB_CM_NSEG));
-
volatile uint8_t *head_flag = (uint8_t *) slot;
if (*head_flag == MPID_NEM_IB_CM_HEAD_FLAG_ZERO) {
goto fn_exit;
} /* Incoming message hasn't arrived */
-
+
switch (*head_flag) {
- case MPID_NEM_IB_CM_SYN: {
- int is_synack = 0;
- volatile MPID_nem_ib_cm_cmd_syn_t *syn_tail_flag =
- (MPID_nem_ib_cm_cmd_syn_t *) slot;
- while (syn_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
- /* __asm__ __volatile__("pause;":::"memory"); */
- }
-
- volatile uint64_t *cas_word = (uint64_t *) (MPID_nem_ib_scratch_pad);
- MPID_nem_ib_cm_cmd_syn_t *syn = (MPID_nem_ib_cm_cmd_syn_t *) slot;
-
- dprintf("cm_poll_syn,syn detected!,initiator_rank=%d,ringbuf_index=%d\n",
- syn->initiator_rank, syn->responder_ringbuf_index);
-
- MPID_nem_ib_cm_req_t* req = MPIU_Malloc(sizeof(MPID_nem_ib_cm_req_t));
- MPIU_ERR_CHKANDJUMP(!req, mpi_errno, MPI_ERR_OTHER, "**malloc");
- req->ref_count = 1; /* Released when draining SCQ of ACK2 */
- req->ringbuf_index = syn->responder_ringbuf_index;
- req->initiator_rank = syn->initiator_rank;
- req->responder_rank = MPID_nem_ib_myrank;
- ibcom_errno =
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[req->initiator_rank],
- &req->ibcom);
-
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
- if (is_conn_established(syn->initiator_rank)) {
- req->state = MPID_NEM_IB_CM_ALREADY_ESTABLISHED;
- }
- else if ((MPID_nem_ib_myrank > syn->initiator_rank) && (req->ibcom->outstanding_connection_tx == 1)) {
- req->state = MPID_NEM_IB_CM_RESPONDER_IS_CONNECTING;
- }
- else {
- /* Skip QP createion on race condition */
- if(!(VC_FIELD(MPID_nem_ib_conns[syn->initiator_rank].vc, connection_state) &
- MPID_NEM_IB_CM_LOCAL_QP_RESET)) {
- ibcom_errno = MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_RC, &MPID_nem_ib_conns[syn->initiator_rank].fd);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
-
- /* store pointer to MPID_nem_ib_com */
- dprintf("cm_poll_syn,initiator fd=%d\n", MPID_nem_ib_conns[syn->initiator_rank].fd);
- ibcom_errno = MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[syn->initiator_rank].fd,
- &VC_FIELD(MPID_nem_ib_conns[syn->initiator_rank].vc, ibcom));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
-
- /* Allocate RDMA-write-to ring-buf for remote */
- mpi_errno = MPID_nem_ib_ringbuf_alloc(MPID_nem_ib_conns[syn->initiator_rank].vc);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_alloc");
-
- /* Record state transition for race condition detection */
- VC_FIELD(MPID_nem_ib_conns[syn->initiator_rank].vc, connection_state) |=
- MPID_NEM_IB_CM_LOCAL_QP_RESET;
+ case MPID_NEM_IB_CM_SYN:{
+ int is_synack = 0;
+ volatile MPID_nem_ib_cm_cmd_syn_t *syn_tail_flag = (MPID_nem_ib_cm_cmd_syn_t *) slot;
+ while (syn_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
+ /* __asm__ __volatile__("pause;":::"memory"); */
}
- req->state = MPID_NEM_IB_CM_SYNACK;
- is_synack = 1;
- }
-
- /* Increment transaction counter here because this path is executed only once */
- req->ibcom->outstanding_connection_tx += 1;
- dprintf("cm_poll_syn,tx=%d\n", req->ibcom->outstanding_connection_tx);
+ volatile uint64_t *cas_word = (uint64_t *) (MPID_nem_ib_scratch_pad);
+ MPID_nem_ib_cm_cmd_syn_t *syn = (MPID_nem_ib_cm_cmd_syn_t *) slot;
+ dprintf("cm_poll_syn,syn detected!,initiator_rank=%d,ringbuf_index=%d\n",
+ syn->initiator_rank, syn->responder_ringbuf_index);
+ MPID_nem_ib_cm_req_t *req = MPIU_Malloc(sizeof(MPID_nem_ib_cm_req_t));
+ MPIU_ERR_CHKANDJUMP(!req, mpi_errno, MPI_ERR_OTHER, "**malloc");
+ req->ref_count = 1; /* Released when draining SCQ of ACK2 */
+ req->ringbuf_index = syn->responder_ringbuf_index;
+ req->initiator_rank = syn->initiator_rank;
+ req->responder_rank = MPID_nem_ib_myrank;
+ ibcom_errno =
+ MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds
+ [req->initiator_rank], &req->ibcom);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_com_obtain_pointer");
+ if (is_conn_established(syn->initiator_rank)) {
+ req->state = MPID_NEM_IB_CM_ALREADY_ESTABLISHED;
+ }
+ else if ((MPID_nem_ib_myrank > syn->initiator_rank) &&
+ (req->ibcom->outstanding_connection_tx == 1)) {
+ req->state = MPID_NEM_IB_CM_RESPONDER_IS_CONNECTING;
+ }
+ else {
+ /* Skip QP createion on race condition */
+ if (!
+ (VC_FIELD
+ (MPID_nem_ib_conns[syn->initiator_rank].vc,
+ connection_state) & MPID_NEM_IB_CM_LOCAL_QP_RESET)) {
+ ibcom_errno =
+ MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_RC,
+ &MPID_nem_ib_conns[syn->initiator_rank].fd);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_com_open");
+ /* store pointer to MPID_nem_ib_com */
+ dprintf("cm_poll_syn,initiator fd=%d\n",
+ MPID_nem_ib_conns[syn->initiator_rank].fd);
+ ibcom_errno =
+ MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[syn->initiator_rank].fd,
+ &VC_FIELD(MPID_nem_ib_conns
+ [syn->initiator_rank].vc, ibcom));
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_com_obtain_pointer");
+ /* Allocate RDMA-write-to ring-buf for remote */
+ mpi_errno =
+ MPID_nem_ib_ringbuf_alloc(MPID_nem_ib_conns[syn->initiator_rank].vc);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_ringbuf_alloc");
+ /* Record state transition for race condition detection */
+ VC_FIELD(MPID_nem_ib_conns[syn->initiator_rank].vc,
+ connection_state) |= MPID_NEM_IB_CM_LOCAL_QP_RESET;
+ }
- /* Increment receiving transaction counter.
- * In the case of SYNACK, Responder receives ack1
- * In the case of ALREADY_ESTABLISHED or RESPONDER_IS_CONNECTING,
- * decrement in cm_drain_scq.
- */
- req->ibcom->incoming_connection_tx += 1;
+ req->state = MPID_NEM_IB_CM_SYNACK;
+ is_synack = 1;
+ }
- if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
- req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
+ /* Increment transaction counter here because this path is executed only once */
+ req->ibcom->outstanding_connection_tx += 1;
+ dprintf("cm_poll_syn,tx=%d\n", req->ibcom->outstanding_connection_tx);
+ /* Increment receiving transaction counter.
+ * In the case of SYNACK, Responder receives ack1
+ * In the case of ALREADY_ESTABLISHED or RESPONDER_IS_CONNECTING,
+ * decrement in cm_drain_scq.
+ */
+ req->ibcom->incoming_connection_tx += 1;
+ if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
+ req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
MPID_nem_ib_cm_ringbuf_tail) < MPID_NEM_IB_CM_NSEG) {
-
- MPID_nem_ib_cm_cmd_synack_t *cmd = (MPID_nem_ib_cm_cmd_synack_t *) req->ibcom->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
- if (is_synack) {
- MPID_NEM_IB_CM_COMPOSE_SYNACK(cmd, req, syn->initiator_req);
- dprintf("cm_poll_syn,composing synack,responder_req=%p,cmd->rmem=%lx,rkey=%08x,ringbuf_nslot=%d,remote_vc=%lx\n",
- cmd->responder_req, cmd->rmem, cmd->rkey, cmd->ringbuf_nslot, cmd->remote_vc);
- cmd->initiator_ringbuf_index = req->initiator_ringbuf_index = MPID_nem_ib_cm_ringbuf_head;
- dprintf("cm_poll_syn,giving ringbuf_index=%d\n", cmd->initiator_ringbuf_index);
- MPID_nem_ib_cm_ringbuf_head++;
- }
- else {
- MPID_NEM_IB_CM_COMPOSE_END_CM(cmd, req, syn->initiator_req, req->state);
- }
- MPID_nem_ib_cm_cmd_shadow_t * shadow =
- (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow->type = req->state;
- shadow->req = req;
- dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
- mpi_errno = MPID_nem_ib_cm_cmd_core(req->initiator_rank, shadow, (void *) cmd, sizeof(MPID_nem_ib_cm_cmd_synack_t), 0, req->ringbuf_index);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_send_core");
-
- } else {
- dprintf("cm_poll_syn,enqueue,ncqe=%d,ncom=%d,head=%d,tail=%d\n", MPID_nem_ib_ncqe_scratch_pad, req->ibcom->ncom_scratch_pad, MPID_nem_ib_cm_ringbuf_head, MPID_nem_ib_cm_ringbuf_tail);
- if (is_synack) {
- MPID_NEM_IB_CM_COMPOSE_SYNACK((MPID_nem_ib_cm_cmd_synack_t *)&(req->cmd), req, syn->initiator_req);
+
+ MPID_nem_ib_cm_cmd_synack_t *cmd =
+ (MPID_nem_ib_cm_cmd_synack_t *) req->ibcom->
+ icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
+ if (is_synack) {
+ MPID_NEM_IB_CM_COMPOSE_SYNACK(cmd, req, syn->initiator_req);
+ dprintf
+ ("cm_poll_syn,composing synack,responder_req=%p,cmd->rmem=%lx,rkey=%08x,ringbuf_nslot=%d,remote_vc=%lx\n",
+ cmd->responder_req, cmd->rmem, cmd->rkey, cmd->ringbuf_nslot,
+ cmd->remote_vc);
+ cmd->initiator_ringbuf_index = req->initiator_ringbuf_index =
+ MPID_nem_ib_cm_ringbuf_head;
+ dprintf("cm_poll_syn,giving ringbuf_index=%d\n", cmd->initiator_ringbuf_index);
+ MPID_nem_ib_cm_ringbuf_head++;
+ }
+ else {
+ MPID_NEM_IB_CM_COMPOSE_END_CM(cmd, req, syn->initiator_req, req->state);
+ }
+ MPID_nem_ib_cm_cmd_shadow_t *shadow = (MPID_nem_ib_cm_cmd_shadow_t *)
+ MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ shadow->type = req->state;
+ shadow->req = req;
+ dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
+ mpi_errno =
+ MPID_nem_ib_cm_cmd_core(req->initiator_rank, shadow, (void *) cmd,
+ sizeof(MPID_nem_ib_cm_cmd_synack_t), 0,
+ req->ringbuf_index);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_send_core");
}
else {
- MPID_NEM_IB_CM_COMPOSE_END_CM((MPID_nem_ib_cm_cmd_synack_t *)&(req->cmd), req, syn->initiator_req, req->state);
+ dprintf("cm_poll_syn,enqueue,ncqe=%d,ncom=%d,head=%d,tail=%d\n",
+ MPID_nem_ib_ncqe_scratch_pad, req->ibcom->ncom_scratch_pad,
+ MPID_nem_ib_cm_ringbuf_head, MPID_nem_ib_cm_ringbuf_tail);
+ if (is_synack) {
+ MPID_NEM_IB_CM_COMPOSE_SYNACK((MPID_nem_ib_cm_cmd_synack_t *) &
+ (req->cmd), req, syn->initiator_req);
+ }
+ else {
+ MPID_NEM_IB_CM_COMPOSE_END_CM((MPID_nem_ib_cm_cmd_synack_t *) &
+ (req->cmd), req, syn->initiator_req, req->state);
+ }
+ MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
}
- MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
+ /* Release CAS word because there's no next write on this syn slot */
+ *cas_word = MPID_NEM_IB_CM_RELEASED;
}
- /* Release CAS word because there's no next write on this syn slot */
- *cas_word = MPID_NEM_IB_CM_RELEASED;
- }
goto common_tail;
break;
- case MPID_NEM_IB_CM_CAS_RELEASE: {
- /* Initiator requests to release CAS word.
- * Because connection is already established.
- * In this case, responder may already have performed vc_terminate. */
-
- volatile MPID_nem_ib_cm_cmd_syn_t *syn_tail_flag =
- (MPID_nem_ib_cm_cmd_syn_t *) slot;
- while (syn_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
- /* __asm__ __volatile__("pause;":::"memory"); */
- }
+ case MPID_NEM_IB_CM_CAS_RELEASE:{
+ /* Initiator requests to release CAS word.
+ * Because connection is already established.
+ * In this case, responder may already have performed vc_terminate. */
- volatile uint64_t *cas_word = (uint64_t *) (MPID_nem_ib_scratch_pad);
+ volatile MPID_nem_ib_cm_cmd_syn_t *syn_tail_flag = (MPID_nem_ib_cm_cmd_syn_t *) slot;
+ while (syn_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
+ /* __asm__ __volatile__("pause;":::"memory"); */
+ }
- /* release */
- *cas_word = MPID_NEM_IB_CM_RELEASED;
- }
+ volatile uint64_t *cas_word = (uint64_t *) (MPID_nem_ib_scratch_pad);
+ /* release */
+ *cas_word = MPID_NEM_IB_CM_RELEASED;
+ }
- common_tail:
- *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
-
+ common_tail:
+ *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
/* Clear all possible tail-flag slots */
- ((MPID_nem_ib_cm_cmd_syn_t *)slot)->tail_flag.tail_flag = 0;
+ ((MPID_nem_ib_cm_cmd_syn_t *) slot)->tail_flag.tail_flag = 0;
break;
default:
printf("unknown connection command\n");
MPIU_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_poll");
break;
}
-
- fn_exit:
+
+ fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_POLL_SYN);
return mpi_errno;
- fn_fail:
+ fn_fail:
goto fn_exit;
}
@@ -2572,29 +2536,26 @@ int MPID_nem_ib_cm_poll_syn()
#undef FUNCNAME
#define FUNCNAME MPID_nem_ib_cm_release
#undef FCNAME
-int MPID_nem_ib_cm_release(uint16_t index) {
+int MPID_nem_ib_cm_release(uint16_t index)
+{
int mpi_errno = MPI_SUCCESS;
int old_ringbuf_tail = MPID_nem_ib_cm_ringbuf_tail;
uint16_t index_slot = index % MPID_NEM_IB_CM_NSEG;
-
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_RELEASE);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_RELEASE);
-
//dprintf("user_data=%p,mem=%p,sub=%08lx,index=%d\n", user_data, vc_ib->ibcom->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO], (unsigned long)user_data - (unsigned long)vc_ib->ibcom->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO], index);
//dprintf("index=%d,released=%016lx\n", index, vc_ib->ibcom->remote_ringbuf->remote_released[index / 64]);
MPID_nem_ib_cm_ringbuf_released[index_slot / 64] |= (1ULL << (index_slot & 63));
//dprintf("released[index/64]=%016lx\n", vc_ib->ibcom->remote_ringbuf->remote_released[index / 64]);
- uint16_t index_tail = ((uint16_t)(MPID_nem_ib_cm_ringbuf_tail + 1) % MPID_NEM_IB_CM_NSEG);
+ uint16_t index_tail = ((uint16_t) (MPID_nem_ib_cm_ringbuf_tail + 1) % MPID_NEM_IB_CM_NSEG);
//dprintf("tail+1=%d,index_tail=%d\n", vc_ib->ibcom->rsr_seq_num_tail + 1, index_tail);
//dprintf("released=%016lx\n", vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64]);
while (1) {
- if (((MPID_nem_ib_cm_ringbuf_released[index_tail / 64] >> (index_tail & 63)) & 1) ==
- 1) {
+ if (((MPID_nem_ib_cm_ringbuf_released[index_tail / 64] >> (index_tail & 63)) & 1) == 1) {
MPID_nem_ib_cm_ringbuf_tail++;
MPID_nem_ib_cm_ringbuf_released[index_tail / 64] &= ~(1ULL << (index_tail & 63));
dprintf("MPID_nem_ib_cm_ringbuf_tail,incremented to %d\n", MPID_nem_ib_cm_ringbuf_tail);
-
- index_tail = (uint16_t)(index_tail + 1) % MPID_NEM_IB_CM_NSEG;
+ index_tail = (uint16_t) (index_tail + 1) % MPID_NEM_IB_CM_NSEG;
}
else {
break;
@@ -2602,16 +2563,15 @@ int MPID_nem_ib_cm_release(uint16_t index) {
}
/* A slot of the ringbuf is released, so kick progress engine */
- if(MPID_nem_ib_cm_ringbuf_tail != old_ringbuf_tail) {
+ if (MPID_nem_ib_cm_ringbuf_tail != old_ringbuf_tail) {
MPID_nem_ib_cm_progress();
}
- fn_exit:
+ fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_RELEASE);
return mpi_errno;
//fn_fail:
goto fn_exit;
-
}
#undef FUNCNAME
@@ -2623,298 +2583,305 @@ int MPID_nem_ib_cm_poll()
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
uint16_t i;
-
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_POLL);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_POLL);
-
/* scratch pad is freed after receiving CLOSE */
- if(!MPID_nem_ib_scratch_pad) {
+ if (!MPID_nem_ib_scratch_pad) {
dprintf("cm_poll,MPID_nem_ib_scratch_pad is zero\n");
goto fn_exit;
}
/* Wrap-around tolerant by using "!=" */
- for(i = MPID_nem_ib_cm_ringbuf_tail + 1; i != MPID_nem_ib_cm_ringbuf_head; i++) {
+ for (i = MPID_nem_ib_cm_ringbuf_tail + 1; i != MPID_nem_ib_cm_ringbuf_head; i++) {
/* Memory layout is (CAS-word:SYN#0:SYN#1:...:SYN#N:CMD#0:CMD#1:...CMD#M) */
- void* slot = (MPID_nem_ib_scratch_pad + MPID_NEM_IB_CM_OFF_CMD +
- sizeof(MPID_nem_ib_cm_cmd_t) *
- ((uint16_t)(i % MPID_NEM_IB_CM_NSEG)));
-
+ void *slot = (MPID_nem_ib_scratch_pad + MPID_NEM_IB_CM_OFF_CMD +
+ sizeof(MPID_nem_ib_cm_cmd_t) * ((uint16_t) (i % MPID_NEM_IB_CM_NSEG)));
volatile uint8_t *head_flag = (uint8_t *) slot;
if (*head_flag == MPID_NEM_IB_CM_HEAD_FLAG_ZERO) {
continue;
- } /* Incoming message hasn't arrived */
+ } /* Incoming message hasn't arrived */
switch (*head_flag) {
- case MPID_NEM_IB_CM_SYNACK: {
- volatile MPID_nem_ib_cm_cmd_synack_t *synack_tail_flag =
- (MPID_nem_ib_cm_cmd_synack_t *) slot;
- while (synack_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
- /* __asm__ __volatile__("pause;":::"memory"); */
- }
-
- MPID_nem_ib_cm_cmd_synack_t *synack = (MPID_nem_ib_cm_cmd_synack_t *) slot;
- MPID_nem_ib_cm_req_t* req = (MPID_nem_ib_cm_req_t *) synack->initiator_req;
- req->ringbuf_index = synack->initiator_ringbuf_index;
-
- dprintf("cm_poll,synack detected!,responder_req=%p,responder_rank=%d,ringbuf_index=%d,tx=%d\n",
- synack->responder_req, req->responder_rank, synack->initiator_ringbuf_index,
- req->ibcom->outstanding_connection_tx);
-
- req->ibcom->incoming_connection_tx -= 1; /* SYNACK */
-
- /* Deduct it from the packet */
- VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_state) |=
- MPID_NEM_IB_CM_REMOTE_QP_RESET;
-
- /* Skip QP state transition on race condition */
- if(!(VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_state) &
- MPID_NEM_IB_CM_LOCAL_QP_RTS)) {
- ibcom_errno =
- MPID_nem_ib_com_rts(MPID_nem_ib_conns[req->responder_rank].fd, synack->qpnum, synack->lid,
- &(synack->gid));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_rts");
-
- /* Connect ring buffer */
- ibcom_errno =
- MPID_nem_ib_com_connect_ringbuf(MPID_nem_ib_conns[req->responder_rank].fd,
- synack->ringbuf_type,
- synack->rmem, synack->rkey, synack->ringbuf_nslot,
- synack->remote_vc,
- 1);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_connect_ringbuf");
- dprintf("connect_ringbuf,%d-%d=%d\n",
- VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, ibcom->sseq_num),
- VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, ibcom->lsr_seq_num_tail),
- MPID_nem_ib_diff16(VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, ibcom->sseq_num),
- VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, ibcom->lsr_seq_num_tail))
- );
-
- /* Record state transition for race condition detection */
- VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_state) |=
- MPID_NEM_IB_CM_LOCAL_QP_RTS;
- }
+ case MPID_NEM_IB_CM_SYNACK:{
+ volatile MPID_nem_ib_cm_cmd_synack_t *synack_tail_flag =
+ (MPID_nem_ib_cm_cmd_synack_t *) slot;
+ while (synack_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
+ /* __asm__ __volatile__("pause;":::"memory"); */
+ }
- req->state = MPID_NEM_IB_CM_ACK1;
- if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
- req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY) {
-
- dprintf("cm_poll,sending ack1,req=%p,ringbuf_index=%d\n", req, req->ringbuf_index);
- MPID_nem_ib_cm_cmd_ack1_t *cmd = (MPID_nem_ib_cm_cmd_ack1_t *) req->ibcom->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
- MPID_NEM_IB_CM_COMPOSE_ACK1(cmd, req, synack->responder_req);
- dprintf("cm_poll,composing ack1,cmd->responder_req=%p,cmd->rmem=%lx,rkey=%08x,ringbuf_nslot=%d,remote_vc=%lx\n",
- cmd->responder_req, cmd->rmem, cmd->rkey, cmd->ringbuf_nslot, cmd->remote_vc);
- MPID_nem_ib_cm_cmd_shadow_t * shadow =
- (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow->type = req->state;
- shadow->req = req;
- dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
- mpi_errno = MPID_nem_ib_cm_cmd_core(req->responder_rank, shadow, (void *) cmd, sizeof(MPID_nem_ib_cm_cmd_ack1_t), 0, req->ringbuf_index);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_send_core");
- } else {
- MPID_NEM_IB_CM_COMPOSE_ACK1((MPID_nem_ib_cm_cmd_ack1_t *)&(req->cmd), req, synack->responder_req);
- MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
+ MPID_nem_ib_cm_cmd_synack_t *synack = (MPID_nem_ib_cm_cmd_synack_t *) slot;
+ MPID_nem_ib_cm_req_t *req = (MPID_nem_ib_cm_req_t *) synack->initiator_req;
+ req->ringbuf_index = synack->initiator_ringbuf_index;
+ dprintf
+ ("cm_poll,synack detected!,responder_req=%p,responder_rank=%d,ringbuf_index=%d,tx=%d\n",
+ synack->responder_req, req->responder_rank, synack->initiator_ringbuf_index,
+ req->ibcom->outstanding_connection_tx);
+ req->ibcom->incoming_connection_tx -= 1; /* SYNACK */
+ /* Deduct it from the packet */
+ VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc,
+ connection_state) |= MPID_NEM_IB_CM_REMOTE_QP_RESET;
+ /* Skip QP state transition on race condition */
+ if (!
+ (VC_FIELD
+ (MPID_nem_ib_conns[req->responder_rank].vc,
+ connection_state) & MPID_NEM_IB_CM_LOCAL_QP_RTS)) {
+ ibcom_errno =
+ MPID_nem_ib_com_rts(MPID_nem_ib_conns[req->responder_rank].fd,
+ synack->qpnum, synack->lid, &(synack->gid));
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_com_rts");
+ /* Connect ring buffer */
+ ibcom_errno =
+ MPID_nem_ib_com_connect_ringbuf(MPID_nem_ib_conns
+ [req->responder_rank].fd,
+ synack->ringbuf_type, synack->rmem,
+ synack->rkey, synack->ringbuf_nslot,
+ synack->remote_vc, 1);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_com_connect_ringbuf");
+ dprintf("connect_ringbuf,%d-%d=%d\n",
+ VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc,
+ ibcom->sseq_num),
+ VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc,
+ ibcom->lsr_seq_num_tail),
+ MPID_nem_ib_diff16(VC_FIELD
+ (MPID_nem_ib_conns[req->responder_rank].vc,
+ ibcom->sseq_num),
+ VC_FIELD(MPID_nem_ib_conns
+ [req->responder_rank].vc,
+ ibcom->lsr_seq_num_tail))
+);
+ /* Record state transition for race condition detection */
+ VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc,
+ connection_state) |= MPID_NEM_IB_CM_LOCAL_QP_RTS;
+ }
+
+ req->state = MPID_NEM_IB_CM_ACK1;
+ if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
+ req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY) {
+
+ dprintf("cm_poll,sending ack1,req=%p,ringbuf_index=%d\n", req,
+ req->ringbuf_index);
+ MPID_nem_ib_cm_cmd_ack1_t *cmd =
+ (MPID_nem_ib_cm_cmd_ack1_t *) req->ibcom->
+ icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
+ MPID_NEM_IB_CM_COMPOSE_ACK1(cmd, req, synack->responder_req);
+ dprintf
+ ("cm_poll,composing ack1,cmd->responder_req=%p,cmd->rmem=%lx,rkey=%08x,ringbuf_nslot=%d,remote_vc=%lx\n",
+ cmd->responder_req, cmd->rmem, cmd->rkey, cmd->ringbuf_nslot,
+ cmd->remote_vc);
+ MPID_nem_ib_cm_cmd_shadow_t *shadow = (MPID_nem_ib_cm_cmd_shadow_t *)
+ MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ shadow->type = req->state;
+ shadow->req = req;
+ dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
+ mpi_errno =
+ MPID_nem_ib_cm_cmd_core(req->responder_rank, shadow, (void *) cmd,
+ sizeof(MPID_nem_ib_cm_cmd_ack1_t), 0,
+ req->ringbuf_index);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_send_core");
+ }
+ else {
+ MPID_NEM_IB_CM_COMPOSE_ACK1((MPID_nem_ib_cm_cmd_ack1_t *) &
+ (req->cmd), req, synack->responder_req);
+ MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
+ }
}
- }
*head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
/* Clear all possible tail-flag slots */
MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
-
//goto common_tail;
break;
case MPID_NEM_IB_CM_ALREADY_ESTABLISHED:
case MPID_NEM_IB_CM_RESPONDER_IS_CONNECTING:
{
- volatile MPID_nem_ib_cm_cmd_synack_t *synack_tail_flag =
- (MPID_nem_ib_cm_cmd_synack_t *) slot;
- while (synack_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
- /* __asm__ __volatile__("pause;":::"memory"); */
- }
-
- MPID_nem_ib_cm_cmd_synack_t *synack = (MPID_nem_ib_cm_cmd_synack_t *) slot;
- MPID_nem_ib_cm_req_t* req = (MPID_nem_ib_cm_req_t *) synack->initiator_req;
-
- dprintf("cm_poll,synack detected!,responder_req=%p,responder_rank=%d,ringbuf_index=%d,tx=%d\n",
- req->ibcom->outstanding_connection_tx);
-
- /* These mean the end of CM-op, so decrement here. */
- req->ibcom->outstanding_connection_tx -= 1;
- req->ibcom->incoming_connection_tx -= 2;
-
- /* cm_release calls cm_progress, so we have to clear scratch_pad here. */
- *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
- /* Clear all possible tail-flag slots */
- MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
-
- /* The initiator release the slot for responder */
- MPID_nem_ib_cm_release(req->responder_ringbuf_index);
-
- /* Kick ask-send commands waiting for connection */
- MPID_nem_ib_ringbuf_progress();
-
- /* Kick send commands waiting for connection.
- This might be a dupe when running-ahead transaction kicked it when receiving ACK1. */
- dprintf("cm_poll,kick progress engine for %d\n", req->responder_rank);
- MPID_nem_ib_send_progress(MPID_nem_ib_conns[req->responder_rank].vc);
-
- /* Let the following connection request go */
- VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_guard) = 0;
-
- /* free memory : req->ref_count is 2, so call MPIU_Free() directly */
- //MPID_nem_ib_cm_request_release(req);
- MPIU_Free(req);
- }
- //goto common_tail;
- break;
- case MPID_NEM_IB_CM_ACK1: {
- volatile MPID_nem_ib_cm_cmd_ack1_t *ack1_tail_flag =
- (MPID_nem_ib_cm_cmd_ack1_t *) slot;
- while (ack1_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
- /* __asm__ __volatile__("pause;":::"memory"); */
- }
-
- MPID_nem_ib_cm_cmd_ack1_t *ack1 = (MPID_nem_ib_cm_cmd_ack1_t *) slot;
- MPID_nem_ib_cm_req_t* req = (MPID_nem_ib_cm_req_t *) ack1->responder_req;
+ volatile MPID_nem_ib_cm_cmd_synack_t *synack_tail_flag =
+ (MPID_nem_ib_cm_cmd_synack_t *) slot;
+ while (synack_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
+ /* __asm__ __volatile__("pause;":::"memory"); */
+ }
- dprintf("cm_poll,ack1 detected!,responder_req=%p,initiator_rank=%d,tx=%d\n",
- ack1->responder_req, req->initiator_rank,
+ MPID_nem_ib_cm_cmd_synack_t *synack = (MPID_nem_ib_cm_cmd_synack_t *) slot;
+ MPID_nem_ib_cm_req_t *req = (MPID_nem_ib_cm_req_t *) synack->initiator_req;
+ dprintf
+ ("cm_poll,synack detected!,responder_req=%p,responder_rank=%d,ringbuf_index=%d,tx=%d\n",
req->ibcom->outstanding_connection_tx);
-
- req->ibcom->incoming_connection_tx -= 1; /* ACK1 */
-
- /* Deduct it from the packet */
- VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, connection_state) |=
- (MPID_NEM_IB_CM_REMOTE_QP_RESET | MPID_NEM_IB_CM_REMOTE_QP_RTS);
-
- /* Skip QP createion on race condition */
- if(!(VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, connection_state) &
- MPID_NEM_IB_CM_LOCAL_QP_RTS)) {
- ibcom_errno =
- MPID_nem_ib_com_rts(MPID_nem_ib_conns[req->initiator_rank].fd,
- ack1->qpnum, ack1->lid, &(ack1->gid));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_rts");
-
- /* Connect ring buffer */
- ibcom_errno =
- MPID_nem_ib_com_connect_ringbuf(MPID_nem_ib_conns[req->initiator_rank].fd,
- ack1->ringbuf_type,
- ack1->rmem, ack1->rkey, ack1->ringbuf_nslot,
- ack1->remote_vc,
- 1);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_connect_ringbuf");
- dprintf("connect_ringbuf,%d-%d=%d\n",
- VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, ibcom->sseq_num),
- VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, ibcom->lsr_seq_num_tail),
- MPID_nem_ib_diff16(VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, ibcom->sseq_num),
- VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, ibcom->lsr_seq_num_tail))
- );
-
- MPID_nem_ib_vc_onconnect(MPID_nem_ib_conns[req->initiator_rank].vc);
-
- /* Record state transition for race condition detection */
- VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, connection_state) |=
- MPID_NEM_IB_CM_LOCAL_QP_RTS;
+ /* These mean the end of CM-op, so decrement here. */
+ req->ibcom->outstanding_connection_tx -= 1;
+ req->ibcom->incoming_connection_tx -= 2;
+ /* cm_release calls cm_progress, so we have to clear scratch_pad here. */
+ *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
+ /* Clear all possible tail-flag slots */
+ MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
+ /* The initiator release the slot for responder */
+ MPID_nem_ib_cm_release(req->responder_ringbuf_index);
+ /* Kick ask-send commands waiting for connection */
+ MPID_nem_ib_ringbuf_progress();
+ /* Kick send commands waiting for connection.
+ * This might be a dupe when running-ahead transaction kicked it when receiving ACK1. */
+ dprintf("cm_poll,kick progress engine for %d\n", req->responder_rank);
+ MPID_nem_ib_send_progress(MPID_nem_ib_conns[req->responder_rank].vc);
+ /* Let the following connection request go */
+ VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_guard) = 0;
+ /* free memory : req->ref_count is 2, so call MPIU_Free() directly */
+ //MPID_nem_ib_cm_request_release(req);
+ MPIU_Free(req);
}
-
- req->state = MPID_NEM_IB_CM_ACK2;
- if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
- req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY) {
-
- dprintf("cm_poll,sending ack2,req=%p,ringbuf_index=%d,initiator_rank=%d,tx=%d\n",
- req, req->ringbuf_index, req->initiator_rank,
- req->ibcom->outstanding_connection_tx);
-
- MPID_nem_ib_cm_cmd_ack2_t *cmd = (MPID_nem_ib_cm_cmd_ack2_t *) req->ibcom->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
- MPID_NEM_IB_CM_COMPOSE_ACK2(cmd, ack1->initiator_req);
- MPID_nem_ib_cm_cmd_shadow_t * shadow =
- (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow->type = req->state;
- shadow->req = req;
- dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
- mpi_errno = MPID_nem_ib_cm_cmd_core(req->initiator_rank, shadow, (void *) cmd, sizeof(MPID_nem_ib_cm_cmd_ack2_t), 0, req->ringbuf_index);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_send_core");
- } else {
- MPID_NEM_IB_CM_COMPOSE_ACK2((MPID_nem_ib_cm_cmd_ack2_t *)&(req->cmd), ack1->initiator_req);
- MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
- }
-
- /* cm_release calls cm_progress, so we have to clear scratch_pad here. */
- *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
- /* Clear all possible tail-flag slots */
- MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
-
- /* The responder release the slot for initiator */
- MPID_nem_ib_cm_release(req->initiator_ringbuf_index);
-
- /* Kick ask-send commands waiting for connection */
- MPID_nem_ib_ringbuf_progress();
-
- /* Kick send commands waiting for connection.
- This might be a dupe when running-ahead transaction kicked it when receiving ACK2. */
- dprintf("cm_poll,kick progress engine for %d\n", req->initiator_rank);
- MPID_nem_ib_send_progress(MPID_nem_ib_conns[req->initiator_rank].vc);
- }
//goto common_tail;
break;
- case MPID_NEM_IB_CM_ACK2: {
- volatile MPID_nem_ib_cm_cmd_ack2_t *ack2_tail_flag =
- (MPID_nem_ib_cm_cmd_ack2_t *) slot;
- while (ack2_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
- /* __asm__ __volatile__("pause;":::"memory"); */
- }
- MPID_nem_ib_cm_cmd_ack2_t *ack2 = (MPID_nem_ib_cm_cmd_ack2_t *) slot;
- MPID_nem_ib_cm_req_t* req = (MPID_nem_ib_cm_req_t *) ack2->initiator_req;
-
- dprintf("cm_poll,ack2 detected!,req=%p,responder_rank=%d,tx=%d\n",
- req, req->responder_rank,
- req->ibcom->outstanding_connection_tx);
+ case MPID_NEM_IB_CM_ACK1:{
+ volatile MPID_nem_ib_cm_cmd_ack1_t *ack1_tail_flag =
+ (MPID_nem_ib_cm_cmd_ack1_t *) slot;
+ while (ack1_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
+ /* __asm__ __volatile__("pause;":::"memory"); */
+ }
- req->ibcom->incoming_connection_tx -= 1; /* ACK2 */
+ MPID_nem_ib_cm_cmd_ack1_t *ack1 = (MPID_nem_ib_cm_cmd_ack1_t *) slot;
+ MPID_nem_ib_cm_req_t *req = (MPID_nem_ib_cm_req_t *) ack1->responder_req;
+ dprintf("cm_poll,ack1 detected!,responder_req=%p,initiator_rank=%d,tx=%d\n",
+ ack1->responder_req, req->initiator_rank,
+ req->ibcom->outstanding_connection_tx);
+ req->ibcom->incoming_connection_tx -= 1; /* ACK1 */
+ /* Deduct it from the packet */
+ VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc,
+ connection_state) |=
+ (MPID_NEM_IB_CM_REMOTE_QP_RESET | MPID_NEM_IB_CM_REMOTE_QP_RTS);
+ /* Skip QP createion on race condition */
+ if (!
+ (VC_FIELD
+ (MPID_nem_ib_conns[req->initiator_rank].vc,
+ connection_state) & MPID_NEM_IB_CM_LOCAL_QP_RTS)) {
+ ibcom_errno =
+ MPID_nem_ib_com_rts(MPID_nem_ib_conns[req->initiator_rank].fd,
+ ack1->qpnum, ack1->lid, &(ack1->gid));
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_com_rts");
+ /* Connect ring buffer */
+ ibcom_errno =
+ MPID_nem_ib_com_connect_ringbuf(MPID_nem_ib_conns
+ [req->initiator_rank].fd,
+ ack1->ringbuf_type, ack1->rmem,
+ ack1->rkey, ack1->ringbuf_nslot,
+ ack1->remote_vc, 1);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_com_connect_ringbuf");
+ dprintf("connect_ringbuf,%d-%d=%d\n",
+ VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc,
+ ibcom->sseq_num),
+ VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc,
+ ibcom->lsr_seq_num_tail),
+ MPID_nem_ib_diff16(VC_FIELD
+ (MPID_nem_ib_conns[req->initiator_rank].vc,
+ ibcom->sseq_num),
+ VC_FIELD(MPID_nem_ib_conns
+ [req->initiator_rank].vc,
+ ibcom->lsr_seq_num_tail))
+);
+ MPID_nem_ib_vc_onconnect(MPID_nem_ib_conns[req->initiator_rank].vc);
+ /* Record state transition for race condition detection */
+ VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc,
+ connection_state) |= MPID_NEM_IB_CM_LOCAL_QP_RTS;
+ }
- /* Deduct it from the packet */
- if(!(VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_state) &
- MPID_NEM_IB_CM_REMOTE_QP_RTS)) {
- MPID_nem_ib_vc_onconnect(MPID_nem_ib_conns[req->responder_rank].vc);
+ req->state = MPID_NEM_IB_CM_ACK2;
+ if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
+ req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY) {
+
+ dprintf
+ ("cm_poll,sending ack2,req=%p,ringbuf_index=%d,initiator_rank=%d,tx=%d\n",
+ req, req->ringbuf_index, req->initiator_rank,
+ req->ibcom->outstanding_connection_tx);
+ MPID_nem_ib_cm_cmd_ack2_t *cmd =
+ (MPID_nem_ib_cm_cmd_ack2_t *) req->ibcom->
+ icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
+ MPID_NEM_IB_CM_COMPOSE_ACK2(cmd, ack1->initiator_req);
+ MPID_nem_ib_cm_cmd_shadow_t *shadow = (MPID_nem_ib_cm_cmd_shadow_t *)
+ MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ shadow->type = req->state;
+ shadow->req = req;
+ dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
+ mpi_errno =
+ MPID_nem_ib_cm_cmd_core(req->initiator_rank, shadow, (void *) cmd,
+ sizeof(MPID_nem_ib_cm_cmd_ack2_t), 0,
+ req->ringbuf_index);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_send_core");
+ }
+ else {
+ MPID_NEM_IB_CM_COMPOSE_ACK2((MPID_nem_ib_cm_cmd_ack2_t *) &
+ (req->cmd), ack1->initiator_req);
+ MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
+ }
- /* Record state transition for race condition detection */
- VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_state) |=
- MPID_NEM_IB_CM_REMOTE_QP_RTS;
+ /* cm_release calls cm_progress, so we have to clear scratch_pad here. */
+ *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
+ /* Clear all possible tail-flag slots */
+ MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
+ /* The responder release the slot for initiator */
+ MPID_nem_ib_cm_release(req->initiator_ringbuf_index);
+ /* Kick ask-send commands waiting for connection */
+ MPID_nem_ib_ringbuf_progress();
+ /* Kick send commands waiting for connection.
+ * This might be a dupe when running-ahead transaction kicked it when receiving ACK2. */
+ dprintf("cm_poll,kick progress engine for %d\n", req->initiator_rank);
+ MPID_nem_ib_send_progress(MPID_nem_ib_conns[req->initiator_rank].vc);
}
+ //goto common_tail;
+ break;
+ case MPID_NEM_IB_CM_ACK2:{
+ volatile MPID_nem_ib_cm_cmd_ack2_t *ack2_tail_flag =
+ (MPID_nem_ib_cm_cmd_ack2_t *) slot;
+ while (ack2_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
+ /* __asm__ __volatile__("pause;":::"memory"); */
+ }
+ MPID_nem_ib_cm_cmd_ack2_t *ack2 = (MPID_nem_ib_cm_cmd_ack2_t *) slot;
+ MPID_nem_ib_cm_req_t *req = (MPID_nem_ib_cm_req_t *) ack2->initiator_req;
+ dprintf("cm_poll,ack2 detected!,req=%p,responder_rank=%d,tx=%d\n", req,
+ req->responder_rank, req->ibcom->outstanding_connection_tx);
+ req->ibcom->incoming_connection_tx -= 1; /* ACK2 */
+ /* Deduct it from the packet */
+ if (!
+ (VC_FIELD
+ (MPID_nem_ib_conns[req->responder_rank].vc,
+ connection_state) & MPID_NEM_IB_CM_REMOTE_QP_RTS)) {
+ MPID_nem_ib_vc_onconnect(MPID_nem_ib_conns[req->responder_rank].vc);
+ /* Record state transition for race condition detection */
+ VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc,
+ connection_state) |= MPID_NEM_IB_CM_REMOTE_QP_RTS;
+ }
- /* cm_release calls cm_progress, so we have to clear scratch_pad here. */
- *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
- /* Clear all possible tail-flag slots */
- MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
+ /* cm_release calls cm_progress, so we have to clear scratch_pad here. */
+ *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
+ /* Clear all possible tail-flag slots */
+ MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
+ /* The initiator release the slot for responder */
+ MPID_nem_ib_cm_release(req->responder_ringbuf_index);
+ /* Acquire ring-buffer slot now that it's connected if requested so */
+ if (req->ask_on_connect &&
+ VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc,
+ ibcom->local_ringbuf_type) == MPID_NEM_IB_RINGBUF_SHARED) {
+ dprintf("cm_poll,ack2,ask on connect\n");
+ mpi_errno =
+ MPID_nem_ib_ringbuf_ask_fetch(MPID_nem_ib_conns[req->responder_rank].vc);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_ringbuf_ask_fetch");
+ }
- /* The initiator release the slot for responder */
- MPID_nem_ib_cm_release(req->responder_ringbuf_index);
-
- /* Acquire ring-buffer slot now that it's connected if requested so */
- if(req->ask_on_connect &&
- VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc,
- ibcom->local_ringbuf_type) == MPID_NEM_IB_RINGBUF_SHARED) {
- dprintf("cm_poll,ack2,ask on connect\n");
- mpi_errno = MPID_nem_ib_ringbuf_ask_fetch(MPID_nem_ib_conns[req->responder_rank].vc);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_ask_fetch");
+ /* Kick ask-send commands waiting for connection */
+ MPID_nem_ib_ringbuf_progress();
+ /* Kick send commands waiting for connection.
+ * This might be a dupe when running-ahead transaction kicked it when receiving ACK1. */
+ dprintf("cm_poll,kick progress engine for %d\n", req->responder_rank);
+ MPID_nem_ib_send_progress(MPID_nem_ib_conns[req->responder_rank].vc);
+ /* Let the following connection request go */
+ VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_guard) = 0;
+ /* Finalize protocol because there is no referer in cm_poll and sendq.
+ * Note that there might be one which sent ACK1 in cm_drain_scq. */
+ MPID_nem_ib_cm_request_release(req);
}
-
- /* Kick ask-send commands waiting for connection */
- MPID_nem_ib_ringbuf_progress();
-
- /* Kick send commands waiting for connection.
- This might be a dupe when running-ahead transaction kicked it when receiving ACK1. */
- dprintf("cm_poll,kick progress engine for %d\n", req->responder_rank);
- MPID_nem_ib_send_progress(MPID_nem_ib_conns[req->responder_rank].vc);
-
- /* Let the following connection request go */
- VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_guard) = 0;
-
- /* Finalize protocol because there is no referer in cm_poll and sendq.
- Note that there might be one which sent ACK1 in cm_drain_scq. */
- MPID_nem_ib_cm_request_release(req);
- }
//common_tail:
//*head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
///* Clear all possible tail-flag slots */
@@ -2927,10 +2894,10 @@ int MPID_nem_ib_cm_poll()
}
}
- fn_exit:
+ fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_POLL);
return mpi_errno;
- fn_fail:
+ fn_fail:
goto fn_exit;
}
@@ -2945,75 +2912,72 @@ int MPID_nem_ib_ringbuf_alloc(MPIDI_VC_t * vc)
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
int i;
-
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_ALLOC);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_ALLOC);
-
- if(!MPID_nem_ib_ringbuf) {
+ if (!MPID_nem_ib_ringbuf) {
MPID_nem_ib_ringbuf = MPIU_Calloc(1, sizeof(MPID_nem_ib_ringbuf_t) * MPID_NEM_IB_NRINGBUF);
- MPIU_ERR_CHKANDJUMP(!MPID_nem_ib_ringbuf, mpi_errno, MPI_ERR_OTHER,
- "**malloc");
+ MPIU_ERR_CHKANDJUMP(!MPID_nem_ib_ringbuf, mpi_errno, MPI_ERR_OTHER, "**malloc");
}
-
-#if 0 /* Degug, "#if 1" to make exclusive ring-buffers not available */
- //if(MPID_nem_ib_myrank == 0) {
- for(i = 0; i < MPID_NEM_IB_NRINGBUF - 1; i++) {
- MPID_nem_ib_ringbuf_acquired[i / 64] |= (1ULL << (i & 63));
- }
- //}
-#endif
+
+#if 0 /* Degug, "#if 1" to make exclusive ring-buffers not available */
+ //if (MPID_nem_ib_myrank == 0) {
+ for (i = 0; i < MPID_NEM_IB_NRINGBUF - 1; i++) {
+ MPID_nem_ib_ringbuf_acquired[i / 64] |= (1ULL << (i & 63));
+ }
+ //}
+#endif
int found = 0;
/* [MPID_NEM_IB_NRINGBUF-1] holds shared ring buffer */
- for(i = 0; i < MPID_NEM_IB_NRINGBUF - 1; i++) {
+ for (i = 0; i < MPID_NEM_IB_NRINGBUF - 1; i++) {
if (((MPID_nem_ib_ringbuf_acquired[i / 64] >> (i & 63)) & 1) == 0) {
found = 1;
break;
}
}
- if(found) {
+ if (found) {
MPID_nem_ib_ringbuf_acquired[i / 64] |= (1ULL << (i & 63));
-
- if(!MPID_nem_ib_ringbuf[i].start) {
+ if (!MPID_nem_ib_ringbuf[i].start) {
MPID_nem_ib_ringbuf[i].type = MPID_NEM_IB_RINGBUF_EXCLUSIVE;
MPID_nem_ib_ringbuf[i].start = MPID_nem_ib_rdmawr_to_alloc(MPID_NEM_IB_RINGBUF_NSLOT);
- MPIU_ERR_CHKANDJUMP(!MPID_nem_ib_ringbuf[i].start, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_rdma_to_alloc");
+ MPIU_ERR_CHKANDJUMP(!MPID_nem_ib_ringbuf[i].start, mpi_errno,
+ MPI_ERR_OTHER, "**MPID_nem_ib_rdma_to_alloc");
MPID_nem_ib_ringbuf[i].nslot = MPID_NEM_IB_RINGBUF_NSLOT;
- memset(MPID_nem_ib_ringbuf[i].remote_released, 0, (MPID_NEM_IB_RINGBUF_NSLOT + 63) / 64);
+ memset(MPID_nem_ib_ringbuf[i].remote_released, 0,
+ (MPID_NEM_IB_RINGBUF_NSLOT + 63) / 64);
MPID_nem_ib_ringbuf_allocated[i / 64] |= (1ULL << (i & 63));
}
VC_FIELD(vc, ibcom->remote_ringbuf) = &MPID_nem_ib_ringbuf[i];
dprintf("ringbuf_alloc,start=%p\n", MPID_nem_ib_ringbuf[i].start);
-
VC_FIELD(vc, ibcom->rsr_seq_num_poll) = 0;
VC_FIELD(vc, ibcom->rsr_seq_num_tail) = -1;
VC_FIELD(vc, ibcom->rsr_seq_num_tail_last_sent) = -1;
-
MPID_nem_ib_ringbuf[i].vc = vc;
- dprintf("ringbuf_alloc,i=%d,pg_rank=%d,ibcom=%p,ibcom->remote_ringbuf=%p\n",
- i, vc->pg_rank, VC_FIELD(vc, ibcom), VC_FIELD(vc, ibcom->remote_ringbuf));
- } else {
- if(!MPID_nem_ib_ringbuf[i].start) {
+ dprintf
+ ("ringbuf_alloc,i=%d,pg_rank=%d,ibcom=%p,ibcom->remote_ringbuf=%p\n",
+ i, vc->pg_rank, VC_FIELD(vc, ibcom), VC_FIELD(vc, ibcom->remote_ringbuf));
+ }
+ else {
+ if (!MPID_nem_ib_ringbuf[i].start) {
MPID_nem_ib_ringbuf[i].type = MPID_NEM_IB_RINGBUF_SHARED;
MPID_nem_ib_ringbuf[i].start = MPID_nem_ib_rdmawr_to_alloc(MPID_NEM_IB_RINGBUF_NSLOT);
- MPIU_ERR_CHKANDJUMP(!MPID_nem_ib_ringbuf[i].start, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_rdma_to_alloc");
+ MPIU_ERR_CHKANDJUMP(!MPID_nem_ib_ringbuf[i].start, mpi_errno,
+ MPI_ERR_OTHER, "**MPID_nem_ib_rdma_to_alloc");
MPID_nem_ib_ringbuf[i].nslot = MPID_NEM_IB_RINGBUF_NSLOT;
- memset(MPID_nem_ib_ringbuf[i].remote_released, 0, (MPID_NEM_IB_RINGBUF_NSLOT + 63) / 64);
+ memset(MPID_nem_ib_ringbuf[i].remote_released, 0,
+ (MPID_NEM_IB_RINGBUF_NSLOT + 63) / 64);
MPID_nem_ib_ringbuf_allocated[i / 64] |= (1ULL << (i & 63));
}
MPID_nem_ib_ringbuf[i].ref_count++;
VC_FIELD(vc, ibcom->remote_ringbuf) = &MPID_nem_ib_ringbuf[i];
-
dprintf("ringbuf_alloc,not found\n");
}
- fn_exit:
+ fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RINGBUF_ALLOC);
return mpi_errno;
- fn_fail:
+ fn_fail:
goto fn_exit;
}
@@ -3026,37 +2990,40 @@ int MPID_nem_ib_ringbuf_free(MPIDI_VC_t * vc)
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
int i;
-
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_FREE);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_FREE);
-
/* No ring-buffer is allocated */
- if(!VC_FIELD(vc, ibcom->remote_ringbuf)) {
+ if (!VC_FIELD(vc, ibcom->remote_ringbuf)) {
goto fn_exit;
}
- int index = ((uint8_t *)VC_FIELD(vc, ibcom->remote_ringbuf) - (uint8_t*)&MPID_nem_ib_ringbuf[0]) /
- sizeof(MPID_nem_ib_ringbuf_t);
+ int index =
+ ((uint8_t *) VC_FIELD(vc, ibcom->remote_ringbuf) -
+ (uint8_t *) & MPID_nem_ib_ringbuf[0]) / sizeof(MPID_nem_ib_ringbuf_t);
dprintf("ringbuf_free,index=%d\n", index);
-
- switch(VC_FIELD(vc, ibcom->remote_ringbuf)->type) {
+ switch (VC_FIELD(vc, ibcom->remote_ringbuf)->type) {
case MPID_NEM_IB_RINGBUF_EXCLUSIVE:
dprintf("ringbuf_free,start=%p\n", VC_FIELD(vc, ibcom->remote_ringbuf)->start);
- MPID_nem_ib_rdmawr_to_free(VC_FIELD(vc, ibcom->remote_ringbuf)->start, MPID_NEM_IB_RINGBUF_NSLOT);
- VC_FIELD(vc, ibcom->remote_ringbuf)->start = NULL; /* initialize for re-allocate */
+ MPID_nem_ib_rdmawr_to_free(VC_FIELD(vc, ibcom->remote_ringbuf)->start,
+ MPID_NEM_IB_RINGBUF_NSLOT);
+ VC_FIELD(vc, ibcom->remote_ringbuf)->start = NULL; /* initialize for re-allocate */
MPID_nem_ib_ringbuf_allocated[index / 64] &= ~(1ULL << (index & 63));
VC_FIELD(vc, ibcom->remote_ringbuf) = NULL;
MPID_nem_ib_ringbuf_acquired[index / 64] &= ~(1ULL << (index & 63));
- dprintf("ringbuf_free,exclucsive,allocated=%0lx\n", MPID_nem_ib_ringbuf_allocated[index / 64]);
+ dprintf("ringbuf_free,exclucsive,allocated=%0lx\n",
+ MPID_nem_ib_ringbuf_allocated[index / 64]);
break;
case MPID_NEM_IB_RINGBUF_SHARED:
- dprintf("ringbuf_free,shared,ref_count=%d\n", VC_FIELD(vc, ibcom->remote_ringbuf)->ref_count);
+ dprintf("ringbuf_free,shared,ref_count=%d\n",
+ VC_FIELD(vc, ibcom->remote_ringbuf)->ref_count);
MPIU_Assert(VC_FIELD(vc, ibcom->remote_ringbuf)->ref_count > 0);
- if(--VC_FIELD(vc, ibcom->remote_ringbuf)->ref_count == 0) {
- MPID_nem_ib_rdmawr_to_free(VC_FIELD(vc, ibcom->remote_ringbuf)->start, MPID_NEM_IB_RINGBUF_NSLOT);
- VC_FIELD(vc, ibcom->remote_ringbuf)->start = NULL; /* initialize for re-allocate */
+ if (--VC_FIELD(vc, ibcom->remote_ringbuf)->ref_count == 0) {
+ MPID_nem_ib_rdmawr_to_free(VC_FIELD(vc, ibcom->remote_ringbuf)->start,
+ MPID_NEM_IB_RINGBUF_NSLOT);
+ VC_FIELD(vc, ibcom->remote_ringbuf)->start = NULL; /* initialize for re-allocate */
MPID_nem_ib_ringbuf_allocated[index / 64] &= ~(1ULL << (index & 63));
- dprintf("ringbuf_free,shared,allocated=%0lx\n", MPID_nem_ib_ringbuf_allocated[index / 64]);
+ dprintf("ringbuf_free,shared,allocated=%0lx\n",
+ MPID_nem_ib_ringbuf_allocated[index / 64]);
}
VC_FIELD(vc, ibcom->remote_ringbuf) = NULL;
default:
@@ -3066,21 +3033,21 @@ int MPID_nem_ib_ringbuf_free(MPIDI_VC_t * vc)
}
int found = 0;
- for(i = 0; i < (MPID_NEM_IB_NRINGBUF + 63) / 64; i++) {
- if(MPID_nem_ib_ringbuf_allocated[i] != 0) {
+ for (i = 0; i < (MPID_NEM_IB_NRINGBUF + 63) / 64; i++) {
+ if (MPID_nem_ib_ringbuf_allocated[i] != 0) {
found = 1;
break;
}
}
- if(!found) {
+ if (!found) {
MPIU_Free(MPID_nem_ib_ringbuf);
MPID_nem_ib_ringbuf = NULL;
}
- fn_exit:
+ fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RINGBUF_FREE);
return mpi_errno;
- fn_fail:
+ fn_fail:
goto fn_exit;
}
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
index 709a417..61792c1 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
@@ -150,8 +150,8 @@ static inline void __lru_queue_display()
for (i = 0; i < MPID_NEM_IB_COM_REG_MR_NLINE; i++) {
dprintf("---- hash %d\n", i);
for (p =
- (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) MPID_nem_ib_com_reg_mr_cache[i].
- lru_next;
+ (struct MPID_nem_ib_com_reg_mr_cache_entry_t *)
+ MPID_nem_ib_com_reg_mr_cache[i].lru_next;
p != (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) &MPID_nem_ib_com_reg_mr_cache[i];
p = (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) p->lru_next) {
if (p && p->addr) {
@@ -165,7 +165,8 @@ static inline void __lru_queue_display()
}
}
-struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, int len, enum ibv_access_flags additional_flags)
+struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, int len,
+ enum ibv_access_flags additional_flags)
{
#if 0 /* debug */
struct ibv_mr *mr;
@@ -268,8 +269,8 @@ struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, int len, enum ibv_access
#if 0 /* disable for debug */
/* move to head of the list */
if (e !=
- (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) MPID_nem_ib_com_reg_mr_cache[key].
- lru_next) {
+ (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) MPID_nem_ib_com_reg_mr_cache[key].lru_next)
+ {
MPID_nem_ib_com_reg_mr_unlink((struct MPID_nem_ib_com_reg_mr_listnode_t *) e);
MPID_nem_ib_com_reg_mr_insert(&MPID_nem_ib_com_reg_mr_cache[key],
(struct MPID_nem_ib_com_reg_mr_listnode_t *) e);
@@ -307,21 +308,21 @@ int MPID_nem_ib_com_register_cache_init()
ref_count++;
dprintf("cache_init,ref_count=%d\n", ref_count);
-
- if(ref_count == 1) {
- /* Using the address to the start node to express the end of the list
- * instead of using NULL */
- for (i = 0; i < MPID_NEM_IB_COM_REG_MR_NLINE; i++) {
- MPID_nem_ib_com_reg_mr_cache[i].lru_next =
- (struct MPID_nem_ib_com_reg_mr_listnode_t *) &MPID_nem_ib_com_reg_mr_cache[i];
- MPID_nem_ib_com_reg_mr_cache[i].lru_prev =
- (struct MPID_nem_ib_com_reg_mr_listnode_t *) &MPID_nem_ib_com_reg_mr_cache[i];
- }
- dprintf("[MrCache] cache initializes %d entries\n", MPID_NEM_IB_COM_REG_MR_NLINE);
+ if (ref_count == 1) {
+ /* Using the address to the start node to express the end of the list
+ * instead of using NULL */
+ for (i = 0; i < MPID_NEM_IB_COM_REG_MR_NLINE; i++) {
+ MPID_nem_ib_com_reg_mr_cache[i].lru_next =
+ (struct MPID_nem_ib_com_reg_mr_listnode_t *) &MPID_nem_ib_com_reg_mr_cache[i];
+ MPID_nem_ib_com_reg_mr_cache[i].lru_prev =
+ (struct MPID_nem_ib_com_reg_mr_listnode_t *) &MPID_nem_ib_com_reg_mr_cache[i];
+ }
+
+ dprintf("[MrCache] cache initializes %d entries\n", MPID_NEM_IB_COM_REG_MR_NLINE);
}
- fn_exit:
+ fn_exit:
return ibcom_errno;
//fn_fail:
goto fn_exit;
@@ -337,16 +338,16 @@ int MPID_nem_ib_com_register_cache_release()
dprintf("cache_release,ref_count=%d\n", ref_count);
MPIU_Assert(ref_count > 0);
- if(--ref_count > 0) {
+ if (--ref_count > 0) {
goto fn_exit;
- }
+ }
for (i = 0; i < MPID_NEM_IB_COM_REG_MR_NLINE; i++) {
for (p =
- (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) MPID_nem_ib_com_reg_mr_cache[i].
- lru_next;
- p != (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) &MPID_nem_ib_com_reg_mr_cache[i];
- ) {
+ (struct MPID_nem_ib_com_reg_mr_cache_entry_t *)
+ MPID_nem_ib_com_reg_mr_cache[i].lru_next;
+ p !=
+ (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) &MPID_nem_ib_com_reg_mr_cache[i];) {
if (p && p->addr > 0) {
ib_errno = MPID_nem_ib_com_dereg_mr(p->mr);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, printf("MPID_nem_ib_com_dereg_mr"));
@@ -365,8 +366,8 @@ int MPID_nem_ib_com_register_cache_release()
//__lru_queue_display();
dprintf("[MrCache] cache destroyed %d entries\n", cnt);
- fn_exit:
+ fn_exit:
return ibcom_errno;
- fn_fail:
+ fn_fail:
goto fn_exit;
}
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
index e9f4e48..1932198 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
@@ -170,13 +170,14 @@ static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, vo
/* send RDMA-write-to buffer occupancy information */
/* embed SR occupancy information and remember the last one sent */
MPIDI_CH3_Pkt_t *ch3_hdr = (MPIDI_CH3_Pkt_t *) hdr;
- if (MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent) > notify_rate) {
+ if (MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent)
+ > notify_rate) {
#if 0 /* debug, disabling piggy-back */
switch (ch3_hdr->type) {
case MPIDI_CH3_PKT_EAGER_SEND:
pkt_netmod.subtype = MPIDI_NEM_IB_PKT_EAGER_SEND;
goto common_tail;
-#if 0 /* modification of mpid_nem_lmt.c is required */
+#if 0 /* modification of mpid_nem_lmt.c is required */
case MPIDI_NEM_PKT_LMT_RTS:
pkt_netmod.subtype = MPIDI_NEM_IB_PKT_LMT_RTS;
goto common_tail;
@@ -244,7 +245,8 @@ static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, vo
dprintf
("isendcontig_core,sreq=%p,prefix=%p,sz_prefix=%d,hdr=%p,sz_hdr=%ld,data=%p,sz_data=%d,remote_ringbuf->type=%d\n",
- sreq, prefix, sz_prefix, hdr, hdr_sz, data, (int) data_sz, vc_ib->ibcom->remote_ringbuf->type);
+ sreq, prefix, sz_prefix, hdr, hdr_sz, data, (int) data_sz,
+ vc_ib->ibcom->remote_ringbuf->type);
if (sizeof(MPIDI_CH3_Pkt_t) != hdr_sz) {
printf("type=%d,subtype=%d\n", ((MPID_nem_pkt_netmod_t *) hdr)->type,
@@ -253,7 +255,7 @@ static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, vo
int copied;
ibcom_errno =
- MPID_nem_ib_com_isend(vc_ib->sc->fd,
+ MPID_nem_ib_com_isend(vc_ib->sc->fd,
(uint64_t) sreq,
prefix, sz_prefix,
hdr, hdr_sz,
@@ -396,30 +398,34 @@ int MPID_nem_ib_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
MPIDI_DBG_Print_packet((MPIDI_CH3_Pkt_t *) hdr);
if (vc_ib->connection_state == MPID_NEM_IB_CM_CLOSED) {
- if(vc_ib->connection_guard == 0) {
+ if (vc_ib->connection_guard == 0) {
vc_ib->connection_guard = 1;
/* connected=no,ringbuf-type=shared,slot-available=no,
- going-to-be-enqueued=yes case */
- MPID_nem_ib_cm_cas(vc, 0); /* Call ask_fetch just after it's connected */
+ * going-to-be-enqueued=yes case */
+ MPID_nem_ib_cm_cas(vc, 0); /* Call ask_fetch just after it's connected */
}
}
if (vc_ib->connection_state != MPID_NEM_IB_CM_ESTABLISHED) {
/* connected=closed/transit,ringbuf-type=shared,slot-available=no,
- going-to-be-enqueued=yes case */
- REQ_FIELD(sreq, ask) = 0; /* We can't ask because ring-buffer type is not determined yet. */
+ * going-to-be-enqueued=yes case */
+ REQ_FIELD(sreq, ask) = 0; /* We can't ask because ring-buffer type is not determined yet. */
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_ask_fetch");
- } else {
+ }
+ else {
/* connected=established,ringbuf-type=shared,slot-available=no,
- going-to-be-enqueued=yes case */
+ * going-to-be-enqueued=yes case */
if (vc_ib->ibcom->local_ringbuf_type == MPID_NEM_IB_RINGBUF_SHARED &&
MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) >= vc_ib->ibcom->local_ringbuf_nslot) {
+ vc_ib->ibcom->lsr_seq_num_tail) >=
+ vc_ib->ibcom->local_ringbuf_nslot) {
dprintf("isendcontig,RINGBUF_SHARED and full,asking\n");
mpi_errno = MPID_nem_ib_ringbuf_ask_fetch(vc);
REQ_FIELD(sreq, ask) = 1;
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_ask_fetch");
- } else {
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_ringbuf_ask_fetch");
+ }
+ else {
REQ_FIELD(sreq, ask) = 0;
}
}
@@ -460,16 +466,15 @@ int MPID_nem_ib_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
}
#endif
- dprintf("isendcontig,%d->%d,req=%p,type=%d,subtype=%d,data_sz=%ld,ldiff=%d(%d-%d),rdiff=%d(%d-%d)\n",
- MPID_nem_ib_myrank, vc->pg_rank,
- sreq,
- ((MPIDI_CH3_Pkt_t *) hdr)->type,
- ((MPID_nem_pkt_netmod_t *) hdr)->subtype, data_sz,
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
- vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail,
- MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
- vc_ib->ibcom->rsr_seq_num_tail_last_sent),
- vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent);
+ dprintf
+ ("isendcontig,%d->%d,req=%p,type=%d,subtype=%d,data_sz=%ld,ldiff=%d(%d-%d),rdiff=%d(%d-%d)\n",
+ MPID_nem_ib_myrank, vc->pg_rank, sreq, ((MPIDI_CH3_Pkt_t *) hdr)->type,
+ ((MPID_nem_pkt_netmod_t *) hdr)->subtype, data_sz,
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
+ vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail,
+ MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
+ vc_ib->ibcom->rsr_seq_num_tail_last_sent),
+ vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent);
dprintf("isendcontig,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n",
MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe,
MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
@@ -489,21 +494,19 @@ int MPID_nem_ib_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
/* make control packet bringing sequence number go ahead of
* queued packets to avoid dead-lock */
int goahead =
- (ch3_hdr->type == MPIDI_NEM_PKT_NETMOD &&
- prefix->subtype == MPIDI_NEM_IB_PKT_REQ_SEQ_NUM)
+ (ch3_hdr->type == MPIDI_NEM_PKT_NETMOD && prefix->subtype == MPIDI_NEM_IB_PKT_REQ_SEQ_NUM)
|| (ch3_hdr->type == MPIDI_NEM_PKT_NETMOD &&
prefix->subtype == MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM) ||
- (ch3_hdr->type == MPIDI_NEM_PKT_NETMOD &&
- prefix->subtype == MPIDI_NEM_IB_PKT_LMT_GET_DONE)
+ (ch3_hdr->type == MPIDI_NEM_PKT_NETMOD && prefix->subtype == MPIDI_NEM_IB_PKT_LMT_GET_DONE)
? 1 : 0;
dprintf("isendcontig,slack=%d,goahead=%d\n", slack, goahead);
- if (
- (goahead || MPID_nem_ib_sendq_empty(vc_ib->sendq)) &&
+ if ((goahead || MPID_nem_ib_sendq_empty(vc_ib->sendq)) &&
vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack &&
MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) < vc_ib->ibcom->local_ringbuf_nslot - slack) {
+ vc_ib->ibcom->lsr_seq_num_tail) <
+ vc_ib->ibcom->local_ringbuf_nslot - slack) {
mpi_errno = MPID_nem_ib_iSendContig_core(vc, sreq, hdr, hdr_sz, data, data_sz);
if (mpi_errno) {
@@ -513,15 +516,16 @@ int MPID_nem_ib_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
}
else {
/* enqueue command into send_queue */
- dprintf("isendcontig,enqueuing,goahead=%d,sendq=%d,ncom=%d,ncqe=%d,ldiff=%d(%d-%d),slack=%d\n",
- goahead, MPID_nem_ib_sendq_empty(vc_ib->sendq),
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack,
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack,
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
- vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail, slack);
+ dprintf
+ ("isendcontig,enqueuing,goahead=%d,sendq=%d,ncom=%d,ncqe=%d,ldiff=%d(%d-%d),slack=%d\n",
+ goahead, MPID_nem_ib_sendq_empty(vc_ib->sendq),
+ vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack,
+ MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack,
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
+ vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail, slack);
/* this location because the above message refers undefined */
- enqueue:
+ enqueue:
/* store required info. see MPIDI_CH3_iSendv in src/mpid/ch3/channels/nemesis/src/ch3_isendv.c */
sreq->dev.pending_pkt = *(MPIDI_CH3_Pkt_t *) hdr;
@@ -713,8 +717,9 @@ static int MPID_nem_ib_SendNoncontig_core(MPIDI_VC_t * vc, MPID_Request * sreq,
}
int copied;
- dprintf("sendnoncontig_core,isend,%d->%d,seq_num=%d,remote_ringbuf->type=%d\n", MPID_nem_ib_myrank, vc->pg_rank,
- vc_ib->ibcom->sseq_num, vc_ib->ibcom->remote_ringbuf->type);
+ dprintf("sendnoncontig_core,isend,%d->%d,seq_num=%d,remote_ringbuf->type=%d\n",
+ MPID_nem_ib_myrank, vc->pg_rank, vc_ib->ibcom->sseq_num,
+ vc_ib->ibcom->remote_ringbuf->type);
ibcom_errno =
MPID_nem_ib_com_isend(vc_ib->sc->fd,
@@ -825,33 +830,37 @@ int MPID_nem_ib_SendNoncontig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "ib_SendNoncontig");
if (vc_ib->connection_state == MPID_NEM_IB_CM_CLOSED) {
- if(vc_ib->connection_guard == 0) {
+ if (vc_ib->connection_guard == 0) {
vc_ib->connection_guard = 1;
/* connected=closed,ringbuf-type=shared,slot-available=no,
- going-to-be-enqueued=yes case */
- MPID_nem_ib_cm_cas(vc, 0); /* Call ask_fetch just after it's connected */
+ * going-to-be-enqueued=yes case */
+ MPID_nem_ib_cm_cas(vc, 0); /* Call ask_fetch just after it's connected */
}
}
if (vc_ib->connection_state != MPID_NEM_IB_CM_ESTABLISHED) {
/* connected=closed/transit,ringbuf-type=shared,slot-available=no,
- going-to-be-enqueued=yes case */
+ * going-to-be-enqueued=yes case */
REQ_FIELD(sreq, ask) = 0;
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_ask_fetch");
- } else {
+ }
+ else {
/* connected=established,ringbuf-type=shared,slot-available=no,
- going-to-be-enqueued=yes case */
+ * going-to-be-enqueued=yes case */
if (vc_ib->ibcom->local_ringbuf_type == MPID_NEM_IB_RINGBUF_SHARED &&
MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) >= vc_ib->ibcom->local_ringbuf_nslot) {
+ vc_ib->ibcom->lsr_seq_num_tail) >=
+ vc_ib->ibcom->local_ringbuf_nslot) {
dprintf("sendnoncontig,RINGBUF_SHARED and full,asking\n");
REQ_FIELD(sreq, ask) = 1;
mpi_errno = MPID_nem_ib_ringbuf_ask_fetch(vc);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_ask_fetch");
- } else {
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_ringbuf_ask_fetch");
+ }
+ else {
REQ_FIELD(sreq, ask) = 0;
}
}
-
+
dprintf("sendnoncontig,%d->%d,sendq_empty=%d,ncom=%d,rdmabuf_occ=%d\n", MPID_nem_ib_myrank,
vc->pg_rank, MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom,
MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
@@ -876,13 +885,14 @@ int MPID_nem_ib_SendNoncontig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
if (
#ifdef MPID_NEM_IB_ONDEMAND
- vc_ib->connection_state == MPID_NEM_IB_CM_ESTABLISHED &&
+ vc_ib->connection_state == MPID_NEM_IB_CM_ESTABLISHED &&
#endif
- MPID_nem_ib_sendq_empty(vc_ib->sendq) &&
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack &&
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) < vc_ib->ibcom->local_ringbuf_nslot - slack) {
+ MPID_nem_ib_sendq_empty(vc_ib->sendq) &&
+ vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
+ MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack &&
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
+ vc_ib->ibcom->lsr_seq_num_tail) <
+ vc_ib->ibcom->local_ringbuf_nslot - slack) {
mpi_errno = MPID_nem_ib_SendNoncontig_core(vc, sreq, hdr, hdr_sz);
if (mpi_errno) {
@@ -936,7 +946,7 @@ int MPID_nem_ib_send_progress(MPIDI_VC_t * vc)
//dprintf("send_progress,enter\n");
#ifdef MPID_NEM_IB_ONDEMAND
- if(vc_ib->connection_state != MPID_NEM_IB_CM_ESTABLISHED) {
+ if (vc_ib->connection_state != MPID_NEM_IB_CM_ESTABLISHED) {
//dprintf("send_progress,connection_state=%08x\n", vc_ib->connection_state);
goto fn_exit;
}
@@ -984,70 +994,70 @@ int MPID_nem_ib_send_progress(MPIDI_VC_t * vc)
MPID_NEM_IB_COM_AMT_SLACK;
/* Temporary fix until removing slack */
- if(vc_ib->ibcom->local_ringbuf_type == MPID_NEM_IB_RINGBUF_SHARED) {
+ if (vc_ib->ibcom->local_ringbuf_type == MPID_NEM_IB_RINGBUF_SHARED) {
slack = 0;
}
/* Refill slots from queue
- We don't need refill code in sendcontig because
- there is an order where (1) send, (2) it's queued, (3) then ask obtains slots,
- (4) then we can refill them here. */
-
- if(vc_ib->ibcom->local_ringbuf_type == MPID_NEM_IB_RINGBUF_SHARED &&
- (msg_type == MPIDI_REQUEST_EAGER_MSG &&
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) >= vc_ib->ibcom->local_ringbuf_nslot)) {
+ * We don't need refill code in sendcontig because
+ * there is an order where (1) send, (2) it's queued, (3) then ask obtains slots,
+ * (4) then we can refill them here. */
+
+ if (vc_ib->ibcom->local_ringbuf_type == MPID_NEM_IB_RINGBUF_SHARED &&
+ (msg_type == MPIDI_REQUEST_EAGER_MSG &&
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
+ vc_ib->ibcom->lsr_seq_num_tail) >=
+ vc_ib->ibcom->local_ringbuf_nslot)) {
/* Prevent RDMA-read for rendezvous protocol from issuing ask */
- if(!REQ_FIELD(sreq, ask)) { /* First packet after connection hasn't asked slot */
- /* Transitioning from exclusive to shared and need to issue ask.
- This case is detected because exclusive entries in the queue are deleted
- and deprived of slots of exclusive and the last state is set to
- shared when deciding a transition from exclusive to shared
- and an issued or queued ask must be in the queue or ringbuf_sendq
- when staying shared. */
+ if (!REQ_FIELD(sreq, ask)) { /* First packet after connection hasn't asked slot */
+ /* Transitioning from exclusive to shared and need to issue ask.
+ * This case is detected because exclusive entries in the queue are deleted
+ * and deprived of slots of exclusive and the last state is set to
+ * shared when deciding a transition from exclusive to shared
+ * and an issued or queued ask must be in the queue or ringbuf_sendq
+ * when staying shared. */
dprintf("send_progress,call ask_fetch,%d->%d\n",
MPID_nem_ib_myrank, vc->pg_rank);
mpi_errno = MPID_nem_ib_ringbuf_ask_fetch(vc);
REQ_FIELD(sreq, ask) = 1;
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
"**MPID_nem_ib_ringbuf_ask_fetch");
- } else if(!MPID_nem_ib_ringbuf_sectorq_empty(vc_ib->ibcom->sectorq)) {
- /* Staying shared or transitioning from shared to exclusive.
- We need to consume acquires slots in the latter case.
- Transitioning from shared to exclusive is achieved by
- finding an exlusive entry. */
- MPID_nem_ib_ringbuf_sector_t* sector =
+ }
+ else if (!MPID_nem_ib_ringbuf_sectorq_empty(vc_ib->ibcom->sectorq)) {
+ /* Staying shared or transitioning from shared to exclusive.
+ * We need to consume acquires slots in the latter case.
+ * Transitioning from shared to exclusive is achieved by
+ * finding an exlusive entry. */
+ MPID_nem_ib_ringbuf_sector_t *sector =
MPID_nem_ib_ringbuf_sectorq_head(vc_ib->ibcom->sectorq);
-
+
vc_ib->ibcom->local_ringbuf_type = sector->type;
vc_ib->ibcom->local_ringbuf_start = sector->start;
vc_ib->ibcom->local_ringbuf_nslot = sector->nslot;
vc_ib->ibcom->sseq_num = sector->head;
vc_ib->ibcom->lsr_seq_num_tail = sector->tail;
-
+
MPID_nem_ib_ringbuf_sectorq_dequeue(&vc_ib->ibcom->sectorq, §or);
MPIU_Free(sector);
- dprintf("send_progress,refill,next type=%d,start=%p,local_head=%d,local_tail=%d\n",
- vc_ib->ibcom->local_ringbuf_type,
- vc_ib->ibcom->local_ringbuf_start,
- vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail
- );
+ dprintf
+ ("send_progress,refill,next type=%d,start=%p,local_head=%d,local_tail=%d\n",
+ vc_ib->ibcom->local_ringbuf_type, vc_ib->ibcom->local_ringbuf_start,
+ vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail);
}
}
if (vc_ib->ibcom->ncom >= MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack ||
MPID_nem_ib_ncqe >= MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack ||
- (msg_type == MPIDI_REQUEST_EAGER_MSG &&
+ (msg_type == MPIDI_REQUEST_EAGER_MSG &&
MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
vc_ib->ibcom->lsr_seq_num_tail) >=
vc_ib->ibcom->local_ringbuf_nslot - slack)) {
- /* Exit when full because this reduces the search cost.
- Note that RDMA-read for rendezvous protocol can be issued even
- when no ring-buffer slot is available. */
- goto fn_exit;
+ /* Exit when full because this reduces the search cost.
+ * Note that RDMA-read for rendezvous protocol can be issued even
+ * when no ring-buffer slot is available. */
+ goto fn_exit;
}
if (vc_ib != MPID_nem_ib_debug_current_vc_ib) {
@@ -1087,7 +1097,7 @@ int MPID_nem_ib_send_progress(MPIDI_VC_t * vc)
(MPID_nem_ib_lmt_cookie_t *) sreq->dev.iov[1].MPID_IOV_BUF;
dprintf("send_progress,MPIDI_NEM_PKT_LMT_RTS,rsr_seq_num_tail=%d\n",
vc_ib->ibcom->rsr_seq_num_tail);
-#if 0 /* moving to packet header */
+#if 0 /* moving to packet header */
/* embed RDMA-write-to buffer occupancy information */
s_cookie_buf->seq_num_tail = vc_ib->ibcom->rsr_seq_num_tail;
/* remember the last one sent */
@@ -1102,7 +1112,7 @@ int MPID_nem_ib_send_progress(MPIDI_VC_t * vc)
(MPID_nem_ib_lmt_cookie_t *) sreq->dev.iov[1].MPID_IOV_BUF;
dprintf("send_progress,MPIDI_NEM_PKT_LMT_CTS,rsr_seq_num_tail=%d\n",
vc_ib->ibcom->rsr_seq_num_tail);
-#if 0 /* moving to packet header */
+#if 0 /* moving to packet header */
/* embed RDMA-write-to buffer occupancy information */
s_cookie_buf->seq_num_tail = vc_ib->ibcom->rsr_seq_num_tail;
/* remember the last one sent */
@@ -1122,8 +1132,8 @@ int MPID_nem_ib_send_progress(MPIDI_VC_t * vc)
case MPIDI_NEM_IB_PKT_LMT_GET_DONE:{
#if 0
MPID_nem_ib_pkt_lmt_get_done_t *_done_pkt =
- (MPID_nem_ib_pkt_lmt_get_done_t *) sreq->dev.
- iov[0].MPID_IOV_BUF;
+ (MPID_nem_ib_pkt_lmt_get_done_t *) sreq->dev.iov[0].
+ MPID_IOV_BUF;
dprintf
("send_progress,MPIDI_NEM_IB_PKT_LMT_GET_DONE,rsr_seq_num_tail=%d\n",
vc_ib->ibcom->rsr_seq_num_tail);
@@ -1137,8 +1147,8 @@ int MPID_nem_ib_send_progress(MPIDI_VC_t * vc)
}
case MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM:{
MPID_nem_ib_pkt_reply_seq_num_t *_pkt =
- (MPID_nem_ib_pkt_reply_seq_num_t *) sreq->dev.
- iov[0].MPID_IOV_BUF;
+ (MPID_nem_ib_pkt_reply_seq_num_t *) sreq->dev.iov[0].
+ MPID_IOV_BUF;
dprintf
("send_progress,MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM,rsr_seq_num_tail=%d\n",
vc_ib->ibcom->rsr_seq_num_tail);
@@ -1191,7 +1201,8 @@ int MPID_nem_ib_send_progress(MPIDI_VC_t * vc)
else if (sreq->kind == MPID_REQUEST_SEND && msg_type == MPIDI_REQUEST_RNDV_MSG) {
}
else {
- dprintf("send_progress,unknown sreq=%p,sreq->kind=%d,msg_type=%d\n", sreq, sreq->kind, msg_type);
+ dprintf("send_progress,unknown sreq=%p,sreq->kind=%d,msg_type=%d\n", sreq,
+ sreq->kind, msg_type);
assert(0);
MPIU_ERR_INTERNALANDJUMP(mpi_errno, "send_progress,unknown type");
}
@@ -1216,7 +1227,7 @@ int MPID_nem_ib_send_progress(MPIDI_VC_t * vc)
prev_sreq = sreq;
sreq = MPID_nem_ib_sendq_next(sreq);
next_unlinked:;
- if(!sreq) {
+ if (!sreq) {
dprintf("send_progress,sendq has got empty!\n");
}
} while (sreq);
@@ -1242,7 +1253,7 @@ int MPID_nem_ib_cm_progress()
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
MPID_nem_ib_cm_req_t *sreq, *prev_sreq;
- MPID_nem_ib_cm_cmd_shadow_t* shadow;
+ MPID_nem_ib_cm_cmd_shadow_t *shadow;
int is_established = 0;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_PROGRESS);
@@ -1272,22 +1283,23 @@ int MPID_nem_ib_cm_progress()
if (MPID_nem_ib_diff63(MPID_nem_ib_progress_engine_vt, sreq->retry_decided) <
sreq->retry_backoff) {
#if 0
- dprintf("cm_progress,vt=%ld,retry_decided=%ld,diff=%ld,backoff=%ld\n",
+ dprintf("cm_progress,vt=%ld,retry_decided=%ld,diff=%ld,backoff=%ld\n",
MPID_nem_ib_progress_engine_vt, sreq->retry_decided,
MPID_nem_ib_diff63(MPID_nem_ib_progress_engine_vt, sreq->retry_decided),
sreq->retry_backoff);
#endif
goto next;
}
- dprintf("cm_progress,retry CAS,responder_rank=%d,req=%p,decided=%ld,vt=%ld,backoff=%ld\n",
- sreq->responder_rank, sreq, sreq->retry_decided,
- MPID_nem_ib_progress_engine_vt, sreq->retry_backoff);
+ dprintf
+ ("cm_progress,retry CAS,responder_rank=%d,req=%p,decided=%ld,vt=%ld,backoff=%ld\n",
+ sreq->responder_rank, sreq, sreq->retry_decided,
+ MPID_nem_ib_progress_engine_vt, sreq->retry_backoff);
shadow =
- (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ (MPID_nem_ib_cm_cmd_shadow_t *)
+ MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
shadow->type = sreq->state;
shadow->req = sreq;
- mpi_errno =
- MPID_nem_ib_cm_cas_core(sreq->responder_rank, shadow);
+ mpi_errno = MPID_nem_ib_cm_cas_core(sreq->responder_rank, shadow);
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
"**MPID_nem_ib_cm_connect_cas_core");
break;
@@ -1297,14 +1309,15 @@ int MPID_nem_ib_cm_progress()
* So we replace SYN with CAS_RELEASE, and send. */
/* override req->type */
- ((MPID_nem_ib_cm_cmd_syn_t*)&sreq->cmd)->type = MPID_NEM_IB_CM_CAS_RELEASE;
- ((MPID_nem_ib_cm_cmd_syn_t*)&sreq->cmd)->initiator_rank = MPID_nem_ib_myrank;
+ ((MPID_nem_ib_cm_cmd_syn_t *) & sreq->cmd)->type = MPID_NEM_IB_CM_CAS_RELEASE;
+ ((MPID_nem_ib_cm_cmd_syn_t *) & sreq->cmd)->initiator_rank = MPID_nem_ib_myrank;
/* Initiator does not receive SYNACK and ACK2, so we decrement incoming counter here. */
sreq->ibcom->incoming_connection_tx -= 2;
shadow =
- (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ (MPID_nem_ib_cm_cmd_shadow_t *)
+ MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
/* override req->state */
shadow->type = sreq->state = MPID_NEM_IB_CM_CAS_RELEASE;
@@ -1312,108 +1325,121 @@ int MPID_nem_ib_cm_progress()
dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
mpi_errno =
MPID_nem_ib_cm_cmd_core(sreq->responder_rank, shadow,
- (void *)(&sreq->cmd),
- sizeof(MPID_nem_ib_cm_cmd_synack_t), 1 /* syn:1 */, 0);
+ (void *) (&sreq->cmd),
+ sizeof(MPID_nem_ib_cm_cmd_synack_t), 1 /* syn:1 */ ,
+ 0);
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
"**MPID_nem_ib_cm_send_core");
break;
}
/* The initiator acqurire slot for the responder when sending syn */
- if(MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
- MPID_nem_ib_cm_ringbuf_tail) >= MPID_NEM_IB_CM_NSEG) {
+ if (MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
+ MPID_nem_ib_cm_ringbuf_tail) >= MPID_NEM_IB_CM_NSEG) {
goto next;
}
- ((MPID_nem_ib_cm_cmd_syn_t*)&sreq->cmd)->responder_ringbuf_index = MPID_nem_ib_cm_ringbuf_head;
+ ((MPID_nem_ib_cm_cmd_syn_t *) & sreq->cmd)->responder_ringbuf_index =
+ MPID_nem_ib_cm_ringbuf_head;
sreq->responder_ringbuf_index = MPID_nem_ib_cm_ringbuf_head;
- ((MPID_nem_ib_cm_cmd_syn_t*)&sreq->cmd)->initiator_rank = MPID_nem_ib_myrank;
+ ((MPID_nem_ib_cm_cmd_syn_t *) & sreq->cmd)->initiator_rank = MPID_nem_ib_myrank;
MPID_nem_ib_cm_ringbuf_head++;
shadow =
- (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ (MPID_nem_ib_cm_cmd_shadow_t *)
+ MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
shadow->type = sreq->state;
shadow->req = sreq;
dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
- mpi_errno =
+ mpi_errno =
MPID_nem_ib_cm_cmd_core(sreq->responder_rank, shadow,
- (void *)(&sreq->cmd),
- sizeof(MPID_nem_ib_cm_cmd_synack_t), 1 /* syn:1 */, 0);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_send_core");
+ (void *) (&sreq->cmd),
+ sizeof(MPID_nem_ib_cm_cmd_synack_t), 1 /* syn:1 */ , 0);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_send_core");
break;
case MPID_NEM_IB_CM_CAS_RELEASE:
- ((MPID_nem_ib_cm_cmd_syn_t*)&sreq->cmd)->initiator_rank = MPID_nem_ib_myrank;
+ ((MPID_nem_ib_cm_cmd_syn_t *) & sreq->cmd)->initiator_rank = MPID_nem_ib_myrank;
shadow =
- (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ (MPID_nem_ib_cm_cmd_shadow_t *)
+ MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
shadow->type = sreq->state;
shadow->req = sreq;
dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
mpi_errno =
MPID_nem_ib_cm_cmd_core(sreq->responder_rank, shadow,
- (void *)(&sreq->cmd),
- sizeof(MPID_nem_ib_cm_cmd_synack_t), 1 /* syn:1 */, 0);
+ (void *) (&sreq->cmd),
+ sizeof(MPID_nem_ib_cm_cmd_synack_t), 1 /* syn:1 */ , 0);
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
"**MPID_nem_ib_cm_send_core");
break;
case MPID_NEM_IB_CM_SYNACK:
/* The responder acquire slot for the initiator when sending synack */
- if(MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
- MPID_nem_ib_cm_ringbuf_tail) >= MPID_NEM_IB_CM_NSEG) {
+ if (MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
+ MPID_nem_ib_cm_ringbuf_tail) >= MPID_NEM_IB_CM_NSEG) {
goto next;
}
- ((MPID_nem_ib_cm_cmd_synack_t*)&sreq->cmd)->initiator_ringbuf_index = MPID_nem_ib_cm_ringbuf_head;
+ ((MPID_nem_ib_cm_cmd_synack_t *) & sreq->cmd)->initiator_ringbuf_index =
+ MPID_nem_ib_cm_ringbuf_head;
sreq->initiator_ringbuf_index = MPID_nem_ib_cm_ringbuf_head;
MPID_nem_ib_cm_ringbuf_head++;
shadow =
- (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ (MPID_nem_ib_cm_cmd_shadow_t *)
+ MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
shadow->type = sreq->state;
shadow->req = sreq;
dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
- mpi_errno =
+ mpi_errno =
MPID_nem_ib_cm_cmd_core(sreq->initiator_rank, shadow,
- (void *)(&sreq->cmd),
- sizeof(MPID_nem_ib_cm_cmd_synack_t), 0, sreq->ringbuf_index);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_send_core");
+ (void *) (&sreq->cmd),
+ sizeof(MPID_nem_ib_cm_cmd_synack_t), 0,
+ sreq->ringbuf_index);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_send_core");
break;
case MPID_NEM_IB_CM_ACK1:
shadow =
- (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ (MPID_nem_ib_cm_cmd_shadow_t *)
+ MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
shadow->type = sreq->state;
shadow->req = sreq;
dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
- mpi_errno =
+ mpi_errno =
MPID_nem_ib_cm_cmd_core(sreq->responder_rank, shadow,
- (void *)(&sreq->cmd),
- sizeof(MPID_nem_ib_cm_cmd_ack1_t), 0, sreq->ringbuf_index);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_send_core");
+ (void *) (&sreq->cmd),
+ sizeof(MPID_nem_ib_cm_cmd_ack1_t), 0,
+ sreq->ringbuf_index);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_send_core");
break;
case MPID_NEM_IB_CM_ACK2:
shadow =
- (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ (MPID_nem_ib_cm_cmd_shadow_t *)
+ MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
shadow->type = sreq->state;
shadow->req = sreq;
dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
- mpi_errno =
+ mpi_errno =
MPID_nem_ib_cm_cmd_core(sreq->initiator_rank, shadow,
- (void *)(&sreq->cmd),
- sizeof(MPID_nem_ib_cm_cmd_ack2_t), 0, sreq->ringbuf_index);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_send_core");
+ (void *) (&sreq->cmd),
+ sizeof(MPID_nem_ib_cm_cmd_ack2_t), 0,
+ sreq->ringbuf_index);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_send_core");
break;
case MPID_NEM_IB_CM_ALREADY_ESTABLISHED:
case MPID_NEM_IB_CM_RESPONDER_IS_CONNECTING:
shadow =
- (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ (MPID_nem_ib_cm_cmd_shadow_t *)
+ MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
shadow->type = sreq->state;
shadow->req = sreq;
dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
mpi_errno =
MPID_nem_ib_cm_cmd_core(sreq->initiator_rank, shadow,
- (void *)(&sreq->cmd),
- sizeof(MPID_nem_ib_cm_cmd_synack_t), 0, sreq->ringbuf_index);
+ (void *) (&sreq->cmd),
+ sizeof(MPID_nem_ib_cm_cmd_synack_t), 0,
+ sreq->ringbuf_index);
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
"**MPID_nem_ib_cm_send_core");
break;
@@ -1455,7 +1481,7 @@ int MPID_nem_ib_cm_progress()
break;
}
goto next_unlinked;
- next:
+ next:
prev_sreq = sreq;
sreq = MPID_nem_ib_cm_sendq_next(sreq);
next_unlinked:;
@@ -1474,7 +1500,7 @@ int MPID_nem_ib_cm_progress()
#define FUNCNAME MPID_nem_ib_cm_cas_core
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_cm_cas_core(int rank, MPID_nem_ib_cm_cmd_shadow_t* shadow)
+int MPID_nem_ib_cm_cas_core(int rank, MPID_nem_ib_cm_cmd_shadow_t * shadow)
{
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
@@ -1496,9 +1522,8 @@ int MPID_nem_ib_cm_cas_core(int rank, MPID_nem_ib_cm_cmd_shadow_t* shadow)
MPID_nem_ib_ncqe_scratch_pad += 1;
/* Direct poll to drain CQ to check CAS result */
- MPID_nem_ib_ncqe_scratch_pad_to_drain += 1;
- dprintf("ringbuf_cm_cas_core,scratch_pad_to_drain=%d\n",
- MPID_nem_ib_ncqe_scratch_pad_to_drain);
+ MPID_nem_ib_ncqe_scratch_pad_to_drain += 1;
+ dprintf("ringbuf_cm_cas_core,scratch_pad_to_drain=%d\n", MPID_nem_ib_ncqe_scratch_pad_to_drain);
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_CAS_CORE);
@@ -1523,7 +1548,7 @@ int MPID_nem_ib_cm_cas(MPIDI_VC_t * vc, uint32_t ask_on_connect)
dprintf("cm_cas,enter\n");
/* Prepare request structure for enqueued case */
- MPID_nem_ib_cm_req_t* req = MPIU_Malloc(sizeof(MPID_nem_ib_cm_req_t));
+ MPID_nem_ib_cm_req_t *req = MPIU_Malloc(sizeof(MPID_nem_ib_cm_req_t));
MPIU_ERR_CHKANDJUMP(!req, mpi_errno, MPI_ERR_OTHER, "**malloc");
dprintf("req=%p\n", req);
req->state = MPID_NEM_IB_CM_CAS;
@@ -1533,29 +1558,29 @@ int MPID_nem_ib_cm_cas(MPIDI_VC_t * vc, uint32_t ask_on_connect)
req->responder_rank = vc->pg_rank;
req->ask_on_connect = ask_on_connect;
ibcom_errno =
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[vc->pg_rank],
- &req->ibcom);
+ MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[vc->pg_rank], &req->ibcom);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
dprintf("req->ibcom=%p\n", req->ibcom);
/* Increment transaction counter here because cm_cas is called only once
- (cm_cas_core might be called more than once when retrying) */
+ * (cm_cas_core might be called more than once when retrying) */
req->ibcom->outstanding_connection_tx += 1;
dprintf("cm_cas,tx=%d\n", req->ibcom->outstanding_connection_tx);
/* Acquire remote scratch pad */
if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
- MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head, MPID_nem_ib_cm_ringbuf_tail) < MPID_NEM_IB_CM_NSEG) {
- MPID_nem_ib_cm_cmd_shadow_t * shadow =
- (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
+ MPID_nem_ib_cm_ringbuf_tail) < MPID_NEM_IB_CM_NSEG) {
+ MPID_nem_ib_cm_cmd_shadow_t *shadow =
+ (MPID_nem_ib_cm_cmd_shadow_t *) MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
shadow->type = req->state;
shadow->req = req;
-
+
mpi_errno = MPID_nem_ib_cm_cas_core(req->responder_rank, shadow);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_cas");
- } else {
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_cas");
+ }
+ else {
dprintf("cm_cas,enqueue\n");
req->retry_decided = MPID_nem_ib_progress_engine_vt;
MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
@@ -1573,7 +1598,8 @@ int MPID_nem_ib_cm_cas(MPIDI_VC_t * vc, uint32_t ask_on_connect)
#define FUNCNAME MPID_nem_ib_cm_cmd_core
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_cm_cmd_core(int rank, MPID_nem_ib_cm_cmd_shadow_t* shadow, void* buf, MPIDI_msg_sz_t sz, uint32_t syn, uint16_t ringbuf_index)
+int MPID_nem_ib_cm_cmd_core(int rank, MPID_nem_ib_cm_cmd_shadow_t * shadow, void *buf,
+ MPIDI_msg_sz_t sz, uint32_t syn, uint16_t ringbuf_index)
{
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
@@ -1593,37 +1619,37 @@ int MPID_nem_ib_cm_cmd_core(int rank, MPID_nem_ib_cm_cmd_shadow_t* shadow, void*
syn ? MPID_NEM_IB_CM_OFF_SYN :
MPID_NEM_IB_CM_OFF_CMD +
sizeof(MPID_nem_ib_cm_cmd_t) *
- ((uint16_t)(ringbuf_index % MPID_NEM_IB_CM_NSEG)),
- sz,
- buf,
- &(shadow->buf_from), &(shadow->buf_from_sz));
+ ((uint16_t) (ringbuf_index % MPID_NEM_IB_CM_NSEG)),
+ sz, buf, &(shadow->buf_from), &(shadow->buf_from_sz));
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_put_scratch_pad");
MPID_nem_ib_ncqe_scratch_pad += 1;
- if(syn) {
+ if (syn) {
/* Skip QP createion on race condition */
- if(!(VC_FIELD(MPID_nem_ib_conns[rank].vc, connection_state) &
- MPID_NEM_IB_CM_LOCAL_QP_RESET)) {
+ if (!(VC_FIELD(MPID_nem_ib_conns[rank].vc, connection_state) &
+ MPID_NEM_IB_CM_LOCAL_QP_RESET)) {
/* Prepare QP (RESET). Attempting to overlap it with preparing QP (RESET) on the responder side */
- ibcom_errno = MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_RC, &MPID_nem_ib_conns[rank].fd);
+ ibcom_errno =
+ MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_RC, &MPID_nem_ib_conns[rank].fd);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
- VC_FIELD(MPID_nem_ib_conns[rank].vc, connection_state) |=
- MPID_NEM_IB_CM_LOCAL_QP_RESET;
-
+ VC_FIELD(MPID_nem_ib_conns[rank].vc, connection_state) |= MPID_NEM_IB_CM_LOCAL_QP_RESET;
+
/* Store pointer to MPID_nem_ib_com */
- ibcom_errno = MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[rank].fd,
- &VC_FIELD(MPID_nem_ib_conns[rank].vc, ibcom));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
-
+ ibcom_errno = MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[rank].fd,
+ &VC_FIELD(MPID_nem_ib_conns[rank].vc,
+ ibcom));
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_com_obtain_pointer");
+
/* Allocate RDMA-write-to ring-buf for remote */
mpi_errno = MPID_nem_ib_ringbuf_alloc(MPID_nem_ib_conns[rank].vc);
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ring_alloc");
}
}
-
+
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_CMD_CORE);
return mpi_errno;
@@ -1637,7 +1663,8 @@ int MPID_nem_ib_cm_cmd_core(int rank, MPID_nem_ib_cm_cmd_shadow_t* shadow, void*
#define FUNCNAME MPID_nem_ib_ringbuf_ask_fetch_core
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_ringbuf_ask_fetch_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t* shadow, MPIDI_msg_sz_t sz)
+int MPID_nem_ib_ringbuf_ask_fetch_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t * shadow,
+ MPIDI_msg_sz_t sz)
{
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
@@ -1651,14 +1678,13 @@ int MPID_nem_ib_ringbuf_ask_fetch_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_
MPID_nem_ib_com_get_scratch_pad(MPID_nem_ib_scratch_pad_fds[vc->pg_rank],
(uint64_t) shadow,
MPID_NEM_IB_RINGBUF_OFF_HEAD,
- sz,
- &shadow->buf_from, &shadow->buf_from_sz);
+ sz, &shadow->buf_from, &shadow->buf_from_sz);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_get_scratch_pad");
MPID_nem_ib_ncqe_scratch_pad += 1;
/* Direct poll to drain CQ to issue CAS */
- MPID_nem_ib_ncqe_scratch_pad_to_drain += 1;
+ MPID_nem_ib_ncqe_scratch_pad_to_drain += 1;
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_FETCH_CORE);
@@ -1683,45 +1709,44 @@ int MPID_nem_ib_ringbuf_ask_fetch(MPIDI_VC_t * vc)
dprintf("ringbuf_ask_fetch,enter\n");
/* Prepare state of ask-send */
- MPID_nem_ib_ringbuf_req_t* req = MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_req_t));
+ MPID_nem_ib_ringbuf_req_t *req = MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_req_t));
MPIU_ERR_CHKANDJUMP(!req, mpi_errno, MPI_ERR_OTHER, "**malloc");
dprintf("ask_fetch,req=%p\n", req);
req->state = MPID_NEM_IB_RINGBUF_ASK_FETCH;
req->retry_backoff = 0;
req->vc = vc;
ibcom_errno =
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[vc->pg_rank],
- &req->ibcom);
+ MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[vc->pg_rank], &req->ibcom);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
dprintf("ask_fetch,connection=%08x,ncqe=%d,ncom=%d,guard=%d\n",
VC_FIELD(vc, connection_state),
MPID_nem_ib_ncqe_scratch_pad,
- req->ibcom->ncom_scratch_pad,
- VC_FIELD(vc, ibcom->ask_guard)
- );
+ req->ibcom->ncom_scratch_pad, VC_FIELD(vc, ibcom->ask_guard)
+);
/* Acquire remote scratch pad */
if (VC_FIELD(vc, connection_state) == MPID_NEM_IB_CM_ESTABLISHED &&
MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
!VC_FIELD(vc, ibcom->ask_guard)) {
-
- /* Let the guard up here to prevent CAS conflicts between consecutive asks
- from the same process */
+
+ /* Let the guard up here to prevent CAS conflicts between consecutive asks
+ * from the same process */
VC_FIELD(vc, ibcom->ask_guard) = 1;
- MPID_nem_ib_ringbuf_cmd_shadow_t * shadow =
- (MPID_nem_ib_ringbuf_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_cmd_shadow_t));
+ MPID_nem_ib_ringbuf_cmd_shadow_t *shadow =
+ (MPID_nem_ib_ringbuf_cmd_shadow_t *)
+ MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_cmd_shadow_t));
shadow->type = req->state;
shadow->req = req;
-
- mpi_errno =
+
+ mpi_errno =
MPID_nem_ib_ringbuf_ask_fetch_core(req->vc, shadow,
sizeof(MPID_nem_ib_ringbuf_headtail_t));
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_ringbuf_ask_fetch");
- } else {
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_ask_fetch");
+ }
+ else {
dprintf("ask_fetch,enqueue,req=%p\n", req);
MPID_nem_ib_ringbuf_sendq_enqueue(&MPID_nem_ib_ringbuf_sendq, req);
}
@@ -1737,7 +1762,8 @@ int MPID_nem_ib_ringbuf_ask_fetch(MPIDI_VC_t * vc)
#define FUNCNAME MPID_nem_ib_ringbuf_ask_cas_core
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_ringbuf_ask_cas_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t* shadow, uint64_t head)
+int MPID_nem_ib_ringbuf_ask_cas_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t * shadow,
+ uint64_t head)
{
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
@@ -1753,18 +1779,17 @@ int MPID_nem_ib_ringbuf_ask_cas_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_sh
MPID_nem_ib_com_cas_scratch_pad(MPID_nem_ib_scratch_pad_fds[vc->pg_rank],
(uint64_t) shadow,
MPID_NEM_IB_RINGBUF_OFF_HEAD,
- head, head + 1,
- &shadow->buf_from, &shadow->buf_from_sz);
+ head, head + 1, &shadow->buf_from, &shadow->buf_from_sz);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_cas_scratch_pad");
MPID_nem_ib_ncqe_scratch_pad += 1;
/* Direct poll to drain CQ to check CAS result */
- MPID_nem_ib_ncqe_scratch_pad_to_drain += 1;
+ MPID_nem_ib_ncqe_scratch_pad_to_drain += 1;
dprintf("ringbuf_ask_cas_core,scratch_pad_to_drain=%d\n",
MPID_nem_ib_ncqe_scratch_pad_to_drain);
- /* Let the guard down here to overlap CAS with a fetch of the following request
- when CAS fails, out-of-order acquire may happen, but it's OK */
+ /* Let the guard down here to overlap CAS with a fetch of the following request
+ * when CAS fails, out-of-order acquire may happen, but it's OK */
VC_FIELD(vc, ibcom->ask_guard) = 0;
fn_exit:
@@ -1778,7 +1803,7 @@ int MPID_nem_ib_ringbuf_ask_cas_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_sh
#define FUNCNAME MPID_nem_ib_ringbuf_ask_cas
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_ringbuf_ask_cas(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_req_t* req)
+int MPID_nem_ib_ringbuf_ask_cas(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_req_t * req)
{
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
@@ -1793,30 +1818,32 @@ int MPID_nem_ib_ringbuf_ask_cas(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_req_t* req)
req->fetched.head, req->fetched.tail,
MPID_nem_ib_diff16(req->fetched.head, req->fetched.tail),
VC_FIELD(vc, ibcom->local_ringbuf_nslot)
- );
+);
/* Acquire one slot of the shared ring buffer */
if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY) {
-
- if(MPID_nem_ib_diff16(req->fetched.head, req->fetched.tail) <
- VC_FIELD(vc, ibcom->local_ringbuf_nslot)) {
+
+ if (MPID_nem_ib_diff16(req->fetched.head, req->fetched.tail) <
+ VC_FIELD(vc, ibcom->local_ringbuf_nslot)) {
dprintf("ask_cas,core\n");
req->state = MPID_NEM_IB_RINGBUF_ASK_CAS;
- MPID_nem_ib_ringbuf_cmd_shadow_t * shadow =
- (MPID_nem_ib_ringbuf_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_cmd_shadow_t));
+ MPID_nem_ib_ringbuf_cmd_shadow_t *shadow =
+ (MPID_nem_ib_ringbuf_cmd_shadow_t *)
+ MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_cmd_shadow_t));
shadow->type = req->state;
shadow->req = req;
- mpi_errno = MPID_nem_ib_ringbuf_ask_cas_core(vc, shadow, (uint64_t)req->fetched.head);
+ mpi_errno = MPID_nem_ib_ringbuf_ask_cas_core(vc, shadow, (uint64_t) req->fetched.head);
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
"**MPID_nem_ib_ringbuf_ask_cas");
- } else {
+ }
+ else {
dprintf("ask_cas,ringbuf full,enqueue\n");
/* Ring-buffer is full */
/* Let the guard down so that this ask-fetch can be issued in ringbuf_progress */
-#if 0 /*debug*/
+#if 0 /*debug */
VC_FIELD(vc, ibcom->ask_guard) = 0;
#endif
/* Retry from fetch */
@@ -1828,7 +1855,8 @@ int MPID_nem_ib_ringbuf_ask_cas(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_req_t* req)
/* Make the ask-fetch in order */
MPID_nem_ib_ringbuf_sendq_enqueue_at_head(&MPID_nem_ib_ringbuf_sendq, req);
}
- } else {
+ }
+ else {
dprintf("ask_cas,ncqe or ncom full,enqueue\n");
req->retry_decided = MPID_nem_ib_progress_engine_vt;
req->retry_backoff = 0;
@@ -1851,8 +1879,8 @@ int MPID_nem_ib_ringbuf_progress()
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
MPID_nem_ib_ringbuf_req_t *sreq, *prev_sreq;
- MPID_nem_ib_ringbuf_cmd_shadow_t* shadow;
-
+ MPID_nem_ib_ringbuf_cmd_shadow_t *shadow;
+
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_PROGRESS);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_PROGRESS);
@@ -1870,21 +1898,22 @@ int MPID_nem_ib_ringbuf_progress()
switch (sreq->state) {
case MPID_NEM_IB_RINGBUF_ASK_CAS:
- dprintf("ringbuf_progress,ask_cas,req=%p\n",
- sreq);
+ dprintf("ringbuf_progress,ask_cas,req=%p\n", sreq);
shadow =
- (MPID_nem_ib_ringbuf_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_cmd_shadow_t));
+ (MPID_nem_ib_ringbuf_cmd_shadow_t *)
+ MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_cmd_shadow_t));
shadow->type = sreq->state;
shadow->req = sreq;
mpi_errno =
- MPID_nem_ib_ringbuf_ask_cas_core(sreq->vc, shadow, (uint64_t)sreq->fetched.head);
+ MPID_nem_ib_ringbuf_ask_cas_core(sreq->vc, shadow,
+ (uint64_t) sreq->fetched.head);
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
"**MPID_nem_ib_ringbuf_connect_cas_core");
break;
case MPID_NEM_IB_RINGBUF_ASK_FETCH:
if (MPID_nem_ib_diff63(MPID_nem_ib_progress_engine_vt, sreq->retry_decided) <
sreq->retry_backoff) {
- dprintf("ringbuf_progress,vt=%ld,retry_decided=%ld,diff=%ld,backoff=%ld\n",
+ dprintf("ringbuf_progress,vt=%ld,retry_decided=%ld,diff=%ld,backoff=%ld\n",
MPID_nem_ib_progress_engine_vt, sreq->retry_decided,
MPID_nem_ib_diff63(MPID_nem_ib_progress_engine_vt, sreq->retry_decided),
sreq->retry_backoff);
@@ -1898,18 +1927,18 @@ int MPID_nem_ib_ringbuf_progress()
if (VC_FIELD(sreq->vc, ibcom->ask_guard)) {
goto next;
}
- dprintf("ringbuf_progress,ask_fetch,req=%p\n",
- sreq);
+ dprintf("ringbuf_progress,ask_fetch,req=%p\n", sreq);
VC_FIELD(sreq->vc, ibcom->ask_guard) = 1;
shadow =
- (MPID_nem_ib_ringbuf_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_cmd_shadow_t));
+ (MPID_nem_ib_ringbuf_cmd_shadow_t *)
+ MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_cmd_shadow_t));
shadow->type = sreq->state;
shadow->req = sreq;
- mpi_errno =
+ mpi_errno =
MPID_nem_ib_ringbuf_ask_fetch_core(sreq->vc, shadow,
sizeof(MPID_nem_ib_ringbuf_headtail_t));
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_ringbuf_send_core");
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_ringbuf_send_core");
}
break;
default:
@@ -1923,7 +1952,8 @@ int MPID_nem_ib_ringbuf_progress()
MPID_nem_ib_ringbuf_sendq_next(prev_sreq) = MPID_nem_ib_ringbuf_sendq_next(sreq);
}
else {
- MPID_nem_ib_ringbuf_sendq_head(MPID_nem_ib_ringbuf_sendq) = MPID_nem_ib_ringbuf_sendq_next(sreq);
+ MPID_nem_ib_ringbuf_sendq_head(MPID_nem_ib_ringbuf_sendq) =
+ MPID_nem_ib_ringbuf_sendq_next(sreq);
}
if (MPID_nem_ib_ringbuf_sendq_next(sreq) == NULL) {
MPID_nem_ib_ringbuf_sendq.tail = prev_sreq;
@@ -1934,7 +1964,7 @@ int MPID_nem_ib_ringbuf_progress()
sreq = MPID_nem_ib_ringbuf_sendq_next(sreq);
goto next_unlinked;
- next:
+ next:
prev_sreq = sreq;
sreq = MPID_nem_ib_ringbuf_sendq_next(sreq);
next_unlinked:;
http://git.mpich.org/mpich.git/commitdiff/c3e1d60bdd4f5f94b357f46e2bba7ba64b2d1971
commit c3e1d60bdd4f5f94b357f46e2bba7ba64b2d1971
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date: Tue May 20 15:20:35 2014 +0900
Fix the management of RDMA-write ringbuffer
Signed-off-by: Pavan Balaji <balaji at anl.gov>
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
index abe7922..951768a 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
@@ -1262,6 +1262,8 @@ int MPID_nem_ib_recv_buf_released(struct MPIDI_VC *vc, void *user_data)
headtail->tail, headtail->head);
}
vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64] &= ~(1ULL << (index_tail & 63));
+
+ index_tail = (uint16_t)(index_tail + 1) % vc_ib->ibcom->remote_ringbuf->nslot;
}
else {
break;
http://git.mpich.org/mpich.git/commitdiff/6caab150572e0ea65ade269096ec9a28525e68dc
commit 6caab150572e0ea65ade269096ec9a28525e68dc
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date: Tue May 20 14:44:23 2014 +0900
Improve on-demand connection management for IB
Add some types of connection protocol, which are used depending on a
connection situation.
Fix the management of ib_cm_ringbuf.
Signed-off-by: Pavan Balaji <balaji at anl.gov>
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
index 704dbf4..6f954a2 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
@@ -388,6 +388,8 @@ static int MPID_nem_ib_com_clean(MPID_nem_ib_com_t * conp)
dprintf("ibcom,destroy MPID_nem_ib_rc_shared_rcq\n");
ib_errno = ibv_destroy_cq(MPID_nem_ib_rc_shared_rcq);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq failed\n"));
+
+ MPID_nem_ib_rc_shared_rcq = NULL;
}
#if 0 /* It's not used */
retval = munmap(conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_FROM], MPID_NEM_IB_COM_RDMABUF_SZ);
@@ -2028,7 +2030,8 @@ int MPID_nem_ib_com_put_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset
/* Use inline so that we don't need to worry about overwriting write-from buffer */
// assert(sz <= conp->max_inline_data);
- assert(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] == laddr);
+ /* When cm_progress calls this function, 'comp->icom_mem' and 'laddr' are not equal. */
+// assert(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] == laddr);
// memcpy(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM], laddr, sz);
/* Instead of using the pre-mmaped memory (comp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM]),
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
index 05fa8f3..4cea111 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
@@ -483,6 +483,7 @@ typedef struct MPID_nem_ib_com {
wait until all the onnection request transactions ends before
freeing scratch-pad QP.*/
int outstanding_connection_tx;
+ int incoming_connection_tx;
} MPID_nem_ib_com_t;
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
index 32f18e5..4f4d99f 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
@@ -105,6 +105,9 @@ typedef GENERIC_Q_DECL(struct MPID_Request) MPID_nem_ib_lmtq_t;
#define MPID_NEM_IB_CM_LOCAL_QP_RTS 8
#define MPID_NEM_IB_CM_ESTABLISHED 15
+#define is_conn_established(rank) \
+ (VC_FIELD(MPID_nem_ib_conns[rank].vc, connection_state) == MPID_NEM_IB_CM_ESTABLISHED)
+
typedef struct {
char *data;
int length;
@@ -120,7 +123,10 @@ enum MPID_nem_ib_cm_cmd_types {
MPID_NEM_IB_CM_ACK1,
MPID_NEM_IB_CM_ACK2,
MPID_NEM_IB_RINGBUF_ASK_FETCH,
- MPID_NEM_IB_RINGBUF_ASK_CAS
+ MPID_NEM_IB_RINGBUF_ASK_CAS,
+ MPID_NEM_IB_CM_CAS_RELEASE,
+ MPID_NEM_IB_CM_ALREADY_ESTABLISHED,
+ MPID_NEM_IB_CM_RESPONDER_IS_CONNECTING
};
/* Packet types of connection protocol */
@@ -263,6 +269,12 @@ typedef GENERIC_Q_DECL(MPID_nem_ib_cm_req_t) MPID_nem_ib_cm_sendq_t;
(cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
}
+#define MPID_NEM_IB_CM_COMPOSE_CAS_RELEASE(cmd, req) { \
+ (cmd)->type = MPID_NEM_IB_CM_CAS_RELEASE; \
+ (cmd)->initiator_req = (req); \
+ (cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
+}
+
#define MPID_NEM_IB_CM_COMPOSE_SYNACK(cmd, req, _initiator_req) { \
(cmd)->type = MPID_NEM_IB_CM_SYNACK; \
MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO((cmd), (req)->initiator_rank); \
@@ -273,6 +285,13 @@ typedef GENERIC_Q_DECL(MPID_nem_ib_cm_req_t) MPID_nem_ib_cm_sendq_t;
(cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
}
+#define MPID_NEM_IB_CM_COMPOSE_END_CM(cmd, req, _initiator_req, _type) { \
+ (cmd)->type = _type; \
+ (cmd)->initiator_req = (_initiator_req); \
+ (cmd)->responder_req = (req); \
+ (cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
+}
+
#define MPID_NEM_IB_CM_COMPOSE_ACK1(cmd, req, _responder_req) { \
(cmd)->type = MPID_NEM_IB_CM_ACK1; \
MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO((cmd), (req)->responder_rank); \
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
index 6c6da16..8014b1c 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
@@ -590,6 +590,9 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
dprintf("init,fd[%d]=%d\n", i, MPID_nem_ib_conns[i].fd);
}
#endif
+#else /* define(MPID_NEM_IB_ONDEMAND) */
+ /* We need to communicate with all other ranks in close sequence. */
+ MPID_nem_ib_conns_ref_count = MPID_nem_ib_nranks - 1;
#endif
MPIU_Free(remote_rank_str);
@@ -1050,7 +1053,8 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
/* Empty sendq */
while (!MPID_nem_ib_sendq_empty(vc_ib->sendq) ||
VC_FIELD(vc, pending_sends) > 0 ||
- MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank]->outstanding_connection_tx > 0) {
+ MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank]->outstanding_connection_tx > 0 ||
+ MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank]->incoming_connection_tx > 0) {
/* mimic ib_poll because vc_terminate might be called from ib_poll_eager */
mpi_errno = MPID_nem_ib_send_progress(vc);
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_send_progress");
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
index 66be158..abe7922 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
@@ -2066,6 +2066,40 @@ int MPID_nem_ib_cm_drain_scq()
dprintf("cm_drain_scq,cm_cas,succeeded\n");
+ if (is_conn_established(shadow_cm->req->responder_rank)) {
+ /* Connection is already established.
+ * In this case, responder may already have performed vc_terminate.
+ * However, since initiator has to release responder's CAS word,
+ * initiator sends CM_CAS_RELEASE. */
+
+ shadow_cm->req->state = MPID_NEM_IB_CM_CAS_RELEASE;
+ if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
+ shadow_cm->req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY) {
+
+ MPID_nem_ib_cm_cmd_syn_t *cmd = (MPID_nem_ib_cm_cmd_syn_t *) shadow_cm->req->ibcom->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
+ MPID_NEM_IB_CM_COMPOSE_CAS_RELEASE(cmd, shadow_cm->req);
+ cmd->initiator_rank = MPID_nem_ib_myrank;
+
+ MPID_nem_ib_cm_cmd_shadow_t * shadow_syn =
+ (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ shadow_syn->type = shadow_cm->req->state;
+ shadow_syn->req = shadow_cm->req;
+ dprintf("shadow_syn=%p,shadow_syn->req=%p\n", shadow_syn, shadow_syn->req);
+ mpi_errno =
+ MPID_nem_ib_cm_cmd_core(shadow_cm->req->responder_rank, shadow_syn,
+ (void *) cmd, sizeof(MPID_nem_ib_cm_cmd_syn_t),
+ 1 /* syn:1 */, 0);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_send_core");
+ } else {
+ MPID_NEM_IB_CM_COMPOSE_CAS_RELEASE((MPID_nem_ib_cm_cmd_syn_t *)&(shadow_cm->req->cmd), shadow_cm->req);
+ MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, shadow_cm->req);
+ }
+ }
+ else {
+ /* Increment receiving transaction counter. Initiator receives SYNACK and ACK2 */
+ shadow_cm->req->ibcom->incoming_connection_tx += 2;
+
shadow_cm->req->state = MPID_NEM_IB_CM_SYN;
if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
shadow_cm->req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
@@ -2094,7 +2128,29 @@ int MPID_nem_ib_cm_drain_scq()
MPID_NEM_IB_CM_COMPOSE_SYN((MPID_nem_ib_cm_cmd_syn_t *)&(shadow_cm->req->cmd), shadow_cm->req);
MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, shadow_cm->req);
}
+ }
} else {
+ if (is_conn_established(shadow_cm->req->responder_rank)) {
+ /* CAS is failed, and connection is already established */
+
+ dprintf("cm_drain_scq,cm_cas,connection is already established\n");
+
+ MPID_nem_ib_ncqe_scratch_pad_to_drain -= 1;
+ shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
+ shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
+ MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
+
+ /* Let the guard down to let the following connection request go. */
+ VC_FIELD(MPID_nem_ib_conns[shadow_cm->req->responder_rank].vc, connection_guard) = 0;
+
+ /* free memory : req->ref_count is 2, so call MPIU_Free() directly */
+ //MPID_nem_ib_cm_request_release(shadow_cm->req);
+ MPIU_Free(shadow_cm->req);
+
+ MPIU_Free(shadow_cm);
+ break;
+ }
+
dprintf("cm_drain_scq,cm_cas,retval=%016lx,backoff=%ld\n",
*cas_retval, shadow_cm->req->retry_backoff);
shadow_cm->req->retry_backoff =
@@ -2123,6 +2179,23 @@ int MPID_nem_ib_cm_drain_scq()
MPIU_Free(shadow_cm);
break;
+ case MPID_NEM_IB_CM_CAS_RELEASE:
+ dprintf("cm_drain_scq,syn sent\n");
+ shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
+ shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
+ shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
+ dprintf("cm_drain_scq,tx=%d\n", shadow_cm->req->ibcom->outstanding_connection_tx);
+
+ dprintf("cm_drain_scq,syn,buf_from=%p,sz=%d\n",
+ shadow_cm->buf_from, shadow_cm->buf_from_sz);
+ MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
+
+ /* free memory : req->ref_count is 2, so call MPIU_Free() directly */
+ //MPID_nem_ib_cm_request_release(shadow_cm->req);
+ MPIU_Free(shadow_cm->req);
+
+ MPIU_Free(shadow_cm);
+ break;
case MPID_NEM_IB_CM_SYNACK:
shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
dprintf("cm_drain_scq,synack sent,req=%p,initiator_rank=%d\n",
@@ -2172,6 +2245,28 @@ int MPID_nem_ib_cm_drain_scq()
MPID_nem_ib_cm_request_release(shadow_cm->req);
MPIU_Free(shadow_cm);
break;
+ case MPID_NEM_IB_CM_ALREADY_ESTABLISHED:
+ case MPID_NEM_IB_CM_RESPONDER_IS_CONNECTING:
+ /* These cases mean the end of CM-op, so we do the almost same operation as ack2 */
+ shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
+ dprintf("cm_drain_scq,established or connecting sent,req=%p,initiator_rank=%p=%d\n",
+ shadow_cm->req, &shadow_cm->req->initiator_rank, shadow_cm->req->initiator_rank);
+ shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
+ shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
+ dprintf("cm_drain_scq,tx=%d\n", shadow_cm->req->ibcom->outstanding_connection_tx);
+
+ shadow_cm->req->ibcom->incoming_connection_tx -= 1;
+
+ MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
+
+ /* Let the guard down to let the following connection request go. */
+ VC_FIELD(MPID_nem_ib_conns[shadow_cm->req->initiator_rank].vc, connection_guard) = 0;
+
+ /* Finalize protocol because there is no referer in cm_drain_scq, sendq
+ and cm_poll because cm_poll sent ACK2. */
+ MPID_nem_ib_cm_request_release(shadow_cm->req);
+ MPIU_Free(shadow_cm);
+ break;
case MPID_NEM_IB_RINGBUF_ASK_FETCH:
shadow_ringbuf = (MPID_nem_ib_ringbuf_cmd_shadow_t *) cqe[i].wr_id;
memcpy(&shadow_ringbuf->req->fetched,
@@ -2328,6 +2423,7 @@ int MPID_nem_ib_cm_poll_syn()
switch (*head_flag) {
case MPID_NEM_IB_CM_SYN: {
+ int is_synack = 0;
volatile MPID_nem_ib_cm_cmd_syn_t *syn_tail_flag =
(MPID_nem_ib_cm_cmd_syn_t *) slot;
while (syn_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
@@ -2339,30 +2435,9 @@ int MPID_nem_ib_cm_poll_syn()
dprintf("cm_poll_syn,syn detected!,initiator_rank=%d,ringbuf_index=%d\n",
syn->initiator_rank, syn->responder_ringbuf_index);
- /* Skip QP createion on race condition */
- if(!(VC_FIELD(MPID_nem_ib_conns[syn->initiator_rank].vc, connection_state) &
- MPID_NEM_IB_CM_LOCAL_QP_RESET)) {
- ibcom_errno = MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_RC, &MPID_nem_ib_conns[syn->initiator_rank].fd);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
- MPID_nem_ib_conns_ref_count++;
- /* store pointer to MPID_nem_ib_com */
- dprintf("cm_poll_syn,initiator fd=%d\n", MPID_nem_ib_conns[syn->initiator_rank].fd);
- ibcom_errno = MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[syn->initiator_rank].fd,
- &VC_FIELD(MPID_nem_ib_conns[syn->initiator_rank].vc, ibcom));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
-
- /* Allocate RDMA-write-to ring-buf for remote */
- mpi_errno = MPID_nem_ib_ringbuf_alloc(MPID_nem_ib_conns[syn->initiator_rank].vc);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_alloc");
- /* Record state transition for race condition detection */
- VC_FIELD(MPID_nem_ib_conns[syn->initiator_rank].vc, connection_state) |=
- MPID_NEM_IB_CM_LOCAL_QP_RESET;
- }
-
MPID_nem_ib_cm_req_t* req = MPIU_Malloc(sizeof(MPID_nem_ib_cm_req_t));
MPIU_ERR_CHKANDJUMP(!req, mpi_errno, MPI_ERR_OTHER, "**malloc");
- req->state = MPID_NEM_IB_CM_SYNACK;
req->ref_count = 1; /* Released when draining SCQ of ACK2 */
req->ringbuf_index = syn->responder_ringbuf_index;
req->initiator_rank = syn->initiator_rank;
@@ -2370,23 +2445,68 @@ int MPID_nem_ib_cm_poll_syn()
ibcom_errno =
MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[req->initiator_rank],
&req->ibcom);
+
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
+ if (is_conn_established(syn->initiator_rank)) {
+ req->state = MPID_NEM_IB_CM_ALREADY_ESTABLISHED;
+ }
+ else if ((MPID_nem_ib_myrank > syn->initiator_rank) && (req->ibcom->outstanding_connection_tx == 1)) {
+ req->state = MPID_NEM_IB_CM_RESPONDER_IS_CONNECTING;
+ }
+ else {
+ /* Skip QP createion on race condition */
+ if(!(VC_FIELD(MPID_nem_ib_conns[syn->initiator_rank].vc, connection_state) &
+ MPID_NEM_IB_CM_LOCAL_QP_RESET)) {
+ ibcom_errno = MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_RC, &MPID_nem_ib_conns[syn->initiator_rank].fd);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
+
+ /* store pointer to MPID_nem_ib_com */
+ dprintf("cm_poll_syn,initiator fd=%d\n", MPID_nem_ib_conns[syn->initiator_rank].fd);
+ ibcom_errno = MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[syn->initiator_rank].fd,
+ &VC_FIELD(MPID_nem_ib_conns[syn->initiator_rank].vc, ibcom));
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
+
+ /* Allocate RDMA-write-to ring-buf for remote */
+ mpi_errno = MPID_nem_ib_ringbuf_alloc(MPID_nem_ib_conns[syn->initiator_rank].vc);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_alloc");
+
+ /* Record state transition for race condition detection */
+ VC_FIELD(MPID_nem_ib_conns[syn->initiator_rank].vc, connection_state) |=
+ MPID_NEM_IB_CM_LOCAL_QP_RESET;
+ }
+
+ req->state = MPID_NEM_IB_CM_SYNACK;
+ is_synack = 1;
+ }
+
/* Increment transaction counter here because this path is executed only once */
req->ibcom->outstanding_connection_tx += 1;
dprintf("cm_poll_syn,tx=%d\n", req->ibcom->outstanding_connection_tx);
+ /* Increment receiving transaction counter.
+ * In the case of SYNACK, Responder receives ack1
+ * In the case of ALREADY_ESTABLISHED or RESPONDER_IS_CONNECTING,
+ * decrement in cm_drain_scq.
+ */
+ req->ibcom->incoming_connection_tx += 1;
+
if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
MPID_nem_ib_cm_ringbuf_tail) < MPID_NEM_IB_CM_NSEG) {
MPID_nem_ib_cm_cmd_synack_t *cmd = (MPID_nem_ib_cm_cmd_synack_t *) req->ibcom->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
- MPID_NEM_IB_CM_COMPOSE_SYNACK(cmd, req, syn->initiator_req);
- dprintf("cm_poll_syn,composing synack,responder_req=%p,cmd->rmem=%lx,rkey=%08x,ringbuf_nslot=%d,remote_vc=%lx\n",
- cmd->responder_req, cmd->rmem, cmd->rkey, cmd->ringbuf_nslot, cmd->remote_vc);
- cmd->initiator_ringbuf_index = req->initiator_ringbuf_index = MPID_nem_ib_cm_ringbuf_head;
- dprintf("cm_poll_syn,giving ringbuf_index=%d\n", cmd->initiator_ringbuf_index);
- MPID_nem_ib_cm_ringbuf_head++;
+ if (is_synack) {
+ MPID_NEM_IB_CM_COMPOSE_SYNACK(cmd, req, syn->initiator_req);
+ dprintf("cm_poll_syn,composing synack,responder_req=%p,cmd->rmem=%lx,rkey=%08x,ringbuf_nslot=%d,remote_vc=%lx\n",
+ cmd->responder_req, cmd->rmem, cmd->rkey, cmd->ringbuf_nslot, cmd->remote_vc);
+ cmd->initiator_ringbuf_index = req->initiator_ringbuf_index = MPID_nem_ib_cm_ringbuf_head;
+ dprintf("cm_poll_syn,giving ringbuf_index=%d\n", cmd->initiator_ringbuf_index);
+ MPID_nem_ib_cm_ringbuf_head++;
+ }
+ else {
+ MPID_NEM_IB_CM_COMPOSE_END_CM(cmd, req, syn->initiator_req, req->state);
+ }
MPID_nem_ib_cm_cmd_shadow_t * shadow =
(MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
shadow->type = req->state;
@@ -2397,13 +2517,37 @@ int MPID_nem_ib_cm_poll_syn()
} else {
dprintf("cm_poll_syn,enqueue,ncqe=%d,ncom=%d,head=%d,tail=%d\n", MPID_nem_ib_ncqe_scratch_pad, req->ibcom->ncom_scratch_pad, MPID_nem_ib_cm_ringbuf_head, MPID_nem_ib_cm_ringbuf_tail);
- MPID_NEM_IB_CM_COMPOSE_SYNACK((MPID_nem_ib_cm_cmd_synack_t *)&(req->cmd), req, syn->initiator_req);
+ if (is_synack) {
+ MPID_NEM_IB_CM_COMPOSE_SYNACK((MPID_nem_ib_cm_cmd_synack_t *)&(req->cmd), req, syn->initiator_req);
+ }
+ else {
+ MPID_NEM_IB_CM_COMPOSE_END_CM((MPID_nem_ib_cm_cmd_synack_t *)&(req->cmd), req, syn->initiator_req, req->state);
+ }
MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
}
/* Release CAS word because there's no next write on this syn slot */
*cas_word = MPID_NEM_IB_CM_RELEASED;
}
- //common_tail:
+ goto common_tail;
+ break;
+ case MPID_NEM_IB_CM_CAS_RELEASE: {
+ /* Initiator requests to release CAS word.
+ * Because connection is already established.
+ * In this case, responder may already have performed vc_terminate. */
+
+ volatile MPID_nem_ib_cm_cmd_syn_t *syn_tail_flag =
+ (MPID_nem_ib_cm_cmd_syn_t *) slot;
+ while (syn_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
+ /* __asm__ __volatile__("pause;":::"memory"); */
+ }
+
+ volatile uint64_t *cas_word = (uint64_t *) (MPID_nem_ib_scratch_pad);
+
+ /* release */
+ *cas_word = MPID_NEM_IB_CM_RELEASED;
+ }
+
+ common_tail:
*head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
/* Clear all possible tail-flag slots */
@@ -2429,17 +2573,16 @@ int MPID_nem_ib_cm_poll_syn()
int MPID_nem_ib_cm_release(uint16_t index) {
int mpi_errno = MPI_SUCCESS;
int old_ringbuf_tail = MPID_nem_ib_cm_ringbuf_tail;
+ uint16_t index_slot = index % MPID_NEM_IB_CM_NSEG;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_RELEASE);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_RELEASE);
- /* mark that one buffer has been released */
- MPIU_Assert(0 <= index && index < MPID_NEM_IB_CM_NSEG);
//dprintf("user_data=%p,mem=%p,sub=%08lx,index=%d\n", user_data, vc_ib->ibcom->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO], (unsigned long)user_data - (unsigned long)vc_ib->ibcom->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO], index);
//dprintf("index=%d,released=%016lx\n", index, vc_ib->ibcom->remote_ringbuf->remote_released[index / 64]);
- MPID_nem_ib_cm_ringbuf_released[index / 64] |= (1ULL << (index & 63));
+ MPID_nem_ib_cm_ringbuf_released[index_slot / 64] |= (1ULL << (index_slot & 63));
//dprintf("released[index/64]=%016lx\n", vc_ib->ibcom->remote_ringbuf->remote_released[index / 64]);
- int index_tail = ((uint16_t)(MPID_nem_ib_cm_ringbuf_tail + 1) % MPID_NEM_IB_CM_NSEG);
+ uint16_t index_tail = ((uint16_t)(MPID_nem_ib_cm_ringbuf_tail + 1) % MPID_NEM_IB_CM_NSEG);
//dprintf("tail+1=%d,index_tail=%d\n", vc_ib->ibcom->rsr_seq_num_tail + 1, index_tail);
//dprintf("released=%016lx\n", vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64]);
while (1) {
@@ -2448,6 +2591,8 @@ int MPID_nem_ib_cm_release(uint16_t index) {
MPID_nem_ib_cm_ringbuf_tail++;
MPID_nem_ib_cm_ringbuf_released[index_tail / 64] &= ~(1ULL << (index_tail & 63));
dprintf("MPID_nem_ib_cm_ringbuf_tail,incremented to %d\n", MPID_nem_ib_cm_ringbuf_tail);
+
+ index_tail = (uint16_t)(index_tail + 1) % MPID_NEM_IB_CM_NSEG;
}
else {
break;
@@ -2515,6 +2660,8 @@ int MPID_nem_ib_cm_poll()
synack->responder_req, req->responder_rank, synack->initiator_ringbuf_index,
req->ibcom->outstanding_connection_tx);
+ req->ibcom->incoming_connection_tx -= 1; /* SYNACK */
+
/* Deduct it from the packet */
VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_state) |=
MPID_NEM_IB_CM_REMOTE_QP_RESET;
@@ -2568,7 +2715,56 @@ int MPID_nem_ib_cm_poll()
MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
}
}
- goto common_tail;
+
+ *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
+ /* Clear all possible tail-flag slots */
+ MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
+
+ //goto common_tail;
+ break;
+ case MPID_NEM_IB_CM_ALREADY_ESTABLISHED:
+ case MPID_NEM_IB_CM_RESPONDER_IS_CONNECTING:
+ {
+ volatile MPID_nem_ib_cm_cmd_synack_t *synack_tail_flag =
+ (MPID_nem_ib_cm_cmd_synack_t *) slot;
+ while (synack_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
+ /* __asm__ __volatile__("pause;":::"memory"); */
+ }
+
+ MPID_nem_ib_cm_cmd_synack_t *synack = (MPID_nem_ib_cm_cmd_synack_t *) slot;
+ MPID_nem_ib_cm_req_t* req = (MPID_nem_ib_cm_req_t *) synack->initiator_req;
+
+ dprintf("cm_poll,synack detected!,responder_req=%p,responder_rank=%d,ringbuf_index=%d,tx=%d\n",
+ req->ibcom->outstanding_connection_tx);
+
+ /* These mean the end of CM-op, so decrement here. */
+ req->ibcom->outstanding_connection_tx -= 1;
+ req->ibcom->incoming_connection_tx -= 2;
+
+ /* cm_release calls cm_progress, so we have to clear scratch_pad here. */
+ *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
+ /* Clear all possible tail-flag slots */
+ MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
+
+ /* The initiator release the slot for responder */
+ MPID_nem_ib_cm_release(req->responder_ringbuf_index);
+
+ /* Kick ask-send commands waiting for connection */
+ MPID_nem_ib_ringbuf_progress();
+
+ /* Kick send commands waiting for connection.
+ This might be a dupe when running-ahead transaction kicked it when receiving ACK1. */
+ dprintf("cm_poll,kick progress engine for %d\n", req->responder_rank);
+ MPID_nem_ib_send_progress(MPID_nem_ib_conns[req->responder_rank].vc);
+
+ /* Let the following connection request go */
+ VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_guard) = 0;
+
+ /* free memory : req->ref_count is 2, so call MPIU_Free() directly */
+ //MPID_nem_ib_cm_request_release(req);
+ MPIU_Free(req);
+ }
+ //goto common_tail;
break;
case MPID_NEM_IB_CM_ACK1: {
volatile MPID_nem_ib_cm_cmd_ack1_t *ack1_tail_flag =
@@ -2584,6 +2780,8 @@ int MPID_nem_ib_cm_poll()
ack1->responder_req, req->initiator_rank,
req->ibcom->outstanding_connection_tx);
+ req->ibcom->incoming_connection_tx -= 1; /* ACK1 */
+
/* Deduct it from the packet */
VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, connection_state) |=
(MPID_NEM_IB_CM_REMOTE_QP_RESET | MPID_NEM_IB_CM_REMOTE_QP_RTS);
@@ -2640,6 +2838,11 @@ int MPID_nem_ib_cm_poll()
MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
}
+ /* cm_release calls cm_progress, so we have to clear scratch_pad here. */
+ *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
+ /* Clear all possible tail-flag slots */
+ MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
+
/* The responder release the slot for initiator */
MPID_nem_ib_cm_release(req->initiator_ringbuf_index);
@@ -2651,7 +2854,7 @@ int MPID_nem_ib_cm_poll()
dprintf("cm_poll,kick progress engine for %d\n", req->initiator_rank);
MPID_nem_ib_send_progress(MPID_nem_ib_conns[req->initiator_rank].vc);
}
- goto common_tail;
+ //goto common_tail;
break;
case MPID_NEM_IB_CM_ACK2: {
volatile MPID_nem_ib_cm_cmd_ack2_t *ack2_tail_flag =
@@ -2666,6 +2869,8 @@ int MPID_nem_ib_cm_poll()
req, req->responder_rank,
req->ibcom->outstanding_connection_tx);
+ req->ibcom->incoming_connection_tx -= 1; /* ACK2 */
+
/* Deduct it from the packet */
if(!(VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_state) &
MPID_NEM_IB_CM_REMOTE_QP_RTS)) {
@@ -2676,6 +2881,11 @@ int MPID_nem_ib_cm_poll()
MPID_NEM_IB_CM_REMOTE_QP_RTS;
}
+ /* cm_release calls cm_progress, so we have to clear scratch_pad here. */
+ *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
+ /* Clear all possible tail-flag slots */
+ MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
+
/* The initiator release the slot for responder */
MPID_nem_ib_cm_release(req->responder_ringbuf_index);
@@ -2703,10 +2913,10 @@ int MPID_nem_ib_cm_poll()
Note that there might be one which sent ACK1 in cm_drain_scq. */
MPID_nem_ib_cm_request_release(req);
}
- common_tail:
- *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
- /* Clear all possible tail-flag slots */
- MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
+ //common_tail:
+ //*head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
+ ///* Clear all possible tail-flag slots */
+ //MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
break;
default:
printf("unknown connection command\n");
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
index 0bdce13..e9f4e48 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
@@ -1243,6 +1243,7 @@ int MPID_nem_ib_cm_progress()
int ibcom_errno;
MPID_nem_ib_cm_req_t *sreq, *prev_sreq;
MPID_nem_ib_cm_cmd_shadow_t* shadow;
+ int is_established = 0;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_PROGRESS);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_PROGRESS);
@@ -1261,6 +1262,12 @@ int MPID_nem_ib_cm_progress()
switch (sreq->state) {
case MPID_NEM_IB_CM_CAS:
+ if (is_conn_established(sreq->responder_rank)) {
+ dprintf("cm_progress,cm_cas,connection is already established\n");
+ is_established = 1;
+ break;
+ }
+
/* This comparison is OK if the diff is within 63-bit range */
if (MPID_nem_ib_diff63(MPID_nem_ib_progress_engine_vt, sreq->retry_decided) <
sreq->retry_backoff) {
@@ -1285,12 +1292,42 @@ int MPID_nem_ib_cm_progress()
"**MPID_nem_ib_cm_connect_cas_core");
break;
case MPID_NEM_IB_CM_SYN:
+ if (is_conn_established(sreq->responder_rank)) {
+ /* Connection was established while SYN command was enqueued.
+ * So we replace SYN with CAS_RELEASE, and send. */
+
+ /* override req->type */
+ ((MPID_nem_ib_cm_cmd_syn_t*)&sreq->cmd)->type = MPID_NEM_IB_CM_CAS_RELEASE;
+ ((MPID_nem_ib_cm_cmd_syn_t*)&sreq->cmd)->initiator_rank = MPID_nem_ib_myrank;
+
+ /* Initiator does not receive SYNACK and ACK2, so we decrement incoming counter here. */
+ sreq->ibcom->incoming_connection_tx -= 2;
+
+ shadow =
+ (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+
+ /* override req->state */
+ shadow->type = sreq->state = MPID_NEM_IB_CM_CAS_RELEASE;
+ shadow->req = sreq;
+ dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
+ mpi_errno =
+ MPID_nem_ib_cm_cmd_core(sreq->responder_rank, shadow,
+ (void *)(&sreq->cmd),
+ sizeof(MPID_nem_ib_cm_cmd_synack_t), 1 /* syn:1 */, 0);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_send_core");
+ break;
+ }
+
/* The initiator acqurire slot for the responder when sending syn */
if(MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
MPID_nem_ib_cm_ringbuf_tail) >= MPID_NEM_IB_CM_NSEG) {
goto next;
}
((MPID_nem_ib_cm_cmd_syn_t*)&sreq->cmd)->responder_ringbuf_index = MPID_nem_ib_cm_ringbuf_head;
+ sreq->responder_ringbuf_index = MPID_nem_ib_cm_ringbuf_head;
+ ((MPID_nem_ib_cm_cmd_syn_t*)&sreq->cmd)->initiator_rank = MPID_nem_ib_myrank;
+
MPID_nem_ib_cm_ringbuf_head++;
shadow =
(MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
@@ -1304,6 +1341,21 @@ int MPID_nem_ib_cm_progress()
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
"**MPID_nem_ib_cm_send_core");
break;
+ case MPID_NEM_IB_CM_CAS_RELEASE:
+ ((MPID_nem_ib_cm_cmd_syn_t*)&sreq->cmd)->initiator_rank = MPID_nem_ib_myrank;
+
+ shadow =
+ (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ shadow->type = sreq->state;
+ shadow->req = sreq;
+ dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
+ mpi_errno =
+ MPID_nem_ib_cm_cmd_core(sreq->responder_rank, shadow,
+ (void *)(&sreq->cmd),
+ sizeof(MPID_nem_ib_cm_cmd_synack_t), 1 /* syn:1 */, 0);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_send_core");
+ break;
case MPID_NEM_IB_CM_SYNACK:
/* The responder acquire slot for the initiator when sending synack */
if(MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
@@ -1311,6 +1363,7 @@ int MPID_nem_ib_cm_progress()
goto next;
}
((MPID_nem_ib_cm_cmd_synack_t*)&sreq->cmd)->initiator_ringbuf_index = MPID_nem_ib_cm_ringbuf_head;
+ sreq->initiator_ringbuf_index = MPID_nem_ib_cm_ringbuf_head;
MPID_nem_ib_cm_ringbuf_head++;
shadow =
(MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
@@ -1350,6 +1403,20 @@ int MPID_nem_ib_cm_progress()
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
"**MPID_nem_ib_cm_send_core");
break;
+ case MPID_NEM_IB_CM_ALREADY_ESTABLISHED:
+ case MPID_NEM_IB_CM_RESPONDER_IS_CONNECTING:
+ shadow =
+ (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ shadow->type = sreq->state;
+ shadow->req = sreq;
+ dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
+ mpi_errno =
+ MPID_nem_ib_cm_cmd_core(sreq->initiator_rank, shadow,
+ (void *)(&sreq->cmd),
+ sizeof(MPID_nem_ib_cm_cmd_synack_t), 0, sreq->ringbuf_index);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_send_core");
+ break;
default:
dprintf("cm_progress,unknown state=%d\n", sreq->state);
assert(0);
@@ -1370,6 +1437,23 @@ int MPID_nem_ib_cm_progress()
/* save sreq->dev.next (and sreq) because decrementing reference-counter might free sreq */
MPID_nem_ib_cm_req_t *tmp_sreq = sreq;
sreq = MPID_nem_ib_cm_sendq_next(sreq);
+
+ if (is_established) {
+ dprintf("cm_progress,destroy connect-op\n");
+
+ /* don't connect */
+ tmp_sreq->ibcom->outstanding_connection_tx -= 1;
+
+ /* Let the guard down to let the following connection request go. */
+ VC_FIELD(MPID_nem_ib_conns[tmp_sreq->responder_rank].vc, connection_guard) = 0;
+
+ /* free memory : req->ref_count is 2, so call MPIU_Free() directly */
+// MPID_nem_ib_cm_request_release(tmp_sreq);
+ MPIU_Free(tmp_sreq);
+
+ is_established = 0;
+ break;
+ }
goto next_unlinked;
next:
prev_sreq = sreq;
@@ -1525,7 +1609,7 @@ int MPID_nem_ib_cm_cmd_core(int rank, MPID_nem_ib_cm_cmd_shadow_t* shadow, void*
/* Prepare QP (RESET). Attempting to overlap it with preparing QP (RESET) on the responder side */
ibcom_errno = MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_RC, &MPID_nem_ib_conns[rank].fd);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
- MPID_nem_ib_conns_ref_count++;
+
VC_FIELD(MPID_nem_ib_conns[rank].vc, connection_state) |=
MPID_NEM_IB_CM_LOCAL_QP_RESET;
http://git.mpich.org/mpich.git/commitdiff/85231ee6e0e1c8b2b69039500a8a864abe608b12
commit 85231ee6e0e1c8b2b69039500a8a864abe608b12
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date: Wed May 7 13:46:50 2014 +0900
No need to malloc scratch-pad when nranks is 1
Signed-off-by: Pavan Balaji <balaji at anl.gov>
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
index 9ebdb42..6c6da16 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
@@ -236,6 +236,11 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
memset(MPID_nem_ib_cm_ringbuf_released, 0, (MPID_NEM_IB_CM_NSEG + 63) / 64);
#endif
+ /* no need to malloc scratch-pad when the number of rank is '1' */
+ if (pg_p->size == 1) {
+ goto fn_exit;
+ }
+
/* malloc scratch-pad fd */
MPIU_CHKPMEM_MALLOC(MPID_nem_ib_scratch_pad_fds, int *, MPID_nem_ib_nranks * sizeof(int),
mpi_errno, "connection table");
http://git.mpich.org/mpich.git/commitdiff/a069e97f945a199ee5737fc727819fd93384a69a
commit a069e97f945a199ee5737fc727819fd93384a69a
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date: Fri May 2 16:57:25 2014 +0900
Fix header size when transmitting by SendNoncontig
Signed-off-by: Pavan Balaji <balaji at anl.gov>
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
index ee81c66..0bdce13 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
@@ -699,6 +699,9 @@ static int MPID_nem_ib_SendNoncontig_core(MPIDI_VC_t * vc, MPID_Request * sreq,
MPIU_Assert(last == sreq->dev.segment_size);
}
+ /* packet handlers assume this */
+ hdr_sz = sizeof(MPIDI_CH3_Pkt_t);
+
/* increment cc because PktHandler_EagerSyncAck, ssend.c, drain_scq decrement it */
if (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_EAGER_SYNC_SEND) {
MPIR_Request_add_ref(sreq);
http://git.mpich.org/mpich.git/commitdiff/92c811d33ba9d12fbf32aacd032c53f1d2934256
commit 92c811d33ba9d12fbf32aacd032c53f1d2934256
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date: Fri May 2 08:56:35 2014 +0900
Replace malloc-free functions when using netmod-IB
We create malloc-free functions to reduce ibv_dereg_mr overhead.
We implement 'malloc' as follows. If there is a memory marked as usable,
we use it and mark as in-use. If not, we allocate memories and mark them
as usable. In the case of small size, we do mmap a large memory area and
divide it into some blocks. In the case of big size, we do mmap by the
size aligned power-of-two.
We implement 'free' as follows. We check the free size by the address.
In the case of small size, we count the number of 'free' called. If all
blocks are freed, we mark them as usable. In the case of big size, we
mark it as usable.
Signed-off-by: Pavan Balaji <balaji at anl.gov>
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/Makefile.mk b/src/mpid/ch3/channels/nemesis/netmod/ib/Makefile.mk
index a357dd1..89ac755 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/Makefile.mk
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/Makefile.mk
@@ -14,7 +14,8 @@ mpi_core_sources += \
src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c \
src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c \
src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c \
- src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
+ src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c \
+ src/mpid/ch3/channels/nemesis/netmod/ib/ib_malloc.c
noinst_HEADERS += \
src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h \
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_malloc.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_malloc.c
new file mode 100644
index 0000000..9458cb2
--- /dev/null
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_malloc.c
@@ -0,0 +1,472 @@
+#define _GNU_SOURCE 1
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <pthread.h>
+
+//#define __DEBUG__
+
+#ifdef __DEBUG__
+#define dprintf printf
+
+#define NUM_USED(addr, align, size) \
+ (((size_t)addr & ~((size_t)align - 1)) == (size_t)addr) ? \
+ ((size_t)align / size) : \
+ (((size_t)addr & (align - 1)) / size)
+
+#else
+#define dprintf(...)
+#endif
+
+static void _local_malloc_initialize_hook(void);
+
+void (*__malloc_initialize_hook) (void) = _local_malloc_initialize_hook;
+
+static pthread_mutex_t mutex;
+static int __initialized_malloc = 0;
+static int __tunnel_munmap = 0;
+
+#define POOL_MIN_POW (5)
+#define POOL_MAX_POW (14)
+#define PAGE_SIZE (1UL << 12)
+
+#define MMAPED_OFFSET_POW (8)
+#define MMAPED_OFFSET (1UL << MMAPED_OFFSET_POW) // 256byte
+
+#define ARRAY_SIZE (64) // x86_64
+
+#define DEFAULT_POOL_SIZE (1UL << 17) // 128Kbyte
+#define POOL_ALIGN_SIZE (DEFAULT_POOL_SIZE)
+
+#define do_segfault (*(unsigned int*)0 = 0) // segmentation fault
+
+struct free_list {
+ struct free_list *next;
+ struct free_list *prev;
+};
+
+static inline void list_init(struct free_list *head)
+{
+ head->next = head;
+ head->prev = head;
+}
+
+static inline void __list_add(struct free_list *new, struct free_list *prev, struct free_list *next)
+{
+ next->prev = new;
+ new->next = next;
+ new->prev = prev;
+ prev->next = new;
+}
+
+static inline void __list_del(struct free_list *prev, struct free_list *next)
+{
+ next->prev = prev;
+ prev->next = next;
+}
+
+static inline void list_add_head(struct free_list *new, struct free_list *head)
+{
+ __list_add(new, head, head->next);
+}
+
+static inline void list_add_tail(struct free_list *new, struct free_list *head)
+{
+ __list_add(new, head->prev, head);
+}
+
+static inline void list_del(struct free_list *list)
+{
+ __list_del(list->prev, list->next);
+
+ list->prev = NULL;
+ list->next = NULL;
+}
+
+static inline int is_list_empty(struct free_list *list)
+{
+ return (list->next == list) ? 1 : 0;
+}
+
+static struct free_list arena_flist[ARRAY_SIZE];
+
+struct pool_info {
+ struct free_list list; /* 16byte (x86_64) */
+ char *next_pos; /* 8byte (x86_64) */
+ uint16_t size; /* 2byte */
+ uint16_t num; /* 2byte */
+ uint16_t free_num; /* 2byte */
+ uint16_t pow; /* 2byte */
+ uint16_t hole_num; /* 2byte */
+ uint16_t num_per_page; /* 2byte */
+ uint16_t count; /* 2byte */
+}; /* size of 'struct pool_info' must be smaller than MMAPED_OFFSET */
+
+#ifdef __x86_64__
+#define builtin_clz __builtin_clzl
+#define builtin_ctz __builtin_ctzl
+#else
+#define builtin_clz __builtin_clz
+#define builtin_ctz __builtin_ctz
+#endif
+
+/* Get a power of the argument */
+static int powoftwo(size_t val)
+{
+ if (val <= (size_t) (1UL << POOL_MIN_POW))
+ return POOL_MIN_POW;
+
+ int shift_max;
+#if defined(__x86_64__)
+ shift_max = 64;
+#else
+ shift_max = 32;
+#endif
+
+ /* If 'val' is power-of-two, we use 'ctz' */
+
+ return (val & (val - 1UL)) ? (shift_max - builtin_clz(val)) : builtin_ctz(val);
+}
+
+static void *__alloc_mmap(size_t size, size_t align)
+{
+ char *unaligned, *aligned;
+ size_t misaligned;
+ int ret;
+
+ unaligned = mmap(0, size + align, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (unaligned == MAP_FAILED)
+ return NULL;
+
+ misaligned = (size_t) unaligned & ((size_t) align - 1);
+ if (misaligned > 0) {
+ size_t offset = align - misaligned;
+ aligned = unaligned + offset;
+
+ /* munmap : head */
+ __tunnel_munmap = 1;
+ ret = munmap(unaligned, offset);
+ __tunnel_munmap = 0;
+ if (ret)
+ do_segfault;
+ }
+ else {
+ aligned = unaligned;
+ misaligned = align;
+ }
+
+ /* munmap : tail */
+ __tunnel_munmap = 1;
+ ret = munmap(aligned + size, misaligned);
+ __tunnel_munmap = 0;
+ if (ret)
+ do_segfault;
+
+ return (void *) aligned;
+}
+
+static void __init_pool_header_with_hole(struct pool_info *info, int i, int size)
+{
+ info->size = 1 << i;
+ info->hole_num = (MMAPED_OFFSET >> i) + 1;
+ info->num_per_page = PAGE_SIZE >> i;
+ info->num = size >> i;
+ info->free_num = info->hole_num;
+ info->count = info->hole_num;
+ info->pow = i;
+ info->next_pos = (char *) info + info->size * info->hole_num;
+}
+
+static void __init_pool_header(struct pool_info *info, int i, int size)
+{
+ info->size = 1 << i;
+ info->num = size >> i;
+ info->free_num = 1;
+ info->pow = i;
+ info->next_pos = (char *) info + info->size;
+}
+
+static void _local_malloc_initialize_hook(void)
+{
+ int i;
+ char *aligned;
+ size_t size;
+ int count;
+
+ pthread_mutex_init(&mutex, NULL);
+
+ pthread_mutex_lock(&mutex);
+ __initialized_malloc = 1;
+
+ for (i = 0; i < ARRAY_SIZE; i++) {
+ /* init list */
+ list_init(&arena_flist[i]);
+ }
+
+ /* Allocate initial mempool
+ *
+ * We do not use 2^0, ..., 2^(POOL_MIN_POW - 1) byte.
+ */
+
+ /* First, allocate a initial area by one-time mmap() and split it */
+ count = POOL_MAX_POW - POOL_MIN_POW + 1;
+ size = (size_t) DEFAULT_POOL_SIZE; // default pool size is 128k
+
+ aligned = (char *) __alloc_mmap(size * count, POOL_ALIGN_SIZE);
+
+ if (aligned == NULL) {
+ pthread_mutex_unlock(&mutex);
+ return;
+ }
+
+ /* split allcated area */
+ for (i = POOL_MIN_POW; i < POOL_MIN_POW + count; i++) {
+ struct pool_info *info;
+
+ info = (struct pool_info *) aligned;
+
+ if (i <= MMAPED_OFFSET_POW)
+ __init_pool_header_with_hole(info, i, size);
+ else
+ __init_pool_header(info, i, size);
+
+ /* add list tail */
+ list_add_tail(&(info->list), &arena_flist[i]);
+
+ aligned += size;
+ }
+
+ pthread_mutex_unlock(&mutex);
+}
+
+void *malloc(size_t size)
+{
+ int pow;
+ char *ptr = NULL;
+
+ if (!__initialized_malloc && __malloc_initialize_hook)
+ __malloc_initialize_hook();
+
+ pthread_mutex_lock(&mutex);
+
+ pow = powoftwo(size);
+
+ if (pow < 0 || pow >= ARRAY_SIZE)
+ return NULL;
+
+ if (is_list_empty(&arena_flist[pow])) {
+ char *tmp;
+
+ if (pow > POOL_MAX_POW) {
+ /* create memory area by mmap */
+
+ tmp = (char *) __alloc_mmap(((size_t) 1 << pow) + PAGE_SIZE, PAGE_SIZE);
+
+ if (tmp == NULL) {
+ pthread_mutex_unlock(&mutex);
+ return NULL;
+ }
+
+ *(int *) tmp = pow; //store 'power' for free()
+
+ ptr = (char *) tmp + MMAPED_OFFSET;
+
+ dprintf("malloc(%lu) [2^%d] ==> CREATE mmaped %p\n", size, pow, ptr);
+ }
+ else {
+ /* create new pool */
+ struct pool_info *info;
+ size_t alloc_sz = DEFAULT_POOL_SIZE;
+
+ tmp = (char *) __alloc_mmap(alloc_sz, POOL_ALIGN_SIZE);
+
+ if (tmp == NULL) {
+ pthread_mutex_unlock(&mutex);
+ return NULL;
+ }
+
+ info = (struct pool_info *) tmp;
+
+ if (pow <= MMAPED_OFFSET_POW)
+ __init_pool_header_with_hole(info, pow, alloc_sz);
+ else
+ __init_pool_header(info, pow, alloc_sz);
+
+ list_add_tail(&(info->list), &arena_flist[pow]);
+
+ ptr = info->next_pos;
+ info->next_pos += info->size;
+
+ if (pow <= MMAPED_OFFSET_POW)
+ info->count++;
+
+ dprintf("malloc(%lu) [2^%d] ==> CREATE pool %p use = %lu\n", size, pow, ptr,
+ NUM_USED(info->next_pos, POOL_ALIGN_SIZE, info->size));
+ }
+ }
+ else {
+ if (pow > POOL_MAX_POW) {
+ char *head = (char *) arena_flist[pow].next;
+
+ list_del((struct free_list *) head);
+
+ *(int *) head = pow; //store 'power' for free()
+ ptr = (char *) head + MMAPED_OFFSET;
+
+ dprintf("malloc(%lu) [2^%d] ==> USE mmaped %p\n", size, pow, ptr);
+ }
+ else {
+ struct pool_info *info = (struct pool_info *) (arena_flist[pow].next);
+
+ ptr = info->next_pos;
+ info->next_pos += info->size;
+
+ dprintf("malloc(%lu) [2^%d] ==> USE pool %p use = %lu\n", size, pow, ptr,
+ NUM_USED(info->next_pos, POOL_ALIGN_SIZE, info->size));
+
+ /* if 'info->nex_pos' is aligned, all blocks are used */
+ if (((size_t) info->next_pos & ~(POOL_ALIGN_SIZE - 1)) == (size_t) info->next_pos) {
+ list_del(&(info->list));
+ }
+ else if (info->pow <= MMAPED_OFFSET_POW) {
+ info->count++;
+
+ if (info->count == info->num_per_page) {
+ info->next_pos += (info->size * info->hole_num);
+ info->count = info->hole_num;
+ info->free_num += info->hole_num;
+ }
+ }
+ }
+ }
+
+ pthread_mutex_unlock(&mutex);
+
+ return ptr;
+}
+
+static inline void free_core(void *addr)
+{
+ pthread_mutex_lock(&mutex);
+
+ if (((size_t) addr & ((size_t) PAGE_SIZE - 1)) == MMAPED_OFFSET) {
+ char *head = (char *) addr - MMAPED_OFFSET;
+ int power = (int) *(int *) head;
+
+ dprintf("free(%p) --> free MMAPED [2^%d]\n", addr, power);
+ list_add_tail((struct free_list *) head, &arena_flist[power]);
+ }
+ else {
+ struct pool_info *info =
+ (struct pool_info *) ((size_t) addr & ~((size_t) POOL_ALIGN_SIZE - 1));
+
+ dprintf("free(%p) --> free POOL [2^%d] %lu / %u / %u (use / free / max)\n",
+ addr, info->pow,
+ NUM_USED(info->next_pos, POOL_ALIGN_SIZE, info->size),
+ info->free_num + 1, info->num);
+
+ info->free_num++;
+ if (info->free_num == info->num) {
+ /* intialize for reuse */
+ if (info->pow <= MMAPED_OFFSET_POW) {
+ info->count = info->hole_num;
+ info->free_num = info->hole_num;
+ info->next_pos = (char *) info + (info->size * info->hole_num);
+ }
+ else {
+ info->free_num = 1;
+ info->next_pos = (char *) info + info->size;
+ }
+
+ list_add_tail(&(info->list), &arena_flist[info->pow]);
+
+ dprintf(" POOL [2^%d] ALL FREED -> add list [%p]\n", info->pow,
+ &arena_flist[info->pow]);
+ }
+ }
+
+ pthread_mutex_unlock(&mutex);
+}
+
+void free(void *addr)
+{
+ if (addr) {
+ free_core(addr);
+ addr = NULL;
+ }
+}
+
+void *realloc(void *addr, size_t size)
+{
+ void *tmp;
+
+ dprintf("realloc(%p, %lu)\n", addr, size);
+
+ tmp = malloc(size);
+
+ if (addr != NULL) {
+ int old_pow, new_pow, power;
+
+ new_pow = powoftwo(size);
+
+ /* get power of 'addr' area */
+ if (((size_t) addr & ((size_t) PAGE_SIZE - 1)) == MMAPED_OFFSET) {
+ char *head = (char *) addr - MMAPED_OFFSET;
+ old_pow = (int) *(int *) head;
+ }
+ else {
+ struct pool_info *info =
+ (struct pool_info *) ((size_t) addr & ~((size_t) POOL_ALIGN_SIZE - 1));
+ old_pow = info->pow;
+ }
+
+ if (old_pow < new_pow)
+ power = old_pow; /* expand */
+ else
+ power = new_pow; /* shrink */
+
+ memcpy((char *) tmp, (char *) addr, (size_t) 1 << power);
+
+ free_core(addr);
+ }
+
+ addr = tmp;
+
+ return tmp;
+}
+
+void *calloc(size_t nmemb, size_t size)
+{
+ size_t total_sz;
+ char *ptr;
+
+ if (!nmemb || !size)
+ return NULL;
+
+ total_sz = nmemb * size;
+ ptr = malloc(total_sz);
+ if (ptr == NULL)
+ return NULL;
+
+ memset(ptr, 0, total_sz);
+
+ return ptr;
+}
+
+int munmap(void *addr, size_t length)
+{
+ if (__tunnel_munmap) {
+ dprintf("munmap(%p, 0x%lx)\n", addr, length);
+
+ return syscall(__NR_munmap, addr, length);
+ }
+ else {
+ /* do nothing */
+ }
+
+ return 0;
+}
http://git.mpich.org/mpich.git/commitdiff/0e7e956869725cf55227711d155b7680c8b783b8
commit 0e7e956869725cf55227711d155b7680c8b783b8
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date: Mon Apr 14 13:18:02 2014 +0900
Initialize the ringbuf memory for re-use
The pre-allocated ringbuf memory may be reused, so we need to initialize
the ringbuf memory when we release it.
Signed-off-by: Pavan Balaji <balaji at anl.gov>
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
index 0ade098..32f18e5 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
@@ -50,6 +50,7 @@ typedef struct {
/* Number of outstanding connection sequence started to eliminate
duplicated connection reuests */
uint8_t connection_guard;
+ void *vc_terminate_buf; /* address of ringbuffer which calls vc_terminate */
} MPID_nem_ib_vc_area;
/* macro for secret area in vc */
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
index 2410ae9..9ebdb42 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
@@ -835,6 +835,7 @@ int MPID_nem_ib_vc_init(MPIDI_VC_t * vc)
VC_FIELD(vc, connection_state) = MPID_NEM_IB_CM_CLOSED;
VC_FIELD(vc, connection_guard) = 0;
#endif
+ VC_FIELD(vc, vc_terminate_buf) = NULL;
/* rank is sent as wr_id and used to obtain vc in poll */
MPID_nem_ib_conns[vc->pg_rank].vc = vc;
@@ -995,6 +996,30 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
* and control transactions always proceed after receiveing reply */
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
+ /* store address of ringbuffer to clear in poll_eager */
+ uint16_t remote_poll = 0;
+ MPID_nem_ib_ringbuf_t *ringbuf;
+ ringbuf = VC_FIELD(vc, ibcom->remote_ringbuf);
+
+ switch (ringbuf->type) {
+ case MPID_NEM_IB_RINGBUF_EXCLUSIVE:
+ remote_poll = VC_FIELD(vc, ibcom->rsr_seq_num_poll);
+ break;
+ case MPID_NEM_IB_RINGBUF_SHARED:
+ remote_poll = MPID_nem_ib_remote_poll_shared;
+ break;
+ default: /* FIXME */
+ printf("unknown ringbuf->type\n");
+ break;
+ }
+
+ /* Decrement because we increment this value in eager_poll. */
+ remote_poll--;
+
+ VC_FIELD(vc, vc_terminate_buf) =
+ (uint8_t *) ringbuf->start +
+ MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t)(remote_poll % ringbuf->nslot));
+
dprintf("vc_terminate,before,%d->%d,diff-rsr=%d,l diff-lsr=%d,sendq_empty=%d,ncqe=%d,pending_sends=%d\n",
MPID_nem_ib_myrank, vc->pg_rank,
MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
@@ -1102,6 +1127,7 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
MPIU_ERR_POP(mpi_errno);
}
+#if 0 /* We move this code to the end of poll_eager. */
/* Destroy VC QP */
/* Destroy ring-buffer */
@@ -1151,6 +1177,7 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
MPIU_Free(MPID_nem_ib_scratch_pad_fds);
MPIU_Free(MPID_nem_ib_scratch_pad_ibcoms);
}
+#endif
dprintf("vc_terminate,exit\n");
fn_exit:
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
index f36c3ca..66be158 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
@@ -649,7 +649,8 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
* because MPID_nem_handle_pkt releases RDMA-wr-to buf by copying data out */
/* responder releases resource and then embed largest sequence number into MPI message bound to initiator */
#if 1
- if(vc->state != MPIDI_VC_STATE_INACTIVE)
+ if((vc->state != MPIDI_VC_STATE_INACTIVE) ||
+ (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
dprintf
("handle_pkt,after,%d<-%d,id=%d,pkt->type=%d,eagershort=%d,close=%d,rts=%d,piggy-backed-eagersend=%d\n",
MPID_nem_ib_myrank, vc->pg_rank, *remote_poll, pkt->type,
@@ -657,7 +658,8 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
MPIDI_NEM_IB_PKT_EAGER_SEND);
int notify_rate;
- if(vc->state != MPIDI_VC_STATE_INACTIVE) {
+ if((vc->state != MPIDI_VC_STATE_INACTIVE) ||
+ (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf)) {
ibcom_errno =
MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get(MPID_nem_ib_conns[vc->pg_rank].fd,
¬ify_rate);
@@ -687,10 +689,12 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
* because there is no way to trace the RDMA-write-to buffer addr
* because rreq->dev.tmpbuf is set to zero in ch3_eager.c
*/
- if(vc->state != MPIDI_VC_STATE_INACTIVE)
+ if((vc->state != MPIDI_VC_STATE_INACTIVE) ||
+ (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
dprintf("poll_eager,released,type=%d,MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM=%d\n", pkt->type,
MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM);
- if(vc->state != MPIDI_VC_STATE_INACTIVE)
+ if((vc->state != MPIDI_VC_STATE_INACTIVE) ||
+ (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
MPID_nem_ib_recv_buf_released(vc,
(void *) ((uint8_t *) buf +
sz_pkt +
@@ -713,10 +717,12 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
}
#endif
- if(vc->state != MPIDI_VC_STATE_INACTIVE)
+ if((vc->state != MPIDI_VC_STATE_INACTIVE) ||
+ (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
dprintf("ib_poll,hdr_ringbuf_type=%d\n", MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf));
- if(vc->state != MPIDI_VC_STATE_INACTIVE)
+ if((vc->state != MPIDI_VC_STATE_INACTIVE) ||
+ (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
if(MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf) & MPID_NEM_IB_RINGBUF_RELINDEX) {
vc_ib->ibcom->lsr_seq_num_tail = MPID_NEM_IB_NETMOD_HDR_RELINDEX_GET(buf);
dprintf("ib_poll,local_tail is updated to %d\n",
@@ -724,9 +730,64 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
}
/* Clear flag */
- if(vc->state != MPIDI_VC_STATE_INACTIVE)
+ if((vc->state != MPIDI_VC_STATE_INACTIVE) ||
+ (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
MPID_NEM_IB_NETMOD_HDR_HEAD_FLAG_SET(buf, 0);
+#if 1 /* We move this code from the end of vc_terminate. */
+ if (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf) {
+ /* clear stored data */
+ vc_ib->vc_terminate_buf = NULL;
+
+ /* Destroy ring-buffer */
+ ibcom_errno = MPID_nem_ib_ringbuf_free(vc);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_ringbuf_free");
+
+ /* Check connection status stored in VC when on-demand connection is used */
+ dprintf("vc_terminate,%d->%d,close\n", MPID_nem_ib_myrank, vc->pg_rank);
+ ibcom_errno = MPID_nem_ib_com_close(vc_ib->sc->fd);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_com_close");
+
+ /* Destroy array of scratch-pad QPs */
+ MPIU_Assert(MPID_nem_ib_conns_ref_count > 0);
+ if(--MPID_nem_ib_conns_ref_count == 0) {
+ MPIU_Free(MPID_nem_ib_conns);
+ }
+
+ /* TODO don't create them for shared memory vc */
+
+ /* Destroy scratch-pad */
+ ibcom_errno =
+ MPID_nem_ib_com_free(MPID_nem_ib_scratch_pad_fds[vc->pg_rank],
+#ifdef MPID_NEM_IB_ONDEMAND
+ MPID_NEM_IB_CM_OFF_CMD +
+ MPID_NEM_IB_CM_NSEG * sizeof(MPID_nem_ib_cm_cmd_t) +
+ sizeof(MPID_nem_ib_ringbuf_headtail_t)
+#else
+ MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_qp_state_t)
+#endif
+ );
+
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_com_free");
+
+ /* Destroy scratch-pad QP */
+ ibcom_errno =
+ MPID_nem_ib_com_close(MPID_nem_ib_scratch_pad_fds[vc->pg_rank]);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_com_close");
+
+ /* Destroy array of scratch-pad QPs */
+ MPIU_Assert(MPID_nem_ib_scratch_pad_fds_ref_count > 0);
+ if(--MPID_nem_ib_scratch_pad_fds_ref_count == 0) {
+ MPIU_Free(MPID_nem_ib_scratch_pad_fds);
+ MPIU_Free(MPID_nem_ib_scratch_pad_ibcoms);
+ }
+ }
+#endif
+
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_POLL_EAGER);
return mpi_errno;
@@ -910,7 +971,7 @@ int MPID_nem_ib_poll(int in_blocking_poll)
/* [MPID_NEM_IB_NRINGBUF-1] stores shared ring buffer */
for (i = 0; i < MPID_NEM_IB_NRINGBUF; i++) {
- if (((MPID_nem_ib_ringbuf_allocated[i / 64] >> (i & 63)) & 1) == 0) {
+ if ((((MPID_nem_ib_ringbuf_allocated[i / 64] >> (i & 63)) & 1) == 0) || !MPID_nem_ib_ringbuf) {
//dprintf("poll,cont\n");
continue;
}
@@ -919,6 +980,12 @@ int MPID_nem_ib_poll(int in_blocking_poll)
mpi_errno = MPID_nem_ib_poll_eager(&MPID_nem_ib_ringbuf[i]);
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_poll_eager");
+ /* MPID_nem_ib_ringbuf may be freed in poll_eager, when we received CLOSE-packet. */
+ if (!MPID_nem_ib_ringbuf) {
+ dprintf("MPID_nem_ib_ringbuf is freed\n");
+ continue;
+ }
+
/* without this, command in sendq doesn't have a chance
* to perform send_progress
* when send and progress_send call drain_scq asking it
@@ -2671,7 +2738,7 @@ int MPID_nem_ib_ringbuf_alloc(MPIDI_VC_t * vc)
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_ALLOC);
if(!MPID_nem_ib_ringbuf) {
- MPID_nem_ib_ringbuf = MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_t) * MPID_NEM_IB_NRINGBUF);
+ MPID_nem_ib_ringbuf = MPIU_Calloc(1, sizeof(MPID_nem_ib_ringbuf_t) * MPID_NEM_IB_NRINGBUF);
MPIU_ERR_CHKANDJUMP(!MPID_nem_ib_ringbuf, mpi_errno, MPI_ERR_OTHER,
"**malloc");
}
@@ -2764,6 +2831,7 @@ int MPID_nem_ib_ringbuf_free(MPIDI_VC_t * vc)
case MPID_NEM_IB_RINGBUF_EXCLUSIVE:
dprintf("ringbuf_free,start=%p\n", VC_FIELD(vc, ibcom->remote_ringbuf)->start);
MPID_nem_ib_rdmawr_to_free(VC_FIELD(vc, ibcom->remote_ringbuf)->start, MPID_NEM_IB_RINGBUF_NSLOT);
+ VC_FIELD(vc, ibcom->remote_ringbuf)->start = NULL; /* initialize for re-allocate */
MPID_nem_ib_ringbuf_allocated[index / 64] &= ~(1ULL << (index & 63));
VC_FIELD(vc, ibcom->remote_ringbuf) = NULL;
MPID_nem_ib_ringbuf_acquired[index / 64] &= ~(1ULL << (index & 63));
@@ -2774,6 +2842,7 @@ int MPID_nem_ib_ringbuf_free(MPIDI_VC_t * vc)
MPIU_Assert(VC_FIELD(vc, ibcom->remote_ringbuf)->ref_count > 0);
if(--VC_FIELD(vc, ibcom->remote_ringbuf)->ref_count == 0) {
MPID_nem_ib_rdmawr_to_free(VC_FIELD(vc, ibcom->remote_ringbuf)->start, MPID_NEM_IB_RINGBUF_NSLOT);
+ VC_FIELD(vc, ibcom->remote_ringbuf)->start = NULL; /* initialize for re-allocate */
MPID_nem_ib_ringbuf_allocated[index / 64] &= ~(1ULL << (index & 63));
dprintf("ringbuf_free,shared,allocated=%0lx\n", MPID_nem_ib_ringbuf_allocated[index / 64]);
}
@@ -2794,6 +2863,7 @@ int MPID_nem_ib_ringbuf_free(MPIDI_VC_t * vc)
if(!found) {
MPIU_Free(MPID_nem_ib_ringbuf);
+ MPID_nem_ib_ringbuf = NULL;
}
fn_exit:
http://git.mpich.org/mpich.git/commitdiff/bb280027d7271cf501df7fddaca66c2b3cc18d7a
commit bb280027d7271cf501df7fddaca66c2b3cc18d7a
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date: Tue Apr 8 13:41:40 2014 +0900
Change of the memory area used in put_scratch_pad
We allocate a memory in MPID_nem_ib_com_put_scratch_pad to avoid
destroying the data area which will be used by posted ibv_post_send.
Signed-off-by: Pavan Balaji <balaji at anl.gov>
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
index 60e0074..704dbf4 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
@@ -2006,7 +2006,7 @@ int MPID_nem_ib_com_put_lmt(int condesc, uint64_t wr_id, void *raddr, int sz_dat
}
int MPID_nem_ib_com_put_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset, int sz,
- void *laddr)
+ void *laddr, void **buf_from_out, uint32_t* buf_from_sz_out)
{
MPID_nem_ib_com_t *conp;
int ibcom_errno = 0;
@@ -2026,25 +2026,34 @@ int MPID_nem_ib_com_put_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset
MPID_NEM_IB_COM_ERR_CHKANDJUMP(!sz, -1, dprintf("MPID_nem_ib_com_put_scratch_pad,sz==0\n"));
/* Use inline so that we don't need to worry about overwriting write-from buffer */
- assert(sz <= conp->max_inline_data);
+// assert(sz <= conp->max_inline_data);
assert(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] == laddr);
- memcpy(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM], laddr, sz);
+// memcpy(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM], laddr, sz);
- void *from =
- (uint8_t *) conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
+ /* Instead of using the pre-mmaped memory (comp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM]),
+ we allocate a memory. */
+ void *buf_from = MPID_nem_ib_rdmawr_from_alloc(sz);
+ memcpy(buf_from, laddr, sz);
+ dprintf("put_scratch_pad,rdmawr_from_alloc=%p,sz=%d\n", buf_from, sz);
+ struct ibv_mr *mr_rdmawr_from = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_MR(buf_from);
+
+ *buf_from_out = buf_from;
+ *buf_from_sz_out = sz;
+
+ void *from = (uint8_t *) buf_from;
#ifdef HAVE_LIBDCFA
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].mic_addr = (uint64_t) from;
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].addr =
- conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_FROM]->host_addr +
+ mr_rdmawr_from->host_addr +
((uint64_t) from - (uint64_t) from);
#else
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].addr = (uint64_t) from;
#endif
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].length = sz;
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].lkey =
- conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_FROM]->lkey;
+ mr_rdmawr_from->lkey;
/* num_sge is defined in MPID_nem_ib_com_open */
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].wr_id = wr_id;
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
index 13a60be..05fa8f3 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
@@ -514,7 +514,7 @@ extern int MPID_nem_ib_com_isend(int condesc,
extern int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_hdr,
void *data, int sz_data);
extern int MPID_nem_ib_com_put_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset, int sz,
- void *laddr);
+ void *laddr, void **buf_from_out, uint32_t* buf_from_sz_out);
extern int MPID_nem_ib_com_get_scratch_pad(int condesc,
uint64_t wr_id,
uint64_t offset, int sz,
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
index 4aa6c43..f36c3ca 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
@@ -2049,6 +2049,11 @@ int MPID_nem_ib_cm_drain_scq()
shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
dprintf("cm_drain_scq,tx=%d\n", shadow_cm->req->ibcom->outstanding_connection_tx);
+
+ dprintf("cm_drain_scq,syn,buf_from=%p,sz=%d\n",
+ shadow_cm->buf_from, shadow_cm->buf_from_sz);
+ MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
+
MPIU_Free(shadow_cm);
break;
case MPID_NEM_IB_CM_SYNACK:
@@ -2057,6 +2062,11 @@ int MPID_nem_ib_cm_drain_scq()
shadow_cm->req, shadow_cm->req->initiator_rank);
shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
dprintf("cm_drain_scq,tx=%d\n", shadow_cm->req->ibcom->outstanding_connection_tx);
+
+ dprintf("cm_drain_scq,synack,buf_from=%p,sz=%d\n",
+ shadow_cm->buf_from, shadow_cm->buf_from_sz);
+ MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
+
MPIU_Free(shadow_cm);
break;
case MPID_NEM_IB_CM_ACK1:
@@ -2066,6 +2076,10 @@ int MPID_nem_ib_cm_drain_scq()
shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
dprintf("cm_drain_scq,tx=%d\n", shadow_cm->req->ibcom->outstanding_connection_tx);
+ dprintf("cm_drain_scq,ack1,buf_from=%p,sz=%d\n",
+ shadow_cm->buf_from, shadow_cm->buf_from_sz);
+ MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
+
/* Finalize protocol because there is no referer in cm_drain_scq and sendq.
Note that there might be one in cm_poll.*/
MPID_nem_ib_cm_request_release(shadow_cm->req);
@@ -2079,6 +2093,10 @@ int MPID_nem_ib_cm_drain_scq()
shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
dprintf("cm_drain_scq,tx=%d\n", shadow_cm->req->ibcom->outstanding_connection_tx);
+ dprintf("cm_drain_scq,ack2,buf_from=%p,sz=%d\n",
+ shadow_cm->buf_from, shadow_cm->buf_from_sz);
+ MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
+
/* Let the guard down to let the following connection request go. */
VC_FIELD(MPID_nem_ib_conns[shadow_cm->req->initiator_rank].vc, connection_guard) = 0;
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
index d7453bd..ee81c66 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
@@ -1508,7 +1508,8 @@ int MPID_nem_ib_cm_cmd_core(int rank, MPID_nem_ib_cm_cmd_shadow_t* shadow, void*
sizeof(MPID_nem_ib_cm_cmd_t) *
((uint16_t)(ringbuf_index % MPID_NEM_IB_CM_NSEG)),
sz,
- buf);
+ buf,
+ &(shadow->buf_from), &(shadow->buf_from_sz));
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_put_scratch_pad");
MPID_nem_ib_ncqe_scratch_pad += 1;
http://git.mpich.org/mpich.git/commitdiff/b5c0c7ef288e9d4fb66cb834bbfe6a1250f5bca5
commit b5c0c7ef288e9d4fb66cb834bbfe6a1250f5bca5
Author: Norio Yamaguchi <norio.yamaguchi at riken.jp>
Date: Tue Apr 8 10:25:46 2014 +0900
Add IB device name of qib<n>
IntelIB-Basic returns new name i.e. qib<n> when ask
with ibv_get_device_name.
Signed-off-by: Pavan Balaji <balaji at anl.gov>
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
index 3bd34ca..60e0074 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
@@ -300,7 +300,8 @@ static int MPID_nem_ib_com_device_init()
#else
for (i = 0; i < dev_num; i++) {
if (!strcmp(ibv_get_device_name(ib_devlist[i]), "mlx4_0") ||
- !strcmp(ibv_get_device_name(ib_devlist[i]), "mlx5_0")) {
+ !strcmp(ibv_get_device_name(ib_devlist[i]), "mlx5_0") ||
+ !strcmp(ibv_get_device_name(ib_devlist[i]), "qib0")) {
goto dev_found;
}
}
http://git.mpich.org/mpich.git/commitdiff/8aaede8f58bd07ea790ef112131633a96528ec9d
commit 8aaede8f58bd07ea790ef112131633a96528ec9d
Author: Masamichi Takagi <masamichi.takagi at gmail.com>
Date: Sun Mar 23 15:26:58 2014 +0900
Add IB device name of mlx5_<n>
Connect-IB and MLNX_OFED_LINUX-2.* returns new name i.e. mlx5_<n> when
asked with ibv_get_device_name.
Signed-off-by: Pavan Balaji <balaji at anl.gov>
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
index eb50f5b..3bd34ca 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
@@ -299,7 +299,8 @@ static int MPID_nem_ib_com_device_init()
}
#else
for (i = 0; i < dev_num; i++) {
- if (!strcmp(ibv_get_device_name(ib_devlist[i]), "mlx4_0")) {
+ if (!strcmp(ibv_get_device_name(ib_devlist[i]), "mlx4_0") ||
+ !strcmp(ibv_get_device_name(ib_devlist[i]), "mlx5_0")) {
goto dev_found;
}
}
http://git.mpich.org/mpich.git/commitdiff/0361551c718eee9d3bd1514f0e18a4e270e75bea
commit 0361551c718eee9d3bd1514f0e18a4e270e75bea
Author: Masamichi Takagi <masamichi.takagi at gmail.com>
Date: Fri Feb 7 11:18:05 2014 +0900
Free, munmap, destroy on terminating VC
Signed-off-by: Pavan Balaji <balaji at anl.gov>
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt b/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
index 4de7a41..0bfa8ce 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
@@ -38,12 +38,14 @@
**MPID_nem_ib_lmt_send_GET_DONE:MPID_nem_ib_lmt_send_GET_DONE failed
**MPID_nem_ib_npollingset:MPID_nem_ib_npollingset failed
**MPID_nem_ib_poll:MPID_nem_ib_poll failed
+**MPID_nem_ib_poll_eager:MPID_nem_ib_poll_eager failed
**MPID_nem_ib_rdma_to_alloc:MPID_nem_ib_rdma_to_alloc failed
**MPID_nem_ib_ring_alloc:MPID_nem_ib_ring_alloc failed
**MPID_nem_ib_ringbuf_alloc:MPID_nem_ib_ringbuf_alloc failed
**MPID_nem_ib_ringbuf_ask_cas:MPID_nem_ib_ringbuf_ask_cas failed
**MPID_nem_ib_ringbuf_ask_fetch:MPID_nem_ib_ringbuf_ask_fetch failed
**MPID_nem_ib_ringbuf_connect_cas_core:MPID_nem_ib_ringbuf_connect_cas_core failed
+**MPID_nem_ib_ringbuf_free:MPID_nem_ib_ringbuf_free failed
**MPID_nem_ib_ringbuf_progress:MPID_nem_ib_ringbuf_progress failed
**MPID_nem_ib_ringbuf_send_core:MPID_nem_ib_ringbuf_send_core failed
**MPID_nem_ib_send_change_rdmabuf_occupancy_notify_state:MPID_nem_ib_send_change_rdmabuf_occupancy_notify_state failed
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
index 6aa2c2b..eb50f5b 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
@@ -38,14 +38,21 @@ static struct ibv_device **ib_devlist;
static struct ibv_context *ib_ctx;
struct ibv_context *MPID_nem_ib_ctx_export; /* for SC13 demo connector */
static struct ibv_pd *ib_pd;
-struct ibv_pd *MPID_nem_ib_pd_export; /* for SC13 demo connector */
-struct ibv_cq *MPID_nem_ib_rc_shared_scq;
-struct ibv_cq *MPID_nem_ib_rc_shared_scq_scratch_pad;
+struct ibv_pd *MPID_nem_ib_pd_export; /* for SC13 demo connector */
+struct ibv_cq *MPID_nem_ib_rc_shared_scq;
+static int MPID_nem_ib_rc_shared_scq_ref_count;
+struct ibv_cq *MPID_nem_ib_rc_shared_scq_scratch_pad;
+static int MPID_nem_ib_rc_shared_scq_scratch_pad_ref_count;
+static struct ibv_cq *MPID_nem_ib_ud_shared_scq;
+static int MPID_nem_ib_ud_shared_scq_ref_count;
static struct ibv_cq *MPID_nem_ib_rc_shared_rcq;
+static int MPID_nem_ib_rc_shared_rcq_ref_count;
static struct ibv_cq *MPID_nem_ib_rc_shared_rcq_scratch_pad;
-static struct ibv_cq *MPID_nem_ib_ud_shared_scq;
-struct ibv_cq *MPID_nem_ib_ud_shared_rcq;
+static int MPID_nem_ib_rc_shared_rcq_scratch_pad_ref_count;
+struct ibv_cq *MPID_nem_ib_ud_shared_rcq;
+static int MPID_nem_ib_ud_shared_rcq_ref_count;
uint8_t *MPID_nem_ib_scratch_pad = 0;
+int MPID_nem_ib_scratch_pad_ref_count;
char *MPID_nem_ib_rdmawr_from_alloc_free_list_front[MPID_NEM_IB_RDMAWR_FROM_ALLOC_NID] = { 0 };
char *MPID_nem_ib_rdmawr_from_alloc_arena_free_list[MPID_NEM_IB_RDMAWR_FROM_ALLOC_NID] = { 0 };
struct ibv_mr* MPID_nem_ib_rdmawr_to_alloc_mr;
@@ -62,7 +69,9 @@ uint8_t *MPID_nem_ib_rdmawr_to_alloc_free_list;
#define MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp) \
{ \
if (condesc < 0 || condesc >= MPID_NEM_IB_COM_SIZE) { \
- return -1; \
+ dprintf("condesc=%d\n", condesc);\
+MPID_nem_ib_segv; \
+ return -1; \
} \
conp = &contab[condesc]; \
MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_used != 1, -1, dprintf("MPID_NEM_IB_RANGE_CHECK_WITH_ERROR,conp->icom_used=%d\n", conp->icom_used)); \
@@ -101,10 +110,10 @@ static int MPID_nem_ib_rdmawr_to_init(uint64_t sz)
MPID_nem_ib_rdmawr_to_alloc_start = start;
MPID_nem_ib_rdmawr_to_alloc_free_list = start;
for (cur = start;
- cur < start + sz - MPID_NEM_IB_COM_RDMABUF_SZSEG;
- cur += MPID_NEM_IB_COM_RDMABUF_SZSEG) {
+ cur < (uint8_t *)start + sz - MPID_NEM_IB_COM_RDMABUF_SZSEG;
+ cur = (uint8_t *)cur + MPID_NEM_IB_COM_RDMABUF_SZSEG) {
//dprintf("rdmawr_to_init,cur=%p\n", cur);
- ((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) cur)->next = cur + MPID_NEM_IB_COM_RDMABUF_SZSEG;
+ ((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) cur)->next = (uint8_t*)cur + MPID_NEM_IB_COM_RDMABUF_SZSEG;
}
((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) cur)->next = 0;
@@ -142,10 +151,10 @@ void MPID_nem_ib_rdmawr_to_free(void *p, int nslots)
((MPID_nem_ib_rdmawr_to_alloc_hdr_t *)
((uint8_t*)p + MPID_NEM_IB_COM_RDMABUF_SZSEG * (nslots-1)))->next =
MPID_nem_ib_rdmawr_to_alloc_free_list;
- for (q = p + MPID_NEM_IB_COM_RDMABUF_SZSEG * (nslots-2);
+ for (q = (uint8_t *)p + MPID_NEM_IB_COM_RDMABUF_SZSEG * (nslots-2);
q >= p;
- q -= MPID_NEM_IB_COM_RDMABUF_SZSEG) {
- ((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) q)->next = q + MPID_NEM_IB_COM_RDMABUF_SZSEG;
+ q = (uint8_t *)q - MPID_NEM_IB_COM_RDMABUF_SZSEG) {
+ ((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) q)->next = (uint8_t *)q + MPID_NEM_IB_COM_RDMABUF_SZSEG;
}
MPID_nem_ib_rdmawr_to_alloc_free_list = p;
}
@@ -316,6 +325,7 @@ static int MPID_nem_ib_com_device_init()
#else
dev_name = MPIU_Strdup(ibv_get_device_name(ib_devlist[i]));
dprintf("MPID_nem_ib_com_device_init,dev_name=%s\n", dev_name);
+ MPIU_Free(dev_name);
#endif
/* Create a PD */
if (MPID_nem_ib_pd_export) {
@@ -351,6 +361,7 @@ static int MPID_nem_ib_com_clean(MPID_nem_ib_com_t * conp)
{
int i;
int ibcom_errno = 0;
+ int ib_errno;
int retval;
if (conp->icom_qp) {
@@ -360,21 +371,30 @@ static int MPID_nem_ib_com_clean(MPID_nem_ib_com_t * conp)
if (conp->icom_mrlist && conp->icom_mrlen > 0) {
switch (conp->open_flag) {
case MPID_NEM_IB_COM_OPEN_RC:
- if(MPID_nem_ib_rc_shared_scq) {
+ MPIU_Assert(MPID_nem_ib_rc_shared_scq_ref_count > 0);
+ if(--MPID_nem_ib_rc_shared_scq_ref_count == 0) {
+ dprintf("ibcom,destroy MPID_nem_ib_rc_shared_scq\n");
ib_errno = ibv_destroy_cq(MPID_nem_ib_rc_shared_scq);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq"));
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq failed\n"));
+
+ /* Tell drain_scq that CQ is destroyed because
+ drain_scq is called after poll_eager calls vc_terminate */
MPID_nem_ib_rc_shared_scq = NULL;
}
- if(MPID_nem_ib_rc_shared_rcq) {
+ MPIU_Assert(MPID_nem_ib_rc_shared_rcq_ref_count > 0);
+ if(--MPID_nem_ib_rc_shared_rcq_ref_count == 0) {
+ dprintf("ibcom,destroy MPID_nem_ib_rc_shared_rcq\n");
ib_errno = ibv_destroy_cq(MPID_nem_ib_rc_shared_rcq);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq"));
- MPID_nem_ib_rc_shared_rcq = NULL;
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq failed\n"));
}
+#if 0 /* It's not used */
retval = munmap(conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_FROM], MPID_NEM_IB_COM_RDMABUF_SZ);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, dprintf("munmap"));
+#endif
+#if 0 /* Don't free it because it's managed through VC_FILED(vc, ibcom->remote_ringbuf) */
retval = munmap(conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO], MPID_NEM_IB_COM_RDMABUF_SZ);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, dprintf("munmap"));
-
+#endif
MPIU_Free(conp->icom_mrlist);
MPIU_Free(conp->icom_mem);
MPIU_Free(conp->icom_msize);
@@ -394,16 +414,23 @@ static int MPID_nem_ib_com_clean(MPID_nem_ib_com_t * conp)
MPIU_Free(conp->icom_rr);
break;
case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
- if(MPID_nem_ib_rc_shared_scq_scratch_pad) {
+ MPIU_Assert(MPID_nem_ib_rc_shared_scq_scratch_pad_ref_count > 0);
+ if(--MPID_nem_ib_rc_shared_scq_scratch_pad_ref_count == 0) {
ib_errno = ibv_destroy_cq(MPID_nem_ib_rc_shared_scq_scratch_pad);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq"));
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq failed\n"));
+ /* Tell drain_scq that CQ is destroyed because
+ drain_scq is called after poll_eager calls vc_terminate */
MPID_nem_ib_rc_shared_scq_scratch_pad = NULL;
}
- if(MPID_nem_ib_rc_shared_scq_scratch_pad) {
- ib_errno = ibv_destroy_cq(MPID_nem_ib_rc_shared_scq_scratch_pad);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq"));
- MPID_nem_ib_rc_shared_scq_scratch_pad = NULL;
+ MPIU_Assert(MPID_nem_ib_rc_shared_rcq_scratch_pad_ref_count > 0);
+ if(--MPID_nem_ib_rc_shared_rcq_scratch_pad_ref_count == 0) {
+ ib_errno = ibv_destroy_cq(MPID_nem_ib_rc_shared_rcq_scratch_pad);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq failed\n"));
}
+ retval = munmap(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM],
+ MPID_NEM_IB_COM_SCRATCH_PAD_FROM_SZ);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, dprintf("munmap"));
+
MPIU_Free(conp->icom_mrlist);
MPIU_Free(conp->icom_mem);
MPIU_Free(conp->icom_msize);
@@ -414,19 +441,24 @@ static int MPID_nem_ib_com_clean(MPID_nem_ib_com_t * conp)
#ifndef HAVE_LIBDCFA
MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list);
+ MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list);
+ MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list);
#endif
MPIU_Free(conp->icom_sr);
break;
case MPID_NEM_IB_COM_OPEN_UD:
- if(MPID_nem_ib_ud_shared_scq) {
+ MPIU_Assert(MPID_nem_ib_ud_shared_scq_ref_count > 0);
+ if(--MPID_nem_ib_ud_shared_scq_ref_count == 0) {
ib_errno = ibv_destroy_cq(MPID_nem_ib_ud_shared_scq);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq"));
+ /* Tell drain_scq that CQ is destroyed because
+ drain_scq is called after poll_eager calls vc_terminate */
MPID_nem_ib_ud_shared_scq = NULL;
}
- if(MPID_nem_ib_ud_shared_rcq) {
+ MPIU_Assert(MPID_nem_ib_ud_shared_rcq_ref_count > 0);
+ if(--MPID_nem_ib_ud_shared_rcq_ref_count == 0) {
ib_errno = ibv_destroy_cq(MPID_nem_ib_ud_shared_rcq);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq"));
- MPID_nem_ib_ud_shared_rcq = NULL;
}
retval = munmap(conp->icom_mem[MPID_NEM_IB_COM_UDWR_FROM], MPID_NEM_IB_COM_UDBUF_SZ);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, dprintf("munmap"));
@@ -452,7 +484,7 @@ static int MPID_nem_ib_com_clean(MPID_nem_ib_com_t * conp)
fn_exit:
return ibcom_errno;
- //fn_fail:
+ fn_fail:
goto fn_exit;
}
@@ -533,6 +565,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
/* Create send/recv CQ */
switch (open_flag) {
case MPID_NEM_IB_COM_OPEN_RC:
+ MPID_nem_ib_rc_shared_scq_ref_count++;
if (!MPID_nem_ib_rc_shared_scq) {
#ifdef HAVE_LIBDCFA
MPID_nem_ib_rc_shared_scq = ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
@@ -545,6 +578,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
}
conp->icom_scq = MPID_nem_ib_rc_shared_scq;
+ MPID_nem_ib_rc_shared_rcq_ref_count++;
if (!MPID_nem_ib_rc_shared_rcq) {
#ifdef HAVE_LIBDCFA
MPID_nem_ib_rc_shared_rcq = ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
@@ -558,6 +592,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
conp->icom_rcq = MPID_nem_ib_rc_shared_rcq;
break;
case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
+ MPID_nem_ib_rc_shared_scq_scratch_pad_ref_count++;
if (!MPID_nem_ib_rc_shared_scq_scratch_pad) {
#ifdef HAVE_LIBDCFA
MPID_nem_ib_rc_shared_scq_scratch_pad =
@@ -571,6 +606,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
}
conp->icom_scq = MPID_nem_ib_rc_shared_scq_scratch_pad;
+ MPID_nem_ib_rc_shared_rcq_scratch_pad_ref_count++;
if (!MPID_nem_ib_rc_shared_rcq_scratch_pad) {
#ifdef HAVE_LIBDCFA
MPID_nem_ib_rc_shared_rcq_scratch_pad =
@@ -585,6 +621,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
conp->icom_rcq = MPID_nem_ib_rc_shared_rcq_scratch_pad;
break;
case MPID_NEM_IB_COM_OPEN_UD:
+ MPID_nem_ib_ud_shared_scq_ref_count++;
if (!MPID_nem_ib_ud_shared_scq) {
#ifdef HAVE_LIBDCFA
MPID_nem_ib_ud_shared_scq = ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
@@ -597,6 +634,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
}
conp->icom_scq = MPID_nem_ib_ud_shared_scq;
+ MPID_nem_ib_ud_shared_rcq_ref_count++;
if (!MPID_nem_ib_ud_shared_rcq) {
#ifdef HAVE_LIBDCFA
MPID_nem_ib_ud_shared_rcq = ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
@@ -1114,6 +1152,7 @@ int MPID_nem_ib_com_alloc(int condesc, int sz)
case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
/* RDMA-write-to local memory area */
+ MPID_nem_ib_scratch_pad_ref_count++;
if (!MPID_nem_ib_scratch_pad) {
MPID_nem_ib_scratch_pad = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
dprintf("MPID_nem_ib_com_alloc,mmap=%p,len=%d\n", MPID_nem_ib_scratch_pad, sz);
@@ -1155,7 +1194,7 @@ int MPID_nem_ib_com_alloc(int condesc, int sz)
goto fn_exit;
}
-void MPID_nem_ib_com_free(int condesc, int sz) {
+int MPID_nem_ib_com_free(int condesc, int sz) {
MPID_nem_ib_com_t *conp;
int ibcom_errno = 0;
int retval;
@@ -1165,10 +1204,12 @@ void MPID_nem_ib_com_free(int condesc, int sz) {
switch (conp->open_flag) {
case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
- MPIU_Assert(scratch_pad_ref_count > 0);
- if(--scratch_pad_ref_count == 0) {
- retval = munmap(scratch_pad, sz);
+ MPIU_Assert(MPID_nem_ib_scratch_pad_ref_count > 0);
+ if(--MPID_nem_ib_scratch_pad_ref_count == 0) {
+ retval = munmap(MPID_nem_ib_scratch_pad, sz);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, dprintf("munmap"));
+ MPID_nem_ib_scratch_pad = NULL;
+ dprintf("ib_com_free,MPID_nem_ib_scratch_pad is freed\n");
}
break;
default:
@@ -1231,7 +1272,6 @@ int MPID_nem_ib_com_rts(int condesc, int remote_qpnum, uint16_t remote_lid,
}
goto common_tail;
case MPID_NEM_IB_COM_OPEN_RC:
- case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
/* Init QP */
ib_errno = modify_qp_to_init(conp->icom_qp, conp->icom_port, 0);
if (ib_errno) {
@@ -1404,7 +1444,7 @@ int MPID_nem_ib_com_isend(int condesc,
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].addr =
mr_rdmawr_from->host_addr +
((uint64_t) buf_from + MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr -
- (uint64_t) MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_START(REQ_FIELD(sreq, buf_from)));
+ (uint64_t) MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_START(buf_from));
#else
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].addr =
(uint64_t) buf_from + MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr;
@@ -1986,6 +2026,7 @@ int MPID_nem_ib_com_put_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset
/* Use inline so that we don't need to worry about overwriting write-from buffer */
assert(sz <= conp->max_inline_data);
+ assert(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] == laddr);
memcpy(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM], laddr, sz);
void *from =
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
index e24b6c6..13a60be 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
@@ -176,6 +176,7 @@ extern struct ibv_cq *MPID_nem_ib_rc_shared_scq;
extern struct ibv_cq *MPID_nem_ib_rc_shared_scq_scratch_pad;
extern struct ibv_cq *MPID_nem_ib_ud_shared_rcq;
extern uint8_t *MPID_nem_ib_scratch_pad;
+extern int MPID_nem_ib_scratch_pad_ref_count;
extern char *MPID_nem_ib_rdmawr_from_alloc_free_list_front[MPID_NEM_IB_RDMAWR_FROM_ALLOC_NID];
extern char *MPID_nem_ib_rdmawr_from_alloc_arena_free_list[MPID_NEM_IB_RDMAWR_FROM_ALLOC_NID];
extern struct ibv_mr* MPID_nem_ib_rdmawr_to_alloc_mr;
@@ -368,6 +369,7 @@ typedef struct {
int nslot;
MPIDI_VC_t * vc;
uint64_t remote_released[(MPID_NEM_IB_COM_RDMABUF_NSEG + 63) / 64];
+ int ref_count; /* number of VCs sharing the ring-buffer */
} MPID_nem_ib_ringbuf_t;
/* Represent a ring-buffer is exclusively acquired */
@@ -473,6 +475,15 @@ typedef struct MPID_nem_ib_com {
/* Ring buffer sectors obtained through ask-send protocol */
MPID_nem_ib_ringbuf_sectorq_t sectorq;
+
+ /* Two transactions from the both ends for a connection
+ can be outstanding at the same time when they were initiated
+ at the same time. This makes one end try to send ACK2 after
+ freeing scratch-pad QP for the connection. So we must monitor and
+ wait until all the onnection request transactions ends before
+ freeing scratch-pad QP.*/
+ int outstanding_connection_tx;
+
} MPID_nem_ib_com_t;
extern void *MPID_nem_ib_rdmawr_to_alloc(int nslots);
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
index 42c854f..0ade098 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
@@ -325,7 +325,7 @@ typedef struct {
/* State store for connection protocol */
typedef struct MPID_nem_ib_ringbuf_req {
MPID_nem_ib_ringbuf_cmd_type_t state;
- MPIDI_VC_t * vc;
+ MPIDI_VC_t * vc; /* You can eliminate this. */
MPID_nem_ib_com_t *ibcom; /* ibcom of scratch pad, referenced in drain_scq */
/* fetch the head and compare-and-swap head and head + 1
@@ -526,11 +526,18 @@ int MPID_nem_ib_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
int MPID_nem_ib_iStartContigMsg(MPIDI_VC_t * vc, void *hdr, MPIDI_msg_sz_t hdr_sz, void *data,
MPIDI_msg_sz_t data_sz, MPID_Request ** sreq_ptr);
+int MPID_nem_ib_cm_cas_core(int rank, MPID_nem_ib_cm_cmd_shadow_t* shadow);
+int MPID_nem_ib_cm_cas(MPIDI_VC_t * vc, uint32_t ask_on_connect);
int MPID_nem_ib_cm_cmd_core(int rank, MPID_nem_ib_cm_cmd_shadow_t* shadow, void* buf, MPIDI_msg_sz_t sz, uint32_t syn, uint16_t ringbuf_index);
-int MPID_nem_ib_ringbuf_alloc(MPIDI_VC_t * vc);
+int MPID_nem_ib_ringbuf_ask_cas(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_req_t* req);
+int MPID_nem_ib_ringbuf_ask_fetch_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t* shadow, MPIDI_msg_sz_t sz);
+int MPID_nem_ib_ringbuf_ask_fetch(MPIDI_VC_t * vc);
int MPID_nem_ib_ringbuf_ask_cas_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t* shadow, uint64_t head);
int MPID_nem_ib_ringbuf_progress(void);
+int MPID_nem_ib_ringbuf_alloc(MPIDI_VC_t * vc);
+int MPID_nem_ib_ringbuf_free(MPIDI_VC_t * vc);
+
/* used by ib_poll.c */
int MPID_nem_ib_send_progress(MPIDI_VC_t * vc);
@@ -566,8 +573,10 @@ extern int MPID_nem_ib_conn_ud_fd;
extern MPID_nem_ib_com_t *MPID_nem_ib_conn_ud_ibcom;
extern MPID_nem_ib_conn_ud_t *MPID_nem_ib_conn_ud;
extern MPID_nem_ib_conn_t *MPID_nem_ib_conns;
+extern int MPID_nem_ib_conns_ref_count;
//extern MPIDI_VC_t **MPID_nem_ib_pollingset;
extern int *MPID_nem_ib_scratch_pad_fds; /* TODO: create structure including fds and ibcoms */
+extern int MPID_nem_ib_scratch_pad_fds_ref_count;
extern MPID_nem_ib_com_t **MPID_nem_ib_scratch_pad_ibcoms;
//extern int MPID_nem_ib_npollingset;
extern void *MPID_nem_ib_fl[18];
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
index 622b8fd..2410ae9 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
@@ -68,11 +68,13 @@ void *MPID_nem_ib_fl[18];
int MPID_nem_ib_nranks;
MPID_nem_ib_conn_ud_t *MPID_nem_ib_conn_ud;
MPID_nem_ib_conn_t *MPID_nem_ib_conns;
+int MPID_nem_ib_conns_ref_count;
//MPIDI_VC_t **MPID_nem_ib_pollingset;
int MPID_nem_ib_conn_ud_fd;
MPID_nem_ib_com_t *MPID_nem_ib_conn_ud_MPID_nem_ib_com;
//int MPID_nem_ib_npollingset;
int *MPID_nem_ib_scratch_pad_fds;
+int MPID_nem_ib_scratch_pad_fds_ref_count;
MPID_nem_ib_com_t **MPID_nem_ib_scratch_pad_ibcoms;
//char *MPID_nem_ib_recv_buf;
int MPID_nem_ib_myrank;
@@ -136,7 +138,7 @@ static int MPID_nem_ib_kvs_put_binary(int from, const char *postfix, const uint8
mpi_errno = MPIDI_PG_GetConnKVSname(&kvs_name);
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPIDI_PG_GetConnKVSname");
- dprintf("kvs_put_binary,kvs_name=%s\n", kvs_name);
+ //dprintf("kvs_put_binary,kvs_name=%s\n", kvs_name);
sprintf(key, "bc/%d/%s", from, postfix);
val[0] = 0;
@@ -144,8 +146,8 @@ static int MPID_nem_ib_kvs_put_binary(int from, const char *postfix, const uint8
sprintf(str, "%02x", buf[j]);
strcat(val, str);
}
- dprintf("kvs_put_binary,rank=%d,from=%d,PMI_KVS_Put(%s, %s, %s)\n", MPID_nem_ib_myrank, from,
- kvs_name, key, val);
+ //dprintf("kvs_put_binary,rank=%d,from=%d,PMI_KVS_Put(%s, %s, %s)\n", MPID_nem_ib_myrank, from,
+ //kvs_name, key, val);
pmi_errno = PMI_KVS_Put(kvs_name, key, val);
MPIU_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**PMI_KVS_Put");
fn_exit:
@@ -171,13 +173,13 @@ static int MPID_nem_ib_kvs_get_binary(int from, const char *postfix, char *buf,
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_KVS_GET_BINARY);
mpi_errno = MPIDI_PG_GetConnKVSname(&kvs_name);
- dprintf("kvs_get_binary,kvs_name=%s\n", kvs_name);
+ //dprintf("kvs_get_binary,kvs_name=%s\n", kvs_name);
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPIDI_PG_GetConnKVSname");
sprintf(key, "bc/%d/%s", from, postfix);
pmi_errno = PMI_KVS_Get(kvs_name, key, val, 256);
- dprintf("kvs_put_binary,rank=%d,from=%d,PMI_KVS_Get(%s, %s, %s)\n", MPID_nem_ib_myrank, from,
- kvs_name, key, val);
+ //dprintf("kvs_put_binary,rank=%d,from=%d,PMI_KVS_Get(%s, %s, %s)\n", MPID_nem_ib_myrank, from,
+ //kvs_name, key, val);
MPIU_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**PMS_KVS_Get");
dprintf("rank=%d,obtained val=%s\n", MPID_nem_ib_myrank, val);
@@ -222,7 +224,6 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
MPID_nem_ib_tsc_poll = MPID_nem_ib_rdtsc();
MPID_nem_ib_ncqe = 0;
MPID_nem_ib_ncqe_to_drain = 0;
- MPID_nem_ib_ncqe_lmt_put = 0;
MPID_nem_ib_ncqe_nces = 0;
MPID_nem_ib_ncqe_scratch_pad = 0;
MPID_nem_ib_ncqe_scratch_pad_to_drain = 0;
@@ -247,11 +248,15 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
/* prepare scrath-pad QP and malloc scratch-pad */
for (i = 0; i < MPID_nem_ib_nranks; i++) {
- MPID_nem_ib_scratch_pad_fds_ref_count++;
+ if(i == MPID_nem_ib_myrank) {
+ continue;
+ }
+ dprintf("init,MPID_nem_ib_myrank=%d,i=%d\n", MPID_nem_ib_myrank, i);
ibcom_errno =
MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_SCRATCH_PAD,
&MPID_nem_ib_scratch_pad_fds[i]);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
+ MPID_nem_ib_scratch_pad_fds_ref_count++;
ibcom_errno =
MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[i],
@@ -304,7 +309,7 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
for (i = 0, nranks = MPID_nem_ib_nranks; nranks > 0; nranks /= 10, i++) {
}
MPIU_CHKPMEM_MALLOC(remote_rank_str, char *, 1 + i + 1, mpi_errno, "connection table");
- MPIU_CHKPMEM_MALLOC(key_str, char *, strlen("sp/qpn") + 1 + i + 1, mpi_errno,
+ MPIU_CHKPMEM_MALLOC(key_str, char *, strlen("sp/rmem") + 1 + i + 1, mpi_errno,
"connection table");
for (i = 0; i < MPID_nem_ib_nranks; i++) {
@@ -489,6 +494,8 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
ibcom_errno =
MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_RC, &MPID_nem_ib_conns[i].fd);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
+ MPID_nem_ib_conns_ref_count++;
+ dprintf("init,fd=%d\n", MPID_nem_ib_conns[i].fd);
}
/* put bc/me/{gid,lid}, put bc/me/{qpn,rmem,rkey}/you */
@@ -580,6 +587,9 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
#endif
#endif
+ MPIU_Free(remote_rank_str);
+ MPIU_Free(key_str);
+
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_INIT);
return mpi_errno;
@@ -776,11 +786,6 @@ int MPID_nem_ib_vc_onconnect(MPIDI_VC_t * vc)
&VC_FIELD(vc, ibcom));
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
- ibcom_errno =
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[vc->pg_rank].fd_lmt_put,
- &VC_FIELD(vc, ibcom_lmt_put));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
-
#if 0
/* Insert into polling set */
MPIU_ERR_CHKANDJUMP(MPID_nem_ib_npollingset + 1 > MPID_NEM_IB_MAX_POLLINGSET, mpi_errno,
@@ -1013,13 +1018,15 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
#endif
/* Empty sendq */
- while (!MPID_nem_ib_sendq_empty(vc_ib->sendq)) {
+ while (!MPID_nem_ib_sendq_empty(vc_ib->sendq) ||
+ VC_FIELD(vc, pending_sends) > 0 ||
+ MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank]->outstanding_connection_tx > 0) {
/* mimic ib_poll because vc_terminate might be called from ib_poll_eager */
mpi_errno = MPID_nem_ib_send_progress(vc);
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_send_progress");
ibcom_errno = MPID_nem_ib_drain_scq(0);
-#ifdef MPID_NEM_IB_ONDEMAND
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_drain_scq");
+#ifdef MPID_NEM_IB_ONDEMAND
ibcom_errno = MPID_nem_ib_cm_poll_syn();
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_poll_syn");
ibcom_errno = MPID_nem_ib_cm_poll();
@@ -1042,10 +1049,13 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
MPID_nem_ib_sendq_empty(vc_ib->sendq), MPID_nem_ib_ncqe, VC_FIELD(vc, pending_sends));
+#if 0
if (MPID_nem_ib_ncqe > 0 || VC_FIELD(vc, pending_sends) > 0) {
usleep(1000);
MPID_nem_ib_drain_scq(0);
}
+#endif
+
dprintf("init,middle2,%d->%d,r rdmaocc=%d,l rdmaocc=%d,sendq=%d,ncqe=%d,pending_sends=%d\n",
MPID_nem_ib_myrank, vc->pg_rank,
MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
@@ -1093,21 +1103,45 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
}
/* Destroy VC QP */
+
+ /* Destroy ring-buffer */
+ ibcom_errno = MPID_nem_ib_ringbuf_free(vc);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_ringbuf_free");
+
/* Check connection status stored in VC when on-demand connection is used */
- ibcom_errno = MPID_nem_ib_com_close(vc_ib->sc.fd);
+ dprintf("vc_terminate,%d->%d,close\n", MPID_nem_ib_myrank, vc->pg_rank);
+ ibcom_errno = MPID_nem_ib_com_close(vc_ib->sc->fd);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
"**MPID_nem_ib_com_close");
- /* Release scratch-pad */
+ /* Destroy array of scratch-pad QPs */
+ MPIU_Assert(MPID_nem_ib_conns_ref_count > 0);
+ if(--MPID_nem_ib_conns_ref_count == 0) {
+ MPIU_Free(MPID_nem_ib_conns);
+ }
+
+ /* TODO don't create them for shared memory vc */
+
+ /* Destroy scratch-pad */
ibcom_errno =
- MPID_nem_ib_com_free(scratch_pad_fds[vc->pg_rank],
- MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_qp_state_t));
+ MPID_nem_ib_com_free(MPID_nem_ib_scratch_pad_fds[vc->pg_rank],
+#ifdef MPID_NEM_IB_ONDEMAND
+ MPID_NEM_IB_CM_OFF_CMD +
+ MPID_NEM_IB_CM_NSEG * sizeof(MPID_nem_ib_cm_cmd_t) +
+ sizeof(MPID_nem_ib_ringbuf_headtail_t)
+#else
+ MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_qp_state_t)
+
+#endif
+ );
+
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
"**MPID_nem_ib_com_free");
/* Destroy scratch-pad QP */
ibcom_errno =
- MPID_nem_ib_com_close(scratch_pad_fds[vc->pg_rank]);
+ MPID_nem_ib_com_close(MPID_nem_ib_scratch_pad_fds[vc->pg_rank]);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
"**MPID_nem_ib_com_close");
@@ -1115,6 +1149,7 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
MPIU_Assert(MPID_nem_ib_scratch_pad_fds_ref_count > 0);
if(--MPID_nem_ib_scratch_pad_fds_ref_count == 0) {
MPIU_Free(MPID_nem_ib_scratch_pad_fds);
+ MPIU_Free(MPID_nem_ib_scratch_pad_ibcoms);
}
dprintf("vc_terminate,exit\n");
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
index 95297d4..a777293 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
@@ -43,6 +43,7 @@ int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_p
MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr,
dt_true_lb);
+ /* FIXME: who frees s_cookie_buf? */
/* malloc memory area for cookie. auto variable is NG because isend does not copy payload */
MPID_nem_ib_lmt_cookie_t *s_cookie_buf =
(MPID_nem_ib_lmt_cookie_t *) MPIU_Malloc(sizeof(MPID_nem_ib_lmt_cookie_t));
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
index 15a5596..4aa6c43 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
@@ -28,7 +28,7 @@ static int entered_drain_scq = 0;
if (((MPID_nem_ib_ringbuf_allocated[n / 64] >> (n & 63)) & 1) == 0) { \
continue; \
} \
- mpi_errno = MPID_nem_ib_poll_eager(&MPID_nem_ib_ringbuf[n]); \
+ mpi_errno = MPID_nem_ib_poll_eager(&MPID_nem_ib_ringbuf[n]); /*FIXME: perform send_progress for all sendqs */\
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_poll_eager"); \
} \
}
@@ -66,13 +66,14 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
/* prevent a call path drain_scq -> send_progress -> drain_scq */
if (entered_drain_scq) {
- printf("drain_scq,re-enter\n");
+ dprintf("drain_scq,re-enter\n");
goto fn_exit;
}
entered_drain_scq = 1;
#ifdef MPID_NEM_IB_ONDEMAND
- /* nobody created QP */
+ /* drain_scq is called after poll_eager calls vc_terminate
+ or nobody created QP */
if(!MPID_nem_ib_rc_shared_scq) {
dprintf("drain_scq,CQ is null\n");
goto fn_exit;
@@ -437,6 +438,12 @@ int MPID_nem_ib_drain_scq_scratch_pad()
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_DRAIN_SCQ_SCRATCH_PAD);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_DRAIN_SCQ_SCRATCH_PAD);
+ /* drain_scq_scratch_pad is called after poll_eager calls vc_terminate */
+ if(!MPID_nem_ib_rc_shared_scq_scratch_pad) {
+ dprintf("drain_scq_scratch_pad,CQ is null\n");
+ goto fn_exit;
+ }
+
#if 0 /*def HAVE_LIBDCFA */
result = ibv_poll_cq(MPID_nem_ib_rc_shared_scq_scratch_pad, 1, &cqe[0]);
#else
@@ -595,6 +602,10 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
}
#endif
+ /* Increment here because handle_pkt of CLOSE calls poll_eager recursively */
+ (*remote_poll) += 1;
+ dprintf("ib_poll,inc,remote_poll=%d\n", *remote_poll);
+
/* VC is stored in the packet for shared ring buffer */
switch(ringbuf->type) {
case MPID_NEM_IB_RINGBUF_EXCLUSIVE:
@@ -608,12 +619,14 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
}
vc_ib = VC_IB(vc);
dprintf("poll_eager,vc=%p\n", vc);
-
+
+ /* Save it because handle_pkt frees buf when the packet is MPIDI_CH3_PKT_CLOSE */
+ ssize_t sz_pkt = MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf);
MPIDI_CH3_Pkt_eager_send_t *pkt =
- (MPIDI_CH3_Pkt_eager_send_t *) ((uint8_t *) buf + MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf));
- dprintf("pkt=%p,sizeof=%ld\n", pkt, MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf));
+ (MPIDI_CH3_Pkt_eager_send_t *) ((uint8_t *) buf + sz_pkt);
+ dprintf("pkt=%p,sizeof=%ld\n", pkt, sz_pkt);
MPIU_Assert(MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) >=
- MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf) + sizeof(MPIDI_CH3_Pkt_t) +
+ sz_pkt + sizeof(MPIDI_CH3_Pkt_t) +
sizeof(MPID_nem_ib_netmod_trailer_t));
dprintf
("handle_pkt,before,%d<-%d,id=%d,pkt->type=%d,pcc=%d,MPIDI_NEM_PKT_END=%d,pkt=%p,subtype=%d\n",
@@ -622,9 +635,9 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
((MPID_nem_pkt_netmod_t *) pkt)->subtype);
/* see MPIDI_CH3_PktHandler_EagerSend (in src/mpid/ch3/src/ch3u_eager.c) */
mpi_errno =
- MPID_nem_handle_pkt(vc, (char *) ((uint8_t *) buf + MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf)),
+ MPID_nem_handle_pkt(vc, (char *) ((uint8_t *) buf + sz_pkt),
(MPIDI_msg_sz_t) (MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) -
- MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf) -
+ sz_pkt -
sizeof(MPID_nem_ib_netmod_trailer_t)));
if (mpi_errno) {
MPIU_ERR_POP(mpi_errno);
@@ -636,6 +649,7 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
* because MPID_nem_handle_pkt releases RDMA-wr-to buf by copying data out */
/* responder releases resource and then embed largest sequence number into MPI message bound to initiator */
#if 1
+ if(vc->state != MPIDI_VC_STATE_INACTIVE)
dprintf
("handle_pkt,after,%d<-%d,id=%d,pkt->type=%d,eagershort=%d,close=%d,rts=%d,piggy-backed-eagersend=%d\n",
MPID_nem_ib_myrank, vc->pg_rank, *remote_poll, pkt->type,
@@ -643,6 +657,7 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
MPIDI_NEM_IB_PKT_EAGER_SEND);
int notify_rate;
+ if(vc->state != MPIDI_VC_STATE_INACTIVE) {
ibcom_errno =
MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get(MPID_nem_ib_conns[vc->pg_rank].fd,
¬ify_rate);
@@ -653,6 +668,7 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail,
notify_rate);
+ }
if(ringbuf->type == MPID_NEM_IB_RINGBUF_EXCLUSIVE) {
dprintf("poll_eager,rdiff=%d(%d-%d)\n",
@@ -671,16 +687,18 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
* because there is no way to trace the RDMA-write-to buffer addr
* because rreq->dev.tmpbuf is set to zero in ch3_eager.c
*/
+ if(vc->state != MPIDI_VC_STATE_INACTIVE)
dprintf("poll_eager,released,type=%d,MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM=%d\n", pkt->type,
MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM);
+ if(vc->state != MPIDI_VC_STATE_INACTIVE)
MPID_nem_ib_recv_buf_released(vc,
(void *) ((uint8_t *) buf +
- MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf) +
+ sz_pkt +
sizeof(MPIDI_CH3_Pkt_t)));
}
else {
if (MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) ==
- MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf) + sizeof(MPIDI_CH3_Pkt_t) +
+ sz_pkt + sizeof(MPIDI_CH3_Pkt_t) +
sizeof(MPID_nem_ib_netmod_trailer_t)) {
if (pkt->type == MPIDI_CH3_PKT_EAGERSHORT_SEND
//|| pkt->type == MPIDI_CH3_PKT_GET
@@ -695,7 +713,10 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
}
#endif
+ if(vc->state != MPIDI_VC_STATE_INACTIVE)
dprintf("ib_poll,hdr_ringbuf_type=%d\n", MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf));
+
+ if(vc->state != MPIDI_VC_STATE_INACTIVE)
if(MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf) & MPID_NEM_IB_RINGBUF_RELINDEX) {
vc_ib->ibcom->lsr_seq_num_tail = MPID_NEM_IB_NETMOD_HDR_RELINDEX_GET(buf);
dprintf("ib_poll,local_tail is updated to %d\n",
@@ -703,12 +724,9 @@ int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
}
/* Clear flag */
+ if(vc->state != MPIDI_VC_STATE_INACTIVE)
MPID_NEM_IB_NETMOD_HDR_HEAD_FLAG_SET(buf, 0);
- (*remote_poll) += 1;
- dprintf("ib_poll,inc,remote_poll=%d\n", *remote_poll);
- dprintf("ib_poll_eager,3,MPIR_Process.comm_self->vcrt->ref_count=%d\n", MPIR_Process.comm_self->vcrt->ref_count);
-
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_POLL_EAGER);
return mpi_errno;
@@ -893,14 +911,13 @@ int MPID_nem_ib_poll(int in_blocking_poll)
/* [MPID_NEM_IB_NRINGBUF-1] stores shared ring buffer */
for (i = 0; i < MPID_NEM_IB_NRINGBUF; i++) {
if (((MPID_nem_ib_ringbuf_allocated[i / 64] >> (i & 63)) & 1) == 0) {
+ //dprintf("poll,cont\n");
continue;
}
//tscs = MPID_nem_ib_rdtsc();
//dprintf("poll,kicking progress engine for %d\n", i);
mpi_errno = MPID_nem_ib_poll_eager(&MPID_nem_ib_ringbuf[i]);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_poll_eager");
/* without this, command in sendq doesn't have a chance
* to perform send_progress
@@ -1034,12 +1051,14 @@ int MPID_nem_ib_recv_posted(struct MPIDI_VC *vc, struct MPID_Request *req)
if (data_sz + sizeof(MPIDI_CH3_Pkt_eager_send_t) > vc->eager_max_msg_sz) {
//if (MPID_nem_ib_tsc_poll - MPID_nem_ib_rdtsc() > MPID_NEM_IB_POLL_PERIOD_RECV_POSTED) {
#if 1
- mpi_errno = MPID_nem_ib_poll_eager(VC_FIELD(vc, ibcom->remote_ringbuf));
+ if(VC_FIELD(vc, ibcom->remote_ringbuf)) {
+ mpi_errno = MPID_nem_ib_poll_eager(VC_FIELD(vc, ibcom->remote_ringbuf));
#else
- mpi_errno = MPID_nem_ib_poll(0);
+ mpi_errno = MPID_nem_ib_poll(0);
#endif
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
+ if (mpi_errno) {
+ MPIU_ERR_POP(mpi_errno);
+ }
}
//}
}
@@ -1047,6 +1066,7 @@ int MPID_nem_ib_recv_posted(struct MPIDI_VC *vc, struct MPID_Request *req)
#if 1
/* anticipating received message finds maching request in the posted-queue */
//if (MPID_nem_ib_tsc_poll - MPID_nem_ib_rdtsc() > MPID_NEM_IB_POLL_PERIOD_RECV_POSTED) {
+ if(VC_FIELD(vc, ibcom->remote_ringbuf)) {
#if 1
mpi_errno = MPID_nem_ib_poll_eager(VC_FIELD(vc, ibcom->remote_ringbuf));
#else
@@ -1055,6 +1075,7 @@ int MPID_nem_ib_recv_posted(struct MPIDI_VC *vc, struct MPID_Request *req)
if (mpi_errno) {
MPIU_ERR_POP(mpi_errno);
}
+ }
//}
#endif
}
@@ -1927,7 +1948,14 @@ int MPID_nem_ib_cm_drain_scq()
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_DRAIN_SCQ);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_DRAIN_SCQ);
- dprintf("cm_drain_scq,enter\n");
+ //dprintf("cm_drain_scq,enter\n");
+
+
+ /* cm_drain_scq is called after poll_eager calls vc_terminate */
+ if(!MPID_nem_ib_rc_shared_scq_scratch_pad) {
+ dprintf("cm_drain_scq,CQ is null\n");
+ goto fn_exit;
+ }
result =
ibv_poll_cq(MPID_nem_ib_rc_shared_scq_scratch_pad, MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN,
@@ -2020,6 +2048,7 @@ int MPID_nem_ib_cm_drain_scq()
dprintf("cm_drain_scq,syn sent\n");
shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
+ dprintf("cm_drain_scq,tx=%d\n", shadow_cm->req->ibcom->outstanding_connection_tx);
MPIU_Free(shadow_cm);
break;
case MPID_NEM_IB_CM_SYNACK:
@@ -2027,12 +2056,15 @@ int MPID_nem_ib_cm_drain_scq()
dprintf("cm_drain_scq,synack sent,req=%p,initiator_rank=%d\n",
shadow_cm->req, shadow_cm->req->initiator_rank);
shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
+ dprintf("cm_drain_scq,tx=%d\n", shadow_cm->req->ibcom->outstanding_connection_tx);
MPIU_Free(shadow_cm);
break;
case MPID_NEM_IB_CM_ACK1:
dprintf("cm_drain_scq,ack1 sent\n");
shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
+ shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
+ dprintf("cm_drain_scq,tx=%d\n", shadow_cm->req->ibcom->outstanding_connection_tx);
/* Finalize protocol because there is no referer in cm_drain_scq and sendq.
Note that there might be one in cm_poll.*/
@@ -2044,6 +2076,8 @@ int MPID_nem_ib_cm_drain_scq()
dprintf("cm_drain_scq,ack2 sent,req=%p,initiator_rank=%p=%d\n",
shadow_cm->req, &shadow_cm->req->initiator_rank, shadow_cm->req->initiator_rank);
shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
+ shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
+ dprintf("cm_drain_scq,tx=%d\n", shadow_cm->req->ibcom->outstanding_connection_tx);
/* Let the guard down to let the following connection request go. */
VC_FIELD(MPID_nem_ib_conns[shadow_cm->req->initiator_rank].vc, connection_guard) = 0;
@@ -2108,13 +2142,13 @@ int MPID_nem_ib_cm_drain_scq()
the first queued send is issued. */
MPID_nem_ib_ringbuf_sector_t * sector =
(MPID_nem_ib_ringbuf_sector_t *) MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_sector_t));
+ MPIU_ERR_CHKANDJUMP(!sector, mpi_errno, MPI_ERR_OTHER, "**malloc");
sector->type = MPID_NEM_IB_RINGBUF_SHARED;
sector->start = VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_start);
sector->nslot = VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot);
sector->head = (uint16_t)shadow_ringbuf->req->fetched.head;
sector->tail = sector->head -
VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot) + 1;
- MPIU_ERR_CHKANDJUMP(!sector, mpi_errno, MPI_ERR_OTHER, "**malloc");
MPID_nem_ib_ringbuf_sectorq_enqueue(&VC_FIELD(shadow_ringbuf->req->vc, ibcom->sectorq),
sector);
}
@@ -2192,6 +2226,12 @@ int MPID_nem_ib_cm_poll_syn()
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_POLL_SYN);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_POLL_SYN);
+ /* scratch pad is freed after receiving CLOSE */
+ if(!MPID_nem_ib_scratch_pad) {
+ dprintf("cm_poll_syn,MPID_nem_ib_scratch_pad is zero\n");
+ goto fn_exit;
+ }
+
/* Memory layout is (CAS-word:SYN#0:SYN#1:...:SYN#N:CMD#0:CMD#1:...CMD#M) */
void* slot = (MPID_nem_ib_scratch_pad + MPID_NEM_IB_CM_OFF_SYN +
sizeof(MPID_nem_ib_cm_cmd_t) * (0 % MPID_NEM_IB_CM_NSEG));
@@ -2201,7 +2241,6 @@ int MPID_nem_ib_cm_poll_syn()
goto fn_exit;
} /* Incoming message hasn't arrived */
- MPID_nem_ib_cm_cmd_t *cmd;
switch (*head_flag) {
case MPID_NEM_IB_CM_SYN: {
volatile MPID_nem_ib_cm_cmd_syn_t *syn_tail_flag =
@@ -2220,6 +2259,7 @@ int MPID_nem_ib_cm_poll_syn()
MPID_NEM_IB_CM_LOCAL_QP_RESET)) {
ibcom_errno = MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_RC, &MPID_nem_ib_conns[syn->initiator_rank].fd);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
+ MPID_nem_ib_conns_ref_count++;
/* store pointer to MPID_nem_ib_com */
dprintf("cm_poll_syn,initiator fd=%d\n", MPID_nem_ib_conns[syn->initiator_rank].fd);
ibcom_errno = MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[syn->initiator_rank].fd,
@@ -2246,6 +2286,9 @@ int MPID_nem_ib_cm_poll_syn()
MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[req->initiator_rank],
&req->ibcom);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
+ /* Increment transaction counter here because this path is executed only once */
+ req->ibcom->outstanding_connection_tx += 1;
+ dprintf("cm_poll_syn,tx=%d\n", req->ibcom->outstanding_connection_tx);
if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
@@ -2352,6 +2395,12 @@ int MPID_nem_ib_cm_poll()
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_POLL);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_POLL);
+ /* scratch pad is freed after receiving CLOSE */
+ if(!MPID_nem_ib_scratch_pad) {
+ dprintf("cm_poll,MPID_nem_ib_scratch_pad is zero\n");
+ goto fn_exit;
+ }
+
/* Wrap-around tolerant by using "!=" */
for(i = MPID_nem_ib_cm_ringbuf_tail + 1; i != MPID_nem_ib_cm_ringbuf_head; i++) {
@@ -2377,8 +2426,9 @@ int MPID_nem_ib_cm_poll()
MPID_nem_ib_cm_req_t* req = (MPID_nem_ib_cm_req_t *) synack->initiator_req;
req->ringbuf_index = synack->initiator_ringbuf_index;
- dprintf("cm_poll,synack detected!,responder_req=%p,responder_rank=%d,ringbuf_index=%d\n",
- synack->responder_req, req->responder_rank, synack->initiator_ringbuf_index);
+ dprintf("cm_poll,synack detected!,responder_req=%p,responder_rank=%d,ringbuf_index=%d,tx=%d\n",
+ synack->responder_req, req->responder_rank, synack->initiator_ringbuf_index,
+ req->ibcom->outstanding_connection_tx);
/* Deduct it from the packet */
VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_state) |=
@@ -2445,8 +2495,9 @@ int MPID_nem_ib_cm_poll()
MPID_nem_ib_cm_cmd_ack1_t *ack1 = (MPID_nem_ib_cm_cmd_ack1_t *) slot;
MPID_nem_ib_cm_req_t* req = (MPID_nem_ib_cm_req_t *) ack1->responder_req;
- dprintf("cm_poll,ack1 detected!,responder_req=%p,initiator_rank=%d\n",
- ack1->responder_req, req->initiator_rank);
+ dprintf("cm_poll,ack1 detected!,responder_req=%p,initiator_rank=%d,tx=%d\n",
+ ack1->responder_req, req->initiator_rank,
+ req->ibcom->outstanding_connection_tx);
/* Deduct it from the packet */
VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, connection_state) |=
@@ -2486,8 +2537,9 @@ int MPID_nem_ib_cm_poll()
if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY) {
- dprintf("cm_poll,sending ack2,req=%p,ringbuf_index=%d,initiator_rank=%d\n",
- req, req->ringbuf_index, req->initiator_rank);
+ dprintf("cm_poll,sending ack2,req=%p,ringbuf_index=%d,initiator_rank=%d,tx=%d\n",
+ req, req->ringbuf_index, req->initiator_rank,
+ req->ibcom->outstanding_connection_tx);
MPID_nem_ib_cm_cmd_ack2_t *cmd = (MPID_nem_ib_cm_cmd_ack2_t *) req->ibcom->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
MPID_NEM_IB_CM_COMPOSE_ACK2(cmd, ack1->initiator_req);
@@ -2525,8 +2577,9 @@ int MPID_nem_ib_cm_poll()
MPID_nem_ib_cm_cmd_ack2_t *ack2 = (MPID_nem_ib_cm_cmd_ack2_t *) slot;
MPID_nem_ib_cm_req_t* req = (MPID_nem_ib_cm_req_t *) ack2->initiator_req;
- dprintf("cm_poll,ack2 detected!,req=%p,responder_rank=%d\n",
- req, req->responder_rank);
+ dprintf("cm_poll,ack2 detected!,req=%p,responder_rank=%d,tx=%d\n",
+ req, req->responder_rank,
+ req->ibcom->outstanding_connection_tx);
/* Deduct it from the packet */
if(!(VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_state) &
@@ -2654,6 +2707,7 @@ int MPID_nem_ib_ringbuf_alloc(MPIDI_VC_t * vc)
memset(MPID_nem_ib_ringbuf[i].remote_released, 0, (MPID_NEM_IB_RINGBUF_NSLOT + 63) / 64);
MPID_nem_ib_ringbuf_allocated[i / 64] |= (1ULL << (i & 63));
}
+ MPID_nem_ib_ringbuf[i].ref_count++;
VC_FIELD(vc, ibcom->remote_ringbuf) = &MPID_nem_ib_ringbuf[i];
dprintf("ringbuf_alloc,not found\n");
@@ -2665,3 +2719,68 @@ int MPID_nem_ib_ringbuf_alloc(MPIDI_VC_t * vc)
fn_fail:
goto fn_exit;
}
+
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_ringbuf_free
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_ib_ringbuf_free(MPIDI_VC_t * vc)
+{
+ int mpi_errno = MPI_SUCCESS;
+ int ibcom_errno;
+ int i;
+
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_FREE);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_FREE);
+
+ /* No ring-buffer is allocated */
+ if(!VC_FIELD(vc, ibcom->remote_ringbuf)) {
+ goto fn_exit;
+ }
+
+ int index = ((uint8_t *)VC_FIELD(vc, ibcom->remote_ringbuf) - (uint8_t*)&MPID_nem_ib_ringbuf[0]) /
+ sizeof(MPID_nem_ib_ringbuf_t);
+ dprintf("ringbuf_free,index=%d\n", index);
+
+ switch(VC_FIELD(vc, ibcom->remote_ringbuf)->type) {
+ case MPID_NEM_IB_RINGBUF_EXCLUSIVE:
+ dprintf("ringbuf_free,start=%p\n", VC_FIELD(vc, ibcom->remote_ringbuf)->start);
+ MPID_nem_ib_rdmawr_to_free(VC_FIELD(vc, ibcom->remote_ringbuf)->start, MPID_NEM_IB_RINGBUF_NSLOT);
+ MPID_nem_ib_ringbuf_allocated[index / 64] &= ~(1ULL << (index & 63));
+ VC_FIELD(vc, ibcom->remote_ringbuf) = NULL;
+ MPID_nem_ib_ringbuf_acquired[index / 64] &= ~(1ULL << (index & 63));
+ dprintf("ringbuf_free,exclucsive,allocated=%0lx\n", MPID_nem_ib_ringbuf_allocated[index / 64]);
+ break;
+ case MPID_NEM_IB_RINGBUF_SHARED:
+ dprintf("ringbuf_free,shared,ref_count=%d\n", VC_FIELD(vc, ibcom->remote_ringbuf)->ref_count);
+ MPIU_Assert(VC_FIELD(vc, ibcom->remote_ringbuf)->ref_count > 0);
+ if(--VC_FIELD(vc, ibcom->remote_ringbuf)->ref_count == 0) {
+ MPID_nem_ib_rdmawr_to_free(VC_FIELD(vc, ibcom->remote_ringbuf)->start, MPID_NEM_IB_RINGBUF_NSLOT);
+ MPID_nem_ib_ringbuf_allocated[index / 64] &= ~(1ULL << (index & 63));
+ dprintf("ringbuf_free,shared,allocated=%0lx\n", MPID_nem_ib_ringbuf_allocated[index / 64]);
+ }
+ VC_FIELD(vc, ibcom->remote_ringbuf) = NULL;
+ default:
+ printf("unknown ring-buffer type\n");
+ MPIU_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_free");
+ break;
+ }
+
+ int found = 0;
+ for(i = 0; i < (MPID_NEM_IB_NRINGBUF + 63) / 64; i++) {
+ if(MPID_nem_ib_ringbuf_allocated[i] != 0) {
+ found = 1;
+ break;
+ }
+ }
+
+ if(!found) {
+ MPIU_Free(MPID_nem_ib_ringbuf);
+ }
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RINGBUF_FREE);
+ return mpi_errno;
+ fn_fail:
+ goto fn_exit;
+}
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
index 6704b95..709a417 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
@@ -25,6 +25,8 @@
#define MPID_NEM_IB_COM_REG_MR_SZPAGE 4096
#define MPID_NEM_IB_COM_REG_MR_LOGSZPAGE 12
+static int ref_count;
+
/* Allocator using reference count at the head of
aligned memory area */
@@ -303,8 +305,10 @@ int MPID_nem_ib_com_register_cache_init()
int ibcom_errno = 0;
int i;
- ref_cout++;
-
+ ref_count++;
+ dprintf("cache_init,ref_count=%d\n", ref_count);
+
+ if(ref_count == 1) {
/* Using the address to the start node to express the end of the list
* instead of using NULL */
for (i = 0; i < MPID_NEM_IB_COM_REG_MR_NLINE; i++) {
@@ -315,6 +319,7 @@ int MPID_nem_ib_com_register_cache_init()
}
dprintf("[MrCache] cache initializes %d entries\n", MPID_NEM_IB_COM_REG_MR_NLINE);
+ }
fn_exit:
return ibcom_errno;
@@ -329,7 +334,9 @@ int MPID_nem_ib_com_register_cache_release()
struct MPID_nem_ib_com_reg_mr_cache_entry_t *p;
int i = 0, cnt = 0;
- MPIU_Assert(ref_count > 0) {
+ dprintf("cache_release,ref_count=%d\n", ref_count);
+
+ MPIU_Assert(ref_count > 0);
if(--ref_count > 0) {
goto fn_exit;
}
@@ -339,14 +346,20 @@ int MPID_nem_ib_com_register_cache_release()
(struct MPID_nem_ib_com_reg_mr_cache_entry_t *) MPID_nem_ib_com_reg_mr_cache[i].
lru_next;
p != (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) &MPID_nem_ib_com_reg_mr_cache[i];
- p = (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) p->lru_next) {
+ ) {
if (p && p->addr > 0) {
ib_errno = MPID_nem_ib_com_dereg_mr(p->mr);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, printf("MPID_nem_ib_com_dereg_mr"));
- afree(p, MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
+ struct MPID_nem_ib_com_reg_mr_cache_entry_t *p_old = p;
+ p = (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) p->lru_next;
+ afree(p_old, MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
cnt++;
}
}
+ MPID_nem_ib_com_reg_mr_cache[i].lru_next =
+ (struct MPID_nem_ib_com_reg_mr_listnode_t *) &MPID_nem_ib_com_reg_mr_cache[i];
+ MPID_nem_ib_com_reg_mr_cache[i].lru_prev =
+ (struct MPID_nem_ib_com_reg_mr_listnode_t *) &MPID_nem_ib_com_reg_mr_cache[i];
}
//__lru_queue_display();
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
index 303964f..d7453bd 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
@@ -20,6 +20,7 @@
static int entered_send_progress = 0;
#ifdef MPID_NEM_IB_ONDEMAND
+#if 0
// tree format is
// one or more <left_pointer(int), right_pointer(int), value(int), length(int), string(char[])>
#define MPID_NEM_IB_MAP_LPTR(ptr) *(int*)((ptr) + sizeof(int)*0)
@@ -138,6 +139,7 @@ int MPID_nem_ib_cm_map_get(MPID_nem_ib_cm_map_t * map, char *key, int key_length
goto fn_exit;
}
#endif
+#endif
#undef FUNCNAME
#define FUNCNAME MPID_nem_ib_iSendContig_core
@@ -405,7 +407,7 @@ int MPID_nem_ib_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
if (vc_ib->connection_state != MPID_NEM_IB_CM_ESTABLISHED) {
/* connected=closed/transit,ringbuf-type=shared,slot-available=no,
going-to-be-enqueued=yes case */
- REQ_FIELD(sreq, ask) = 0;
+ REQ_FIELD(sreq, ask) = 0; /* We can't ask because ring-buffer type is not determined yet. */
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_ask_fetch");
} else {
/* connected=established,ringbuf-type=shared,slot-available=no,
@@ -994,7 +996,7 @@ int MPID_nem_ib_send_progress(MPIDI_VC_t * vc)
vc_ib->ibcom->lsr_seq_num_tail) >= vc_ib->ibcom->local_ringbuf_nslot)) {
/* Prevent RDMA-read for rendezvous protocol from issuing ask */
- if(!REQ_FIELD(sreq, ask)) {
+ if(!REQ_FIELD(sreq, ask)) { /* First packet after connection hasn't asked slot */
/* Transitioning from exclusive to shared and need to issue ask.
This case is detected because exclusive entries in the queue are deleted
and deprived of slots of exclusive and the last state is set to
@@ -1449,6 +1451,11 @@ int MPID_nem_ib_cm_cas(MPIDI_VC_t * vc, uint32_t ask_on_connect)
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
dprintf("req->ibcom=%p\n", req->ibcom);
+ /* Increment transaction counter here because cm_cas is called only once
+ (cm_cas_core might be called more than once when retrying) */
+ req->ibcom->outstanding_connection_tx += 1;
+ dprintf("cm_cas,tx=%d\n", req->ibcom->outstanding_connection_tx);
+
/* Acquire remote scratch pad */
if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
@@ -1514,6 +1521,7 @@ int MPID_nem_ib_cm_cmd_core(int rank, MPID_nem_ib_cm_cmd_shadow_t* shadow, void*
/* Prepare QP (RESET). Attempting to overlap it with preparing QP (RESET) on the responder side */
ibcom_errno = MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_RC, &MPID_nem_ib_conns[rank].fd);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
+ MPID_nem_ib_conns_ref_count++;
VC_FIELD(MPID_nem_ib_conns[rank].vc, connection_state) |=
MPID_NEM_IB_CM_LOCAL_QP_RESET;
http://git.mpich.org/mpich.git/commitdiff/d384cbab8bd27a1c46a3f13891b11f8152d92a14
commit d384cbab8bd27a1c46a3f13891b11f8152d92a14
Author: Masamichi Takagi <masamichi.takagi at gmail.com>
Date: Tue Jan 21 03:34:51 2014 +0900
Add on-demand connection management for IB
A pair of QPs connecting two MPI processes is created only when a
communication between them is requested.
Allocation of eager-send ring-buffer is implemented as well. First N
connctions are given ring-buffers used only for them and the following
connections are given a shared ring-buffer, similar to send/recv, of
which a sender needs to reserve a buffer slot before initiating
RDMA-write.
Signed-off-by: Pavan Balaji <balaji at anl.gov>
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt b/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
index e71e98e..4de7a41 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
@@ -1,38 +1,25 @@
**MPIDI_PG_GetConnKVSname:MPIDI_PG_GetConnKVSname failed
-**MPID_nem_ib_cm_accept:MPID_nem_ib_cm_accept failed
+**MPID_nem_ib_cm_cas:MPID_nem_ib_cm_cas failed
+**MPID_nem_ib_cm_connect_cas_core:MPID_nem_ib_cm_connect_cas_core failed
**MPID_nem_ib_cm_drain_scq:MPID_nem_ib_cm_drain_scq failed
**MPID_nem_ib_cm_poll:MPID_nem_ib_cm_poll failed
-**MPID_nem_ib_cm_put:MPID_nem_ib_cm_put failed
+**MPID_nem_ib_cm_poll_syn:MPID_nem_ib_cm_poll_syn failed
+**MPID_nem_ib_cm_progress:MPID_nem_ib_cm_progress failed
**MPID_nem_ib_cm_send_core:MPID_nem_ib_cm_send_core failed
-**MPID_nem_ib_drain_scq:MPID_nem_ib_drain_scq failed
-**MPID_nem_ib_drain_scq_lmt_put:MPID_nem_ib_drain_scq_lmt_put failed
-**MPID_nem_ib_drain_scq_scratch_pad:MPID_nem_ib_drain_scq_scratch_pad failed
-**MPID_nem_ib_kvs_put_binary:MPID_nem_ib_kvs_put_binary failed
-**MPID_nem_ib_lmt_done_recv:MPID_nem_ib_lmt_done_recv failed
-**MPID_nem_ib_lmt_done_send:MPID_nem_ib_lmt_done_send failed
-**MPID_nem_ib_npollingset:MPID_nem_ib_npollingset failed
-**MPID_nem_ib_poll:MPID_nem_ib_poll failed
-**MPID_nem_ib_send_change_rdmabuf_occupancy_notify_state:MPID_nem_ib_send_change_rdmabuf_occupancy_notify_state failed
-**MPID_nem_ib_send_reply_seq_num:MPID_nem_ib_send_reply_seq_num failed
-**MPID_nem_ib_send_req_seq_num:MPID_nem_ib_send_req_seq_num failed
-**PMI_Barrier:PMI_Barrier failed
-**PMI_KVS_Get:PMI_KVS_Get failed
-**PMI_KVS_Put:PMI_KVS_Put failed
-**PMS_KVS_Get:PMS_KVS_Get failed
-**MPID_nem_ib_lmt_send_GET_DONE:MPID_nem_ib_lmt_send_GET_DONE failed
-**MPID_nem_ib_com_open:MPID_nem_ib_com_open failed
**MPID_nem_ib_com_alloc:MPID_nem_ib_com_alloc failed
+**MPID_nem_ib_com_cas_scratch_pad:MPID_nem_ib_com_cas_scratch_pad failed
**MPID_nem_ib_com_close:MPID_nem_ib_com_close failed
+**MPID_nem_ib_com_connect_ringbuf:MPID_nem_ib_com_connect_ringbuf failed
+**MPID_nem_ib_com_free:MPID_nem_ib_com_free failed
**MPID_nem_ib_com_get_info_conn:MPID_nem_ib_com_get_info_conn failed
**MPID_nem_ib_com_get_info_mr:MPID_nem_ib_com_get_info_mr failed
+**MPID_nem_ib_com_get_scratch_pad:MPID_nem_ib_com_get_scratch_pad failed
**MPID_nem_ib_com_irecv:MPID_nem_ib_com_irecv failed
**MPID_nem_ib_com_isend:MPID_nem_ib_com_isend failed
**MPID_nem_ib_com_lrecv:MPID_nem_ib_com_lrecv failed
-**MPID_nem_ib_com_lsr_seq_num_tail_get:MPID_nem_ib_com_lsr_seq_num_tail_get failed
-**MPID_nem_ib_com_mem_udwr_from:MPID_nem_ib_com_mem_udwr_from failed
-**MPID_nem_ib_com_mem_udwr_to:MPID_nem_ib_com_mem_udwr_to failed
**MPID_nem_ib_com_obtain_pointer:MPID_nem_ib_com_obtain_pointer failed
+**MPID_nem_ib_com_open:MPID_nem_ib_com_open failed
**MPID_nem_ib_com_poll_cq %s:MPID_nem_ib_com_poll_cq failed with error %s
**MPID_nem_ib_com_poll_cq:MPID_nem_ib_com_poll_cq failed
**MPID_nem_ib_com_put_scratch_pad:MPID_nem_ib_com_put_scratch_pad failed
@@ -41,12 +28,32 @@
**MPID_nem_ib_com_rdmabuf_occupancy_notify_rstate_get:MPID_nem_ib_com_rdmabuf_occupancy_notify_rstate_get failed
**MPID_nem_ib_com_reg_mr_connect:MPID_nem_ib_com_reg_mr_connect failed
**MPID_nem_ib_com_reg_mr_fetch:MPID_nem_ib_com_reg_mr_fetch failed
-**MPID_nem_ib_com_rsr_seq_num_tail_get:MPID_nem_ib_com_rsr_seq_num_tail_get failed
-**MPID_nem_ib_com_rsr_seq_num_tail_last_sent_get:MPID_nem_ib_com_rsr_seq_num_tail_last_sent_get failed
**MPID_nem_ib_com_rts:MPID_nem_ib_com_rts failed
**MPID_nem_ib_com_sq_occupancy_notify_rate_get:MPID_nem_ib_com_sq_occupancy_notify_rate_get failed
-**MPID_nem_ib_com_sseq_num_get:MPID_nem_ib_com_sseq_num_get failed
-**MPID_nem_ib_com_udrecv:MPID_nem_ib_com_udrecv failed
+**MPID_nem_ib_drain_scq:MPID_nem_ib_drain_scq failed
+**MPID_nem_ib_drain_scq_scratch_pad:MPID_nem_ib_drain_scq_scratch_pad failed
+**MPID_nem_ib_kvs_put_binary:MPID_nem_ib_kvs_put_binary failed
+**MPID_nem_ib_lmt_done_recv:MPID_nem_ib_lmt_done_recv failed
+**MPID_nem_ib_lmt_done_send:MPID_nem_ib_lmt_done_send failed
+**MPID_nem_ib_lmt_send_GET_DONE:MPID_nem_ib_lmt_send_GET_DONE failed
+**MPID_nem_ib_npollingset:MPID_nem_ib_npollingset failed
+**MPID_nem_ib_poll:MPID_nem_ib_poll failed
+**MPID_nem_ib_rdma_to_alloc:MPID_nem_ib_rdma_to_alloc failed
+**MPID_nem_ib_ring_alloc:MPID_nem_ib_ring_alloc failed
+**MPID_nem_ib_ringbuf_alloc:MPID_nem_ib_ringbuf_alloc failed
+**MPID_nem_ib_ringbuf_ask_cas:MPID_nem_ib_ringbuf_ask_cas failed
+**MPID_nem_ib_ringbuf_ask_fetch:MPID_nem_ib_ringbuf_ask_fetch failed
+**MPID_nem_ib_ringbuf_connect_cas_core:MPID_nem_ib_ringbuf_connect_cas_core failed
+**MPID_nem_ib_ringbuf_progress:MPID_nem_ib_ringbuf_progress failed
+**MPID_nem_ib_ringbuf_send_core:MPID_nem_ib_ringbuf_send_core failed
+**MPID_nem_ib_send_change_rdmabuf_occupancy_notify_state:MPID_nem_ib_send_change_rdmabuf_occupancy_notify_state failed
+**MPID_nem_ib_send_progress:MPID_nem_ib_send_progress failed
+**MPID_nem_ib_send_reply_seq_num:MPID_nem_ib_send_reply_seq_num failed
+**MPID_nem_ib_send_req_seq_num:MPID_nem_ib_send_req_seq_num failed
+**PMI_Barrier:PMI_Barrier failed
+**PMI_KVS_Put:PMI_KVS_Put failed
+**PMS_KVS_Get:PMS_KVS_Get failed
+**malloc:malloc failed
**netmod,ib,ibv_poll_cq:netmod,ib,ibv_poll_cq failed
**notimplemented:notimplemented failed
**outofmemory:outofmemory failed
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
index 2c2cf5c..6aa2c2b 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
@@ -40,14 +40,17 @@ struct ibv_context *MPID_nem_ib_ctx_export; /* for SC13 demo connector */
static struct ibv_pd *ib_pd;
struct ibv_pd *MPID_nem_ib_pd_export; /* for SC13 demo connector */
struct ibv_cq *MPID_nem_ib_rc_shared_scq;
-struct ibv_cq *MPID_nem_ib_rc_shared_scq_lmt_put;
struct ibv_cq *MPID_nem_ib_rc_shared_scq_scratch_pad;
static struct ibv_cq *MPID_nem_ib_rc_shared_rcq;
-static struct ibv_cq *MPID_nem_ib_rc_shared_rcq_lmt_put;
static struct ibv_cq *MPID_nem_ib_rc_shared_rcq_scratch_pad;
static struct ibv_cq *MPID_nem_ib_ud_shared_scq;
struct ibv_cq *MPID_nem_ib_ud_shared_rcq;
-static uint8_t *scratch_pad = 0;
+uint8_t *MPID_nem_ib_scratch_pad = 0;
+char *MPID_nem_ib_rdmawr_from_alloc_free_list_front[MPID_NEM_IB_RDMAWR_FROM_ALLOC_NID] = { 0 };
+char *MPID_nem_ib_rdmawr_from_alloc_arena_free_list[MPID_NEM_IB_RDMAWR_FROM_ALLOC_NID] = { 0 };
+struct ibv_mr* MPID_nem_ib_rdmawr_to_alloc_mr;
+uint8_t *MPID_nem_ib_rdmawr_to_alloc_start;
+uint8_t *MPID_nem_ib_rdmawr_to_alloc_free_list;
#define MPID_NEM_IB_RANGE_CHECK(condesc, conp) \
{ \
@@ -65,7 +68,101 @@ static uint8_t *scratch_pad = 0;
MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_used != 1, -1, dprintf("MPID_NEM_IB_RANGE_CHECK_WITH_ERROR,conp->icom_used=%d\n", conp->icom_used)); \
}
-static int modify_qp_to_init(struct ibv_qp *qp, int ib_port)
+/* Allocator for RDMA write to buffer
+ - Allocate performs dequeue
+ - Slow to "malloc" (two load and one store instructions)
+ - Free performs enqueue
+ - Slow to "free" (one load and two store instructions)
+ - No flagmentation occurs
+ - munmap unit is small (4KB)
+ - Less header when compared to reference count
+ - Refill never happens because IB-registers whole pool at the beginning
+ - Fast when first-time allocs occur
+ - Free list is a linked list
+ - Fast to find a empty slot (one load instruction)
+ */
+static int MPID_nem_ib_rdmawr_to_init(uint64_t sz)
+{
+ int ibcom_errno = 0;
+ void *start;
+ void *cur;
+ start = (void *) mmap(0, sz, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE,
+ -1, 0);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(start == (void *) -1, -1, printf("mmap failed\n"));
+ dprintf("rdmawr_to_init,sz=%ld,start=%p\n", sz, start);
+
+ memset(start, 0, sz);
+
+ MPID_nem_ib_rdmawr_to_alloc_mr = MPID_nem_ib_com_reg_mr_fetch(start, sz, 0);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(!MPID_nem_ib_rdmawr_to_alloc_mr, -1,
+ printf("MPID_nem_ib_com_reg_mr_fetchibv_reg_mr failed\n"));
+ dprintf("rdmawr_to_init,rkey=%08x\n", MPID_nem_ib_rdmawr_to_alloc_mr->rkey);
+
+ MPID_nem_ib_rdmawr_to_alloc_start = start;
+ MPID_nem_ib_rdmawr_to_alloc_free_list = start;
+ for (cur = start;
+ cur < start + sz - MPID_NEM_IB_COM_RDMABUF_SZSEG;
+ cur += MPID_NEM_IB_COM_RDMABUF_SZSEG) {
+ //dprintf("rdmawr_to_init,cur=%p\n", cur);
+ ((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) cur)->next = cur + MPID_NEM_IB_COM_RDMABUF_SZSEG;
+ }
+ ((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) cur)->next = 0;
+
+ fn_exit:
+ return ibcom_errno;
+ fn_fail:
+ goto fn_exit;
+}
+
+void *MPID_nem_ib_rdmawr_to_alloc(int nslots)
+{
+ dprintf("rdmawr_to_alloc,nslots=%d\n", nslots);
+ void *start;
+ int i;
+ for(i = 0; i < nslots; i++) {
+ //dprintf("MPID_nem_ib_rdmawr_to_alloc,free_list=%p\n", MPID_nem_ib_rdmawr_to_alloc_free_list);
+ if (MPID_nem_ib_rdmawr_to_alloc_free_list) {
+ if(i == 0) {
+ start = MPID_nem_ib_rdmawr_to_alloc_free_list;
+ }
+ MPID_nem_ib_rdmawr_to_alloc_free_list =
+ ((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) MPID_nem_ib_rdmawr_to_alloc_free_list)->next;
+ }
+ else {
+ printf("out of rdmawr_to bufer\n");
+ return 0;
+ }
+ }
+ return start;
+}
+
+void MPID_nem_ib_rdmawr_to_free(void *p, int nslots)
+{
+ void *q;
+ ((MPID_nem_ib_rdmawr_to_alloc_hdr_t *)
+ ((uint8_t*)p + MPID_NEM_IB_COM_RDMABUF_SZSEG * (nslots-1)))->next =
+ MPID_nem_ib_rdmawr_to_alloc_free_list;
+ for (q = p + MPID_NEM_IB_COM_RDMABUF_SZSEG * (nslots-2);
+ q >= p;
+ q -= MPID_NEM_IB_COM_RDMABUF_SZSEG) {
+ ((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) q)->next = q + MPID_NEM_IB_COM_RDMABUF_SZSEG;
+ }
+ MPID_nem_ib_rdmawr_to_alloc_free_list = p;
+}
+
+int MPID_nem_ib_rdmawr_to_munmap(void *p, int nslots)
+{
+ int retval;
+ int ibcom_errno = 0;
+ retval = munmap(p, MPID_NEM_IB_COM_RDMABUF_SZSEG * nslots);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, printf("munmap failed\n"));
+ fn_exit:
+ return ibcom_errno;
+ fn_fail:
+ goto fn_exit;
+}
+
+static int modify_qp_to_init(struct ibv_qp *qp, int ib_port, int additional_flags)
{
struct ibv_qp_attr attr;
int flags;
@@ -76,7 +173,8 @@ static int modify_qp_to_init(struct ibv_qp *qp, int ib_port)
attr.port_num = ib_port;
attr.pkey_index = 0;
attr.qp_access_flags =
- IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE;
+ IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ |
+ IBV_ACCESS_REMOTE_WRITE | additional_flags;
flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS;
rc = ibv_modify_qp(qp, &attr, flags);
if (rc) {
@@ -177,9 +275,6 @@ static int MPID_nem_ib_com_device_init()
if (ib_initialized == -1)
return -1;
- /* initialize ibv_reg_mr cache */
- MPID_nem_ib_com_register_cache_init();
-
/* Get the device list */
ib_devlist = ibv_get_device_list(&dev_num);
if (!ib_devlist || !dev_num) {
@@ -252,61 +347,113 @@ static int MPID_nem_ib_com_device_init()
goto fn_exit;
}
-static void MPID_nem_ib_com_clean(MPID_nem_ib_com_t * conp)
+static int MPID_nem_ib_com_clean(MPID_nem_ib_com_t * conp)
{
int i;
+ int ibcom_errno = 0;
+ int retval;
- if (conp->icom_qp)
+ if (conp->icom_qp) {
ibv_destroy_qp(conp->icom_qp);
+ conp->icom_qp = NULL;
+ }
if (conp->icom_mrlist && conp->icom_mrlen > 0) {
switch (conp->open_flag) {
case MPID_NEM_IB_COM_OPEN_RC:
- for (i = 0; i < MPID_NEM_IB_COM_NBUF_RDMA; i++) {
- if (conp->icom_mrlist[i]) {
- ibv_dereg_mr(conp->icom_mrlist[i]);
- }
+ if(MPID_nem_ib_rc_shared_scq) {
+ ib_errno = ibv_destroy_cq(MPID_nem_ib_rc_shared_scq);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq"));
+ MPID_nem_ib_rc_shared_scq = NULL;
}
+ if(MPID_nem_ib_rc_shared_rcq) {
+ ib_errno = ibv_destroy_cq(MPID_nem_ib_rc_shared_rcq);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq"));
+ MPID_nem_ib_rc_shared_rcq = NULL;
+ }
+ retval = munmap(conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_FROM], MPID_NEM_IB_COM_RDMABUF_SZ);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, dprintf("munmap"));
+ retval = munmap(conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO], MPID_NEM_IB_COM_RDMABUF_SZ);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, dprintf("munmap"));
+
+ MPIU_Free(conp->icom_mrlist);
+ MPIU_Free(conp->icom_mem);
+ MPIU_Free(conp->icom_msize);
+
+ MPIU_Free(conp->icom_rmem);
+ MPIU_Free(conp->icom_rsize);
+ MPIU_Free(conp->icom_rkey);
+ for (i = 0; i < MPID_NEM_IB_COM_SMT_INLINE_NCHAIN; i++) {
+#ifndef HAVE_LIBDCFA
+ MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].sg_list);
+#endif
+ }
+ MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list);
+ MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list);
+ MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].sg_list);
+ MPIU_Free(conp->icom_sr);
+ MPIU_Free(conp->icom_rr);
break;
case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
- for (i = 0; i < MPID_NEM_IB_COM_NBUF_SCRATCH_PAD; i++) {
- if (conp->icom_mrlist[i]) {
- ibv_dereg_mr(conp->icom_mrlist[i]);
- }
+ if(MPID_nem_ib_rc_shared_scq_scratch_pad) {
+ ib_errno = ibv_destroy_cq(MPID_nem_ib_rc_shared_scq_scratch_pad);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq"));
+ MPID_nem_ib_rc_shared_scq_scratch_pad = NULL;
+ }
+ if(MPID_nem_ib_rc_shared_scq_scratch_pad) {
+ ib_errno = ibv_destroy_cq(MPID_nem_ib_rc_shared_scq_scratch_pad);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq"));
+ MPID_nem_ib_rc_shared_scq_scratch_pad = NULL;
}
+ MPIU_Free(conp->icom_mrlist);
+ MPIU_Free(conp->icom_mem);
+ MPIU_Free(conp->icom_msize);
+
+ MPIU_Free(conp->icom_rmem);
+ MPIU_Free(conp->icom_rsize);
+ MPIU_Free(conp->icom_rkey);
+
+#ifndef HAVE_LIBDCFA
+ MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list);
+#endif
+ MPIU_Free(conp->icom_sr);
break;
case MPID_NEM_IB_COM_OPEN_UD:
- for (i = 0; i < MPID_NEM_IB_COM_NBUF_UD; i++) {
- if (conp->icom_mrlist[i]) {
- ibv_dereg_mr(conp->icom_mrlist[i]);
- }
+ if(MPID_nem_ib_ud_shared_scq) {
+ ib_errno = ibv_destroy_cq(MPID_nem_ib_ud_shared_scq);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq"));
+ MPID_nem_ib_ud_shared_scq = NULL;
}
+ if(MPID_nem_ib_ud_shared_rcq) {
+ ib_errno = ibv_destroy_cq(MPID_nem_ib_ud_shared_rcq);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq"));
+ MPID_nem_ib_ud_shared_rcq = NULL;
+ }
+ retval = munmap(conp->icom_mem[MPID_NEM_IB_COM_UDWR_FROM], MPID_NEM_IB_COM_UDBUF_SZ);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, dprintf("munmap"));
+ retval = munmap(conp->icom_mem[MPID_NEM_IB_COM_UDWR_TO], MPID_NEM_IB_COM_UDBUF_SZ);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, dprintf("munmap"));
+
+ MPIU_Free(conp->icom_mrlist);
+ MPIU_Free(conp->icom_mem);
+ MPIU_Free(conp->icom_msize);
+
+ MPIU_Free(conp->icom_ah_attr);
+#ifndef HAVE_LIBDCFA
+ MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_UD_INITIATOR].sg_list);
+#endif
+ MPIU_Free(conp->icom_sr);
+
+ MPIU_Free(conp->icom_rr[MPID_NEM_IB_COM_UD_RESPONDER].sg_list);
+ MPIU_Free(conp->icom_rr);
break;
}
- MPIU_Free(conp->icom_mrlist);
- }
- if (conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_FROM]) {
- munmap(conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_FROM], MPID_NEM_IB_COM_RDMABUF_SZ);
- }
- if (conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO]) {
- munmap(conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO], MPID_NEM_IB_COM_RDMABUF_SZ);
- }
- if (conp->icom_scq) {
- ibv_destroy_cq(conp->icom_scq);
- }
- if (conp->icom_rcq) {
- ibv_destroy_cq(conp->icom_rcq);
- }
- if (conp->icom_rmem) {
- MPIU_Free(conp->icom_rmem);
- }
- if (conp->icom_rsize) {
- MPIU_Free(conp->icom_rsize);
- }
- if (conp->icom_rkey) {
- MPIU_Free(conp->icom_rkey);
}
memset(conp, 0, sizeof(MPID_nem_ib_com_t));
- // TODO: free ah, sge, command template, ...
+
+ fn_exit:
+ return ibcom_errno;
+ //fn_fail:
+ goto fn_exit;
}
int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
@@ -324,7 +471,6 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
int open_flag_conn = open_flag;
if (open_flag_conn != MPID_NEM_IB_COM_OPEN_RC &&
- open_flag_conn != MPID_NEM_IB_COM_OPEN_RC_LMT_PUT &&
open_flag_conn != MPID_NEM_IB_COM_OPEN_UD &&
open_flag_conn != MPID_NEM_IB_COM_OPEN_SCRATCH_PAD) {
dprintf("MPID_nem_ib_com_open,bad flag\n");
@@ -332,6 +478,11 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
goto fn_fail;
}
+ /* Increment reference counter of ibv_reg_mr cache */
+ ibcom_errno = MPID_nem_ib_com_register_cache_init();
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(ibcom_errno, -1,
+ dprintf("MPID_nem_ib_com_register_cache_init"));
+
/* device open error */
if (MPID_nem_ib_com_device_init() < 0) {
ibcom_errno = -1;
@@ -365,10 +516,10 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
conp->rsr_seq_num_poll = 0; /* it means slot 0 is polled */
conp->rsr_seq_num_tail = -1; /* it means slot 0 is not released */
conp->rsr_seq_num_tail_last_sent = -1;
- conp->lsr_seq_num_tail = -1;
conp->lsr_seq_num_tail_last_requested = -2;
conp->rdmabuf_occupancy_notify_rstate = MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_LW;
conp->rdmabuf_occupancy_notify_lstate = MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_LW;
+ conp->ask_guard = 0;
//dprintf("MPID_nem_ib_com_open,ptr=%p,rsr_seq_num_poll=%d\n", conp, conp->rsr_seq_num_poll);
#ifdef HAVE_LIBDCFA
@@ -433,33 +584,6 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
}
conp->icom_rcq = MPID_nem_ib_rc_shared_rcq_scratch_pad;
break;
- case MPID_NEM_IB_COM_OPEN_RC_LMT_PUT:
- if (!MPID_nem_ib_rc_shared_scq_lmt_put) {
-#ifdef HAVE_LIBDCFA
- MPID_nem_ib_rc_shared_scq_lmt_put =
- ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
-#else
- MPID_nem_ib_rc_shared_scq_lmt_put =
- ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY, NULL, NULL, 0);
-#endif
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!MPID_nem_ib_rc_shared_scq_lmt_put, -1,
- dprintf("MPID_nem_ib_rc_shared_scq"));
- }
- conp->icom_scq = MPID_nem_ib_rc_shared_scq_lmt_put;
-
- if (!MPID_nem_ib_rc_shared_rcq_lmt_put) {
-#ifdef HAVE_LIBDCFA
- MPID_nem_ib_rc_shared_rcq_lmt_put =
- ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
-#else
- MPID_nem_ib_rc_shared_rcq_lmt_put =
- ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY, NULL, NULL, 0);
-#endif
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!MPID_nem_ib_rc_shared_rcq_lmt_put, -1,
- dprintf("MPID_nem_ib_rc_shared_rcq"));
- }
- conp->icom_rcq = MPID_nem_ib_rc_shared_rcq_lmt_put;
- break;
case MPID_NEM_IB_COM_OPEN_UD:
if (!MPID_nem_ib_ud_shared_scq) {
#ifdef HAVE_LIBDCFA
@@ -498,7 +622,6 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
qp_init_attr.cap.max_inline_data = MPID_NEM_IB_COM_INLINE_DATA;
switch (open_flag) {
case MPID_NEM_IB_COM_OPEN_RC:
- case MPID_NEM_IB_COM_OPEN_RC_LMT_PUT:
case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
qp_init_attr.qp_type = IBV_QPT_RC;
break;
@@ -558,28 +681,6 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
memset(conp->icom_msize, 0, sizeof(int *) * MPID_NEM_IB_COM_NBUF_RDMA);
mr_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE;
- /* RDMA-write-from local memory area */
- conp->icom_msize[MPID_NEM_IB_COM_RDMAWR_FROM] = MPID_NEM_IB_COM_RDMABUF_SZ;
- conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_FROM] =
- mmap(0, MPID_NEM_IB_COM_RDMABUF_SZ, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE,
- -1, 0);
- dprintf("MPID_nem_ib_com_open,mmap=%p,len=%d\n",
- conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_FROM], MPID_NEM_IB_COM_RDMABUF_SZ);
- if (conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_FROM] == (void *) -1) {
- fprintf(stderr, "failed to allocate buffer\n");
- goto err_exit;
- }
- memset(conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_FROM], 0,
- conp->icom_msize[MPID_NEM_IB_COM_RDMAWR_FROM]);
-
- conp->icom_mrlist[MPID_NEM_IB_COM_RDMAWR_FROM] =
- MPID_nem_ib_com_reg_mr_fetch(conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_FROM],
- conp->icom_msize[MPID_NEM_IB_COM_RDMAWR_FROM]);
- if (!conp->icom_mrlist[MPID_NEM_IB_COM_RDMAWR_FROM]) {
- fprintf(stderr, "ibv_reg_mr failed with mr_flags=0x%x\n", mr_flags);
- goto err_exit;
- }
-
/* RDMA-write-to local memory area */
conp->icom_msize[MPID_NEM_IB_COM_RDMAWR_TO] = MPID_NEM_IB_COM_RDMABUF_SZ;
#if 0
@@ -593,35 +694,32 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
goto fn_fail;
}
#else
+ /* ibv_reg_mr all memory area for all ring buffers
+ including shared and exclusive ones */
+ if(!MPID_nem_ib_rdmawr_to_alloc_start) {
+ ibcom_errno =
+ MPID_nem_ib_rdmawr_to_init(MPID_NEM_IB_COM_RDMABUF_SZ * MPID_NEM_IB_NRINGBUF);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(ibcom_errno, -1, printf("MPID_nem_ib_rdmawr_to_init"));
+ dprintf("ib_com_open,MPID_nem_ib_rdmawr_to_alloc_free_list=%p\n", MPID_nem_ib_rdmawr_to_alloc_free_list);
+ }
+
conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO] =
- mmap(0, MPID_NEM_IB_COM_RDMABUF_SZ, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE,
- -1, 0);
+ MPID_nem_ib_rdmawr_to_alloc_start;
+ //mmap(0, MPID_NEM_IB_COM_RDMABUF_SZ, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE,
+ //-1, 0);
dprintf("MPID_nem_ib_com_open,mmap=%p,len=%d\n", conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO],
MPID_NEM_IB_COM_RDMABUF_SZ);
#endif
- if (conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO] == (void *) -1) {
- fprintf(stderr, "failed to allocate buffer\n");
- goto err_exit;
- }
- memset(conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO], 0,
- conp->icom_msize[MPID_NEM_IB_COM_RDMAWR_TO]);
-
- conp->icom_mrlist[MPID_NEM_IB_COM_RDMAWR_TO] =
- ibv_reg_mr(ib_pd, conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO],
- conp->icom_msize[MPID_NEM_IB_COM_RDMAWR_TO], mr_flags);
- if (!conp->icom_mrlist[MPID_NEM_IB_COM_RDMAWR_TO]) {
- fprintf(stderr, "ibv_reg_mr failed with mr_flags=0x%x\n", mr_flags);
- goto err_exit;
- }
+
#ifdef HAVE_LIBDCFA
dprintf("MPID_nem_ib_com_open,fd=%d,rmem=%p\n", *condesc,
- conp->icom_mrlist[MPID_NEM_IB_COM_RDMAWR_TO]->buf);
+ MPID_nem_ib_rdmawr_to_alloc_mr->buf);
#else
dprintf("MPID_nem_ib_com_open,fd=%d,rmem=%p\n", *condesc,
- conp->icom_mrlist[MPID_NEM_IB_COM_RDMAWR_TO]->addr);
+ MPID_nem_ib_rdmawr_to_alloc_mr->addr);
#endif
dprintf("MPID_nem_ib_com_open,fd=%d,rkey=%08x\n", *condesc,
- conp->icom_mrlist[MPID_NEM_IB_COM_RDMAWR_TO]->rkey);
+ MPID_nem_ib_rdmawr_to_alloc_mr->rkey);
/* RDMA-write-to remote memory area */
conp->icom_rmem = (void **) MPIU_Malloc(sizeof(void **) * MPID_NEM_IB_COM_NBUF_RDMA);
@@ -641,23 +739,37 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
break;
case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
/* RDMA-write-from and -to local memory area */
- conp->icom_mrlist = MPIU_Malloc(sizeof(struct ibv_mr *) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
+ conp->icom_mrlist = (struct ibv_mr **) MPIU_Malloc(sizeof(struct ibv_mr *) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
memset(conp->icom_mrlist, 0, sizeof(struct ibv_mr *) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
conp->icom_mrlen = MPID_NEM_IB_COM_NBUF_SCRATCH_PAD;
- conp->icom_mem = (void **) MPIU_Malloc(sizeof(void **) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
- memset(conp->icom_mem, 0, sizeof(void **) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
- conp->icom_msize = (int *) MPIU_Malloc(sizeof(int *) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
- memset(conp->icom_msize, 0, sizeof(int *) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
+ conp->icom_mem = (void **) MPIU_Malloc(sizeof(void *) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
+ memset(conp->icom_mem, 0, sizeof(void *) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
+ conp->icom_msize = (int *) MPIU_Malloc(sizeof(int) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
+ memset(conp->icom_msize, 0, sizeof(int) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
mr_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE;
+ /* RDMA-write-from local memory area */
+ conp->icom_msize[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] = MPID_NEM_IB_COM_SCRATCH_PAD_FROM_SZ;
+ conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] =
+ mmap(0, MPID_NEM_IB_COM_SCRATCH_PAD_FROM_SZ, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE,
+ -1, 0);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] == (void*)-1, -1,
+ printf("mmap failed\n"));
+
+ conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] =
+ MPID_nem_ib_com_reg_mr_fetch(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM],
+ conp->icom_msize[MPID_NEM_IB_COM_SCRATCH_PAD_FROM], 0);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_FROM], -1,
+ printf("ibv_reg_mr failed\n"));
+
/* RDMA-write-to remote memory area */
- conp->icom_rmem = (void **) MPIU_Malloc(sizeof(void **) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
+ conp->icom_rmem = (void **) MPIU_Malloc(sizeof(void *) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_rmem == 0, -1, dprintf("malloc failed\n"));
- memset(conp->icom_rmem, 0, sizeof(void **) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
+ memset(conp->icom_rmem, 0, sizeof(void *) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
- conp->icom_rsize = (size_t *) MPIU_Malloc(sizeof(void **) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
+ conp->icom_rsize = (size_t *) MPIU_Malloc(sizeof(size_t) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_rsize == 0, -1, dprintf("malloc failed\n"));
- memset(conp->icom_rsize, 0, sizeof(void **) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
+ memset(conp->icom_rsize, 0, sizeof(size_t) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
conp->icom_rkey = (int *) MPIU_Malloc(sizeof(int) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_rkey == 0, -1, dprintf("malloc failed\n"));
@@ -691,7 +803,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
conp->icom_mrlist[MPID_NEM_IB_COM_UDWR_FROM] =
MPID_nem_ib_com_reg_mr_fetch(conp->icom_mem[MPID_NEM_IB_COM_UDWR_FROM],
- conp->icom_msize[MPID_NEM_IB_COM_UDWR_FROM]);
+ conp->icom_msize[MPID_NEM_IB_COM_UDWR_FROM], 0);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_mrlist[MPID_NEM_IB_COM_UDWR_FROM], -1,
dprintf("ibv_reg_mr failed with mr_flags=0x%x\n", mr_flags));
@@ -712,7 +824,7 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
conp->icom_mrlist[MPID_NEM_IB_COM_UDWR_TO] =
MPID_nem_ib_com_reg_mr_fetch(conp->icom_mem[MPID_NEM_IB_COM_UDWR_TO],
- conp->icom_msize[MPID_NEM_IB_COM_UDWR_TO]);
+ conp->icom_msize[MPID_NEM_IB_COM_UDWR_TO], 0);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_mrlist[MPID_NEM_IB_COM_UDWR_TO], -1,
dprintf("ibv_reg_mr failed with mr_flags=0x%x\n", mr_flags));
@@ -862,33 +974,46 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
#endif
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].num_sge = 1;
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].opcode = IBV_WR_RDMA_WRITE;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].send_flags = IBV_SEND_SIGNALED;
- break;
- }
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].send_flags =
+ IBV_SEND_SIGNALED | IBV_SEND_INLINE;
+
- case MPID_NEM_IB_COM_OPEN_RC_LMT_PUT:
- /* SR (send request) template */
- conp->icom_sr =
- (struct ibv_send_wr *) MPIU_Malloc(sizeof(struct ibv_send_wr) *
- MPID_NEM_IB_COM_RC_SR_LMT_PUT_NTEMPLATE);
- memset(conp->icom_sr, 0,
- sizeof(struct ibv_send_wr) * MPID_NEM_IB_COM_RC_SR_LMT_PUT_NTEMPLATE);
- /* SR (send request) template for MPID_NEM_IB_COM_LMT_PUT */
#ifdef HAVE_LIBDCFA
- memset(&(conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].sg_list[0]), 0,
- sizeof(struct ibv_sge) * WR_SG_NUM);
+ memset(&(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list[0]),
+ 0, sizeof(struct ibv_sge) * WR_SG_NUM);
#else
- sge = (struct ibv_sge *) MPIU_Malloc(sizeof(struct ibv_sge) * MPID_NEM_IB_COM_LMT_PUT_NSGE);
- memset(sge, 0, sizeof(struct ibv_sge) * MPID_NEM_IB_COM_LMT_PUT_NSGE);
+ sge =
+ (struct ibv_sge*) MPIU_Malloc(sizeof(struct ibv_sge) * MPID_NEM_IB_COM_SCRATCH_PAD_GET_NSGE);
+ memset(sge, 0, sizeof(struct ibv_sge) * MPID_NEM_IB_COM_SCRATCH_PAD_GET_NSGE);
#endif
- conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].next = NULL;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].next = NULL;
#ifdef HAVE_LIBDCFA
#else
- conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].sg_list = sge;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list = sge;
#endif
- conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].opcode = IBV_WR_RDMA_WRITE;
- conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].send_flags = IBV_SEND_SIGNALED;
- break;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].num_sge = 1;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].opcode = IBV_WR_RDMA_READ;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].send_flags = IBV_SEND_SIGNALED;
+
+
+#ifdef HAVE_LIBDCFA
+ memset(&(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list[0]),
+ 0, sizeof(struct ibv_sge) * WR_SG_NUM);
+#else
+ sge =
+ (struct ibv_sge*) MPIU_Malloc(sizeof(struct ibv_sge) * MPID_NEM_IB_COM_SCRATCH_PAD_CAS_NSGE);
+ memset(sge, 0, sizeof(struct ibv_sge) * MPID_NEM_IB_COM_SCRATCH_PAD_CAS_NSGE);
+#endif
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].next = NULL;
+#ifdef HAVE_LIBDCFA
+#else
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list = sge;
+#endif
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].num_sge = 1;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].opcode = IBV_WR_ATOMIC_CMP_AND_SWP;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].send_flags = IBV_SEND_SIGNALED;
+ break;
+ }
case MPID_NEM_IB_COM_OPEN_UD:
/* SGE (RDMA-send-from memory) template for MPID_NEM_IB_COM_UD_INITIATOR */
@@ -969,10 +1094,8 @@ int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
fn_exit:
return ibcom_errno;
err_exit:
- MPID_nem_ib_com_clean(conp);
return -1;
fn_fail:
- MPID_nem_ib_com_clean(conp);
goto fn_exit;
}
@@ -991,20 +1114,21 @@ int MPID_nem_ib_com_alloc(int condesc, int sz)
case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
/* RDMA-write-to local memory area */
- if (!scratch_pad) {
- scratch_pad = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
- dprintf("MPID_nem_ib_com_alloc,mmap=%p,len=%d\n", scratch_pad, sz);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(scratch_pad == (void *) -1, -1,
- dprintf("failed to allocate buffer\n"));
- dprintf("MPID_nem_ib_com_alloc,scratch_pad=%p\n", scratch_pad);
- memset(scratch_pad, 0, sz);
+ if (!MPID_nem_ib_scratch_pad) {
+ MPID_nem_ib_scratch_pad = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ dprintf("MPID_nem_ib_com_alloc,mmap=%p,len=%d\n", MPID_nem_ib_scratch_pad, sz);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(MPID_nem_ib_scratch_pad == (void *) -1, -1,
+ dprintf("failed to allocate buffer\n"));
+ dprintf("MPID_nem_ib_com_alloc,MPID_nem_ib_scratch_pad=%p\n", MPID_nem_ib_scratch_pad);
+ memset(MPID_nem_ib_scratch_pad, 0, sz);
}
- conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_TO] = scratch_pad;
+ conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_TO] = MPID_nem_ib_scratch_pad;
conp->icom_msize[MPID_NEM_IB_COM_SCRATCH_PAD_TO] = sz;
conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_TO] =
MPID_nem_ib_com_reg_mr_fetch(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_TO],
- conp->icom_msize[MPID_NEM_IB_COM_SCRATCH_PAD_TO]);
+ conp->icom_msize[MPID_NEM_IB_COM_SCRATCH_PAD_TO],
+ IBV_ACCESS_REMOTE_ATOMIC);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_TO], -1,
dprintf("ibv_reg_mr failed with mr_flags=0x%x\n", mr_flags));
@@ -1031,6 +1155,35 @@ int MPID_nem_ib_com_alloc(int condesc, int sz)
goto fn_exit;
}
+void MPID_nem_ib_com_free(int condesc, int sz) {
+ MPID_nem_ib_com_t *conp;
+ int ibcom_errno = 0;
+ int retval;
+
+ MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
+
+ switch (conp->open_flag) {
+
+ case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
+ MPIU_Assert(scratch_pad_ref_count > 0);
+ if(--scratch_pad_ref_count == 0) {
+ retval = munmap(scratch_pad, sz);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, dprintf("munmap"));
+ }
+ break;
+ default:
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(1, -1,
+ dprintf("MPID_nem_ib_com_free, invalid open_flag=%d\n",
+ conp->open_flag));
+ break;
+ }
+
+ fn_exit:
+ return ibcom_errno;
+ fn_fail:
+ goto fn_exit;
+}
+
int MPID_nem_ib_com_close(int condesc)
{
MPID_nem_ib_com_t *conp;
@@ -1039,9 +1192,12 @@ int MPID_nem_ib_com_close(int condesc)
dprintf("MPID_nem_ib_com_close,condesc=%d\n", condesc);
MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
+ ibcom_errno = MPID_nem_ib_com_register_cache_release();
MPID_nem_ib_com_clean(conp);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(ibcom_errno, -1,
+ printf("MPID_nem_ib_com_register_cache_release"));
--maxcon;
-
+
fn_exit:
return ibcom_errno;
fn_fail:
@@ -1065,16 +1221,25 @@ int MPID_nem_ib_com_rts(int condesc, int remote_qpnum, uint16_t remote_lid,
int flags;
switch (conp->open_flag) {
+ case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
+ /* Init QP */
+ ib_errno = modify_qp_to_init(conp->icom_qp, conp->icom_port, IBV_ACCESS_REMOTE_ATOMIC);
+ if(ib_errno) {
+ fprintf(stderr, "change QP state to INIT failed\n");
+ ibcom_errno = ib_errno;
+ goto fn_fail;
+ }
+ goto common_tail;
case MPID_NEM_IB_COM_OPEN_RC:
- case MPID_NEM_IB_COM_OPEN_RC_LMT_PUT:
case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
/* Init QP */
- ib_errno = modify_qp_to_init(conp->icom_qp, conp->icom_port);
+ ib_errno = modify_qp_to_init(conp->icom_qp, conp->icom_port, 0);
if (ib_errno) {
fprintf(stderr, "change QP state to INIT failed\n");
ibcom_errno = ib_errno;
goto fn_fail;
}
+ common_tail:
/* Modify QP TO RTR status */
ib_errno =
modify_qp_to_rtr(conp->icom_qp, remote_qpnum, remote_lid, remote_gid, conp->icom_port,
@@ -1129,8 +1294,15 @@ int MPID_nem_ib_com_rts(int condesc, int remote_qpnum, uint16_t remote_lid,
}
#define MPID_NEM_IB_ENABLE_INLINE
-int MPID_nem_ib_com_isend(int condesc, uint64_t wr_id, void *prefix, int sz_prefix, void *hdr,
- int sz_hdr, void *data, int sz_data, int *copied)
+/* <buf_from_out, buf_from_sz_out>: Free the slot in drain_scq */
+int MPID_nem_ib_com_isend(int condesc,
+ uint64_t wr_id,
+ void *prefix, int sz_prefix,
+ void *hdr, int sz_hdr,
+ void *data, int sz_data,
+ int *copied,
+ uint32_t local_ringbuf_type, uint32_t remote_ringbuf_type,
+ void** buf_from_out, uint32_t* buf_from_sz_out)
{
MPID_nem_ib_com_t *conp;
int ibcom_errno = 0;
@@ -1138,56 +1310,76 @@ int MPID_nem_ib_com_isend(int condesc, uint64_t wr_id, void *prefix, int sz_pref
int ib_errno;
int num_sge;
- dprintf("MPID_nem_ib_com_isend,prefix=%p,sz_prefix=%d,hdr=%p,sz_hdr=%d,data=%p,sz_data=%d\n",
- prefix, sz_prefix, hdr, sz_hdr, data, sz_data);
+ dprintf("MPID_nem_ib_com_isend,prefix=%p,sz_prefix=%d,hdr=%p,sz_hdr=%d,data=%p,sz_data=%d,local_ringbuf_type=%d,remote_ringbuf_type=%d\n",
+ prefix, sz_prefix, hdr, sz_hdr, data, sz_data, local_ringbuf_type, remote_ringbuf_type);
MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
if (conp->icom_connected == 0) {
return -1;
}
- int sz_data_pow2;
- MPID_NEM_IB_SZ_DATA_POW2(sizeof(MPID_nem_ib_sz_hdrmagic_t) + sz_prefix + sz_hdr + sz_data);
- uint32_t sumsz = sz_data_pow2 + sizeof(MPID_nem_ib_tailmagic_t);
+
+
+ int off_pow2_aligned;
+ MPID_NEM_IB_OFF_POW2_ALIGNED(MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr + sz_data);
+ uint32_t sumsz = off_pow2_aligned + sizeof(MPID_nem_ib_netmod_trailer_t);
+ int sz_pad = off_pow2_aligned - (MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr + sz_data);
+
+ uint32_t buf_from_sz = MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr +
+ sz_pad + sizeof(MPID_nem_ib_netmod_trailer_t);
+ *buf_from_sz_out = buf_from_sz;
+ void *buf_from = MPID_nem_ib_rdmawr_from_alloc(buf_from_sz);
+ dprintf("isend,rdmawr_from_alloc=%p,sz=%d\n", buf_from, buf_from_sz);
+ *buf_from_out = buf_from;
+ struct ibv_mr *mr_rdmawr_from = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_MR(buf_from);
+
if (sz_data > 16000) {
- //dprintf("MPID_nem_ib_com_isend,sz_data=%d,sz_data_pow2=%d,sz_max=%ld\n", sz_data, sz_data_pow2, MPID_NEM_IB_MAX_DATA_POW2);
+ //dprintf("MPID_nem_ib_com_isend,sz_data=%d,off_pow2_aligned=%d,sz_max=%ld\n", sz_data, off_pow2_aligned, MPID_NEM_IB_MAX_DATA_POW2);
}
num_sge = 0;
-
- void *buf_from =
- (uint8_t *) conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_FROM] +
- MPID_NEM_IB_COM_RDMABUF_SZSEG * (conp->sseq_num % MPID_NEM_IB_COM_RDMABUF_NSEG);
-
- MPID_nem_ib_sz_hdrmagic_t *sz_hdrmagic = (MPID_nem_ib_sz_hdrmagic_t *) buf_from;
- sz_hdrmagic->sz =
- sizeof(MPID_nem_ib_sz_hdrmagic_t) + sz_prefix + sz_hdr + sz_data +
- sizeof(MPID_nem_ib_tailmagic_t);
- sz_hdrmagic->magic = MPID_NEM_IB_COM_MAGIC;
+ uint32_t hdr_ringbuf_type = local_ringbuf_type;
+ MPID_NEM_IB_NETMOD_HDR_SZ_SET(buf_from,
+ MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) +
+ sz_prefix + sz_hdr + sz_data +
+ sizeof(MPID_nem_ib_netmod_trailer_t));
+ if(remote_ringbuf_type == MPID_NEM_IB_RINGBUF_EXCLUSIVE) {
+ hdr_ringbuf_type |= MPID_NEM_IB_RINGBUF_RELINDEX;
+ MPID_NEM_IB_NETMOD_HDR_RELINDEX_SET(buf_from, conp->rsr_seq_num_tail);
+ conp->rsr_seq_num_tail_last_sent = conp->rsr_seq_num_tail;
+ dprintf("isend,rsr_seq_num_tail=%d\n", MPID_NEM_IB_NETMOD_HDR_RELINDEX_GET(buf_from));
+ }
+ if(local_ringbuf_type == MPID_NEM_IB_RINGBUF_SHARED) {
+ MPID_NEM_IB_NETMOD_HDR_VC_SET(buf_from, conp->remote_vc);
+ dprintf("isend,remote_vc=%p\n", MPID_NEM_IB_NETMOD_HDR_VC_GET(buf_from));
+ }
+ MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_SET(buf_from, hdr_ringbuf_type);
+ dprintf("isend,hdr_ringbuf_type=%08x\n",
+ MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf_from));
/* memcpy hdr is needed because hdr resides in stack when sending close-VC command */
/* memcpy is performed onto MPID_NEM_IB_COM_RDMAWR_FROM buffer */
- void *hdr_copy = (uint8_t *) buf_from + sizeof(MPID_nem_ib_sz_hdrmagic_t);
+ void *hdr_copy = (uint8_t *) buf_from + MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type);
memcpy(hdr_copy, prefix, sz_prefix);
memcpy((uint8_t *) hdr_copy + sz_prefix, hdr, sz_hdr);
#ifdef HAVE_LIBDCFA
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].mic_addr = (uint64_t) sz_hdrmagic;
+ conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].mic_addr = (uint64_t) buf_from;
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].addr =
- conp->icom_mrlist[MPID_NEM_IB_COM_RDMAWR_FROM]->host_addr + ((uint64_t) sz_hdrmagic -
- (uint64_t)
- conp->icom_mem
- [MPID_NEM_IB_COM_RDMAWR_FROM]);
+ mr_rdmawr_from->host_addr +
+ ((uint64_t) buf_from -
+ (uint64_t) MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_START(buf_from));
+
#else
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].addr = (uint64_t) sz_hdrmagic;
+ conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].addr = (uint64_t) buf_from;
#endif
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].length =
- sizeof(MPID_nem_ib_sz_hdrmagic_t) + sz_prefix + sz_hdr;
+ MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr;
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].lkey =
- conp->icom_mrlist[MPID_NEM_IB_COM_RDMAWR_FROM]->lkey;
+ mr_rdmawr_from->lkey;
num_sge += 1;
if (sz_data) {
//dprintf("MPID_nem_ib_com_isend,data=%p,sz_data=%d\n", data, sz_data);
- struct ibv_mr *mr_data = MPID_nem_ib_com_reg_mr_fetch(data, sz_data);
+ struct ibv_mr *mr_data = MPID_nem_ib_com_reg_mr_fetch(data, sz_data, 0);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_data, -1,
printf("MPID_nem_ib_com_isend,ibv_reg_mr_fetch failed\n"));
#ifdef HAVE_LIBDCFA
@@ -1202,40 +1394,46 @@ int MPID_nem_ib_com_isend(int condesc, uint64_t wr_id, void *prefix, int sz_pref
num_sge += 1;
}
- int sz_pad = sz_data_pow2 - (sizeof(MPID_nem_ib_sz_hdrmagic_t) + sz_prefix + sz_hdr + sz_data);
- MPID_nem_ib_tailmagic_t *tailmagic =
- (MPID_nem_ib_tailmagic_t *) ((uint8_t *) buf_from + sizeof(MPID_nem_ib_sz_hdrmagic_t) +
+ MPID_nem_ib_netmod_trailer_t *netmod_trailer =
+ (MPID_nem_ib_netmod_trailer_t *) ((uint8_t *) buf_from + MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) +
sz_prefix + sz_hdr + sz_pad);
- tailmagic->magic = MPID_NEM_IB_COM_MAGIC;
+ netmod_trailer->tail_flag = MPID_NEM_IB_COM_MAGIC;
#ifdef HAVE_LIBDCFA
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].mic_addr =
- (uint64_t) buf_from + sizeof(MPID_nem_ib_sz_hdrmagic_t) + sz_prefix + sz_hdr;
+ (uint64_t) buf_from + MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr;
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].addr =
- conp->icom_mrlist[MPID_NEM_IB_COM_RDMAWR_FROM]->host_addr + ((uint64_t) buf_from +
- sizeof
- (MPID_nem_ib_sz_hdrmagic_t) +
- sz_prefix + sz_hdr -
- (uint64_t)
- conp->icom_mem
- [MPID_NEM_IB_COM_RDMAWR_FROM]);
+ mr_rdmawr_from->host_addr +
+ ((uint64_t) buf_from + MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr -
+ (uint64_t) MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_START(REQ_FIELD(sreq, buf_from)));
#else
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].addr =
- (uint64_t) buf_from + sizeof(MPID_nem_ib_sz_hdrmagic_t) + sz_prefix + sz_hdr;
+ (uint64_t) buf_from + MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr;
#endif
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].length =
- sz_pad + sizeof(MPID_nem_ib_tailmagic_t);
+ sz_pad + sizeof(MPID_nem_ib_netmod_trailer_t);
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].lkey =
- conp->icom_mrlist[MPID_NEM_IB_COM_RDMAWR_FROM]->lkey;
+ mr_rdmawr_from->lkey;
num_sge += 1;
dprintf("MPID_nem_ib_com_isend,sz_data=%d,pow2=%d,sz_pad=%d,num_sge=%d\n", sz_data,
- sz_data_pow2, sz_pad, num_sge);
+ off_pow2_aligned, sz_pad, num_sge);
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].num_sge = num_sge;
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr_id = wr_id;
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.remote_addr =
- (uint64_t) conp->icom_rmem[MPID_NEM_IB_COM_RDMAWR_TO] +
- MPID_NEM_IB_COM_RDMABUF_SZSEG * (conp->sseq_num % MPID_NEM_IB_COM_RDMABUF_NSEG);
- /* rkey is defined in MPID_nem_ib_com_reg_mr_connect */
+ (uint64_t) conp->local_ringbuf_start +
+ MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t)(conp->sseq_num % conp->local_ringbuf_nslot));
+ dprintf("isend,ringbuf_start=%p,local_head=%04ux,nslot=%d,rkey=%08x,remote_addr=%lx\n",
+ conp->local_ringbuf_start, conp->sseq_num, conp->local_ringbuf_nslot,
+ conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.rkey,
+ conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.remote_addr
+ );
+ if(conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.remote_addr <
+ (uint64_t) conp->local_ringbuf_start ||
+ conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.remote_addr >=
+ (uint64_t) conp->local_ringbuf_start + MPID_NEM_IB_COM_RDMABUF_SZSEG * conp->local_ringbuf_nslot) {
+ MPID_nem_ib_segv;
+ }
+ /* rkey is defined in MPID_nem_ib_com_connect_ringbuf */
//dprintf("MPID_nem_ib_com_isend,condesc=%d,num_sge=%d,opcode=%08x,imm_data=%08x,wr_id=%016lx, raddr=%p, rkey=%08x\n", condesc, conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].num_sge, conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].opcode, conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].imm_data, conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr_id, conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.remote_addr, conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.rkey);
@@ -1280,7 +1478,6 @@ int MPID_nem_ib_com_isend(int condesc, uint64_t wr_id, void *prefix, int sz_pref
#endif
conp->sseq_num += 1;
- assert(conp->sseq_num > 0);
conp->ncom += 1;
fn_exit:
return ibcom_errno;
@@ -1288,6 +1485,7 @@ int MPID_nem_ib_com_isend(int condesc, uint64_t wr_id, void *prefix, int sz_pref
goto fn_exit;
}
+#if 0
int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_hdr, void *data,
int sz_data)
{
@@ -1299,7 +1497,7 @@ int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_h
int i;
struct ibv_mr *mr_data;
uint32_t sumsz =
- sizeof(MPID_nem_ib_sz_hdrmagic_t) + sz_hdr + sz_data + sizeof(MPID_nem_ib_tailmagic_t);
+ sizeof(MPID_nem_ib_netmod_hdr_t) + sz_hdr + sz_data + sizeof(MPID_nem_ib_netmod_trailer_t);
unsigned long tscs, tsce;
dprintf("MPID_nem_ib_com_isend_chain,enter\n");
@@ -1309,20 +1507,20 @@ int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_h
void *buf_from =
(uint8_t *) conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_FROM] +
- MPID_NEM_IB_COM_RDMABUF_SZSEG * (conp->sseq_num % MPID_NEM_IB_COM_RDMABUF_NSEG);
+ MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t)(conp->sseq_num % MPID_NEM_IB_COM_RDMABUF_NSEG));
/* make a tail-magic position is in a fixed set */
- int sz_data_pow2;
- MPID_NEM_IB_SZ_DATA_POW2(sizeof(MPID_nem_ib_sz_hdrmagic_t) + sz_hdr + sz_data);
+ int off_pow2_aligned;
+ MPID_NEM_IB_OFF_POW2_ALIGNED(sizeof(MPID_nem_ib_netmod_hdr_t) + sz_hdr + sz_data);
/* let the last command icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAIN-1] which has IBV_WR_RDMA_WRITE_WITH_IMM */
int s =
- MPID_NEM_IB_COM_SMT_INLINE_NCHAIN - (sizeof(MPID_nem_ib_sz_hdrmagic_t) + sz_hdr +
- sz_data_pow2 + sizeof(MPID_nem_ib_tailmagic_t) +
+ MPID_NEM_IB_COM_SMT_INLINE_NCHAIN - (sizeof(MPID_nem_ib_netmod_hdr_t) + sz_hdr +
+ off_pow2_aligned + sizeof(MPID_nem_ib_netmod_trailer_t) +
MPID_NEM_IB_COM_INLINE_DATA -
1) / MPID_NEM_IB_COM_INLINE_DATA;
- MPID_NEM_IB_COM_ERR_CHKANDJUMP((sizeof(MPID_nem_ib_sz_hdrmagic_t) + sz_hdr +
- sz_data_pow2) % 4 != 0, -1,
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP((sizeof(MPID_nem_ib_netmod_hdr_t) + sz_hdr +
+ off_pow2_aligned) % 4 != 0, -1,
printf
("MPID_nem_ib_com_isend_chain,tail-magic gets over packet-boundary\n"));
MPID_NEM_IB_COM_ERR_CHKANDJUMP(s < 0 ||
@@ -1336,10 +1534,9 @@ int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_h
int sz_used = 0; /* how much of the payload of a IB packet is used? */
int num_sge = 0;
if (i == s) {
- MPID_nem_ib_sz_hdrmagic_t *sz_hdrmagic = (MPID_nem_ib_sz_hdrmagic_t *) buf_from;
- sz_hdrmagic->sz = sumsz;
- sz_hdrmagic->magic = MPID_NEM_IB_COM_MAGIC;
- memcpy((uint8_t *) buf_from + sizeof(MPID_nem_ib_sz_hdrmagic_t), hdr, sz_hdr);
+ MPID_nem_ib_netmod_hdr_t *netmod_hdr = (MPID_nem_ib_netmod_hdr_t *) buf_from;
+ MPID_NEM_IB_NETMOD_HDR_SZ_SET(netmod_hdr, sumsz);
+ memcpy((uint8_t *) buf_from + sizeof(MPID_nem_ib_netmod_hdr_t), hdr, sz_hdr);
#ifdef HAVE_LIBDCFA
conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].sg_list[num_sge].mic_addr =
(uint64_t) buf_from;
@@ -1352,10 +1549,10 @@ int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_h
conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].sg_list[num_sge].addr =
(uint64_t) buf_from;
#endif
- buf_from = (uint8_t *) buf_from + sizeof(MPID_nem_ib_sz_hdrmagic_t) + sz_hdr;
+ buf_from = (uint8_t *) buf_from + sizeof(MPID_nem_ib_netmod_hdr_t) + sz_hdr;
conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].sg_list[num_sge].length =
- sizeof(MPID_nem_ib_sz_hdrmagic_t) + sz_hdr;
- sz_used += sizeof(MPID_nem_ib_sz_hdrmagic_t) + sz_hdr;
+ sizeof(MPID_nem_ib_netmod_hdr_t) + sz_hdr;
+ sz_used += sizeof(MPID_nem_ib_netmod_hdr_t) + sz_hdr;
conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].sg_list[num_sge].lkey =
conp->icom_mrlist[MPID_NEM_IB_COM_RDMAWR_FROM]->lkey;
num_sge += 1;
@@ -1371,7 +1568,7 @@ int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_h
(uint64_t) data + sz_data - sz_data_rem;
#endif
int sz_data_red =
- sz_used + sz_data_rem + sizeof(MPID_nem_ib_tailmagic_t) <=
+ sz_used + sz_data_rem + sizeof(MPID_nem_ib_netmod_trailer_t) <=
MPID_NEM_IB_COM_INLINE_DATA ? sz_data_rem : sz_data_rem <=
MPID_NEM_IB_COM_INLINE_DATA - sz_used ? sz_data_rem : MPID_NEM_IB_COM_INLINE_DATA -
sz_used;
@@ -1385,7 +1582,7 @@ int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_h
if (i == s) {
MPID_NEM_IB_COM_ERR_CHKANDJUMP(!sz_data, -1,
printf("MPID_nem_ib_com_isend_chain,sz_data==0\n"));
- mr_data = MPID_nem_ib_com_reg_mr_fetch(data, sz_data);
+ mr_data = MPID_nem_ib_com_reg_mr_fetch(data, sz_data, 0);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_data, -1,
printf
("MPID_nem_ib_com_isend,ibv_reg_mr_fetch failed\n"));
@@ -1402,17 +1599,17 @@ int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_h
dprintf("MPID_nem_ib_com_isend_chain,i=%d,sz_used=%d,sz_data_rem=%d\n", i, sz_used,
sz_data_rem);
}
- else { /* tailmagic only packet is being generated */
+ else { /* netmod_trailer only packet is being generated */
}
//tsce = MPID_nem_ib_rdtsc(); printf("1,%ld\n", tsce-tscs);
//tscs = MPID_nem_ib_rdtsc();
- if (i == MPID_NEM_IB_COM_SMT_INLINE_NCHAIN - 1) { /* append tailmagic */
- int sz_pad = sz_data_pow2 - sz_data;
- MPID_nem_ib_tailmagic_t *tailmagic =
- (MPID_nem_ib_tailmagic_t *) ((uint8_t *) buf_from + sz_pad);
- tailmagic->magic = MPID_NEM_IB_COM_MAGIC;
+ if (i == MPID_NEM_IB_COM_SMT_INLINE_NCHAIN - 1) { /* append netmod_trailer */
+ int sz_pad = off_pow2_aligned - sz_data;
+ MPID_nem_ib_netmod_trailer_t *netmod_trailer =
+ (MPID_nem_ib_netmod_trailer_t *) ((uint8_t *) buf_from + sz_pad);
+ netmod_trailer->tail_flag = MPID_NEM_IB_COM_MAGIC;
#ifdef HAVE_LIBDCFA
conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].sg_list[num_sge].mic_addr =
(uint64_t) buf_from;
@@ -1426,8 +1623,8 @@ int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_h
(uint64_t) buf_from;
#endif
conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].sg_list[num_sge].length =
- sz_pad + sizeof(MPID_nem_ib_tailmagic_t);
- sz_used += sz_pad + sizeof(MPID_nem_ib_tailmagic_t);
+ sz_pad + sizeof(MPID_nem_ib_netmod_trailer_t);
+ sz_used += sz_pad + sizeof(MPID_nem_ib_netmod_trailer_t);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(sz_data_rem != 0, -1,
printf("MPID_nem_ib_com_isend_chain, sz_data_rem\n"));
conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].sg_list[num_sge].lkey =
@@ -1475,7 +1672,7 @@ int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_h
conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].wr_id = wr_id;
conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].wr.rdma.remote_addr =
(uint64_t) conp->icom_rmem[MPID_NEM_IB_COM_RDMAWR_TO] +
- MPID_NEM_IB_COM_RDMABUF_SZSEG * (conp->sseq_num % MPID_NEM_IB_COM_RDMABUF_NSEG) +
+ MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t)(conp->sseq_num % MPID_NEM_IB_COM_RDMABUF_NSEG)) +
MPID_NEM_IB_COM_INLINE_DATA * (i - s);
}
#if 0
@@ -1514,6 +1711,7 @@ int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_h
fn_fail:
goto fn_exit;
}
+#endif
int MPID_nem_ib_com_irecv(int condesc, uint64_t wr_id)
{
@@ -1655,12 +1853,12 @@ int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, int sz_data,
num_sge = 0;
/* register memory area containing data */
- struct ibv_mr *mr_data = MPID_nem_ib_com_reg_mr_fetch(laddr, sz_data);
+ struct ibv_mr *mr_data = MPID_nem_ib_com_reg_mr_fetch(laddr, sz_data, 0);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_data, -1,
dprintf("MPID_nem_ib_com_lrecv,ibv_reg_mr_fetch failed\n"));
/* Erase magic, super bug!! */
- //((MPID_nem_ib_tailmagic_t*)(laddr + sz_data - sizeof(MPID_nem_ib_tailmagic_t)))->magic = 0;
+ //((MPID_nem_ib_netmod_trailer_t*)(laddr + sz_data - sizeof(MPID_nem_ib_netmod_trailer_t)))->magic = 0;
#ifdef HAVE_LIBDCFA
conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].mic_addr = (uint64_t) laddr;
conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].addr =
@@ -1723,7 +1921,7 @@ int MPID_nem_ib_com_put_lmt(int condesc, uint64_t wr_id, void *raddr, int sz_dat
num_sge = 0;
/* register memory area containing data */
- struct ibv_mr *mr_data = MPID_nem_ib_com_reg_mr_fetch(laddr, sz_data);
+ struct ibv_mr *mr_data = MPID_nem_ib_com_reg_mr_fetch(laddr, sz_data, 0);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_data, -1,
dprintf("MPID_nem_ib_com_put_lmt,ibv_reg_mr_fetch failed\n"));
@@ -1785,22 +1983,25 @@ int MPID_nem_ib_com_put_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset
dprintf("MPID_nem_ib_com_put_scratch_pad,not connected\n"));
MPID_NEM_IB_COM_ERR_CHKANDJUMP(!sz, -1, dprintf("MPID_nem_ib_com_put_scratch_pad,sz==0\n"));
- /* register memory area containing data */
- struct ibv_mr *mr_data = MPID_nem_ib_com_reg_mr_fetch(laddr, sz);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_data, -1,
- dprintf
- ("MPID_nem_ib_com_put_scratch_pad,ibv_reg_mr_fetch failed\n"));
- dprintf("MPID_nem_ib_com_put_scratch_pad,");
+ /* Use inline so that we don't need to worry about overwriting write-from buffer */
+ assert(sz <= conp->max_inline_data);
+
+ memcpy(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM], laddr, sz);
+
+ void *from =
+ (uint8_t *) conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
#ifdef HAVE_LIBDCFA
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].mic_addr = (uint64_t) laddr;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].mic_addr = (uint64_t) from;
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].addr =
- mr_data->host_addr + ((uint64_t) laddr - (uint64_t) laddr);
+ conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_FROM]->host_addr +
+ ((uint64_t) from - (uint64_t) from);
#else
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].addr = (uint64_t) laddr;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].addr = (uint64_t) from;
#endif
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].length = sz;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].lkey = mr_data->lkey;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].lkey =
+ conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_FROM]->lkey;
/* num_sge is defined in MPID_nem_ib_com_open */
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].wr_id = wr_id;
@@ -1836,65 +2037,59 @@ int MPID_nem_ib_com_put_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset
goto fn_exit;
}
-#ifdef MPID_NEM_IB_ONDEMAND
-int MPID_nem_ib_com_cas_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset, uint64_t compare,
- uint64_t swap)
+int MPID_nem_ib_com_get_scratch_pad(int condesc,
+ uint64_t wr_id,
+ uint64_t offset, int sz,
+ void** buf_from_out, uint32_t * buf_from_sz_out)
{
MPID_nem_ib_com_t *conp;
int ibcom_errno = 0;
struct ibv_send_wr *bad_wr;
int ib_errno;
- dprintf("MPID_nem_ib_com_put_scratch_pad,enter,wr_id=%llx,offset=%llx,sz=%d,laddr=%p\n",
- (unsigned long long) wr_id, (unsigned long long) offset, sz, laddr);
- dprintf("MPID_nem_ib_com_put_scratch_pad,data=%08x\n", *((uint32_t *) laddr));
+ dprintf("MPID_nem_ib_com_get_scratch_pad,enter,wr_id=%llx,offset=%llx,sz=%d\n",
+ (unsigned long long) wr_id, (unsigned long long) offset, sz);
MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->open_flag != MPID_NEM_IB_COM_OPEN_SCRATCH_PAD, -1,
- dprintf("MPID_nem_ib_com_put_scratch_pad,invalid open_flag=%d\n",
- conp->open_flag));
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_connected, -1,
- dprintf("MPID_nem_ib_com_put_scratch_pad,not connected\n"));
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!sz, -1, dprintf("MPID_nem_ib_com_put_scratch_pad,sz==0\n"));
- /* register memory area containing data */
- struct ibv_mr *mr_data = MPID_nem_ib_com_reg_mr_fetch(laddr, sz);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_data, -1,
- dprintf
- ("MPID_nem_ib_com_put_scratch_pad,ibv_reg_mr_fetch failed\n"));
- dprintf("MPID_nem_ib_com_put_scratch_pad,");
+ *buf_from_sz_out = sz;
+ void *buf_from = MPID_nem_ib_rdmawr_from_alloc(sz);
+ dprintf("get_scratch_pad,rdmawr_from_alloc=%p,sz=%d\n", buf_from, sz);
+ *buf_from_out = buf_from;
+ struct ibv_mr *mr_rdmawr_from = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_MR(buf_from);
#ifdef HAVE_LIBDCFA
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].mic_addr = (uint64_t) laddr;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].addr =
- mr_data->host_addr + ((uint64_t) laddr - (uint64_t) laddr);
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list[0].mic_addr = (uint64_t) buf_from;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list[0].addr =
+ mr_rdmawr_from->host_addr +
+ ((uint64_t) buf_from -
+ (uint64_t) buf_from);
#else
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].addr = (uint64_t) laddr;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list[0].addr = (uint64_t) buf_from;
#endif
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].length = sz;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].lkey = mr_data->lkey;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list[0].length = sz;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list[0].lkey =
+ mr_rdmawr_from->lkey;
/* num_sge is defined in MPID_nem_ib_com_open */
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].wr_id = wr_id;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].wr.atomic.remote_addr =
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].wr_id = wr_id;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].wr.rdma.remote_addr =
(uint64_t) conp->icom_rmem[MPID_NEM_IB_COM_SCRATCH_PAD_TO] + offset;
/* rkey is defined in MPID_nem_ib_com_reg_mr_connect */
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].wr.atomic.compare_add = compare;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].wr.atomic.swap = swap;
- dprintf("MPID_nem_ib_com_put_scratch_pad,wr.rdma.remote_addr=%llx\n",
- (unsigned long long) conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].wr.rdma.
+ dprintf("MPID_nem_ib_com_get_scratch_pad,wr.rdma.remote_addr=%llx\n",
+ (unsigned long long) conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].wr.rdma.
remote_addr);
#ifdef HAVE_LIBDCFA
- ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR]);
+ ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET]);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
dprintf
("MPID_nem_ib_com_put_scratch_pad, ibv_post_send, rc=%d\n",
ib_errno));
#else
ib_errno =
- ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR],
+ ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET],
&bad_wr);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
dprintf
@@ -1909,7 +2104,76 @@ int MPID_nem_ib_com_cas_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset
fn_fail:
goto fn_exit;
}
+
+int MPID_nem_ib_com_cas_scratch_pad(int condesc,
+ uint64_t wr_id, uint64_t offset,
+ uint64_t compare, uint64_t swap,
+ void** buf_from_out, uint32_t * buf_from_sz_out)
+{
+ MPID_nem_ib_com_t *conp;
+ int ibcom_errno = 0;
+ struct ibv_send_wr *bad_wr;
+ int ib_errno;
+ uint32_t sz = sizeof(uint64_t);
+
+ dprintf("MPID_nem_ib_com_cas_scratch_pad,enter,wr_id=%llx,offset=%llx\n",
+ (unsigned long long) wr_id, (unsigned long long) offset);
+
+ MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
+
+ *buf_from_sz_out = sz;
+ void *buf_from = MPID_nem_ib_rdmawr_from_alloc(sz);
+ dprintf("cas_scratch_pad,rdmawr_from_alloc=%p,sz=%d\n", buf_from, sz);
+ *buf_from_out = buf_from;
+ struct ibv_mr *mr_rdmawr_from = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_MR(buf_from);
+
+#ifdef HAVE_LIBDCFA
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list[0].mic_addr = (uint64_t) buf_from;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list[0].addr =
+ mr_rdmawr_from->host_addr +
+ ((uint64_t) buf_from - (uint64_t) buf_from);
+#else
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list[0].addr = (uint64_t) buf_from;
#endif
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list[0].length = sizeof(uint64_t);
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list[0].lkey =
+ mr_rdmawr_from->lkey;
+
+ /* num_sge is defined in MPID_nem_ib_com_open */
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].wr_id = wr_id;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].wr.atomic.remote_addr =
+ (uint64_t) conp->icom_rmem[MPID_NEM_IB_COM_SCRATCH_PAD_TO] + offset;
+ /* atomic.rkey is defined in MPID_nem_ib_com_reg_mr_connect */
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].wr.atomic.compare_add = compare;
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].wr.atomic.swap = swap;
+
+ dprintf("MPID_nem_ib_com_cas_scratch_pad,wr.rdma.remote_addr=%llx\n",
+ (unsigned long long) conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].wr.rdma.
+ remote_addr);
+
+#ifdef HAVE_LIBDCFA
+ ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS]);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
+ dprintf
+ ("MPID_nem_ib_com_cas_scratch_pad, ibv_post_send, rc=%d\n",
+ ib_errno));
+#else
+ ib_errno =
+ ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS],
+ &bad_wr);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
+ dprintf
+ ("MPID_nem_ib_com_cas_scratch_pad, ibv_post_send, rc=%d, bad_wr=%p\n",
+ ib_errno, bad_wr));
+#endif
+
+ conp->ncom_scratch_pad += 1;
+
+ fn_exit:
+ return ibcom_errno;
+ fn_fail:
+ goto fn_exit;
+}
/* poll completion queue */
int MPID_nem_ib_com_poll_cq(int which_cq, struct ibv_wc *wc, int *result)
@@ -1954,7 +2218,6 @@ int MPID_nem_ib_com_reg_mr_connect(int condesc, void *rmem, int rkey)
MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
switch (conp->open_flag) {
case MPID_NEM_IB_COM_OPEN_RC:
- case MPID_NEM_IB_COM_OPEN_RC_LMT_PUT:
conp->icom_rmem[MPID_NEM_IB_COM_RDMAWR_TO] = rmem;
conp->icom_rkey[MPID_NEM_IB_COM_RDMAWR_TO] = rkey;
conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.rkey =
@@ -1970,6 +2233,10 @@ int MPID_nem_ib_com_reg_mr_connect(int condesc, void *rmem, int rkey)
conp->icom_rkey[MPID_NEM_IB_COM_SCRATCH_PAD_TO] = rkey;
conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].wr.rdma.rkey =
conp->icom_rkey[MPID_NEM_IB_COM_SCRATCH_PAD_TO];
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].wr.rdma.rkey =
+ conp->icom_rkey[MPID_NEM_IB_COM_SCRATCH_PAD_TO];
+ conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].wr.atomic.rkey =
+ conp->icom_rkey[MPID_NEM_IB_COM_SCRATCH_PAD_TO];
break;
default:
@@ -1983,6 +2250,61 @@ int MPID_nem_ib_com_reg_mr_connect(int condesc, void *rmem, int rkey)
goto fn_exit;
}
+/* alloc_new_mr
+ 0: The new ring buffer is located in the same IB Memory Region as
+ the previous ring buffer is located in.
+ This happens when making the connection switch to smaller ring buffer.
+ 1: The new ring buffer is located in the new IB Memory Region
+ This happens when memory area shrunk then has grown. */
+int MPID_nem_ib_com_connect_ringbuf(int condesc,
+ uint32_t ringbuf_type,
+ void *start, int rkey, int nslot,
+ MPIDI_VC_t *remote_vc,
+ uint32_t alloc_new_mr)
+{
+ int ibcom_errno = 0;
+ MPID_nem_ib_com_t *conp;
+ int i;
+
+ MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
+
+ conp->local_ringbuf_type = ringbuf_type;
+
+
+ /* Address and size */
+ conp->local_ringbuf_start = start;
+ conp->local_ringbuf_nslot = nslot;
+ switch(conp->local_ringbuf_type) {
+ case MPID_NEM_IB_RINGBUF_EXCLUSIVE:
+ /* Head and tail pointers */
+ conp->sseq_num = 0;
+ conp->lsr_seq_num_tail = -1;
+ break;
+ case MPID_NEM_IB_RINGBUF_SHARED:
+ /* Mark as full to make the sender ask */
+ conp->lsr_seq_num_tail = conp->sseq_num - conp->local_ringbuf_nslot;
+ conp->remote_vc = remote_vc;
+ break;
+ default:
+ printf("unknown ringbuf type");
+ break;
+ }
+ if(alloc_new_mr) {
+ conp->local_ringbuf_rkey = rkey;
+ conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.rkey = rkey;
+ }
+ dprintf("connect_ringbuf,ringbuf_type=%d,rkey=%08x,start=%p,nslot=%d,sseq_num=%d,lsr_seq_num_tail=%d,remote_vc=%lx,alloc_new_mr=%d\n",
+ conp->local_ringbuf_type, conp->local_ringbuf_rkey, conp->local_ringbuf_start,
+ conp->local_ringbuf_nslot, conp->sseq_num,
+ conp->lsr_seq_num_tail, conp->remote_vc,
+ alloc_new_mr);
+
+ fn_exit:
+ return ibcom_errno;
+ fn_fail:
+ goto fn_exit;
+}
+
int MPID_nem_ib_com_get_info_conn(int condesc, int key, void *out, uint32_t out_len)
{
int ibcom_errno = 0;
@@ -2078,7 +2400,7 @@ int MPID_nem_ib_com_mem_rdmawr_from(int condesc, void **out)
MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
*out =
(uint8_t *) conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_FROM] +
- MPID_NEM_IB_COM_RDMABUF_SZSEG * (conp->sseq_num % MPID_NEM_IB_COM_RDMABUF_NSEG);
+ MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t)(conp->sseq_num % MPID_NEM_IB_COM_RDMABUF_NSEG));
fn_exit:
return ibcom_errno;
@@ -2086,6 +2408,7 @@ int MPID_nem_ib_com_mem_rdmawr_from(int condesc, void **out)
goto fn_exit;
}
+#if 0
int MPID_nem_ib_com_mem_rdmawr_to(int condesc, int seq_num, void **out)
{
MPID_nem_ib_com_t *conp;
@@ -2101,6 +2424,7 @@ int MPID_nem_ib_com_mem_rdmawr_to(int condesc, int seq_num, void **out)
fn_fail:
goto fn_exit;
}
+#endif
int MPID_nem_ib_com_mem_udwr_from(int condesc, void **out)
{
@@ -2130,62 +2454,6 @@ int MPID_nem_ib_com_mem_udwr_to(int condesc, void **out)
goto fn_exit;
}
-int MPID_nem_ib_com_sseq_num_get(int condesc, int *seq_num)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- *seq_num = conp->sseq_num;
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_lsr_seq_num_tail_get(int condesc, int **seq_num)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- *seq_num = &(conp->lsr_seq_num_tail);
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_rsr_seq_num_tail_get(int condesc, int **seq_num)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- *seq_num = &(conp->rsr_seq_num_tail);
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_rsr_seq_num_tail_last_sent_get(int condesc, int **seq_num)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- *seq_num = &(conp->rsr_seq_num_tail_last_sent);
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
int MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get(int condesc, int *notify_rate)
{
MPID_nem_ib_com_t *conp;
@@ -2299,14 +2567,15 @@ char *MPID_nem_ib_com_strerror(int errno)
goto fn_exit;
}
-int MPID_nem_ib_com_reg_mr(void *addr, int len, struct ibv_mr **mr)
+int MPID_nem_ib_com_reg_mr(void *addr, int len, struct ibv_mr **mr, enum ibv_access_flags additional_flags)
{
int ibcom_errno = 0;
dprintf("MPID_nem_ib_com_reg_mr,addr=%p,len=%d,mr=%p\n", addr, len, mr);
*mr =
ibv_reg_mr(ib_pd, addr, len,
- IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
+ IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE |
+ IBV_ACCESS_REMOTE_READ | additional_flags);
MPID_NEM_IB_COM_ERR_CHKANDJUMP(*mr == 0, -1,
dprintf("MPID_nem_ib_com_reg_mr,cannot register memory\n"));
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
index 930fbf1..e24b6c6 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
@@ -12,6 +12,7 @@
#include <unistd.h>
#include <stdint.h>
#include <sys/types.h>
+#include <linux/mman.h> /* make it define MAP_ANONYMOUS */
#include "mpid_nem_impl.h"
#ifdef HAVE_LIBDCFA
@@ -169,12 +170,19 @@ static inline unsigned long long MPID_nem_ib_rdtsc_cpuid(void)
return (unsigned long long) hi << 32 | lo;
}
+#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_NID 32
+
extern struct ibv_cq *MPID_nem_ib_rc_shared_scq;
-extern struct ibv_cq *MPID_nem_ib_rc_shared_scq_lmt_put;
extern struct ibv_cq *MPID_nem_ib_rc_shared_scq_scratch_pad;
extern struct ibv_cq *MPID_nem_ib_ud_shared_rcq;
-
-#define MPID_NEM_IB_COM_SIZE 2048 /* one process uses 2-4 fds */
+extern uint8_t *MPID_nem_ib_scratch_pad;
+extern char *MPID_nem_ib_rdmawr_from_alloc_free_list_front[MPID_NEM_IB_RDMAWR_FROM_ALLOC_NID];
+extern char *MPID_nem_ib_rdmawr_from_alloc_arena_free_list[MPID_NEM_IB_RDMAWR_FROM_ALLOC_NID];
+extern struct ibv_mr* MPID_nem_ib_rdmawr_to_alloc_mr;
+extern uint8_t *MPID_nem_ib_rdmawr_to_alloc_start;
+extern uint8_t *MPID_nem_ib_rdmawr_to_alloc_free_list;
+
+#define MPID_NEM_IB_COM_SIZE (65536*2) /* Maxiumum number of QPs. One process uses 2 QPs. */
#define MPID_NEM_IB_COM_INLINE_DATA (512-64) /* experimented max is 884 */ /* this is lower bound and more than this value is set. the more this value is, the more the actual value set is. you need to check it */
#define MPID_NEM_IB_COM_MAX_SQ_CAPACITY (256/1)
@@ -209,6 +217,11 @@ extern struct ibv_cq *MPID_nem_ib_ud_shared_rcq;
#define MPID_NEM_IB_COM_RDMABUF_SZSEG (16384/4) //(16384+8+40+1) /* this size minus magics and headers must be 2^n because data might grow to the next 2^m boundary, see ib_impl.h, ib_com.c, src/mpid/ch3/src/mpid_isend.c */
#define MPID_NEM_IB_COM_RDMABUF_SZ ((MPID_NEM_IB_COM_RDMABUF_SZSEG) * 16) /* (32768 * 256) */
#define MPID_NEM_IB_COM_RDMABUF_NSEG ((MPID_NEM_IB_COM_RDMABUF_SZ) / (MPID_NEM_IB_COM_RDMABUF_SZSEG))
+
+#define MPID_NEM_IB_RINGBUF_SHARED_SZSEG (16384/4)
+#define MPID_NEM_IB_RINGBUF_SHARED_SZ ((MPID_NEM_IB_RINGBUF_SHARED_SZSEG) * 16)
+#define MPID_NEM_IB_RINGBUF_SHARED_NSEG ((MPID_NEM_IB_RINGBUF_SHARED_SZ) / (MPID_NEM_IB_RINGBUF_SHARED_SZSEG))
+
#define MPID_NEM_IB_COM_SMT_INLINE_NCHAIN 8 /* maximum number of chained inline-send commands */
#define MPID_NEM_IB_COM_RDMABUF_HIGH_WATER_MARK (((MPID_NEM_IB_COM_RDMABUF_NSEG)>>1)+((MPID_NEM_IB_COM_RDMABUF_NSEG)>>2))
#define MPID_NEM_IB_COM_RDMABUF_LOW_WATER_MARK (((MPID_NEM_IB_COM_RDMABUF_NSEG)>>2))
@@ -225,8 +238,10 @@ extern struct ibv_cq *MPID_nem_ib_ud_shared_rcq;
#define MPID_NEM_IB_COM_UDBUF_SZSEG (128)
#define MPID_NEM_IB_COM_UDBUF_NSEG (MPID_NEM_IB_COM_UDBUF_SZ / MPID_NEM_IB_COM_UDBUF_SZSEG)
-#define MPID_NEM_IB_COM_NBUF_SCRATCH_PAD 1 /* number of <addr, sz, lkey, rkey> */
-#define MPID_NEM_IB_COM_SCRATCH_PAD_TO 0 /* index to RDMA-write-to buffer */
+#define MPID_NEM_IB_COM_NBUF_SCRATCH_PAD 2 /* number of <addr, sz, lkey, rkey> */
+#define MPID_NEM_IB_COM_SCRATCH_PAD_FROM_SZ 4096
+#define MPID_NEM_IB_COM_SCRATCH_PAD_FROM 0
+#define MPID_NEM_IB_COM_SCRATCH_PAD_TO 1 /* index to RDMA-write-to buffer */
/* send command templates */
#define MPID_NEM_IB_COM_RC_SR_NTEMPLATE (8+1+2) /* number of request templates, 8 for inline-chained-smt, 1 for smt, 1 for lmt */
@@ -234,8 +249,6 @@ extern struct ibv_cq *MPID_nem_ib_ud_shared_rcq;
#define MPID_NEM_IB_COM_SMT_INLINE_CHAINED7 7
#define MPID_NEM_IB_COM_SMT_NOINLINE 8
#define MPID_NEM_IB_COM_LMT_INITIATOR 9 /* FIXME: bad naming */
-
-#define MPID_NEM_IB_COM_RC_SR_LMT_PUT_NTEMPLATE MPID_NEM_IB_COM_RC_SR_NTEMPLATE /* FIXME: TEMPLATE named MPID_NEM_IB_COM_RC_SR shares MPID_NEM_IB_COM_LMT_PUT */
#define MPID_NEM_IB_COM_LMT_PUT 10
/* recv command templates */
@@ -248,19 +261,153 @@ extern struct ibv_cq *MPID_nem_ib_ud_shared_rcq;
#define MPID_NEM_IB_COM_LMT_INITIATOR_NSGE 1 /* data x1 */
#define MPID_NEM_IB_COM_LMT_PUT_NSGE 1 /* data x1 */
#define MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR_NSGE 1 /* QP state */
+#define MPID_NEM_IB_COM_SCRATCH_PAD_CAS_NSGE 1 /* QP state */
+#define MPID_NEM_IB_COM_SCRATCH_PAD_GET_NSGE 1
#define MPID_NEM_IB_COM_UD_SR_NTEMPLATE 1
#define MPID_NEM_IB_COM_UD_RR_NTEMPLATE 1
#define MPID_NEM_IB_COM_UD_INITIATOR 0 /* index to send request template */
#define MPID_NEM_IB_COM_UD_RESPONDER 0 /* index to recv request template */
-#define MPID_NEM_IB_COM_SCRATCH_PAD_SR_NTEMPLATE 2
+#define MPID_NEM_IB_COM_SCRATCH_PAD_SR_NTEMPLATE 3
#define MPID_NEM_IB_COM_SCRATCH_PAD_RR_NTEMPLATE 1
#define MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR 0 /* index to send request template */
#define MPID_NEM_IB_COM_SCRATCH_PAD_CAS 1
+#define MPID_NEM_IB_COM_SCRATCH_PAD_GET 2
#define MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER 0 /* index to recv request template */
-
+/* Header prepended to the MPI packet */
+#define MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf) ((uint32_t)(((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first >> 61))
+#define MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_SET(buf, val) ((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first = (((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first & ~(7ULL<<61)) | ((uint64_t)(val) << 61)
+
+#define MPID_NEM_IB_NETMOD_HDR_RELINDEX_GET(buf) ((int16_t)((((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first >> 32) & 65535))
+#define MPID_NEM_IB_NETMOD_HDR_RELINDEX_SET(buf, val) ((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first = (((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first & ~(65535ULL<<32)) | ((uint64_t)((val&65535)) << 32)
+
+/* Note that the result is put into [63:32] */
+#define MPID_NEM_IB_NETMOD_HDR_ACQADDRH_GET(buf) ((uint64_t)((((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first << 12) & (((1ULL<<32)-1)<<32)))
+/* Note that the value to put is located in [63:32] */
+#define MPID_NEM_IB_NETMOD_HDR_ACQADDRH_SET(buf, val) ((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first = (((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first & ~(((1ULL<<32)-1)<<20)) | (((val) & (((1ULL<<32)-1)<<32)) >> 12)
+
+#define MPID_NEM_IB_NETMOD_HDR_ACQADDR_GET(buf) (MPID_NEM_IB_NETMOD_HDR_ACQADDRH_GET(buf)|((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->second)
+#define MPID_NEM_IB_NETMOD_HDR_ACQADDR_SET(buf, val) MPID_NEM_IB_NETMOD_HDR_ACQADDRH_SET((buf), (val)); ((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->second = ((val) & ((1ULL<<32)-1))
+
+#define MPID_NEM_IB_NETMOD_HDR_ACQAMTLOG_GET(buf) ((uint32_t)((((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first >> 16) & 15))
+#define MPID_NEM_IB_NETMOD_HDR_ACQAMTLOG_SET(buf, val) ((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first = (((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first & ~(15ULL<<16)) | ((uint64_t)(val) << 16)
+
+#define MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) ((uint32_t)(((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first & 65535))
+#define MPID_NEM_IB_NETMOD_HDR_SZ_SET(buf, val) ((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first = (((MPID_nem_ib_netmod_hdr_exclusive_t *)buf)->first & ~65535ULL) | (val)
+
+#define MPID_NEM_IB_NETMOD_HDR_VC_GET(buf) ((struct MPIDI_VC *)(((uint64_t)((MPID_nem_ib_netmod_hdr_shared_t *)(buf))->third << 32) | (uint64_t)((MPID_nem_ib_netmod_hdr_shared_t *)(buf))->forth))
+#define MPID_NEM_IB_NETMOD_HDR_VC_SET(buf, val) ((MPID_nem_ib_netmod_hdr_shared_t *)(buf))->third = (uint64_t)(val) >> 32; ((MPID_nem_ib_netmod_hdr_shared_t *)(buf))->forth = (uint64_t)(val) & ((1ULL << 32) - 1);
+
+#define MPID_NEM_IB_NETMOD_HDR_SIZEOF(type) (((type) == MPID_NEM_IB_RINGBUF_EXCLUSIVE) ? sizeof(MPID_nem_ib_netmod_hdr_exclusive_t) : sizeof(MPID_nem_ib_netmod_hdr_shared_t))
+#define MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf) ((MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf) & MPID_NEM_IB_RINGBUF_EXCLUSIVE) ? sizeof(MPID_nem_ib_netmod_hdr_exclusive_t) : sizeof(MPID_nem_ib_netmod_hdr_shared_t))
+
+#define MPID_NEM_IB_NETMOD_HDR_HEAD_FLAG_PTR(buf) (&((MPID_nem_ib_netmod_hdr_shared_t *)(buf))->first)
+#define MPID_NEM_IB_NETMOD_HDR_HEAD_FLAG_SET(buf, val) ((MPID_nem_ib_netmod_hdr_shared_t *)(buf))->first = (val);
+
+typedef struct MPID_nem_ib_netmod_hdr_exclusive {
+ /*
+ [63:61] ring buffer type
+ remote is exclusive:
+ [47:32] largest index of contiguous released slots 16-bit
+ reply to slot request:
+ [51:20] Start address of acquired slots, MSB part
+ [19:16] Log_2 of amount of acquired slots
+ [15:0] Packet size without padding
+ */
+ uint64_t first;
+ /* jump case:
+ [31:0] Start address of acquired slots, LSB part
+ */
+ uint32_t second;
+
+} MPID_nem_ib_netmod_hdr_exclusive_t;
+
+typedef struct MPID_nem_ib_netmod_hdr_shared {
+ uint64_t first;
+ uint32_t second;
+
+ /* remote is one slot:
+ [31:0] VC pointer in remote node, MSB part */
+ uint32_t third;
+
+ /* remote is one slot:
+ [31:0] VC pointer in remote node, LSB part */
+ uint32_t forth;
+} MPID_nem_ib_netmod_hdr_shared_t;
+
+typedef struct MPID_nem_ib_netmod_trailer {
+ uint8_t tail_flag;
+ //uint32_t traits; /* for debug */
+} MPID_nem_ib_netmod_trailer_t;
+
+/* Allocator for RDMA write to buffer */
+typedef struct {
+ /* Avoid polluting netmod_hdr and trailer */
+ uint8_t padding[sizeof(MPID_nem_ib_netmod_hdr_shared_t)];
+ uint8_t *next;
+} MPID_nem_ib_rdmawr_to_alloc_hdr_t;
+
+/* Ring-buffer to which a remote note RDMA-writes */
+#define MPID_NEM_IB_NRINGBUF 64
+#define MPID_NEM_IB_RINGBUF_NSLOT 16
+
+/* Ring-buffer type. It is set by ringbuf_alloc on the receiver side
+ and sent in SYNACK or ACK1 to the sender side and referenced by isend
+ on the sender side and by poll on the receiver side */
+/* Exclusive ring buffer has been allocated */
+#define MPID_NEM_IB_RINGBUF_EXCLUSIVE 1
+/* Shared ring buffer has been allocated */
+#define MPID_NEM_IB_RINGBUF_SHARED 2
+#define MPID_NEM_IB_RINGBUF_RELINDEX 4
+
+typedef struct {
+ uint32_t type; /* acquiring contiguous slots or a single slot */
+ void* start;
+ int nslot;
+ MPIDI_VC_t * vc;
+ uint64_t remote_released[(MPID_NEM_IB_COM_RDMABUF_NSEG + 63) / 64];
+} MPID_nem_ib_ringbuf_t;
+
+/* Represent a ring-buffer is exclusively acquired */
+extern uint64_t MPID_nem_ib_ringbuf_acquired[(MPID_NEM_IB_NRINGBUF + 63) / 64];
+
+/* Represent a ring-buffer is ready to poll */
+extern uint64_t MPID_nem_ib_ringbuf_allocated[(MPID_NEM_IB_NRINGBUF + 63) / 64];
+
+extern MPID_nem_ib_ringbuf_t *MPID_nem_ib_ringbuf;
+
+
+/* Next ring-buffer type and slots
+ Exclusive slots are sticky.
+ Shared slot is consumed.
+ Use the type described here because we need to
+ use up acquired slots of shared ring-buffer when
+ transitioning from share to exclusive.
+ The next type is absent means we're transitioning
+ from exclusive to shared. */
+typedef struct MPID_nem_ib_ringbuf_sector {
+ uint32_t type;
+ void* start;
+ int nslot;
+ uint16_t head;
+ uint16_t tail;
+
+ struct MPID_nem_ib_ringbuf_sector * sectorq_next;
+} MPID_nem_ib_ringbuf_sector_t;
+
+typedef GENERIC_Q_DECL(MPID_nem_ib_ringbuf_sector_t) MPID_nem_ib_ringbuf_sectorq_t;
+
+#define MPID_nem_ib_ringbuf_sectorq_empty(q) GENERICM_Q_EMPTY (q)
+#define MPID_nem_ib_ringbuf_sectorq_head(q) GENERICM_Q_HEAD (q)
+#define MPID_nem_ib_ringbuf_sectorq_next_field(ep, next_field) ((ep)->next_field)
+#define MPID_nem_ib_ringbuf_sectorq_next(ep) ((ep)->sectorq_next)
+#define MPID_nem_ib_ringbuf_sectorq_enqueue(qp, ep) GENERICM_Q_ENQUEUE (qp, ep, MPID_nem_ib_ringbuf_sectorq_next_field, sectorq_next);
+#define MPID_nem_ib_ringbuf_sectorq_dequeue(qp, epp) GENERICM_Q_DEQUEUE (qp, epp, MPID_nem_ib_ringbuf_sectorq_next_field, sectorq_next);
+
+
+/* IB connection */
typedef struct MPID_nem_ib_com {
short icom_used;
short icom_connected;
@@ -283,14 +430,14 @@ typedef struct MPID_nem_ib_com {
void **icom_rmem;
int *icom_rkey;
size_t *icom_rsize;
- int sseq_num;
- int rsr_seq_num_poll;
- int rsr_seq_num_tail; /* occupation status of remote Send Request (SR) queue (it covers occupation status of local RDMA-wr-to buffer) */
- int rsr_seq_num_tail_last_sent; /* latest one sent to remote rank */
- int lsr_seq_num_tail; /* occupation status of local Send Request (SR) queue */
+ uint16_t sseq_num;
+ uint16_t rsr_seq_num_poll;
+ uint16_t rsr_seq_num_tail; /* occupation status of remote Send Request (SR) queue (it covers occupation status of local RDMA-wr-to buffer) */
+ uint16_t rsr_seq_num_tail_last_sent; /* latest one sent to remote rank */
+ uint16_t lsr_seq_num_tail; /* occupation status of local Send Request (SR) queue */
int lsr_seq_num_tail_last_requested; /* value when lmt_start_send issued req_seq_num */
int rdmabuf_occupancy_notify_rstate, rdmabuf_occupancy_notify_lstate;
- int ncom, ncom_lmt_put, ncom_scratch_pad; /* number of entries in the command queue */
+ int ncom, ncom_scratch_pad; /* number of entries in the command queue */
uint32_t max_inline_data; /* actual value obtained after ibv_create_qp */
uint32_t max_send_wr;
@@ -303,23 +450,69 @@ typedef struct MPID_nem_ib_com {
/* see the "Ordering and the Fence Indicator" section in "InfiniBand Architecture" by William T. Futral */
uint16_t after_rdma_rd;
- uint64_t rsr_seq_num_released[(MPID_NEM_IB_COM_RDMABUF_NSEG + 63) / 64];
+ /* Ring-buffer information on the receiver side.
+ It's allocated on the receiver side. */
+ MPID_nem_ib_ringbuf_t* remote_ringbuf;
+
+ /* Ring buffer information on the sender side.
+ The information is passed from the receiver side on connection. */
+ uint32_t local_ringbuf_type;
+ void* local_ringbuf_start;
+ int local_ringbuf_rkey;
+ uint16_t local_ringbuf_nslot;
+
+ /* VC of remote node. It's embedded in a packet going to the
+ shared ring buffer because no VC information is available on
+ the receiver side in the shared case. c.f. They are stored in
+ the individual exclusive ring-buffers in the exclusive case. */
+ MPIDI_VC_t *remote_vc;
+
+ /* Delay the fetch of the second ask until the first issues CAS */
+ uint8_t ask_guard;
+
+ /* Ring buffer sectors obtained through ask-send protocol */
+ MPID_nem_ib_ringbuf_sectorq_t sectorq;
} MPID_nem_ib_com_t;
+extern void *MPID_nem_ib_rdmawr_to_alloc(int nslots);
+extern void MPID_nem_ib_rdmawr_to_free(void *p, int nslots);
+extern int MPID_nem_ib_rdmawr_to_munmap(void *p, int nslots);
extern int MPID_nem_ib_com_open(int ib_port, int MPID_nem_ib_com_open_flag, int *condesc);
-extern int MPID_nem_ib_com_alloc(int condesc, int sz);
extern int MPID_nem_ib_com_close(int);
+extern int MPID_nem_ib_com_alloc(int condesc, int sz);
+extern int MPID_nem_ib_com_free(int condesc, int sz);
extern int MPID_nem_ib_com_rts(int condesc, int remote_qpnum, uint16_t remote_lid,
union ibv_gid *remote_gid);
extern int MPID_nem_ib_com_reg_mr_connect(int condesc, void *rmem, int rkey);
-extern int MPID_nem_ib_com_isend(int condesc, uint64_t wr_id, void *prefix, int sz_prefix,
- void *hdr, int sz_hdr, void *data, int sz_data, int *copied);
+extern int MPID_nem_ib_com_connect_ringbuf(int condesc,
+ uint32_t ringbuf_type,
+ void *start, int rkey, int nslot,
+ MPIDI_VC_t * remote_vc,
+ uint32_t alloc_new_mr);
+
+extern int MPID_nem_ib_com_isend(int condesc,
+ uint64_t wr_id,
+ void *prefix, int sz_prefix,
+ void *hdr, int sz_hdr,
+ void *data, int sz_data,
+ int *copied,
+ uint32_t local_ringbuf_type, uint32_t remote_ringbuf_type,
+ void** buf_from_out, uint32_t* buf_from_sz_out);
extern int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_hdr,
void *data, int sz_data);
extern int MPID_nem_ib_com_put_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset, int sz,
void *laddr);
+extern int MPID_nem_ib_com_get_scratch_pad(int condesc,
+ uint64_t wr_id,
+ uint64_t offset, int sz,
+ void** buf_from_out, uint32_t * buf_from_sz_out);
+extern int MPID_nem_ib_com_cas_scratch_pad(int condesc,
+ uint64_t wr_id, uint64_t offset,
+ uint64_t compare, uint64_t swap,
+ void** buf_from_out, uint32_t * buf_from_sz_out);
+
//extern int MPID_nem_ib_com_isend(int condesc, uint64_t wr_id, void* hdr, int sz_hdr, void* data, int sz_data);
extern int MPID_nem_ib_com_irecv(int condesc, uint64_t wr_id);
extern int MPID_nem_ib_com_udsend(int condesc, union ibv_gid *remote_gid, uint16_t remote_lid,
@@ -334,16 +527,13 @@ extern int MPID_nem_ib_com_poll_cq(int which_cq, struct ibv_wc *wc, int *result)
extern int MPID_nem_ib_com_obtain_pointer(int condesc, MPID_nem_ib_com_t ** MPID_nem_ib_com);
/* for ib_reg_mr.c */
-extern int MPID_nem_ib_com_reg_mr(void *addr, int len, struct ibv_mr **mr);
+extern int MPID_nem_ib_com_reg_mr(void *addr, int len, struct ibv_mr **mr,
+ enum ibv_access_flags additional_flags);
extern int MPID_nem_ib_com_dereg_mr(struct ibv_mr *mr);
extern int MPID_nem_ib_com_get_info_conn(int condesc, int key, void *out, uint32_t out_len);
extern int MPID_nem_ib_com_get_info_mr(int condesc, int memid, int key, void *out, int out_len);
-extern int MPID_nem_ib_com_sseq_num_get(int condesc, int *seq_num);
-extern int MPID_nem_ib_com_lsr_seq_num_tail_get(int condesc, int **seq_num);
-extern int MPID_nem_ib_com_rsr_seq_num_tail_get(int condesc, int **seq_num);
-extern int MPID_nem_ib_com_rsr_seq_num_tail_last_sent_get(int condesc, int **seq_num);
extern int MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get(int condesc, int *notify_rate);
extern int MPID_nem_ib_com_rdmabuf_occupancy_notify_rstate_get(int condesc, int **rstate);
extern int MPID_nem_ib_com_rdmabuf_occupancy_notify_lstate_get(int condesc, int **lstate);
@@ -351,14 +541,14 @@ extern int MPID_nem_ib_com_rdmabuf_occupancy_notify_lstate_get(int condesc, int
extern char *MPID_nem_ib_com_strerror(int errno);
extern int MPID_nem_ib_com_mem_rdmawr_from(int condesc, void **out);
-extern int MPID_nem_ib_com_mem_rdmawr_to(int condesc, int seq_num, void **out);
+//extern int MPID_nem_ib_com_mem_rdmawr_to(int condesc, int seq_num, void **out);
extern int MPID_nem_ib_com_mem_udwr_from(int condesc, void **out);
extern int MPID_nem_ib_com_mem_udwr_to(int condesc, void **out);
/* ib_reg_mr.c */
-extern void MPID_nem_ib_com_register_cache_init(void);
-extern void MPID_nem_ib_com_register_cache_destroy(void);
-extern struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, int len);
+extern int MPID_nem_ib_com_register_cache_init(void);
+extern int MPID_nem_ib_com_register_cache_release(void);
+extern struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, int len, enum ibv_access_flags additional_flags);
extern int MPID_nem_ib_com_udbuf_init(void *q);
@@ -366,7 +556,6 @@ extern int MPID_nem_ib_com_udbuf_init(void *q);
#define MPID_NEM_IB_COM_RC_SHARED_SCQ 1
#define MPID_NEM_IB_COM_UD_SHARED_RCQ 2
#define MPID_NEM_IB_COM_UD_SHARED_SCQ 3
-#define MPID_NEM_IB_COM_RC_SHARED_SCQ_LMT_PUT 4
/* flag for open */
#define MPID_NEM_IB_COM_OPEN_RC 0x01
@@ -377,35 +566,23 @@ extern int MPID_nem_ib_com_udbuf_init(void *q);
/* obsolete, to wait for you-to-me QP to become RTR state
so via UD-send/recv */
-#define MPID_NEM_IB_COM_OPEN_RC_LMT_PUT 0x03
-/* obsolete, tried to use different CQ for LMT-PUT protocol for speed */
-
#define MPID_NEM_IB_COM_OPEN_SCRATCH_PAD 0x04
/* obsolete, to wait for you-to-me QP to become RTR state
so via RDMA-write */
+#define MPID_nem_ib_segv printf("%d\n", *(int32_t*)0);
#define MPID_NEM_IB_COM_ERR_SETANDJUMP(errno, stmt) { stmt; ibcom_errno = errno; goto fn_fail; }
#define MPID_NEM_IB_COM_ERR_CHKANDJUMP(cond, errno, stmt) if (cond) { stmt; ibcom_errno = errno; goto fn_fail; }
+#define MPID_NEM_IB_ERR_FATAL(cond, var, val, tag) if (cond) { var = val; printf("%s\n", tag); MPID_nem_ib_segv; }
#define MPID_NEM_IB_COM_QKEY 0x1234
#define MPID_NEM_IB_COM_MAGIC 0x55
-typedef struct MPID_nem_ib_sz_hdrmagic_t {
- uint32_t sz;
- uint32_t magic;
-} MPID_nem_ib_sz_hdrmagic_t;
-
+#define MPID_NEM_IB_OFF_POW2_ALIGNED(sz) \
+ for(off_pow2_aligned = 15; off_pow2_aligned < (sz); off_pow2_aligned = ((((off_pow2_aligned + 1) << 1) - 1) > MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_netmod_trailer_t)) ? MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_netmod_trailer_t) : (((off_pow2_aligned + 1) << 1) - 1)) { } \
+ if (off_pow2_aligned > MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_netmod_trailer_t)) { printf("assertion failed\n"); }; \
-typedef struct MPID_nem_ib_tailmagic_t {
- uint8_t magic;
- //uint32_t traits; /* for debug */
-} MPID_nem_ib_tailmagic_t;
-
-#define MPID_NEM_IB_SZ_DATA_POW2(sz) \
- for(sz_data_pow2 = 15; sz_data_pow2 < (sz); sz_data_pow2 = ((((sz_data_pow2 + 1) << 1) - 1) > MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_tailmagic_t)) ? MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_tailmagic_t) : (((sz_data_pow2 + 1) << 1) - 1)) { } \
- if (sz_data_pow2 > MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_tailmagic_t)) { printf("assertion failed\n"); }; \
-
-#define MPID_NEM_IB_MAX_DATA_POW2 (MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_tailmagic_t))
+#define MPID_NEM_IB_MAX_OFF_POW2_ALIGNED (MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_netmod_trailer_t))
typedef struct MPID_nem_ib_com_qp_state_t {
uint32_t state;
@@ -414,3 +591,122 @@ typedef struct MPID_nem_ib_com_qp_state_t {
#define MPID_NEM_IB_COM_QP_STATE_RTR 0x12345678
#define MPID_NEM_IB_COM_SZ_MPI_HEADER 48
#define MPID_NEM_IB_COM_AMT_SLACK (MPID_NEM_IB_COM_RDMABUF_NSEG > 128 ? 1 : 1)
+
+#define MPID_NEM_IB_MAX(a, b) ((a) > (b) ? (a) : (b))
+
+/* Allocator for RDMA write from buffer
+ - Allocate performs overflow checks and increments pointer
+ - Fast to "malloc" (one load and one store instructions)
+ - Free decrements counter at the head of
+ aligned memory area. The area is freed when the counter is zero.
+ - Fast to "free" (one load and one store instructions)
+ - Easy to shrink
+ - Refill allocates multiple slots and IB-registers them
+ - Fast when first-time allocs occur
+ - Free list is pointers for 2^n sizes.
+ - Fast to find a empty slot
+ */
+typedef struct {
+ union {
+ uint32_t ref_count;
+ char *next;
+ } first;
+ struct ibv_mr* mr;
+} MPID_nem_ib_rdmawr_from_alloc_hdr_t;
+#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA 65536
+#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_ROUNDUP64(addr, align) ((addr + align - 1) & ~((unsigned long)align - 1))
+#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_ROUNDUP64_ADDR(addr, align) ((char*)(((uint64_t)addr + align - 1) & ~((uint64_t)align - 1)))
+#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_NCLUST_SLAB 1
+#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_START(p) ((void *) ((uint64_t) (p) & ~(MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA - 1)))
+#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_MR(p) (((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) ((uint64_t) (p) & ~(MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA - 1)))->mr)
+#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_PREPROCESS_SZ \
+ if(_sz < 256) { \
+ clz = 23; \
+ sz = 256; \
+ } else { \
+ clz = __builtin_clz(_sz); \
+ int ctz = __builtin_ctz(_sz); \
+ sz = (clz + ctz == 31) ? _sz : (1ULL << (32 - clz)); \
+ }
+
+static inline void *MPID_nem_ib_rdmawr_from_alloc(uint32_t _sz)
+{
+ int retval;
+ int clz;
+ uint32_t sz;
+ assert(_sz <= (1ULL<<31));
+ MPID_NEM_IB_RDMAWR_FROM_ALLOC_PREPROCESS_SZ;
+ char *p = MPID_nem_ib_rdmawr_from_alloc_free_list_front[clz];
+ if ((unsigned long) p & (MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA - 1)) {
+ MPID_nem_ib_rdmawr_from_alloc_free_list_front[clz] += sz;
+ return p;
+ }
+ else {
+ char *q, r;
+ if (MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz]) {
+ q = MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz];
+ MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz] = ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz])->first.next;
+ }
+ else {
+ unsigned long sz_clust = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ROUNDUP64(MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA * MPID_NEM_IB_RDMAWR_FROM_ALLOC_NCLUST_SLAB, 4096);
+ char* unaligned = mmap(NULL,
+ sz_clust + MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA,
+ PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (unaligned == (void *)-1) {
+ printf("mmap failed\n");
+ MPID_nem_ib_segv;
+ }
+
+ q = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ROUNDUP64_ADDR(unaligned, MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA);
+ retval = munmap(unaligned, q - unaligned);
+ if (q - unaligned != 0 && retval) {
+ printf("munmap failed\n");
+ MPID_nem_ib_segv;
+ }
+ retval = munmap(q + sz_clust, MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA - (q - unaligned));
+ if (retval) {
+ printf("munmap failed\n");
+ MPID_nem_ib_segv;
+ }
+
+ ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->mr = MPID_nem_ib_com_reg_mr_fetch(q, MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA, 0);
+ if (!((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->mr) {
+ printf("ibv_reg_mr failed\n");
+ MPID_nem_ib_segv;
+ }
+
+#if MPID_NEM_IB_RDMAWR_FROM_ALLOC_NCLUST_SLAB > 1
+ MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz] = q + MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA;
+ for (p = q + MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA; p < q + (MPID_NEM_IB_RDMAWR_FROM_ALLOC_NCLUST_SLAB - 1) * MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA;
+ p += MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA) {
+ ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) p)->mr = MPID_nem_ib_com_reg_mr_fetch(q, MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA, 0);
+ if (!((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) p)->mr) {
+ printf("ibv_reg_mr failed\n");
+ MPID_nem_ib_segv;
+ }
+
+ ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) p)->first.next = p + MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA;
+ }
+ ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) p)->first.next = 0;
+#endif
+ }
+ ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->first.ref_count = MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA / sz - 1;
+ q += sz + (MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA % sz);
+ MPID_nem_ib_rdmawr_from_alloc_free_list_front[clz] = q + sz;
+ return q;
+ }
+}
+
+static inline void MPID_nem_ib_rdmawr_from_free(const void *p, uint32_t _sz)
+{
+ int clz;
+ uint32_t sz;
+ assert(_sz <= (1ULL<<31));
+ MPID_NEM_IB_RDMAWR_FROM_ALLOC_PREPROCESS_SZ;
+ void *q = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_START(p);
+ if (!(--(((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->first.ref_count))) {
+ ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->first.next = MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz];
+ MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz] = (char *) q;
+ }
+}
+
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
index f176e6f..42c854f 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
@@ -21,7 +21,7 @@
(1) receiver sends cts to sender (2) sender RDMA-write to receiver
(3) sender fetch CQE (4) receiver polls on end-flag
*/
-//#define MPID_NEM_IB_ONDEMAND
+#define MPID_NEM_IB_ONDEMAND
typedef struct {
union ibv_gid gid;
@@ -30,7 +30,7 @@ typedef struct {
} MPID_nem_ib_conn_ud_t;
typedef struct {
- int fd, fd_lmt_put;
+ int fd;
MPIDI_VC_t *vc;
} MPID_nem_ib_conn_t;
@@ -43,10 +43,13 @@ typedef GENERIC_Q_DECL(struct MPID_Request) MPID_nem_ib_sendq_t;
typedef struct {
MPID_nem_ib_conn_t *sc;
int pending_sends; /* number of send in flight */
- MPID_nem_ib_com_t *ibcom, *ibcom_lmt_put;
+ MPID_nem_ib_com_t *ibcom;
MPID_nem_ib_sendq_t sendq; /* overflow queue for IB commands */
- MPID_nem_ib_sendq_t sendq_lmt_put;
- int is_connected; /* dynamic connection, checked in iSendContig, protocol processed there and in progress engine */
+ int connection_state; /* dynamic connection, checked in iSendContig, protocol processed there and in progress engine */
+
+ /* Number of outstanding connection sequence started to eliminate
+ duplicated connection reuests */
+ uint8_t connection_guard;
} MPID_nem_ib_vc_area;
/* macro for secret area in vc */
@@ -68,6 +71,9 @@ typedef struct {
MPI_Aint lmt_dt_true_lb; /* to locate the last byte of receive buffer */
void *lmt_write_to_buf; /* user buffer or temporary buffer for pack and remember it for lmt_orderq */
void *lmt_pack_buf; /* to pack non-contiguous data */
+ void *buf_from; /* address of RDMA write from buffer */
+ uint32_t buf_from_sz; /* size of RDMA write from buffer. It's set on sending, referenced on freeing */
+ uint8_t ask; /* Issued ask or not on send */
} MPID_nem_ib_req_area;
/* macro for secret area in req */
@@ -88,59 +94,271 @@ typedef struct {
/* see src/mpid/ch3/channels/nemesis/include/mpid_nem_generic_queue.h */
typedef GENERIC_Q_DECL(struct MPID_Request) MPID_nem_ib_lmtq_t;
-/* connection manager */
-typedef struct {
- int remote_rank;
- uint32_t type; /* SYN */
- uint32_t qpn; /* QPN for eager-send channel */
- uint32_t rkey; /* key for RDMA-write-to buffer of eager-send channel */
- void *rmem; /* address of RDMA-write-to buffer of eager-send channel */
-} MPID_nem_ib_cm_pkt_syn_t;
-
-typedef struct {
- int remote_rank;
- uint32_t type; /* SYNACK */
- uint32_t qpn; /* QPN for eager-send channel */
- uint32_t rkey; /* key for RDMA-write-to buffer of eager-send channel */
- void *rmem; /* address of RDMA-write-to buffer of eager-send channel */
-} MPID_nem_ib_cm_pkt_synack_t;
-
-typedef union {
- MPID_nem_ib_cm_pkt_syn_t syn;
- MPID_nem_ib_cm_pkt_synack_t synack;
-} MPID_nem_ib_cm_pkt_t;
+#ifdef MPID_NEM_IB_ONDEMAND
-typedef struct MPID_nem_ib_cm_sendq_entry {
- MPID_nem_ib_cm_pkt_t pending_pkt;
- struct MPID_nem_ib_cm_sendq_entry *sendq_next; /* for software command queue */
-} MPID_nem_ib_cm_sendq_entry_t;
+/* States in connection protocol */
+#define MPID_NEM_IB_CM_CLOSED 0
+#define MPID_NEM_IB_CM_LOCAL_QP_RESET 1
+#define MPID_NEM_IB_CM_REMOTE_QP_RESET 2
+#define MPID_NEM_IB_CM_REMOTE_QP_RTS 4
+#define MPID_NEM_IB_CM_LOCAL_QP_RTS 8
+#define MPID_NEM_IB_CM_ESTABLISHED 15
-#ifdef MPID_NEM_IB_ONDEMAND
typedef struct {
char *data;
int length;
int max_length;
} MPID_nem_ib_cm_map_t;
+/* Types of connection protocol packets */
+enum MPID_nem_ib_cm_cmd_types {
+ MPID_NEM_IB_CM_HEAD_FLAG_ZERO = 0,
+ MPID_NEM_IB_CM_CAS,
+ MPID_NEM_IB_CM_SYN,
+ MPID_NEM_IB_CM_SYNACK,
+ MPID_NEM_IB_CM_ACK1,
+ MPID_NEM_IB_CM_ACK2,
+ MPID_NEM_IB_RINGBUF_ASK_FETCH,
+ MPID_NEM_IB_RINGBUF_ASK_CAS
+};
+
+/* Packet types of connection protocol */
+struct MPID_nem_ib_cm_req;
+
+/* They should have the same type because
+ cm commands and ring buffer commands share one CQ */
+typedef uint8_t MPID_nem_ib_cm_ringbuf_cmd_type_t;
+typedef MPID_nem_ib_cm_ringbuf_cmd_type_t MPID_nem_ib_ringbuf_cmd_type_t;
+typedef MPID_nem_ib_cm_ringbuf_cmd_type_t MPID_nem_ib_cm_cmd_type_t;
+
+typedef struct {
+ MPID_nem_ib_cm_cmd_type_t type;
+ struct MPID_nem_ib_cm_req *initiator_req;
+ uint16_t responder_ringbuf_index;
+ int initiator_rank;
+ MPID_nem_ib_netmod_trailer_t tail_flag;
+} MPID_nem_ib_cm_cmd_syn_t;
+
+typedef struct {
+ MPID_nem_ib_cm_cmd_type_t type; /* this is used as head flag as well */
+ uint32_t qpnum;
+ uint16_t lid;
+ union ibv_gid gid;
+ void *rmem;
+ uint32_t rkey;
+ int ringbuf_nslot;
+ uint32_t ringbuf_type; /* Ring buffer information sent from receiver side to sender side */
+ struct MPID_nem_ib_cm_req *initiator_req;
+ struct MPID_nem_ib_cm_req *responder_req;
+ uint16_t initiator_ringbuf_index; /* index to connection protocol ring buffer */
+ MPIDI_VC_t * remote_vc;
+ MPID_nem_ib_netmod_trailer_t tail_flag;
+} MPID_nem_ib_cm_cmd_synack_t;
+
typedef struct {
- uint32_t type;
+ MPID_nem_ib_cm_cmd_type_t type;
uint32_t qpnum;
uint16_t lid;
union ibv_gid gid;
void *rmem;
uint32_t rkey;
+ int ringbuf_nslot;
+ uint32_t ringbuf_type; /* Ring buffer information sent from sender side to receiver side */
+ struct MPID_nem_ib_cm_req *initiator_req;
+ struct MPID_nem_ib_cm_req *responder_req;
+ MPIDI_VC_t * remote_vc;
+ MPID_nem_ib_netmod_trailer_t tail_flag;
+} MPID_nem_ib_cm_cmd_ack1_t;
+
+typedef struct {
+ MPID_nem_ib_cm_cmd_type_t type;
+ struct MPID_nem_ib_cm_req *initiator_req;
+ MPID_nem_ib_netmod_trailer_t tail_flag;
+} MPID_nem_ib_cm_cmd_ack2_t;
+
+/* Base class for branching on type
+ and used to measure maximum size */
+typedef union {
+ MPID_nem_ib_cm_cmd_type_t type;
+ MPID_nem_ib_cm_cmd_syn_t syn;
+ MPID_nem_ib_cm_cmd_synack_t synack;
+ MPID_nem_ib_cm_cmd_ack1_t ack1;
+ MPID_nem_ib_cm_cmd_ack2_t ack2;
} MPID_nem_ib_cm_cmd_t;
-#endif
-typedef GENERIC_Q_DECL(struct MPID_Request) MPID_nem_ib_cm_sendq_t;
+/* State store for connection protocol */
+typedef struct MPID_nem_ib_cm_req {
+ MPID_nem_ib_cm_cmd_type_t state;
+ MPID_nem_ib_com_t *ibcom; /* Referenced in drain_scq */
+ uint64_t retry_decided; /* Virtual time when CAS retry is decided */
+ uint64_t retry_backoff; /* Back-off duration of retry */
+ uint16_t ringbuf_index; /* index of slot where responder writes responds */
+ int initiator_rank;
+ int responder_rank;
+ uint16_t initiator_ringbuf_index; /* responder stores it when acquiring it */
+ uint16_t responder_ringbuf_index; /* initiator stores it when acquiring it */
+ struct MPID_nem_ib_cm_req *sendq_next;
+ MPID_nem_ib_cm_cmd_t cmd; /* buf used only when enqueued */
+ uint32_t ask_on_connect; /* Ask ring-buffer slot when connected */
+
+ /* We need to track reference count because the last reference of state
+ is non-deterministic. i.e. it happens either on receiving packet and draining SCQ */
+ uint32_t ref_count;
+} MPID_nem_ib_cm_req_t;
+
+/* Track identity of a packet */
+typedef struct {
+ MPID_nem_ib_cm_cmd_type_t type; /* Type referenced in drain_scq */
+ MPID_nem_ib_cm_req_t *req;
+ void* buf_from;
+ uint32_t buf_from_sz;
+} MPID_nem_ib_cm_cmd_shadow_t;
+
+#define MPID_NEM_IB_CM_RELEASED ((uint64_t)(-1))
+#define MPID_NEM_IB_CM_OFF_SYN (256) /* Align for 256-byte-write PCI command */
+#define MPID_NEM_IB_CM_OFF_CMD (256*2) /* Align for 256-byte-write PCI command */
+#define MPID_NEM_IB_CM_NSEG 16 /* number of slots to which responder writes its response */
+
+typedef GENERIC_Q_DECL(MPID_nem_ib_cm_req_t) MPID_nem_ib_cm_sendq_t;
#define MPID_nem_ib_cm_sendq_empty(q) GENERICM_Q_EMPTY (q)
#define MPID_nem_ib_cm_sendq_head(q) GENERICM_Q_HEAD (q)
#define MPID_nem_ib_cm_sendq_next_field(ep, next_field) ((ep)->next_field)
#define MPID_nem_ib_cm_sendq_next(ep) ((ep)->sendq_next)
#define MPID_nem_ib_cm_sendq_enqueue(qp, ep) GENERICM_Q_ENQUEUE (qp, ep, MPID_nem_ib_cm_sendq_next_field, sendq_next);
-#define MPID_nem_ib_cm_sendq_enqueue_at_head(qp, ep) GENERICM_Q_ENQUEUE_AT_HEAD(qp, ep, MPID_nem_ib_cm_sendq_next_field, sendq_next);
-#define MPID_nem_ib_cm_sendq_dequeue(qp, ep) GENERICM_Q_DEQUEUE (qp, ep, MPID_nem_ib_cm_sendq_next_field, sendq_next);
+
+#ifdef HAVE_LIBDCFA
+#define MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO_MR_ADDR host_adddr
+#else
+#define MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO_MR_ADDR addr
+#endif
+
+#define MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO(cmd, rank) { \
+ ibcom_errno = \
+ MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conns[(rank)].fd, MPID_NEM_IB_COM_INFOKEY_PORT_LID, &((cmd)->lid), \
+ sizeof(uint16_t)); \
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_get_info_conn"); \
+\
+ ibcom_errno = \
+ MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conns[(rank)].fd, MPID_NEM_IB_COM_INFOKEY_PORT_GID, &((cmd)->gid), \
+ sizeof(union ibv_gid)); \
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_get_info_conn"); \
+ \
+ ibcom_errno = \
+ MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conns[(rank)].fd, MPID_NEM_IB_COM_INFOKEY_QP_QPN, &((cmd)->qpnum), \
+ sizeof(uint32_t)); \
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_get_info_conn"); \
+ \
+ (cmd)->rmem = (uint8_t*)MPID_nem_ib_rdmawr_to_alloc_mr->MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO_MR_ADDR + \
+ ((uint8_t*)VC_FIELD(MPID_nem_ib_conns[(rank)].vc, ibcom->remote_ringbuf->start) - \
+ (uint8_t*)MPID_nem_ib_rdmawr_to_alloc_start) ; \
+ (cmd)->rkey = MPID_nem_ib_rdmawr_to_alloc_mr->rkey; \
+ (cmd)->ringbuf_nslot = VC_FIELD(MPID_nem_ib_conns[(rank)].vc, ibcom->remote_ringbuf->nslot); \
+ }
+
+#define MPID_NEM_IB_CM_COMPOSE_SYN(cmd, req) { \
+ (cmd)->type = MPID_NEM_IB_CM_SYN; \
+ (cmd)->initiator_req = (req); \
+ (cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
+}
+
+#define MPID_NEM_IB_CM_COMPOSE_SYNACK(cmd, req, _initiator_req) { \
+ (cmd)->type = MPID_NEM_IB_CM_SYNACK; \
+ MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO((cmd), (req)->initiator_rank); \
+ (cmd)->ringbuf_type = VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, ibcom->remote_ringbuf->type); \
+ (cmd)->initiator_req = (_initiator_req); \
+ (cmd)->responder_req = (req); \
+ (cmd)->remote_vc = MPID_nem_ib_conns[req->initiator_rank].vc; \
+ (cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
+}
+
+#define MPID_NEM_IB_CM_COMPOSE_ACK1(cmd, req, _responder_req) { \
+ (cmd)->type = MPID_NEM_IB_CM_ACK1; \
+ MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO((cmd), (req)->responder_rank); \
+ (cmd)->ringbuf_type = VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, ibcom->remote_ringbuf->type); \
+ (cmd)->initiator_req = (req); \
+ (cmd)->responder_req = (_responder_req); \
+ (cmd)->remote_vc = MPID_nem_ib_conns[req->responder_rank].vc; \
+ (cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
+}
+
+#define MPID_NEM_IB_CM_COMPOSE_ACK2(cmd, _initiator_req) { \
+ (cmd)->type = MPID_NEM_IB_CM_ACK2; \
+ (cmd)->initiator_req = (_initiator_req); \
+ (cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
+}
+
+#define MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(buf) { \
+ ((MPID_nem_ib_cm_cmd_synack_t *)(buf))->tail_flag.tail_flag = 0; \
+ ((MPID_nem_ib_cm_cmd_ack1_t *)(buf))->tail_flag.tail_flag = 0; \
+ ((MPID_nem_ib_cm_cmd_ack2_t *)(buf))->tail_flag.tail_flag = 0; \
+}
+
+static inline void MPID_nem_ib_cm_request_release(MPID_nem_ib_cm_req_t * req) {
+ if(req->ref_count == 0) {
+ MPID_nem_ib_segv;
+ }
+ if(--req->ref_count == 0) {
+ MPIU_Free(req);
+ }
+}
+
+int MPID_nem_ib_cm_progress(void);
+int MPID_nem_ib_cm_release(uint16_t index);
+#endif
+
+/* Ring buffer protocol
+ including Ask-Send protocol */
+
+uint32_t MPID_nem_ib_ringbuf_local_shared_nseg;
+
+/* It's on the scratch pad, RDMA-read by a process which performs ask-send */
+
+typedef struct {
+ uint64_t head; /* CAS size is 64-bit */
+ uint16_t tail;
+} MPID_nem_ib_ringbuf_headtail_t;
+
+/* Types of ring buffer protocol packets is included in
+ MPID_nem_ib_cm_cmd_types */
+
+/* State store for connection protocol */
+typedef struct MPID_nem_ib_ringbuf_req {
+ MPID_nem_ib_ringbuf_cmd_type_t state;
+ MPIDI_VC_t * vc;
+ MPID_nem_ib_com_t *ibcom; /* ibcom of scratch pad, referenced in drain_scq */
+
+ /* fetch the head and compare-and-swap head and head + 1
+ to prevent the case 2^32-1 contiguos fetches while assuming
+ the ring buffer isn't full corrupt the head pointer */
+ MPID_nem_ib_ringbuf_headtail_t fetched;
+
+ uint64_t retry_decided; /* Virtual time when CAS retry is decided */
+ uint64_t retry_backoff; /* Back-off duration of retry */
+ struct MPID_nem_ib_ringbuf_req *sendq_next;
+} MPID_nem_ib_ringbuf_req_t;
+
+/* Track identity of a packet */
+typedef struct {
+ MPID_nem_ib_ringbuf_cmd_type_t type; /* Type referenced in drain_scq */
+ MPID_nem_ib_ringbuf_req_t *req;
+ void* buf_from;
+ uint32_t buf_from_sz;
+} MPID_nem_ib_ringbuf_cmd_shadow_t;
+
+/* Location of head of the shared ring buffer */
+#define MPID_NEM_IB_RINGBUF_OFF_HEAD (MPID_NEM_IB_CM_OFF_CMD + sizeof(MPID_nem_ib_cm_cmd_t) * MPID_NEM_IB_CM_NSEG)
+#define MPID_NEM_IB_RINGBUF_UPDATE_BACKOFF(backoff) (backoff) = (backoff) ? ((backoff) << 1) : 1;
+
+typedef GENERIC_Q_DECL(MPID_nem_ib_ringbuf_req_t) MPID_nem_ib_ringbuf_sendq_t;
+
+#define MPID_nem_ib_ringbuf_sendq_empty(q) GENERICM_Q_EMPTY (q)
+#define MPID_nem_ib_ringbuf_sendq_head(q) GENERICM_Q_HEAD (q)
+#define MPID_nem_ib_ringbuf_sendq_next_field(ep, next_field) ((ep)->next_field)
+#define MPID_nem_ib_ringbuf_sendq_next(ep) ((ep)->sendq_next)
+#define MPID_nem_ib_ringbuf_sendq_enqueue(qp, ep) GENERICM_Q_ENQUEUE (qp, ep, MPID_nem_ib_ringbuf_sendq_next_field, sendq_next);
+#define MPID_nem_ib_ringbuf_sendq_enqueue_at_head(qp, ep) GENERICM_Q_ENQUEUE_AT_HEAD(qp, ep, MPID_nem_ib_ringbuf_sendq_next_field, sendq_next);
+
/* see src/mpid/ch3/channels/nemesis/include/mpidi_ch3_impl.h */
/* TODO: rreq for rendezvous is dequeued from posted-queue nor unexpected-queue when do_cts is called,
@@ -150,10 +368,10 @@ typedef GENERIC_Q_DECL(struct MPID_Request) MPID_nem_ib_cm_sendq_t;
#define MPID_nem_ib_lmtq_next_field(ep, next_field) REQ_FIELD(ep, next_field)
#define MPID_nem_ib_lmtq_next(ep) REQ_FIELD(ep, lmt_next)
#define MPID_nem_ib_lmtq_enqueue(qp, ep) GENERICM_Q_ENQUEUE(qp, ep, MPID_nem_ib_lmtq_next_field, lmt_next);
-
-#define MPID_nem_ib_diff32(a, b) ((uint32_t)((a + (1ULL<<32) - b) & ((1ULL<<32)-1)))
-#define MPID_nem_ib_sendq_ready_to_send_head(vc_ib) (vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY && MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY && MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail) < MPID_NEM_IB_COM_RDMABUF_NSEG)
-#define MPID_nem_ib_sendq_ready_to_send_head_lmt_put(vc_ib) (vc_ib->ibcom->ncom_lmt_put < MPID_NEM_IB_COM_MAX_SQ_CAPACITY && MPID_nem_ib_ncqe_lmt_put < MPID_NEM_IB_COM_MAX_CQ_CAPACITY)
+#define MPID_nem_ib_diff63(a, b) ((uint64_t)(((a) + (1ULL<<63) - (b)) & ((1ULL<<63)-1)))
+#define MPID_nem_ib_diff16(a, b) ((uint16_t)(((a) + (1ULL<<16) - (b)) & ((1ULL<<16)-1)))
+#define MPID_nem_ib_diff32(a, b) ((uint32_t)(((a) + (1ULL<<32) - (b)) & ((1ULL<<32)-1)))
+#define MPID_nem_ib_sendq_ready_to_send_head(vc_ib) (vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY && MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY && MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail) < MPID_NEM_IB_COM_RDMABUF_NSEG)
/* counting bloom filter to detect multiple lmt-sends in one send-wait period to
avoid overwriting the last byte in the receive buffer */
@@ -282,13 +500,18 @@ uint64_t MPID_nem_ib_rdtsc(void);
int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_max_sz_p);
int MPID_nem_ib_finalize(void);
int MPID_nem_ib_drain_scq(int dont_call_progress);
-int MPID_nem_ib_drain_scq_lmt_put(void);
int MPID_nem_ib_drain_scq_scratch_pad(void);
int MPID_nem_ib_poll(int in_blocking_poll);
-int MPID_nem_ib_poll_eager(MPIDI_VC_t * vc);
+int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf);
+int MPID_nem_ib_ring_alloc(MPIDI_VC_t * vc);
+
+int MPID_nem_ib_cm_drain_scq(void);
+int MPID_nem_ib_cm_poll_syn(void);
+int MPID_nem_ib_cm_poll(void);
int MPID_nem_ib_get_business_card(int my_rank, char **bc_val_p, int *val_max_sz_p);
int MPID_nem_ib_connect_to_root(const char *business_card, MPIDI_VC_t * new_vc);
+int MPID_nem_ib_vc_onconnect(MPIDI_VC_t * vc);
int MPID_nem_ib_vc_init(MPIDI_VC_t * vc);
int MPID_nem_ib_vc_destroy(MPIDI_VC_t * vc);
int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc);
@@ -303,8 +526,13 @@ int MPID_nem_ib_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
int MPID_nem_ib_iStartContigMsg(MPIDI_VC_t * vc, void *hdr, MPIDI_msg_sz_t hdr_sz, void *data,
MPIDI_msg_sz_t data_sz, MPID_Request ** sreq_ptr);
+int MPID_nem_ib_cm_cmd_core(int rank, MPID_nem_ib_cm_cmd_shadow_t* shadow, void* buf, MPIDI_msg_sz_t sz, uint32_t syn, uint16_t ringbuf_index);
+int MPID_nem_ib_ringbuf_alloc(MPIDI_VC_t * vc);
+int MPID_nem_ib_ringbuf_ask_cas_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t* shadow, uint64_t head);
+int MPID_nem_ib_ringbuf_progress(void);
+
/* used by ib_poll.c */
-int MPID_nem_ib_send_progress(MPID_nem_ib_vc_area * vc_ib);
+int MPID_nem_ib_send_progress(MPIDI_VC_t * vc);
/* CH3--lmt send/recv functions */
int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_pkt,
@@ -338,21 +566,31 @@ extern int MPID_nem_ib_conn_ud_fd;
extern MPID_nem_ib_com_t *MPID_nem_ib_conn_ud_ibcom;
extern MPID_nem_ib_conn_ud_t *MPID_nem_ib_conn_ud;
extern MPID_nem_ib_conn_t *MPID_nem_ib_conns;
-extern MPIDI_VC_t **MPID_nem_ib_pollingset;
-extern int *MPID_nem_ib_scratch_pad_fds;
-extern int MPID_nem_ib_npollingset;
+//extern MPIDI_VC_t **MPID_nem_ib_pollingset;
+extern int *MPID_nem_ib_scratch_pad_fds; /* TODO: create structure including fds and ibcoms */
+extern MPID_nem_ib_com_t **MPID_nem_ib_scratch_pad_ibcoms;
+//extern int MPID_nem_ib_npollingset;
extern void *MPID_nem_ib_fl[18];
extern int MPID_nem_ib_nranks;
//extern char *MPID_nem_ib_recv_buf;
extern int MPID_nem_ib_myrank;
extern uint64_t MPID_nem_ib_tsc_poll; /* to throttle ib_poll in recv_posted (in ib_poll.c) */
extern int MPID_nem_ib_ncqe; /* for lazy poll scq */
-extern int MPID_nem_ib_ncqe_lmt_put; /* lmt-put uses another QP, SQ, CQ to speed-up fetching CQE */
+extern uint64_t MPID_nem_ib_progress_engine_vt; /* virtual time stamp counter */
+extern uint16_t MPID_nem_ib_remote_poll_shared; /* index to poll for shared ring buffer */
#ifdef MPID_NEM_IB_ONDEMAND
-extern MPID_nem_ib_cm_map_t MPID_nem_ib_cm_state;
-extern int MPID_nem_ib_ncqe_connect; /* couting outstanding connection requests */
+extern uint16_t MPID_nem_ib_cm_ringbuf_head; /* head is incremented after assigned */
+extern uint16_t MPID_nem_ib_cm_ringbuf_tail;
+extern uint64_t MPID_nem_ib_cm_ringbuf_released[(MPID_NEM_IB_CM_NSEG + 63) / 64];
+
+/* overflow queue when no more slots for responder to write on are available */
+extern MPID_nem_ib_cm_sendq_t MPID_nem_ib_cm_sendq;
+
+extern MPID_nem_ib_ringbuf_sendq_t MPID_nem_ib_ringbuf_sendq;
+
#endif
extern int MPID_nem_ib_ncqe_scratch_pad;
+extern int MPID_nem_ib_ncqe_scratch_pad_to_drain;
extern int MPID_nem_ib_ncqe_to_drain; /* count put in lmt-put-done protocol */
extern int MPID_nem_ib_ncqe_nces; /* counting non-copied eager-send */
extern MPID_nem_ib_lmtq_t MPID_nem_ib_lmtq; /* poll queue for lmt */
@@ -364,7 +602,8 @@ extern MPID_nem_ib_vc_area *MPID_nem_ib_debug_current_vc_ib;
extern uint8_t MPID_nem_ib_lmt_tail_addr_cbf[MPID_nem_ib_cbf_nslot *
MPID_nem_ib_cbf_bitsperslot / 8];
-#define MPID_NEM_IB_MAX_POLLINGSET 65536
+
+//#define MPID_NEM_IB_MAX_POLLINGSET 65536
/* xfer.c manages memory region using memid */
#define MPID_NEM_IB_MEMID_RDMA 0
@@ -374,19 +613,24 @@ extern uint8_t MPID_nem_ib_lmt_tail_addr_cbf[MPID_nem_ib_cbf_nslot *
#define MPID_NEM_IB_SYNC_SYNACK 1
#define MPID_NEM_IB_SYNC_NACK 2
-#define MPID_NEM_IB_EAGER_MAX_MSG_SZ (MPID_NEM_IB_COM_RDMABUF_SZSEG/*1024*/-sizeof(MPIDI_CH3_Pkt_t)+sizeof(MPIDI_CH3_Pkt_eager_send_t)-sizeof(MPID_nem_ib_sz_hdrmagic_t)-sizeof(MPID_nem_ib_pkt_prefix_t)-sizeof(MPID_nem_ib_tailmagic_t)) /* when > this size, lmt is used. see src/mpid/ch3/src/mpid_isend.c */
+#define MPID_NEM_IB_EAGER_MAX_MSG_SZ (MPID_NEM_IB_COM_RDMABUF_SZSEG/*1024*/-sizeof(MPIDI_CH3_Pkt_t)+sizeof(MPIDI_CH3_Pkt_eager_send_t)-sizeof(MPID_nem_ib_netmod_hdr_shared_t)-sizeof(MPID_nem_ib_pkt_prefix_t)-sizeof(MPID_nem_ib_netmod_trailer_t)) /* when > this size, lmt is used. see src/mpid/ch3/src/mpid_isend.c */
#define MPID_NEM_IB_POLL_PERIOD_RECV_POSTED 2000 /* minimum period from previous ib_poll to ib_poll in recv_posted */
#define MPID_NEM_IB_POLL_PERIOD_SEND_POSTED 2000
typedef struct {
void *addr;
uint32_t rkey;
+#if 0 /* moving to packet header */
int seq_num_tail; /* notify RDMA-write-to buffer occupation */
+#endif
uint8_t tail; /* last word of payload */
} MPID_nem_ib_lmt_cookie_t;
typedef enum MPID_nem_ib_pkt_subtype {
MPIDI_NEM_IB_PKT_EAGER_SEND,
+#if 0 /* modification of mpid_nem_lmt.c is required */
+ MPIDI_NEM_IB_PKT_LMT_RTS,
+#endif
MPIDI_NEM_IB_PKT_PUT,
MPIDI_NEM_IB_PKT_ACCUMULATE,
MPIDI_NEM_IB_PKT_GET,
@@ -403,7 +647,7 @@ typedef struct MPID_nem_ib_pkt_prefix {
MPID_nem_pkt_type_t type;
unsigned subtype;
/* additional field */
- int seq_num_tail;
+ int16_t seq_num_tail;
} MPID_nem_ib_pkt_prefix_t;
/* derived from MPID_nem_pkt_netmod_t and MPID_nem_pkt_lmt_done_t */
@@ -412,7 +656,7 @@ typedef struct MPID_nem_ib_pkt_lmt_get_done {
unsigned subtype;
/* additional field */
MPI_Request req_id;
- int seq_num_tail;
+ int16_t seq_num_tail;
} MPID_nem_ib_pkt_lmt_get_done_t;
/* derived from MPID_nem_pkt_netmod_t */
@@ -420,7 +664,7 @@ typedef struct MPID_nem_ib_pkt_req_seq_num_t {
MPID_nem_pkt_type_t type;
unsigned subtype;
/* additional field */
- int seq_num_tail;
+ int16_t seq_num_tail;
} MPID_nem_ib_pkt_req_seq_num_t;
/* derived from MPID_nem_pkt_netmod_t */
@@ -428,7 +672,7 @@ typedef struct MPID_nem_ib_pkt_reply_seq_num_t {
MPID_nem_pkt_type_t type;
unsigned subtype;
/* additional field */
- int seq_num_tail;
+ int16_t seq_num_tail;
} MPID_nem_ib_pkt_reply_seq_num_t;
/* derived from MPID_nem_pkt_netmod_t */
@@ -442,6 +686,11 @@ typedef struct MPID_nem_ib_pkt_change_rdmabuf_occupancy_notify_state_t {
int MPID_nem_ib_PktHandler_EagerSend(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
MPIDI_msg_sz_t * buflen /* out */ ,
MPID_Request ** rreqp /* out */);
+#if 0 /* modification of mpid_nem_lmt.c is required */
+int MPID_nem_ib_pkt_RTS_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
+ MPIDI_msg_sz_t * buflen /* out */ ,
+ MPID_Request ** rreqp /* out */);
+#endif
int MPID_nem_ib_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
MPIDI_msg_sz_t * buflen /* out */ ,
MPID_Request ** rreqp /* out */);
@@ -490,6 +739,7 @@ int pkt_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt, MPIDI_msg_sz_t * bu
if (_req != NULL) { \
MPIU_ERR_CHKANDJUMP(_req->status.MPI_ERROR, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_send_req_seq_num"); \
MPID_Request_release(_req); \
+ dprintf("send_req_seq_num,release,req=%p\n", _req); \
} \
} while (0)
@@ -500,22 +750,15 @@ int pkt_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt, MPIDI_msg_sz_t * bu
MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"sending reply_seq_num packet"); \
MPIDI_Pkt_init(_pkt, MPIDI_NEM_PKT_NETMOD); \
_pkt->subtype = MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM; \
- \
- int *rsr_seq_num_tail; \
- ibcom_errno = MPID_nem_ib_com_rsr_seq_num_tail_get(VC_FIELD(vc, sc->fd), &rsr_seq_num_tail); \
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_rsr_seq_num_tail_get"); \
- _pkt->seq_num_tail = *rsr_seq_num_tail; \
- \
- int *rsr_seq_num_tail_last_sent; \
- ibcom_errno = MPID_nem_ib_com_rsr_seq_num_tail_last_sent_get(VC_FIELD(vc, sc->fd), &rsr_seq_num_tail_last_sent); \
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_rsr_seq_num_tail_last_sent_get"); \
- *rsr_seq_num_tail_last_sent = *rsr_seq_num_tail; \
+ _pkt->seq_num_tail = vc_ib->ibcom->rsr_seq_num_tail; \
+ vc_ib->ibcom->rsr_seq_num_tail_last_sent = vc_ib->ibcom->rsr_seq_num_tail; \
\
mpi_errno = MPIDI_CH3_iStartMsg((vc), _pkt, sizeof(*_pkt), &_req); \
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_send_reply_seq_num"); \
if (_req != NULL) { \
MPIU_ERR_CHKANDJUMP(_req->status.MPI_ERROR, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_send_reply_seq_num"); \
MPID_Request_release(_req); \
+ dprintf("send_reply_seq_num,release,req=%p\n", _req); \
} \
} while (0)
@@ -533,24 +776,20 @@ int pkt_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt, MPIDI_msg_sz_t * bu
if (_req != NULL) { \
MPIU_ERR_CHKANDJUMP(_req->status.MPI_ERROR, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_send_change_rdmabuf_occupancy_notify_state"); \
MPID_Request_release(_req); \
+ dprintf("send_change_...,release,req=%p\n", _req); \
} \
} while (0)
#define MPID_nem_ib_change_rdmabuf_occupancy_notify_policy_lw(vc_ib, lsr_seq_num_tail) \
do { \
- int lsr_seq_num_head; \
- /* sequence number of (largest) in-flight send command */ \
- ibcom_errno = MPID_nem_ib_com_sseq_num_get(vc_ib->sc->fd, &lsr_seq_num_head); \
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_sseq_num_get"); \
- \
int *rdmabuf_occupancy_notify_rstate; \
ibcom_errno = MPID_nem_ib_com_rdmabuf_occupancy_notify_rstate_get(vc_ib->sc->fd, &rdmabuf_occupancy_notify_rstate); \
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_rdmabuf_occupancy_notify_rstate_get"); \
\
- /*dprintf("notify_policy_lw,head=%d,tail=%d,lw=%d\n", lsr_seq_num_head, *lsr_seq_num_tail, MPID_NEM_IB_COM_RDMABUF_LOW_WATER_MARK);*/ \
+ /*dprintf("notify_policy_lw,head=%d,tail=%d,lw=%d\n", vc_ib->ibcom->sseq_num, *lsr_seq_num_tail, MPID_NEM_IB_COM_RDMABUF_LOW_WATER_MARK);*/ \
/* if the number of occupied slot of RDMA-write-to buffer have got below the low water-mark */ \
if (*rdmabuf_occupancy_notify_rstate == MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_HW && \
- MPID_nem_ib_diff32(lsr_seq_num_head, *lsr_seq_num_tail) < MPID_NEM_IB_COM_RDMABUF_LOW_WATER_MARK) { \
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, *lsr_seq_num_tail) < MPID_NEM_IB_COM_RDMABUF_LOW_WATER_MARK) { \
dprintf("changing notify_rstate\n"); \
/* remember remote notifying policy so that local can know when to change remote policy back to HW */ \
*rdmabuf_occupancy_notify_rstate = MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_LW; \
@@ -579,11 +818,22 @@ int pkt_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt, MPIDI_msg_sz_t * bu
{ \
MPIU_ERR_CHKANDJUMP(_done_req->status.MPI_ERROR, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_lmt_send_GET_DONE"); \
MPID_Request_release(_done_req); \
+ dprintf("send_get_done,release,req=%p\n", _done_req); \
} \
} while (0)
-#define MPID_NEM_IB_MAX(a, b) ((a) > (b) ? (a) : (b))
-
+/* Allocator for packing buffer for non-contiguous data
+ - Allocate performs dequeue
+ - Slow to "malloc" (two load and one store instructions)
+ - Free preforms enqueue
+ - Slow to "free" (one load and two store instructions)
+ - Refill allocates a single slot
+ - Slow when first-time allocs occur
+ - Free list is linked lists and prepared for 2^n sizes.
+ - Fast to find a empty slot (one load instruction)
+ - Use mmap and munmap for requests of larger than or
+ equal to 4KB buffers
+ - No unused slots for large requests */
static inline void *MPID_nem_ib_stmalloc(size_t _sz)
{
size_t sz = _sz;
@@ -639,4 +889,5 @@ static inline void MPID_nem_ib_stfree(void *ptr, size_t sz)
MPID_nem_ib_fl[ndx] = ptr;
fn_exit:;
}
+
#endif /* IB_IMPL_H_INCLUDED */
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
index 5c340f4..622b8fd 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
@@ -68,20 +68,27 @@ void *MPID_nem_ib_fl[18];
int MPID_nem_ib_nranks;
MPID_nem_ib_conn_ud_t *MPID_nem_ib_conn_ud;
MPID_nem_ib_conn_t *MPID_nem_ib_conns;
-MPIDI_VC_t **MPID_nem_ib_pollingset;
+//MPIDI_VC_t **MPID_nem_ib_pollingset;
int MPID_nem_ib_conn_ud_fd;
MPID_nem_ib_com_t *MPID_nem_ib_conn_ud_MPID_nem_ib_com;
-int MPID_nem_ib_npollingset;
+//int MPID_nem_ib_npollingset;
int *MPID_nem_ib_scratch_pad_fds;
+MPID_nem_ib_com_t **MPID_nem_ib_scratch_pad_ibcoms;
//char *MPID_nem_ib_recv_buf;
int MPID_nem_ib_myrank;
uint64_t MPID_nem_ib_tsc_poll;
int MPID_nem_ib_ncqe;
-int MPID_nem_ib_ncqe_lmt_put;
+uint64_t MPID_nem_ib_progress_engine_vt;
+uint16_t MPID_nem_ib_remote_poll_shared;
#ifdef MPID_NEM_IB_ONDEMAND
-MPID_nem_ib_cm_map_t MPID_nem_ib_cm_state;
-int MPID_nem_ib_ncqe_connect;
+uint16_t MPID_nem_ib_cm_ringbuf_head;
+uint16_t MPID_nem_ib_cm_ringbuf_tail;
+uint64_t MPID_nem_ib_cm_ringbuf_released[(MPID_NEM_IB_CM_NSEG + 63) / 64];
+MPID_nem_ib_cm_sendq_t MPID_nem_ib_cm_sendq = { NULL, NULL };
+int MPID_nem_ib_ncqe_scratch_pad_to_drain;
#endif
+MPID_nem_ib_ringbuf_sendq_t MPID_nem_ib_ringbuf_sendq = { NULL, NULL };
+
int MPID_nem_ib_ncqe_scratch_pad;
int MPID_nem_ib_ncqe_to_drain;
int MPID_nem_ib_ncqe_nces;
@@ -93,6 +100,9 @@ static uint32_t MPID_nem_ib_rand_next = 1;
MPID_nem_ib_vc_area *MPID_nem_ib_debug_current_vc_ib;
static int listen_fd;
static int listen_port;
+uint64_t MPID_nem_ib_ringbuf_acquired[(MPID_NEM_IB_NRINGBUF + 63) / 64];
+uint64_t MPID_nem_ib_ringbuf_allocated[(MPID_NEM_IB_NRINGBUF + 63) / 64];
+MPID_nem_ib_ringbuf_t *MPID_nem_ib_ringbuf;
uint8_t MPID_nem_ib_rand()
{
@@ -200,7 +210,7 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
int i, j, k;
int ib_port = 1;
- MPIU_CHKPMEM_DECL(7);
+ MPIU_CHKPMEM_DECL(6);
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_INIT);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_INIT);
@@ -211,102 +221,18 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
MPID_nem_ib_myrank = pg_rank;
MPID_nem_ib_tsc_poll = MPID_nem_ib_rdtsc();
MPID_nem_ib_ncqe = 0;
- MPID_nem_ib_ncqe_lmt_put = 0;
-#ifdef MPID_NEM_IB_ONDEMAND
- MPID_nem_ib_ncqe_connect = 0;
-#endif
- MPID_nem_ib_ncqe_scratch_pad = 0;
MPID_nem_ib_ncqe_to_drain = 0;
+ MPID_nem_ib_ncqe_lmt_put = 0;
MPID_nem_ib_ncqe_nces = 0;
- MPID_nem_ib_npollingset = 0;
-
+ MPID_nem_ib_ncqe_scratch_pad = 0;
+ MPID_nem_ib_ncqe_scratch_pad_to_drain = 0;
+ // MPID_nem_ib_npollingset = 0;
+ MPID_nem_ib_progress_engine_vt = 0;
+ MPID_nem_ib_remote_poll_shared = 0;
#ifdef MPID_NEM_IB_ONDEMAND
- /* prepare UD QPN for dynamic connection */
- ibcom_errno = MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_UD, &MPID_nem_ib_conn_ud_fd);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
- ibcom_errno =
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conn_ud_fd,
- &MPID_nem_ib_conn_ud_MPID_nem_ib_com);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
- ibcom_errno = MPID_nem_ib_com_rts(MPID_nem_ib_conn_ud_fd, 0, 0, 0);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_rts");
-
- for (i = 0; i < MPID_NEM_IB_COM_MAX_RQ_CAPACITY; i++) {
- ibcom_errno = MPID_nem_ib_com_udrecv(MPID_nem_ib_conn_ud_fd);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_udrecv");
- }
-
- /* obtain gid, lid, qpn using KVS */
- MPIU_CHKPMEM_MALLOC(MPID_nem_ib_conn_ud, MPID_nem_ib_conn_ud_t *,
- MPID_nem_ib_nranks * sizeof(MPID_nem_ib_conn_ud_t), mpi_errno,
- "ud connection table");
- memset(MPID_nem_ib_conn_ud, 0, MPID_nem_ib_nranks * sizeof(MPID_nem_ib_conn_ud_t));
-
- /* put bc/<my rank>/dcs/gid:lid:qpn */
- uint32_t my_qpnum;
- uint16_t my_lid;
- union ibv_gid my_gid;
- MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conn_ud_fd, MPID_NEM_IB_COM_INFOKEY_QP_QPN, &my_qpnum,
- sizeof(uint32_t));
- MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conn_ud_fd, MPID_NEM_IB_COM_INFOKEY_PORT_LID, &my_lid,
- sizeof(uint16_t));
- MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conn_ud_fd, MPID_NEM_IB_COM_INFOKEY_PORT_GID, &my_gid,
- sizeof(union ibv_gid));
-
- char *kvs_name;
- mpi_errno = MPIDI_PG_GetConnKVSname(&kvs_name);
- char *key_dcs, val[2 * sizeof(union ibv_gid) + 1 + 4 + 1 + 8 + 1], str[9];
-
- /* count maximum length of the string representation of remote_rank */
- for (i = 0, nranks = MPID_nem_ib_nranks; nranks > 0; nranks /= 10, i++) {
- }
- MPIU_CHKPMEM_MALLOC(key_dcs, char *, strlen("bc/") + i + strlen("/dcs/gid_lid_qpn") + 1,
- mpi_errno, "connection table");
-
- sprintf(key, "bc/%d/dcs/gid_lid_qpn", MPID_nem_ib_myrank);
- val[0] = 0;
- for (j = 0; j < sizeof(union ibv_gid); j++) {
- sprintf(str, "%02x", my_gid.raw[j]);
- strcat(val, str);
- }
- sprintf(str, ":");
- strcat(val, str);
- sprintf(str, "%04x:", my_lid);
- strcat(val, str);
- sprintf(str, "%08x", my_qpnum);
- strcat(val, str);
- dprintf("rank=%d,PMI_KVS_Put(%s, %s, %s)\n", MPID_nem_ib_myrank, kvs_name, key_dcs, val);
- pmi_errno = PMI_KVS_Put(kvs_name, key_dcs, val);
- MPIU_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**PMI_KVS_Put");
-
- /* wait for key-value to propagate among all ranks */
- pmi_errno = PMI_Barrier();
- MPIU_ERR_CHKANDJUMP(pmi_errno != PMI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**PMI_Barrier");
-
- /* obtain GID, LID, QP number for remote UD QP for dynamic connection */
- for (i = 0; i < MPID_nem_ib_nranks; i++) {
- if (i != MPID_nem_ib_myrank) {
- sprintf(key_dcs, "bc/%d/dcs/gid_lid_qpn", i);
- pmi_errno = PMI_KVS_Get(kvs_name, key_dcs, val, 256);
- dprintf("pmi_errno=%d\n", pmi_errno);
- MPIU_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**PMI_KVS_Get");
- dprintf("rank=%d,obtained val=%s\n", MPID_nem_ib_myrank, val);
- char *strp = val;
- for (j = 0; j < sizeof(union ibv_gid); j++) {
- memcpy(str, strp, 2);
- str[2] = 0;
- MPID_nem_ib_conn_ud[i].gid.raw[j] = strtol(str, NULL, 16);
- strp += 2;
- }
- sscanf(strp, ":%04x:%08x", &MPID_nem_ib_conn_ud[i].lid, &MPID_nem_ib_conn_ud[i].qpn);
-
- dprintf("remote rank=%d,gid=", i);
- for (j = 0; j < sizeof(union ibv_gid); j++) {
- dprintf("%02x", MPID_nem_ib_conn_ud[i].gid.raw[j]);
- }
- dprintf(",lid=%04x,qpn=%08x\n", MPID_nem_ib_conn_ud[i].lid, MPID_nem_ib_conn_ud[i].qpn);
- }
- }
+ MPID_nem_ib_cm_ringbuf_head = 0;
+ MPID_nem_ib_cm_ringbuf_tail = -1; /* it means slot 0 is not acquired */
+ memset(MPID_nem_ib_cm_ringbuf_released, 0, (MPID_NEM_IB_CM_NSEG + 63) / 64);
#endif
/* malloc scratch-pad fd */
@@ -314,27 +240,54 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
mpi_errno, "connection table");
memset(MPID_nem_ib_scratch_pad_fds, 0, MPID_nem_ib_nranks * sizeof(int));
+ MPIU_CHKPMEM_MALLOC(MPID_nem_ib_scratch_pad_ibcoms, MPID_nem_ib_com_t **,
+ MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_t *),
+ mpi_errno, "connection table");
+ memset(MPID_nem_ib_scratch_pad_ibcoms, 0, MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_t*));
+
/* prepare scrath-pad QP and malloc scratch-pad */
for (i = 0; i < MPID_nem_ib_nranks; i++) {
+ MPID_nem_ib_scratch_pad_fds_ref_count++;
ibcom_errno =
MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_SCRATCH_PAD,
&MPID_nem_ib_scratch_pad_fds[i]);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
ibcom_errno =
+ MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[i],
+ &MPID_nem_ib_scratch_pad_ibcoms[i]);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_com_obtain_pointer");
+
+
+ ibcom_errno =
MPID_nem_ib_com_alloc(MPID_nem_ib_scratch_pad_fds[i],
- MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_qp_state_t));
+#ifdef MPID_NEM_IB_ONDEMAND
+ MPID_NEM_IB_CM_OFF_CMD +
+ MPID_NEM_IB_CM_NSEG * sizeof(MPID_nem_ib_cm_cmd_t) +
+ sizeof(MPID_nem_ib_ringbuf_headtail_t)
+#else
+ MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_qp_state_t)
+#endif
+);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_alloc");
}
+#ifdef MPID_NEM_IB_ONDEMAND
+ /* Release CAS word */
+ *((uint64_t *) MPID_nem_ib_scratch_pad) = MPID_NEM_IB_CM_RELEASED;
+#endif
+ /* Initialize head and tail pointer of shared ring buffer */
+ MPID_nem_ib_ringbuf_headtail_t * headtail =
+ (MPID_nem_ib_ringbuf_headtail_t *) ((uint8_t*)MPID_nem_ib_scratch_pad + MPID_NEM_IB_RINGBUF_OFF_HEAD);
+ headtail->head = 0;
+ headtail->tail = -1;
/* put bc/me/sp/{gid,lid} put bc/me/sp/{qpn,rmem,rkey}/you */
int nranks;
-#ifndef MPID_NEM_IB_ONDEMAND
uint32_t my_qpnum;
uint16_t my_lid;
union ibv_gid my_gid;
-#endif
void *my_rmem;
int my_rkey;
@@ -524,29 +477,19 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
MPID_nem_ib_nranks * sizeof(MPID_nem_ib_conn_t), mpi_errno,
"connection table");
memset(MPID_nem_ib_conns, 0, MPID_nem_ib_nranks * sizeof(MPID_nem_ib_conn_t));
-
+#if 0
MPIU_CHKPMEM_MALLOC(MPID_nem_ib_pollingset, MPIDI_VC_t **,
MPID_NEM_IB_MAX_POLLINGSET * sizeof(MPIDI_VC_t *), mpi_errno,
"connection table");
memset(MPID_nem_ib_pollingset, 0, MPID_NEM_IB_MAX_POLLINGSET * sizeof(MPIDI_VC_t *));
-
+#endif
+#ifndef MPID_NEM_IB_ONDEMAND
/* prepare eager-send QP */
for (i = 0; i < MPID_nem_ib_nranks; i++) {
ibcom_errno =
MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_RC, &MPID_nem_ib_conns[i].fd);
- dprintf("init,fd=%d\n", MPID_nem_ib_conns[i].fd);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
- }
-
-#if 0
- for (i = 0; i < MPID_nem_ib_nranks; i++) {
- ibcom_errno =
- MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_RC_LMT_PUT,
- &MPID_nem_ib_conns[i].fd_lmt_put);
- dprintf("init,fd_lmt_put=%d\n", MPID_nem_ib_conns[i].fd_lmt_put);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
}
-#endif
/* put bc/me/{gid,lid}, put bc/me/{qpn,rmem,rkey}/you */
mpi_errno = MPID_nem_ib_announce_network_addr(pg_rank, bc_val_p, val_max_sz_p);
@@ -609,17 +552,10 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_rts");
/* report me-to-you eager-send QP becomes RTR */
- MPID_nem_ib_com_t *MPID_nem_ib_com_scratch_pad;
- ibcom_errno =
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[i],
- &MPID_nem_ib_com_scratch_pad);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_obtain_pointer");
-
MPID_nem_ib_com_qp_state_t state = {.state = MPID_NEM_IB_COM_QP_STATE_RTR };
ibcom_errno =
MPID_nem_ib_com_put_scratch_pad(MPID_nem_ib_scratch_pad_fds[i],
- (uint64_t) MPID_nem_ib_com_scratch_pad,
+ (uint64_t) MPID_nem_ib_scratch_pad_ibcoms[i],
sizeof(MPID_nem_ib_com_qp_state_t) *
MPID_nem_ib_myrank,
sizeof(MPID_nem_ib_com_qp_state_t),
@@ -634,23 +570,6 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
"**MPID_nem_ib_com_reg_mr_connect");
dprintf("ib_init,after mr_connect for me-to-you eager-send QP\n");
-#if 0
- /* CQ, SQ, SCQ for lmt-put */
- strcpy(key_str, MPID_NEM_IB_QPN_KEY);
- strcat(key_str, "lmt-put"); /* "" or "lmt-put" */
- sprintf(remote_rank_str, "/%x", MPID_nem_ib_myrank);
- strcat(key_str, remote_rank_str);
- mpi_errno =
- MPID_nem_ib_kvs_get_binary(i, key_str, (char *) &remote_qpnum, sizeof(uint32_t));
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
-
- ibcom_errno =
- MPID_nem_ib_com_rts(MPID_nem_ib_conns[i].fd_lmt_put, remote_qpnum, remote_lid,
- &remote_gid);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_rts");
-#endif
}
}
@@ -659,7 +578,7 @@ int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_m
dprintf("init,fd[%d]=%d\n", i, MPID_nem_ib_conns[i].fd);
}
#endif
-
+#endif
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_INIT);
@@ -783,24 +702,6 @@ static int MPID_nem_ib_announce_network_addr(int my_rank, char **bc_val_p, int *
sizeof(uint32_t));
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_kvs_put_binary");
-#if 0
- /* lmt-put */
- strcpy(key_str, MPID_NEM_IB_QPN_KEY);
- strcat(key_str, "lmt-put");
- sprintf(remote_rank_str, "/%x", i);
- strcat(key_str, remote_rank_str);
- ibcom_errno =
- MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conns[i].fd_lmt_put,
- MPID_NEM_IB_COM_INFOKEY_QP_QPN, &my_qpnum,
- sizeof(uint32_t));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_get_info_conn");
-
- mpi_errno =
- MPID_nem_ib_kvs_put_binary(MPID_nem_ib_myrank, key_str, (uint8_t *) & my_qpnum,
- sizeof(uint32_t));
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_kvs_put_binary");
-#endif
strcpy(key_str, MPID_NEM_IB_RMEM_KEY);
sprintf(remote_rank_str, "/%x", i);
@@ -832,6 +733,7 @@ static int MPID_nem_ib_announce_network_addr(int my_rank, char **bc_val_p, int *
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_kvs_put_binary");
}
+ MPIU_CHKLMEM_FREEALL();
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_ANNOUNCE_NETWORK_ADDR);
return mpi_errno;
@@ -858,6 +760,42 @@ int MPID_nem_ib_connect_to_root(const char *business_card, MPIDI_VC_t * new_vc)
}
#undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_vc_onconnect
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_ib_vc_onconnect(MPIDI_VC_t * vc)
+{
+ int mpi_errno = MPI_SUCCESS;
+ int ibcom_errno;
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_VC_ONCONNECT);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_VC_ONCONNECT);
+
+ /* store pointer to MPID_nem_ib_com */
+ ibcom_errno =
+ MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[vc->pg_rank].fd,
+ &VC_FIELD(vc, ibcom));
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
+
+ ibcom_errno =
+ MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[vc->pg_rank].fd_lmt_put,
+ &VC_FIELD(vc, ibcom_lmt_put));
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
+
+#if 0
+ /* Insert into polling set */
+ MPIU_ERR_CHKANDJUMP(MPID_nem_ib_npollingset + 1 > MPID_NEM_IB_MAX_POLLINGSET, mpi_errno,
+ MPI_ERR_OTHER, "**MPID_nem_ib_npollingset");
+ MPID_nem_ib_pollingset[MPID_nem_ib_npollingset++] = vc;
+ //printf("vc_init,%d->%d,vc=%p,npollingset=%d\n", MPID_nem_ib_myrank, vc->pg_rank, vc, MPID_nem_ib_npollingset);
+#endif
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_VC_ONCONNECT);
+ return mpi_errno;
+ fn_fail:
+ goto fn_exit;
+}
+
+#undef FUNCNAME
#define FUNCNAME MPID_nem_ib_vc_init
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
@@ -885,29 +823,19 @@ int MPID_nem_ib_vc_init(MPIDI_VC_t * vc)
vc_ib->sc = &MPID_nem_ib_conns[vc->pg_rank];
- /* store pointer to MPID_nem_ib_com */
- ibcom_errno = MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[vc->pg_rank].fd, &vc_ib->ibcom);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
-
- ibcom_errno =
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[vc->pg_rank].fd_lmt_put,
- &vc_ib->ibcom_lmt_put);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
-
- //dprintf("vc_init,open,fd=%d,ptr=%p,rsr_seq_num_poll=%d\n", MPID_nem_ib_conns[vc->pg_rank].fd, vc_ib->MPID_nem_ib_com, vc_ib->ibcom->rsr_seq_num_poll);
-
/* initialize sendq */
vc_ib->sendq.head = NULL;
vc_ib->sendq.tail = NULL;
- vc_ib->sendq_lmt_put.head = NULL;
- vc_ib->sendq_lmt_put.tail = NULL;
+#ifdef MPID_NEM_IB_ONDEMAND
+ VC_FIELD(vc, connection_state) = MPID_NEM_IB_CM_CLOSED;
+ VC_FIELD(vc, connection_guard) = 0;
+#endif
/* rank is sent as wr_id and used to obtain vc in poll */
MPID_nem_ib_conns[vc->pg_rank].vc = vc;
- MPIU_ERR_CHKANDJUMP(MPID_nem_ib_npollingset + 1 > MPID_NEM_IB_MAX_POLLINGSET, mpi_errno,
- MPI_ERR_OTHER, "**MPID_nem_ib_npollingset");
- MPID_nem_ib_pollingset[MPID_nem_ib_npollingset++] = vc;
- //printf("vc_init,%d->%d,vc=%p,npollingset=%d\n", MPID_nem_ib_myrank, vc->pg_rank, vc, MPID_nem_ib_npollingset);
+
+#ifndef MPID_NEM_IB_ONDEMAND
+ MPID_nem_ib_vc_onconnect(vc);
/* wait until you-to-me eager-send QP becomes RTR */
MPID_nem_ib_com_t *MPID_nem_ib_com_scratch_pad;
@@ -942,20 +870,19 @@ int MPID_nem_ib_vc_init(MPIDI_VC_t * vc)
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_irecv");
}
+#endif
MPIDI_CHANGE_VC_STATE(vc, ACTIVE);
-
+#if 0 /* dead code */
uint32_t max_msg_sz;
MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conns[vc->pg_rank].fd,
MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ, &max_msg_sz,
sizeof(max_msg_sz));
- VC_FIELD(vc, pending_sends) = 0;
-#ifdef MPID_NEM_IB_ONDEMAND
- VC_FIELD(vc, is_connected) = 0;
#endif
+ VC_FIELD(vc, pending_sends) = 0;
- MPIU_Assert(sizeof(MPID_nem_ib_sz_hdrmagic_t) == 8); /* assumption in ib_ibcom.h */
- MPIU_Assert(sizeof(MPID_nem_ib_tailmagic_t) == 1); /* assumption in ib_ibcom.h */
+ //MPIU_Assert(sizeof(MPID_nem_ib_netmod_hdr_t) == 8); /* assumption in ib_ibcom.h */
+ MPIU_Assert(sizeof(MPID_nem_ib_netmod_trailer_t) == 1); /* assumption in ib_ibcom.h */
uint32_t sz;
#if 0
@@ -994,6 +921,9 @@ int MPID_nem_ib_vc_init(MPIDI_VC_t * vc)
vc_ch->pkt_handler = MPID_nem_ib_pkt_handler;
vc_ch->num_pkt_handlers = MPIDI_NEM_IB_PKT_NUM_PKT_HANDLERS;
MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_EAGER_SEND] = MPID_nem_ib_PktHandler_EagerSend;
+#if 0 /* modification of mpid_nem_lmt.c is required */
+ MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_LMT_RTS] = MPID_nem_ib_pkt_RTS_handler;
+#endif
MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_PUT] = MPID_nem_ib_PktHandler_Put;
MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_GET] = MPID_nem_ib_PktHandler_Get;
MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_GET_RESP] = MPID_nem_ib_PktHandler_GetResp;
@@ -1021,7 +951,7 @@ int MPID_nem_ib_vc_init(MPIDI_VC_t * vc)
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_VC_INIT);
return mpi_errno;
- fn_fail:
+ //fn_fail:
goto fn_exit;
}
@@ -1045,10 +975,11 @@ int MPID_nem_ib_vc_destroy(MPIDI_VC_t * vc)
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
{
- dprintf("ib_vc_terminate,enter\n");
+ dprintf("ib_vc_terminate,pg_rank=%d\n", vc->pg_rank);
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
int req_errno = MPI_SUCCESS;
+ int i;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_VC_TERMINATE);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_VC_TERMINATE);
@@ -1059,16 +990,16 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
* and control transactions always proceed after receiveing reply */
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- dprintf("init,before,%d->%d,r rdmaocc=%d,l rdmaocc=%d,sendq=%d,ncqe=%d,pending_sends=%d\n",
+ dprintf("vc_terminate,before,%d->%d,diff-rsr=%d,l diff-lsr=%d,sendq_empty=%d,ncqe=%d,pending_sends=%d\n",
MPID_nem_ib_myrank, vc->pg_rank,
- MPID_nem_ib_diff32(vc_ib->ibcom->rsr_seq_num_tail,
+ MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
vc_ib->ibcom->rsr_seq_num_tail_last_sent),
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
MPID_nem_ib_sendq_empty(vc_ib->sendq), MPID_nem_ib_ncqe, VC_FIELD(vc, pending_sends));
/* update remote RDMA-write-to buffer occupancy */
#if 0 /* we can't send it when the other party has closed QP */
- while (MPID_nem_ib_diff32
+ while (MPID_nem_ib_diff16
(vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent) > 0) {
MPID_nem_ib_send_reply_seq_num(vc);
}
@@ -1076,21 +1007,39 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
/* update local RDMA-write-to buffer occupancy */
#if 0
- while (MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail) > 0) {
+ while (MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail) > 0) {
MPID_nem_ib_poll_eager(vc);
}
#endif
- /* drain sendq */
+ /* Empty sendq */
while (!MPID_nem_ib_sendq_empty(vc_ib->sendq)) {
- MPID_nem_ib_send_progress(vc_ib);
+ /* mimic ib_poll because vc_terminate might be called from ib_poll_eager */
+ mpi_errno = MPID_nem_ib_send_progress(vc);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_send_progress");
+ ibcom_errno = MPID_nem_ib_drain_scq(0);
+#ifdef MPID_NEM_IB_ONDEMAND
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_drain_scq");
+ ibcom_errno = MPID_nem_ib_cm_poll_syn();
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_poll_syn");
+ ibcom_errno = MPID_nem_ib_cm_poll();
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_poll");
+ ibcom_errno = MPID_nem_ib_cm_progress();
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_progress");
+ ibcom_errno = MPID_nem_ib_cm_drain_scq();
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_drain_scq");
+#endif
+ ibcom_errno = MPID_nem_ib_ringbuf_progress();
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_progress");
+
+ MPID_nem_ib_progress_engine_vt += 1; /* Progress virtual time */
}
dprintf("init,middle,%d->%d,r rdmaocc=%d,l rdmaocc=%d,sendq=%d,ncqe=%d,pending_sends=%d\n",
MPID_nem_ib_myrank, vc->pg_rank,
- MPID_nem_ib_diff32(vc_ib->ibcom->rsr_seq_num_tail,
+ MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
vc_ib->ibcom->rsr_seq_num_tail_last_sent),
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
MPID_nem_ib_sendq_empty(vc_ib->sendq), MPID_nem_ib_ncqe, VC_FIELD(vc, pending_sends));
if (MPID_nem_ib_ncqe > 0 || VC_FIELD(vc, pending_sends) > 0) {
@@ -1099,9 +1048,9 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
}
dprintf("init,middle2,%d->%d,r rdmaocc=%d,l rdmaocc=%d,sendq=%d,ncqe=%d,pending_sends=%d\n",
MPID_nem_ib_myrank, vc->pg_rank,
- MPID_nem_ib_diff32(vc_ib->ibcom->rsr_seq_num_tail,
+ MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
vc_ib->ibcom->rsr_seq_num_tail_last_sent),
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
MPID_nem_ib_sendq_empty(vc_ib->sendq), MPID_nem_ib_ncqe, VC_FIELD(vc, pending_sends));
if (MPID_nem_ib_ncqe > 0 || VC_FIELD(vc, pending_sends) > 0) {
@@ -1120,21 +1069,55 @@ int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
dprintf("init,after ,%d->%d,r rdmaocc=%d,l rdmaocc=%d,sendq=%d,ncqe=%d,pending_sends=%d\n",
MPID_nem_ib_myrank, vc->pg_rank,
- MPID_nem_ib_diff32(vc_ib->ibcom->rsr_seq_num_tail,
+ MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
vc_ib->ibcom->rsr_seq_num_tail_last_sent),
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
MPID_nem_ib_sendq_empty(vc_ib->sendq), MPID_nem_ib_ncqe, VC_FIELD(vc, pending_sends));
/* drain scratch-pad scq */
+#ifdef MPID_NEM_IB_ONDEMAND
+ ibcom_errno = MPID_nem_ib_cm_drain_scq();
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_drain_scq");
+ dprintf("init,scratch_pad,ncqe=%d,to_drain=%d\n", MPID_nem_ib_ncqe_scratch_pad, MPID_nem_ib_ncqe_scratch_pad_to_drain);
+ dprintf("init,scratch_pad,ncom_scratch_pad=%d\n", MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank]->ncom_scratch_pad);
+#else
ibcom_errno = MPID_nem_ib_drain_scq_scratch_pad();
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
"**MPID_nem_ib_drain_scq_scratch_pad");
+#endif
mpi_errno = MPIDI_CH3U_Handle_connection(vc, MPIDI_VC_EVENT_TERMINATED);
if (mpi_errno) {
MPIU_ERR_POP(mpi_errno);
}
+ /* Destroy VC QP */
+ /* Check connection status stored in VC when on-demand connection is used */
+ ibcom_errno = MPID_nem_ib_com_close(vc_ib->sc.fd);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_com_close");
+
+ /* Release scratch-pad */
+ ibcom_errno =
+ MPID_nem_ib_com_free(scratch_pad_fds[vc->pg_rank],
+ MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_qp_state_t));
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_com_free");
+
+ /* Destroy scratch-pad QP */
+ ibcom_errno =
+ MPID_nem_ib_com_close(scratch_pad_fds[vc->pg_rank]);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_com_close");
+
+ /* Destroy array of scratch-pad QPs */
+ MPIU_Assert(MPID_nem_ib_scratch_pad_fds_ref_count > 0);
+ if(--MPID_nem_ib_scratch_pad_fds_ref_count == 0) {
+ MPIU_Free(MPID_nem_ib_scratch_pad_fds);
+ }
+ dprintf("vc_terminate,exit\n");
+
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_VC_TERMINATE);
return mpi_errno;
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
index e5da2f9..95297d4 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
@@ -99,7 +99,7 @@ int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_p
/* prepare magic */
//*((uint32_t*)(write_from_buf + data_sz - sizeof(tailmagic_t))) = MPID_NEM_IB_COM_MAGIC;
-#if 1 /* embed RDMA-write-to buffer occupancy information */
+#if 0 /* moving to packet header */ /* embed RDMA-write-to buffer occupancy information */
dprintf("lmt_initiate_lmt,rsr_seq_num_tail=%d\n", vc_ib->ibcom->rsr_seq_num_tail);
/* embed RDMA-write-to buffer occupancy information */
s_cookie_buf->seq_num_tail = vc_ib->ibcom->rsr_seq_num_tail;
@@ -109,7 +109,7 @@ int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_p
#endif
/* put IB rkey */
- struct ibv_mr *mr = MPID_nem_ib_com_reg_mr_fetch(write_from_buf, data_sz);
+ struct ibv_mr *mr = MPID_nem_ib_com_reg_mr_fetch(write_from_buf, data_sz, 0);
MPIU_ERR_CHKANDJUMP(!mr, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
#ifdef HAVE_LIBDCFA
s_cookie_buf->addr = (void *) mr->host_addr;
@@ -159,6 +159,7 @@ int MPID_nem_ib_lmt_start_recv_core(struct MPID_Request *req, void *raddr, uint3
req, req->ch.lmt_data_sz, write_to_buf, REQ_FIELD(req, lmt_pack_buf), req->dev.user_buf,
raddr, rkey, write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t),
*((uint8_t *) (write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t))));
+ //fflush(stdout);
#ifdef MPID_NEM_IB_LMT_GET_CQE
MPID_nem_ib_ncqe_to_drain += 1; /* use CQE instead of polling */
@@ -270,12 +271,13 @@ int MPID_nem_ib_lmt_start_recv(struct MPIDI_VC *vc, struct MPID_Request *req, MP
MPID_nem_ib_sendq_enqueue(&vc_ib->sendq, req);
}
+#if 0 /* moving to packet header */
/* extract embeded RDMA-write-to buffer occupancy information */
dprintf("lmt_start_recv,old lsr_seq_num=%d,s_cookie_buf->seq_num_tail=%d\n",
vc_ib->ibcom->lsr_seq_num_tail, s_cookie_buf->seq_num_tail);
- vc_ib->ibcom->lsr_seq_num_tail =
- MPID_NEM_IB_MAX(vc_ib->ibcom->lsr_seq_num_tail, s_cookie_buf->seq_num_tail);
+ vc_ib->ibcom->lsr_seq_num_tail = s_cookie_buf->seq_num_tail;
//dprintf("lmt_start_recv,new lsr_seq_num=%d\n", vc_ib->ibcom->lsr_seq_num_tail);
+#endif
#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
/* change remote notification policy of RDMA-write-to buf */
@@ -283,20 +285,20 @@ int MPID_nem_ib_lmt_start_recv(struct MPIDI_VC *vc, struct MPID_Request *req, MP
MPID_nem_ib_change_rdmabuf_occupancy_notify_policy_lw(vc_ib, &vc_ib->ibcom->lsr_seq_num_tail);
//dprintf("lmt_start_recv,reply_seq_num,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
#endif
- //dprintf("lmt_start_recv,reply_seq_num,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
+ //dprintf("lmt_start_recv,reply_seq_num,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
/* try to send from sendq because at least one RDMA-write-to buffer has been released */
//dprintf("lmt_start_recv,reply_seq_num,send_progress\n");
if (!MPID_nem_ib_sendq_empty(vc_ib->sendq)) {
dprintf("lmt_start_recv,ncom=%d,ncqe=%d,diff=%d\n",
vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY,
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num,
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
vc_ib->ibcom->lsr_seq_num_tail) < MPID_NEM_IB_COM_RDMABUF_NSEG);
}
if (!MPID_nem_ib_sendq_empty(vc_ib->sendq) && MPID_nem_ib_sendq_ready_to_send_head(vc_ib)) {
dprintf("lmt_start_recv,send_progress\n");
fflush(stdout);
- MPID_nem_ib_send_progress(vc_ib);
+ MPID_nem_ib_send_progress(vc);
}
fn_exit:
@@ -440,6 +442,7 @@ int MPID_nem_ib_lmt_done_send(struct MPIDI_VC *vc, struct MPID_Request *req)
"**MPID_nem_ib_lmt_done_send");
dprintf("lmt_done_send,1,req=%p,pcc=%d\n", req, MPIDI_CH3I_progress_completion_count.v);
MPIDI_CH3U_Request_complete(req);
+ dprintf("lmt_done_send,complete,req=%p\n", req);
dprintf("lmt_done_send,2,req=%p,pcc=%d\n", req, MPIDI_CH3I_progress_completion_count.v);
//dprintf("lmt_done_send, mark completion on sreq\n");
@@ -498,6 +501,7 @@ int MPID_nem_ib_lmt_done_recv(struct MPIDI_VC *vc, struct MPID_Request *rreq)
dprintf("lmt_done_recv,1,req=%p,pcc=%d\n", rreq, MPIDI_CH3I_progress_completion_count.v);
MPIDI_CH3U_Request_complete(rreq);
+ dprintf("lmt_done_recv,complete,req=%p\n", rreq);
dprintf("lmt_done_recv,2,pcc=%d\n", MPIDI_CH3I_progress_completion_count.v);
fn_exit:
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
index de1905c..15a5596 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
@@ -20,21 +20,30 @@
static int entered_drain_scq = 0;
#if 0
-#define MPID_NEM_IB_SEND_PROGRESS_POLLINGSET MPID_nem_ib_send_progress(vc_ib);
+#define MPID_NEM_IB_SEND_PROGRESS_POLLINGSET MPID_nem_ib_send_progress(vc);
#else
#define MPID_NEM_IB_SEND_PROGRESS_POLLINGSET { \
- int n; \
- for(n = 0; n < MPID_nem_ib_npollingset; n++) { \
- MPIDI_VC_t *vc_n = MPID_nem_ib_pollingset[n]; \
- /*MPID_nem_ib_debug_current_vc_ib = vc_ib;*/ \
- MPID_nem_ib_send_progress(VC_IB(vc_n)); \
- } \
+ int n; \
+ for (n = 0; n < MPID_NEM_IB_NRINGBUF; n++) { \
+ if (((MPID_nem_ib_ringbuf_allocated[n / 64] >> (n & 63)) & 1) == 0) { \
+ continue; \
+ } \
+ mpi_errno = MPID_nem_ib_poll_eager(&MPID_nem_ib_ringbuf[n]); \
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_poll_eager"); \
+ } \
}
+// int n; \
+// for(n = 0; n < MPID_nem_ib_npollingset; n++) { \
+// MPIDI_VC_t *vc_n = MPID_nem_ib_pollingset[n]; \
+// /*MPID_nem_ib_debug_current_vc_ib = vc_ib;*/ \
+// MPID_nem_ib_send_progress(vc_n); \
+// } \
+
#endif
#if 1
#define MPID_NEM_IB_CHECK_AND_SEND_PROGRESS \
if (!MPID_nem_ib_sendq_empty(vc_ib->sendq) && MPID_nem_ib_sendq_ready_to_send_head(vc_ib)) { \
- MPID_nem_ib_send_progress(vc_ib); \
+ MPID_nem_ib_send_progress(vc); \
}
#else
#define MPID_NEM_IB_CHECK_AND_SEND_PROGRESS MPID_NEM_IB_SEND_PROGRESS_POLLINGSET
@@ -62,6 +71,14 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
}
entered_drain_scq = 1;
+#ifdef MPID_NEM_IB_ONDEMAND
+ /* nobody created QP */
+ if(!MPID_nem_ib_rc_shared_scq) {
+ dprintf("drain_scq,CQ is null\n");
+ goto fn_exit;
+ }
+#endif
+
#if 0 /*def HAVE_LIBDCFA */
result = ibv_poll_cq(MPID_nem_ib_rc_shared_scq, 1, &cqe[0]);
#else
@@ -72,7 +89,7 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
MPIU_ERR_CHKANDJUMP(result < 0, mpi_errno, MPI_ERR_OTHER, "**netmod,ib,ibv_poll_cq");
if (result > 0) {
- dprintf("poll,scq,result=%d\n", result);
+ dprintf("drain_scq,result=%d\n", result);
}
for (i = 0; i < result; i++) {
dprintf("drain_scq,i=%d\n", i);
@@ -97,21 +114,22 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
#ifdef HAVE_LIBDCFA
if (cqe[i].status != IBV_WC_SUCCESS) {
- dprintf("drain_scq,kind=%d,req_type=%d,msg_type=%d,cqe.status=%08x\n", kind, req_type,
+ printf("drain_scq,kind=%d,req_type=%d,msg_type=%d,cqe.status=%08x\n", kind, req_type,
msg_type, cqe[i].status);
}
#else
if (cqe[i].status != IBV_WC_SUCCESS) {
- dprintf("drain_scq,kind=%d,req_type=%d,msg_type=%d,comm=%p,cqe.status=%08x,%s\n", kind,
- req_type, msg_type, req->comm, cqe[i].status, ibv_wc_status_str(cqe[i].status));
+ printf("drain_scq,kind=%d,req_type=%d,msg_type=%d,comm=%p,cqe.status=%08x,%s,sseq_num=%d\n", kind,
+ req_type, msg_type, req->comm, cqe[i].status, ibv_wc_status_str(cqe[i].status),
+ VC_FIELD(req->ch.vc, ibcom->sseq_num));
}
#endif
- MPIU_ERR_CHKANDJUMP(cqe[i].status != IBV_WC_SUCCESS, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_drain_scq");
+ MPID_NEM_IB_ERR_FATAL(cqe[i].status != IBV_WC_SUCCESS, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_drain_scq");
/*
* packets generated by MPIDI_CH3_iStartMsgv has req_type of RECV
- * lmt_initiate_lmt, lmt_put_cts_to_sender, lmt_put_rts_to_receiver, lmt_send_put_done
+ * lmt_initiate_lmt, lmt_send_put_done
*/
if (
//req_type == MPIDI_REQUEST_TYPE_SEND
@@ -122,7 +140,7 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
req->comm, cqe[i].opcode);
MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
- dprintf("drain_scq,MPIDI_REQUEST_EAGER_MSG,%d->%d,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_myrank, req->ch.vc->pg_rank, MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail)); /* moved before MPID_Request_release because this references req->ch.vc */
+ dprintf("drain_scq,MPIDI_REQUEST_EAGER_MSG,%d->%d,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_myrank, req->ch.vc->pg_rank, MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail)); /* moved before MPID_Request_release because this references req->ch.vc */
/* free temporal buffer for eager-send non-contiguous data.
* MPIDI_Request_create_sreq (in mpid_isend.c) sets req->dev.datatype
@@ -149,6 +167,9 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
/* decrement the number of entries in IB command queue */
vc_ib->ibcom->ncom -= 1;
MPID_nem_ib_ncqe -= 1;
+ MPID_nem_ib_rdmawr_from_free(REQ_FIELD(req, buf_from), REQ_FIELD(req, buf_from_sz));
+ dprintf("drain_scq,afree=%p,sz=%d\n", REQ_FIELD(req, buf_from), REQ_FIELD(req, buf_from_sz));
+
dprintf("drain_scq,eager-send,ncqe=%d\n", MPID_nem_ib_ncqe);
MPIU_Assert(req->ref_count == 1 || req->ref_count == 2);
@@ -166,9 +187,10 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
reqFn = req->dev.OnDataAvail;
if (!reqFn) {
MPIDI_CH3U_Request_complete(req);
+ dprintf("drain_scq,complete,req=%p\n", req);
MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete");
- dprintf("drain_scq,complete,req=%p,pcc incremented to %d\n", req,
- MPIDI_CH3I_progress_completion_count.v);
+ //dprintf("drain_scq,complete,req=%p,pcc incremented to %d\n", req,
+ //MPIDI_CH3I_progress_completion_count.v);
}
else {
dprintf("drain_scq,reqFn isn't zero\n");
@@ -183,6 +205,7 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
}
else {
MPID_Request_release(req);
+ dprintf("drain_scq,relese,req=%p\n", req);
}
/* try to send from sendq */
//dprintf("ib_poll,SCQ,!lmt,send_progress\n");
@@ -190,9 +213,9 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
dprintf("drain_scq,eager-send,ncom=%d,ncqe=%d,diff=%d\n",
vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY,
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num,
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
vc_ib->ibcom->lsr_seq_num_tail) <
- MPID_NEM_IB_COM_RDMABUF_NSEG);
+ vc_ib->ibcom->local_ringbuf_nslot);
MPID_Request *sreq = MPID_nem_ib_sendq_head(vc_ib->sendq);
int msg_type_sreq = MPIDI_Request_get_msg_type(sreq);
@@ -224,11 +247,13 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
req->comm, cqe[i].opcode);
MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
- dprintf("drain_scq,MPIDI_REQUEST_EAGER_MSG,%d->%d,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_myrank, req->ch.vc->pg_rank, MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail)); /* moved before MPID_Request_release because this references req->ch.vc */
+ dprintf("drain_scq,MPIDI_REQUEST_EAGER_MSG,%d->%d,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_myrank, req->ch.vc->pg_rank, MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail)); /* moved before MPID_Request_release because this references req->ch.vc */
/* decrement the number of entries in IB command queue */
vc_ib->ibcom->ncom -= 1;
MPID_nem_ib_ncqe -= 1;
+ MPID_nem_ib_rdmawr_from_free(REQ_FIELD(req, buf_from) , REQ_FIELD(req, buf_from_sz));
+
dprintf("drain_scq,GET_RESP,ncqe=%d\n", MPID_nem_ib_ncqe);
MPIU_Assert(req->ref_count == 1 || req->ref_count == 2);
@@ -245,9 +270,10 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
reqFn = req->dev.OnDataAvail;
if (!reqFn) {
MPIDI_CH3U_Request_complete(req);
+ dprintf("drain_scq,complete,req=%p\n", req);
MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete");
- dprintf("drain_scq,complete,req=%p,pcc incremented to %d\n", req,
- MPIDI_CH3I_progress_completion_count.v);
+ //dprintf("drain_scq,complete,req=%p,pcc incremented to %d\n", req,
+ //MPIDI_CH3I_progress_completion_count.v);
}
else {
dprintf("drain_scq,reqFn isn't zero\n");
@@ -319,7 +345,7 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
#endif
/* unmark "lmt is going on" */
- //dprintf("ib_poll,SCQ,lmt,%d->%d,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_myrank, req->ch.vc->pg_rank, MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail)); /* moved before MPID_Request_release because this references req->ch.vc */
+ //dprintf("ib_poll,SCQ,lmt,%d->%d,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_myrank, req->ch.vc->pg_rank, MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail)); /* moved before MPID_Request_release because this references req->ch.vc */
/* decrement the number of entries in IB command queue */
vc_ib->ibcom->ncom -= 1;
@@ -330,6 +356,7 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
dprintf("drain_scq,GET_CQE,Request_complete\n");
/* mark completion on rreq */
MPIDI_CH3U_Request_complete(req);
+ dprintf("drain_scq,complete,req=%p\n", req);
#else /* GET, and !GET_CQE */
int is_contig;
@@ -350,15 +377,16 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
/* lmt_start_recv increments ref_count
* drain_scq and ib_poll is not ordered, so both can decrement ref_count */
MPID_Request_release(req);
+ dprintf("drain_scq,relese,req=%p\n", req);
#endif
/* try to send from sendq */
if (!MPID_nem_ib_sendq_empty(vc_ib->sendq)) {
dprintf("drain_scq,GET,ncom=%d,ncqe=%d,diff=%d\n",
vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY,
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num,
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
vc_ib->ibcom->lsr_seq_num_tail) <
- MPID_NEM_IB_COM_RDMABUF_NSEG);
+ vc_ib->ibcom->local_ringbuf_nslot);
MPID_Request *sreq = MPID_nem_ib_sendq_head(vc_ib->sendq);
int msg_type_sreq = MPIDI_Request_get_msg_type(sreq);
@@ -370,8 +398,6 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
}
}
//if (!MPID_nem_ib_sendq_empty(vc_ib->sendq) && MPID_nem_ib_sendq_ready_to_send_head(vc_ib)) {
- dprintf("drain_scq,GET,send_progress\n");
- fflush(stdout);
//MPID_NEM_IB_SEND_PROGRESS_POLLINGSET
//}
}
@@ -397,118 +423,6 @@ int MPID_nem_ib_drain_scq(int dont_call_progress)
}
#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_drain_scq_lmt_put
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_drain_scq_lmt_put()
-{
-
- int mpi_errno = MPI_SUCCESS;
- int result;
- int i;
- struct ibv_wc cqe[MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN];
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_DRAIN_SCQ_LMT_PUT);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_DRAIN_SCQ_LMT_PUT);
-
-#if 0 /*def HAVE_LIBDCFA */
- result = ibv_poll_cq(MPID_nem_ib_rc_shared_scq_lmt_put, 1, &cqe[0]);
-#else
- result =
- ibv_poll_cq(MPID_nem_ib_rc_shared_scq_lmt_put, MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN,
- &cqe[0]);
-#endif
- MPIU_ERR_CHKANDJUMP(result < 0, mpi_errno, MPI_ERR_OTHER, "**netmod,ib,ibv_poll_cq");
-
- if (result > 0) {
- dprintf("drain_scq_lmt_put,found,result=%d\n", result);
- }
- for (i = 0; i < result; i++) {
-
- MPID_Request *req;
- MPID_Request_kind_t kind;
- int req_type, msg_type;
-
-#ifdef HAVE_LIBDCFA
- if (cqe[i].status != IBV_WC_SUCCESS) {
- dprintf("drain_scq_lmt_put,status=%08x\n", cqe[i].status);
- }
-#else
- if (cqe[i].status != IBV_WC_SUCCESS) {
- dprintf("drain_scq_lmt_put,status=%08x,%s\n", cqe[i].status,
- ibv_wc_status_str(cqe[i].status));
- }
-#endif
- MPIU_ERR_CHKANDJUMP(cqe[i].status != IBV_WC_SUCCESS, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_drain_scq_lmt_put");
-
- /* Obtain sreq */
- req = (MPID_Request *) cqe[i].wr_id;
- dprintf("drain_scq_lmt_put,req=%p,req->ref_count=%d\n", req, req->ref_count);
- MPIU_Assert(req->ref_count > 0);
-
- kind = req->kind;
- req_type = MPIDI_Request_get_type(req);
- msg_type = MPIDI_Request_get_msg_type(req);
-
-
- if (req_type == MPIDI_REQUEST_TYPE_RECV && msg_type == MPIDI_REQUEST_RNDV_MSG) {
- /* lmt-put */
- /* MPIDI_Request_set_type is not performed when
- * MPID_Isend --> FDU_or_AEP --> recv_posted --> ib_poll --> PUTCTS packet-handler */
-
- dprintf("drain_scq_lmt_put,lmt-put found\n");
-
-#if 0 /* moving to just after put */ /*implementing back-to-back put and done */
-#endif
-
- /* decrement the number of entries in IB command queue */
- MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
- vc_ib->ibcom->ncom_lmt_put -= 1;
- MPID_nem_ib_ncqe_lmt_put -= 1;
- dprintf("drain_scq_lmt_put,rndv,ncqe=%d\n", MPID_nem_ib_ncqe_lmt_put); /*suspicious */
- int (*reqFn) (MPIDI_VC_t *, MPID_Request *, int *);
-
- (VC_FIELD(req->ch.vc, pending_sends)) -= 1;
-
- /* as in the template */
- reqFn = req->dev.OnDataAvail;
- if (!reqFn) {
- MPIDI_CH3U_Request_complete(req); /* decrement cc, signal_completion, decrement ref_count, free */
- dprintf("drain_scq,lmt-put,req=%p,cc incremented to %d\n", req,
- MPIDI_CH3I_progress_completion_count.v);
- MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete");
- }
- else {
- MPIDI_VC_t *vc = req->ch.vc;
- int complete = 0;
- mpi_errno = reqFn(vc, req, &complete);
- if (mpi_errno)
- MPIU_ERR_POP(mpi_errno);
- /* not-completed case is not implemented */
- MPIU_Assert(complete == TRUE);
- MPIU_Assert(0); /* decrement ref_count and free sreq causes problem */
- }
- }
- else {
- dprintf("drain_scq_lmt_put,unknown kind=%d,req_type=%d,msg_type=%d\n", kind, req_type,
- msg_type);
-#if 1 // lazy consulting of completion queue
- MPIU_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_drain_scq_lmt_put");
-#else
- //printf("kind=%d\n", kind);
-#endif
- }
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_DRAIN_SCQ_LMT_PUT);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
#define FUNCNAME MPID_nem_ib_drain_scq_scratch_pad
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
@@ -567,11 +481,13 @@ int MPID_nem_ib_drain_scq_scratch_pad()
#define FUNCNAME MPID_nem_ib_poll_eager
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_poll_eager(MPIDI_VC_t * vc)
+int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t *ringbuf)
{
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
+ struct MPIDI_VC * vc;
+ MPID_nem_ib_vc_area * vc_ib;
int result;
struct ibv_wc cqe[MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN];
uint64_t tscs, tsce;
@@ -581,33 +497,40 @@ int MPID_nem_ib_poll_eager(MPIDI_VC_t * vc)
//MPID_nem_ib_tsc_poll = MPID_nem_ib_rdtsc();
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- //dprintf("ib_poll,ld,rsr_seq_num_poll=%d\n", vc_ib->ibcom->rsr_seq_num_poll);
- volatile void *buf =
- (uint8_t *) vc_ib->ibcom->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO] +
- MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint32_t) vc_ib->ibcom->rsr_seq_num_poll %
- MPID_NEM_IB_COM_RDMABUF_NSEG);
- volatile MPID_nem_ib_sz_hdrmagic_t *sz_hdrmagic = (MPID_nem_ib_sz_hdrmagic_t *) buf;
- if (sz_hdrmagic->magic != MPID_NEM_IB_COM_MAGIC) {
+ uint16_t * remote_poll;
+ switch(ringbuf->type) {
+ case MPID_NEM_IB_RINGBUF_EXCLUSIVE:
+ remote_poll = &VC_FIELD(ringbuf->vc, ibcom->rsr_seq_num_poll);
+ break;
+ case MPID_NEM_IB_RINGBUF_SHARED:
+ remote_poll = &MPID_nem_ib_remote_poll_shared;
+ break;
+ default:
+ printf("unknown ringbuf->type\n");
+ }
+
+ void *buf =
+ (uint8_t *) ringbuf->start +
+ MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t)(*remote_poll % ringbuf->nslot));
+ volatile uint64_t *head_flag = MPID_NEM_IB_NETMOD_HDR_HEAD_FLAG_PTR(buf);
+ if (*head_flag == 0) {
goto fn_exit;
}
- //dprintf("ib_poll_eager,buf=%p,sz=%d\n", buf, sz_hdrmagic->sz);
+ dprintf("ib_poll_eager,remote_poll=%d,buf=%p,sz=%d\n", *remote_poll, buf, MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf));
- /* unmark magic */
- sz_hdrmagic->magic = 0 /*0xdead */ ;
#if 0
ibcom_errno = MPID_nem_ib_com_poll_cq(MPID_NEM_IB_COM_RC_SHARED_RCQ, &cqe, &result);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_poll_cq");
#endif
- dprintf("ib_poll_eager,eager-send,found\n");
+ dprintf("ib_poll_eager,eager-send,found\n");fflush(stdout);
//MPIU_ERR_CHKANDJUMP1(cqe.status != IBV_WC_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_poll_cq", "**MPID_nem_ib_com_poll_cq %s", MPID_nem_ib_com_strerror(ibcom_errno));
- int sz_data_pow2;
- MPID_NEM_IB_SZ_DATA_POW2(sz_hdrmagic->sz);
- volatile MPID_nem_ib_tailmagic_t *tailmagic =
- (MPID_nem_ib_tailmagic_t *) ((uint8_t *) buf + sz_data_pow2);
- dprintf("poll,sz_data_pow2=%d,tailmagic=%p,sz=%d\n", sz_data_pow2, tailmagic, sz_hdrmagic->sz);
+ int off_pow2_aligned;
+ MPID_NEM_IB_OFF_POW2_ALIGNED(MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf));
+ volatile MPID_nem_ib_netmod_trailer_t *netmod_trailer =
+ (MPID_nem_ib_netmod_trailer_t *) ((uint8_t *) buf + off_pow2_aligned);
+ dprintf("poll,off_pow2_aligned=%d,netmod_trailer=%p,sz=%d\n", off_pow2_aligned, netmod_trailer, MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf));
int k = 0;
//tsce = MPID_nem_ib_rdtsc(); printf("9,%ld\n", tsce - tscs); // 55 for 512-byte
//tscs = MPID_nem_ib_rdtsc();
@@ -615,15 +538,16 @@ int MPID_nem_ib_poll_eager(MPIDI_VC_t * vc)
#ifdef MPID_NEM_IB_TLBPREF_POLL
int tlb_pref_ahd = (uint64_t) tailmagic + 4096 * MPID_NEM_IB_TLBPREF_POLL - (uint64_t) buf;
#endif
- while (tailmagic->magic != MPID_NEM_IB_COM_MAGIC) {
+ while (netmod_trailer->tail_flag != MPID_NEM_IB_COM_MAGIC) {
//k++;
#if 0 /* pre-fetch next RDMA-write-buf slot to cover TLB miss latency */
__asm__ __volatile__
("movq %0, %%rsi;"
- "movq 0(%%rsi), %%rsi;"::"r"(vc_ib->ibcom->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO] +
- MPID_NEM_IB_COM_RDMABUF_SZSEG *
- ((vc_ib->ibcom->rsr_seq_num_poll +
- 1) % MPID_NEM_IB_COM_RDMABUF_NSEG)):"%rsi");
+ "movq 0(%%rsi), %%rsi;"
+ :
+ :"r"(ringbuf->start + MPID_NEM_IB_COM_RDMABUF_SZSEG *
+ ((uint16_t)((*remote_poll + 1) % MPID_NEM_IB_COM_RDMABUF_NSEG)))
+ :"%rsi");
#endif
#ifdef MPID_NEM_IB_TLBPREF_POLL
__asm__ __volatile__
@@ -645,43 +569,63 @@ int MPID_nem_ib_poll_eager(MPIDI_VC_t * vc)
#if 1
void *rsi;
- for (rsi = (void *) buf; rsi < (uint8_t *) buf + sz_hdrmagic->sz;
+ for (rsi = (void *) buf; rsi < (uint8_t *) buf + MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf);
rsi = (uint8_t *) rsi + 64 * 4) {
#ifdef __MIC__
__asm__ __volatile__
("movq %0, %%rsi;"
"vprefetch0 0x00(%%rsi);"
- "vprefetch0 0x40(%%rsi);" "vprefetch0 0x80(%%rsi);" "vprefetch0 0xc0(%%rsi);"::"r"(rsi)
- :"%rsi");
+ "vprefetch0 0x40(%%rsi);"
+ "vprefetch0 0x80(%%rsi);"
+ "vprefetch0 0xc0(%%rsi);"
+ :
+ : "r"(rsi)
+ : "%rsi");
#else
__asm__ __volatile__
("movq %0, %%rsi;"
"prefetchnta 0x00(%%rsi);"
"prefetchnta 0x40(%%rsi);"
- "prefetchnta 0x80(%%rsi);" "prefetchnta 0xc0(%%rsi);"::"r"(rsi)
- :"%rsi");
+ "prefetchnta 0x80(%%rsi);"
+ "prefetchnta 0xc0(%%rsi);"
+ :
+ : "r"(rsi)
+ : "%rsi");
#endif
}
#endif
+ /* VC is stored in the packet for shared ring buffer */
+ switch(ringbuf->type) {
+ case MPID_NEM_IB_RINGBUF_EXCLUSIVE:
+ vc = ringbuf->vc;
+ break;
+ case MPID_NEM_IB_RINGBUF_SHARED:
+ vc = MPID_NEM_IB_NETMOD_HDR_VC_GET(buf);
+ break;
+ default:
+ printf("unknown ringbuf->type\n");
+ }
+ vc_ib = VC_IB(vc);
+ dprintf("poll_eager,vc=%p\n", vc);
+
MPIDI_CH3_Pkt_eager_send_t *pkt =
- (MPIDI_CH3_Pkt_eager_send_t *) ((uint8_t *) buf + sizeof(MPID_nem_ib_sz_hdrmagic_t));
- MPIU_Assert(sz_hdrmagic->sz >=
- sizeof(MPID_nem_ib_sz_hdrmagic_t) + sizeof(MPIDI_CH3_Pkt_t) +
- sizeof(MPID_nem_ib_tailmagic_t));
- MPIDI_CH3_Pkt_eager_send_t *pkt2 =
- (MPIDI_CH3_Pkt_eager_send_t *) ((uint8_t *) buf + sizeof(MPID_nem_ib_sz_hdrmagic_t) +
- sizeof(MPID_nem_ib_pkt_prefix_t));
+ (MPIDI_CH3_Pkt_eager_send_t *) ((uint8_t *) buf + MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf));
+ dprintf("pkt=%p,sizeof=%ld\n", pkt, MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf));
+ MPIU_Assert(MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) >=
+ MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf) + sizeof(MPIDI_CH3_Pkt_t) +
+ sizeof(MPID_nem_ib_netmod_trailer_t));
dprintf
("handle_pkt,before,%d<-%d,id=%d,pkt->type=%d,pcc=%d,MPIDI_NEM_PKT_END=%d,pkt=%p,subtype=%d\n",
- MPID_nem_ib_myrank, vc->pg_rank, vc_ib->ibcom->rsr_seq_num_poll, pkt->type,
+ MPID_nem_ib_myrank, vc->pg_rank, *remote_poll, pkt->type,
MPIDI_CH3I_progress_completion_count.v, MPIDI_NEM_PKT_END, pkt,
((MPID_nem_pkt_netmod_t *) pkt)->subtype);
/* see MPIDI_CH3_PktHandler_EagerSend (in src/mpid/ch3/src/ch3u_eager.c) */
mpi_errno =
- MPID_nem_handle_pkt(vc, (char *) ((uint8_t *) buf + sizeof(MPID_nem_ib_sz_hdrmagic_t)),
- (MPIDI_msg_sz_t) (sz_hdrmagic->sz - sizeof(MPID_nem_ib_sz_hdrmagic_t) -
- sizeof(MPID_nem_ib_tailmagic_t)));
+ MPID_nem_handle_pkt(vc, (char *) ((uint8_t *) buf + MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf)),
+ (MPIDI_msg_sz_t) (MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) -
+ MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf) -
+ sizeof(MPID_nem_ib_netmod_trailer_t)));
if (mpi_errno) {
MPIU_ERR_POP(mpi_errno);
}
@@ -691,11 +635,10 @@ int MPID_nem_ib_poll_eager(MPIDI_VC_t * vc)
/* this includes local RDMA-wr-to buf occupation
* because MPID_nem_handle_pkt releases RDMA-wr-to buf by copying data out */
/* responder releases resource and then embed largest sequence number into MPI message bound to initiator */
- //dprintf("after handle_pkt,rsr_seq_num_tail=%d\n", vc_ib->ibcom->rsr_seq_num_tail);
#if 1
dprintf
("handle_pkt,after,%d<-%d,id=%d,pkt->type=%d,eagershort=%d,close=%d,rts=%d,piggy-backed-eagersend=%d\n",
- MPID_nem_ib_myrank, vc->pg_rank, vc_ib->ibcom->rsr_seq_num_poll, pkt->type,
+ MPID_nem_ib_myrank, vc->pg_rank, *remote_poll, pkt->type,
MPIDI_CH3_PKT_EAGERSHORT_SEND, MPIDI_CH3_PKT_CLOSE, MPIDI_NEM_PKT_LMT_RTS,
MPIDI_NEM_IB_PKT_EAGER_SEND);
@@ -703,15 +646,20 @@ int MPID_nem_ib_poll_eager(MPIDI_VC_t * vc)
ibcom_errno =
MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get(MPID_nem_ib_conns[vc->pg_rank].fd,
¬ify_rate);
- dprintf("poll_eager,sendq=%d,ncom=%d,ncqe=%d,ldiff=%d(%d-%d),rdiff=%d(%d-%d),rate=%d\n",
+ dprintf("poll_eager,sendq=%d,ncom=%d,ncqe=%d,ldiff=%d(%d-%d),rate=%d\n",
MPID_nem_ib_sendq_empty(vc_ib->sendq),
vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY,
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail,
- MPID_nem_ib_diff32(vc_ib->ibcom->rsr_seq_num_tail,
- vc_ib->ibcom->rsr_seq_num_tail_last_sent),
- vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent, notify_rate);
+ notify_rate);
+
+ if(ringbuf->type == MPID_NEM_IB_RINGBUF_EXCLUSIVE) {
+ dprintf("poll_eager,rdiff=%d(%d-%d)\n",
+ MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
+ vc_ib->ibcom->rsr_seq_num_tail_last_sent),
+ vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent);
+ }
//dprintf("ib_poll,current pcc=%d\n", MPIDI_CH3I_progress_completion_count.v);
@@ -727,13 +675,13 @@ int MPID_nem_ib_poll_eager(MPIDI_VC_t * vc)
MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM);
MPID_nem_ib_recv_buf_released(vc,
(void *) ((uint8_t *) buf +
- sizeof(MPID_nem_ib_sz_hdrmagic_t) +
+ MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf) +
sizeof(MPIDI_CH3_Pkt_t)));
}
else {
- if (sz_hdrmagic->sz ==
- sizeof(MPID_nem_ib_sz_hdrmagic_t) + sizeof(MPIDI_CH3_Pkt_t) +
- sizeof(MPID_nem_ib_tailmagic_t)) {
+ if (MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) ==
+ MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf) + sizeof(MPIDI_CH3_Pkt_t) +
+ sizeof(MPID_nem_ib_netmod_trailer_t)) {
if (pkt->type == MPIDI_CH3_PKT_EAGERSHORT_SEND
//|| pkt->type == MPIDI_CH3_PKT_GET
) {
@@ -747,8 +695,19 @@ int MPID_nem_ib_poll_eager(MPIDI_VC_t * vc)
}
#endif
- vc_ib->ibcom->rsr_seq_num_poll += 1;
- dprintf("ib_poll,inc,rsr_seq_num_poll=%d\n", vc_ib->ibcom->rsr_seq_num_poll);
+ dprintf("ib_poll,hdr_ringbuf_type=%d\n", MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf));
+ if(MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf) & MPID_NEM_IB_RINGBUF_RELINDEX) {
+ vc_ib->ibcom->lsr_seq_num_tail = MPID_NEM_IB_NETMOD_HDR_RELINDEX_GET(buf);
+ dprintf("ib_poll,local_tail is updated to %d\n",
+ MPID_NEM_IB_NETMOD_HDR_RELINDEX_GET(buf));
+ }
+
+ /* Clear flag */
+ MPID_NEM_IB_NETMOD_HDR_HEAD_FLAG_SET(buf, 0);
+
+ (*remote_poll) += 1;
+ dprintf("ib_poll,inc,remote_poll=%d\n", *remote_poll);
+ dprintf("ib_poll_eager,3,MPIR_Process.comm_self->vcrt->ref_count=%d\n", MPIR_Process.comm_self->vcrt->ref_count);
fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_POLL_EAGER);
@@ -906,6 +865,7 @@ int MPID_nem_ib_poll(int in_blocking_poll)
* get-lmt: ib_poll, drain_scq, wait
* put-lmt: ib_poll, wait */
MPID_Request_release(tmp_rreq);
+ dprintf("ib_poll,relese,req=%p\n", tmp_rreq);
dprintf("ib_poll,lmt,after release,tmp_rreq=%p,rreq->ref_count=%d,comm=%p\n",
tmp_rreq, tmp_rreq->ref_count, tmp_rreq->comm);
@@ -929,27 +889,34 @@ int MPID_nem_ib_poll(int in_blocking_poll)
}
#endif
int ncom_almost_full = 0;
- for (i = 0; i < MPID_nem_ib_npollingset; i++) {
+
+ /* [MPID_NEM_IB_NRINGBUF-1] stores shared ring buffer */
+ for (i = 0; i < MPID_NEM_IB_NRINGBUF; i++) {
+ if (((MPID_nem_ib_ringbuf_allocated[i / 64] >> (i & 63)) & 1) == 0) {
+ continue;
+ }
//tscs = MPID_nem_ib_rdtsc();
- MPIDI_VC_t *vc = MPID_nem_ib_pollingset[i];
- mpi_errno = MPID_nem_ib_poll_eager(vc);
+ //dprintf("poll,kicking progress engine for %d\n", i);
+ mpi_errno = MPID_nem_ib_poll_eager(&MPID_nem_ib_ringbuf[i]);
if (mpi_errno) {
MPIU_ERR_POP(mpi_errno);
}
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
-
/* without this, command in sendq doesn't have a chance
* to perform send_progress
- * when send and progress_send call drain_scq but asking it
+ * when send and progress_send call drain_scq asking it
* for not performing send_progress and make the CQ empty */
- MPID_nem_ib_send_progress(vc_ib);
+ if(MPID_nem_ib_ringbuf[i].type == MPID_NEM_IB_RINGBUF_EXCLUSIVE) {
+ mpi_errno = MPID_nem_ib_send_progress(MPID_nem_ib_ringbuf[i].vc);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_send_progress");
+
+ ncom_almost_full |= (VC_FIELD(MPID_nem_ib_ringbuf[i].vc, ibcom->ncom) >= MPID_NEM_IB_COM_MAX_SQ_HEIGHT_DRAIN);
+ }
- ncom_almost_full |= (vc_ib->ibcom->ncom >= MPID_NEM_IB_COM_MAX_SQ_HEIGHT_DRAIN);
#if 0
/* aggressively perform drain_scq */
- ncom_almost_full |= !(MPID_nem_ib_sendq_empty(vc_ib->sendq));
+ ncom_almost_full |= !(MPID_nem_ib_sendq_empty(VC_FIELD(MPID_nem_ib_ringbuf[i].vc, sendq));
#endif
}
#if defined (MPID_NEM_IB_TIMER_WAIT_IB_POLL)
@@ -987,23 +954,27 @@ int MPID_nem_ib_poll(int in_blocking_poll)
ibcom_errno = MPID_nem_ib_drain_scq(0);
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_drain_scq");
#endif
- /* detect completion of lmt-put when MPI_Wait kicks ib_poll */
- if (MPID_nem_ib_ncqe_lmt_put > 0) {
- ibcom_errno = MPID_nem_ib_drain_scq_lmt_put();
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_drain_scq_lmt_put");
- }
#ifdef MPID_NEM_IB_ONDEMAND
/* process incoming connection request */
- MPID_nem_ib_cm_accept();
+ MPID_nem_ib_cm_poll_syn();
+ MPID_nem_ib_cm_poll();
+ //dprintf("ib_poll,MPID_nem_ib_ncqe_scratch_pad_to_drain=%d\n",
+ //MPID_nem_ib_ncqe_scratch_pad_to_drain);
/* process outgoing conncetion request */
- if (MPID_nem_ib_ncqe_connect >= MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN) {
- ibcom_errno = MPID_nem_ib_cm_drain_scq(0);
+ if (MPID_nem_ib_ncqe_scratch_pad_to_drain > 0 ||
+ MPID_nem_ib_ncqe_scratch_pad >= MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN) {
+ ibcom_errno = MPID_nem_ib_cm_drain_scq();
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_drain_scq");
}
+
+ /* Kick progress engine because time elapsed and it'd fire a event in the send queue */
+ MPID_nem_ib_cm_progress();
#endif
+ MPID_nem_ib_ringbuf_progress();
+
+ MPID_nem_ib_progress_engine_vt += 1; /* Progress virtual time */
#if 1
/* if polling on eager-send and lmt would repeat frequently, perform "pause" to yield instruction issue bandwitdh to other logical-core */
@@ -1038,6 +1009,12 @@ int MPID_nem_ib_recv_posted(struct MPIDI_VC *vc, struct MPID_Request *req)
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RECV_POSTED);
dprintf("recv_posted,enter,%d->%d,req=%p\n", MPID_nem_ib_myrank, vc->pg_rank, req);
+#ifdef MPID_NEM_IB_ONDEMAND
+ if (vc_ib->connection_state != MPID_NEM_IB_CM_ESTABLISHED) {
+ goto fn_exit;
+ }
+#endif
+
#if 0
int ibcom_errno;
ibcom_errno = MPID_nem_ib_com_irecv(vc_ib->sc->fd, (uint64_t) vc->pg_rank);
@@ -1057,7 +1034,7 @@ int MPID_nem_ib_recv_posted(struct MPIDI_VC *vc, struct MPID_Request *req)
if (data_sz + sizeof(MPIDI_CH3_Pkt_eager_send_t) > vc->eager_max_msg_sz) {
//if (MPID_nem_ib_tsc_poll - MPID_nem_ib_rdtsc() > MPID_NEM_IB_POLL_PERIOD_RECV_POSTED) {
#if 1
- mpi_errno = MPID_nem_ib_poll_eager(vc);
+ mpi_errno = MPID_nem_ib_poll_eager(VC_FIELD(vc, ibcom->remote_ringbuf));
#else
mpi_errno = MPID_nem_ib_poll(0);
#endif
@@ -1071,7 +1048,7 @@ int MPID_nem_ib_recv_posted(struct MPIDI_VC *vc, struct MPID_Request *req)
/* anticipating received message finds maching request in the posted-queue */
//if (MPID_nem_ib_tsc_poll - MPID_nem_ib_rdtsc() > MPID_NEM_IB_POLL_PERIOD_RECV_POSTED) {
#if 1
- mpi_errno = MPID_nem_ib_poll_eager(vc);
+ mpi_errno = MPID_nem_ib_poll_eager(VC_FIELD(vc, ibcom->remote_ringbuf));
#else
mpi_errno = MPID_nem_ib_poll(0);
#endif
@@ -1111,8 +1088,8 @@ int MPID_nem_ib_recv_buf_released(struct MPIDI_VC *vc, void *user_data)
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RECV_BUF_RELEASED);
dprintf("recv_buf_released,%d<-%d,user_data=%p\n", MPID_nem_ib_myrank, vc->pg_rank, user_data);
#if 1 /* moving from ib_poll */
- /* unmark magic */
- /* magic is located at MPID_NEM_IB_COM_INLINE_DATA boundary and variable length entails multiple prospective locations for the future use */
+ /* Clear all possible tail flag slots */
+ /* tail flag is located at MPID_NEM_IB_COM_INLINE_DATA boundary and variable length entails multiple prospective locations for the future use */
/* see MPIDI_CH3_PktHandler_EagerShortSend (in src/mpid/ch3/src/ch3u_eager.c */
/* eager-send with zero-length data is released in poll
@@ -1123,39 +1100,39 @@ int MPID_nem_ib_recv_buf_released(struct MPIDI_VC *vc, void *user_data)
goto fn_exit;
}
- MPIU_Assert((uint8_t *) vc_ib->ibcom->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO] <=
- (uint8_t *) user_data &&
- (uint8_t *) user_data <
- (uint8_t *) vc_ib->ibcom->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO] +
- MPID_NEM_IB_COM_RDMABUF_SZ);
+ if(MPID_nem_ib_rdmawr_to_alloc_start > user_data &&
+ user_data >= MPID_nem_ib_rdmawr_to_alloc_start +
+ MPID_NEM_IB_COM_RDMABUF_SZ * MPID_NEM_IB_NRINGBUF) {
+ MPID_nem_ib_segv;
+ }
unsigned long mod =
(unsigned long) ((uint8_t *) user_data -
- (uint8_t *) vc_ib->ibcom->
- icom_mem[MPID_NEM_IB_COM_RDMAWR_TO]) & (MPID_NEM_IB_COM_RDMABUF_SZSEG - 1);
+ (uint8_t *) vc_ib->ibcom->remote_ringbuf->start) &
+ (MPID_NEM_IB_COM_RDMABUF_SZSEG - 1);
void *buf = (void *) ((uint8_t *) user_data - mod);
//dprintf("recv_buf_released,clearing,buf=%p\n", buf);
- MPID_nem_ib_sz_hdrmagic_t *sz_hdrmagic = (MPID_nem_ib_sz_hdrmagic_t *) buf;
- int sz_data_pow2;
- MPID_NEM_IB_SZ_DATA_POW2(sz_hdrmagic->sz);
- //dprintf("recv_buf_released,sz=%d,pow2=%d\n", sz_hdrmagic->sz, sz_data_pow2);
+ int off_pow2_aligned;
+ MPID_NEM_IB_OFF_POW2_ALIGNED(MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf));
+ //dprintf("recv_buf_released,sz=%d,pow2=%d\n", MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf), off_pow2_aligned);
#if 1
uint32_t offset;
- for (offset = 0;;
+ for (offset = 15;;
offset =
- offset ? ((((offset + 1) << 1) - 1) >
- MPID_NEM_IB_MAX_DATA_POW2 ? MPID_NEM_IB_MAX_DATA_POW2 : (((offset + 1) << 1) -
- 1)) : 15) {
- volatile MPID_nem_ib_tailmagic_t *ptr =
- (MPID_nem_ib_tailmagic_t *) ((uint8_t *) buf + offset);
- MPIU_Assert((uint8_t *) vc_ib->ibcom->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO] <=
- (uint8_t *) ptr &&
- (uint8_t *) ptr <
- (uint8_t *) vc_ib->ibcom->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO] +
- MPID_NEM_IB_COM_RDMABUF_SZ);
- ptr->magic = 0 /*0xde */ ;
- if (offset == sz_data_pow2) {
+ (((offset + 1) << 1) - 1) > MPID_NEM_IB_MAX_OFF_POW2_ALIGNED ?
+ MPID_NEM_IB_MAX_OFF_POW2_ALIGNED :
+ (((offset + 1) << 1) - 1) ) {
+ MPID_nem_ib_netmod_trailer_t *netmod_trailer =
+ (MPID_nem_ib_netmod_trailer_t *) ((uint8_t *) buf + offset);
+ if(MPID_nem_ib_rdmawr_to_alloc_start > (uint8_t *) netmod_trailer &&
+ (uint8_t *) netmod_trailer >=
+ MPID_nem_ib_rdmawr_to_alloc_start +
+ MPID_NEM_IB_COM_RDMABUF_SZ * MPID_NEM_IB_NRINGBUF) {
+ MPID_nem_ib_segv;
+ }
+ netmod_trailer->tail_flag = 0;
+ if (offset == off_pow2_aligned) {
break;
}
}
@@ -1164,26 +1141,39 @@ int MPID_nem_ib_recv_buf_released(struct MPIDI_VC *vc, void *user_data)
#if 1 /* moving from ib_poll */
/* mark that one eager-send RDMA-write-to buffer has been released */
- int index_slot =
+ uint16_t index_slot =
(unsigned long) ((uint8_t *) user_data -
- (uint8_t *) vc_ib->ibcom->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO]) /
+ (uint8_t *) vc_ib->ibcom->remote_ringbuf->start) /
MPID_NEM_IB_COM_RDMABUF_SZSEG;
- MPIU_Assert(0 <= index_slot && index_slot < MPID_NEM_IB_COM_RDMABUF_NSEG);
- //dprintf("user_data=%p,mem=%p,sub=%08lx,index_slot=%d\n", user_data, vc_ib->ibcom->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO], (unsigned long)user_data - (unsigned long)vc_ib->ibcom->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO], index_slot);
- //dprintf("index_slot=%d,released=%016lx\n", index_slot, vc_ib->ibcom->rsr_seq_num_released[index_slot / 64]);
- vc_ib->ibcom->rsr_seq_num_released[index_slot / 64] |= (1ULL << (index_slot & 63));
- //dprintf("released[index_slot/64]=%016lx\n", vc_ib->ibcom->rsr_seq_num_released[index_slot / 64]);
- // int index_tail = (vc_ib->ibcom->rsr_seq_num_tail + 1) & (MPID_NEM_IB_COM_RDMABUF_NSEG-1);
- int index_tail = (vc_ib->ibcom->rsr_seq_num_tail + 1) % MPID_NEM_IB_COM_RDMABUF_NSEG;
- //dprintf("tail+1=%d,index_tail=%d\n", vc_ib->ibcom->rsr_seq_num_tail + 1, index_tail);
- //dprintf("released=%016lx\n", vc_ib->ibcom->rsr_seq_num_released[index_tail / 64]);
- if (1 || (index_tail & 7) || MPID_nem_ib_diff32(index_slot, index_tail) >= MPID_NEM_IB_COM_RDMABUF_NSEG - 8) { /* avoid wrap-around */
+ MPIU_Assert(0 <= index_slot && index_slot < vc_ib->ibcom->remote_ringbuf->nslot);
+ dprintf("released,user_data=%p,mem=%p,sub=%08lx,index_slot=%d\n", user_data, vc_ib->ibcom->remote_ringbuf->start, (unsigned long)user_data - (unsigned long)vc_ib->ibcom->remote_ringbuf->start, index_slot);
+ dprintf("released,index_slot=%d,released=%016lx\n", index_slot, vc_ib->ibcom->remote_ringbuf->remote_released[index_slot / 64]);
+ vc_ib->ibcom->remote_ringbuf->remote_released[index_slot / 64] |= (1ULL << (index_slot & 63));
+ dprintf("released,after bitset,%016lx\n", vc_ib->ibcom->remote_ringbuf->remote_released[index_slot / 64]);
+ // int index_tail = (vc_ib->ibcom->rsr_seq_num_tail + 1) & (vc_ib->ibcom->local_ringbuf_nslot-1);
+
+ MPID_nem_ib_ringbuf_headtail_t * headtail =
+ (MPID_nem_ib_ringbuf_headtail_t *) ((uint8_t *) MPID_nem_ib_scratch_pad +
+ MPID_NEM_IB_RINGBUF_OFF_HEAD);
+
+ uint16_t index_tail = vc_ib->ibcom->remote_ringbuf->type == MPID_NEM_IB_RINGBUF_EXCLUSIVE ?
+ ((uint16_t)(vc_ib->ibcom->rsr_seq_num_tail + 1) % vc_ib->ibcom->remote_ringbuf->nslot) :
+ ((uint16_t)(headtail->tail + 1) % vc_ib->ibcom->remote_ringbuf->nslot);
+ dprintf("released,index_tail=%d\n", index_tail);
+ dprintf("released,%016lx\n", vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64]);
+ if (1 || (index_tail & 7) || MPID_nem_ib_diff16(index_slot, index_tail) >= vc_ib->ibcom->remote_ringbuf->nslot - 8) { /* avoid wrap-around */
while (1) {
- if (((vc_ib->ibcom->rsr_seq_num_released[index_tail / 64] >> (index_tail & 63)) & 1) ==
+ if (((vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64] >> (index_tail & 63)) & 1) ==
1) {
- vc_ib->ibcom->rsr_seq_num_tail += 1;
- vc_ib->ibcom->rsr_seq_num_released[index_tail / 64] &= ~(1ULL << (index_tail & 63));
- dprintf("rsr_seq_num_tail,incremented to %d\n", vc_ib->ibcom->rsr_seq_num_tail);
+ if(vc_ib->ibcom->remote_ringbuf->type == MPID_NEM_IB_RINGBUF_EXCLUSIVE) {
+ vc_ib->ibcom->rsr_seq_num_tail += 1;
+ dprintf("exclusive ringbuf,remote_tail,incremented to %d\n", vc_ib->ibcom->rsr_seq_num_tail);
+ } else {
+ headtail->tail += 1;
+ dprintf("shared ringbuf,tail,incremented to %d,head=%ld\n",
+ headtail->tail, headtail->head);
+ }
+ vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64] &= ~(1ULL << (index_tail & 63));
}
else {
break;
@@ -1191,11 +1181,11 @@ int MPID_nem_ib_recv_buf_released(struct MPIDI_VC *vc, void *user_data)
}
}
else {
- if (((vc_ib->ibcom->rsr_seq_num_released[index_tail / 64] >> (index_tail & 63)) & 0xff) ==
+ if (((vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64] >> (index_tail & 63)) & 0xff) ==
0xff) {
vc_ib->ibcom->rsr_seq_num_tail += 8;
- vc_ib->ibcom->rsr_seq_num_released[index_tail / 64] &= ~(0xffULL << (index_tail & 63));
- //dprintf("released[index_tail/64]=%016lx\n", vc_ib->ibcom->rsr_seq_num_released[index_tail / 64]);
+ vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64] &= ~(0xffULL << (index_tail & 63));
+ //dprintf("released[index_tail/64]=%016lx\n", vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64]);
}
}
@@ -1209,11 +1199,12 @@ int MPID_nem_ib_recv_buf_released(struct MPIDI_VC *vc, void *user_data)
"**MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get");
/* if you missed the chance to make eager-send message piggy-back it */
- if (MPID_nem_ib_diff32
+ if (vc_ib->ibcom->remote_ringbuf->type == MPID_NEM_IB_RINGBUF_EXCLUSIVE &&
+ MPID_nem_ib_diff16
(vc_ib->ibcom->rsr_seq_num_tail,
vc_ib->ibcom->rsr_seq_num_tail_last_sent) >
MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_RATE_DELAY_MULTIPLIER(notify_rate)
- //|| MPID_nem_ib_diff32(lsr_seq_num_head, vc_ib->ibcom->lsr_seq_num_tail_last_sent) == MPID_NEM_IB_COM_RDMABUF_NSEG
+ //|| MPID_nem_ib_diff16(lsr_seq_num_head, vc_ib->ibcom->lsr_seq_num_tail_last_sent) == vc_ib->ibcom->local_ringbuf_nslot
) {
MPID_Request *sreq;
sreq = MPID_nem_ib_sendq_head(vc_ib->sendq);
@@ -1226,7 +1217,7 @@ int MPID_nem_ib_recv_buf_released(struct MPIDI_VC *vc, void *user_data)
goto skip;
}
}
- //printf("recv_buf_released,sending reply_seq_num,diff=%d,rate=%d,id=%d\n", MPID_nem_ib_diff32(vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent), notify_rate + (notify_rate>>1), vc_ib->ibcom->sseq_num);
+ //printf("recv_buf_released,sending reply_seq_num,diff=%d,rate=%d,id=%d\n", MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent), notify_rate + (notify_rate>>1), vc_ib->ibcom->sseq_num);
MPID_nem_ib_send_reply_seq_num(vc);
skip:;
}
@@ -1321,15 +1312,11 @@ int MPID_nem_ib_PktHandler_EagerSend(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
//}
/* Update occupation status of local SR (send request) queue */
- int *lsr_seq_num_tail;
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- ibcom_errno = MPID_nem_ib_com_lsr_seq_num_tail_get(vc_ib->sc->fd, &lsr_seq_num_tail);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_lsr_seq_num_tail_get");
dprintf("MPID_nem_ib_PktHandler_EagerSend,lsr_seq_num_tail=%d,netmod_pkt->seq_num_tail=%d\n",
- *lsr_seq_num_tail, netmod_pkt->seq_num_tail);
- *lsr_seq_num_tail = MPID_NEM_IB_MAX(*lsr_seq_num_tail, netmod_pkt->seq_num_tail);
- dprintf("MPID_nem_ib_PktHandler_EagerSend,lsr_seq_num_tail updated to %d\n", *lsr_seq_num_tail);
+ vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
+ vc_ib->ibcom->lsr_seq_num_tail = netmod_pkt->seq_num_tail;
+ dprintf("MPID_nem_ib_PktHandler_EagerSend,lsr_seq_num_tail updated to %d\n", vc_ib->ibcom->lsr_seq_num_tail);
#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
/* change remote notification policy of RDMA-write-to buf */
@@ -1340,7 +1327,7 @@ int MPID_nem_ib_PktHandler_EagerSend(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
dprintf("pkthandler,eagersend,sendq_empty=%d,ncom=%d,rdmabuf_occ=%d\n",
MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom,
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
/* try to send from sendq because at least one RDMA-write-to buffer has been released */
/* calling drain_scq from progress_send derpives of chance
* for ib_poll to drain sendq using ncqe
@@ -1378,6 +1365,73 @@ int MPID_nem_ib_PktHandler_EagerSend(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
goto fn_exit;
}
+#if 0 /* modification of mpid_nem_lmt.c is required */
+
+/* Temporary fix because it's static */
+int pkt_RTS_handler(MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt, MPIDI_msg_sz_t *buflen, MPID_Request **rreqp);
+
+/* packet handler for wrapper packet of MPIDI_NEM_PKT_LMT_RTS */
+/* see pkt_RTS_handler (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c) */
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_pkt_RTS_handler
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_ib_pkt_RTS_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
+ MPIDI_msg_sz_t * buflen /* out */ ,
+ MPID_Request ** rreqp /* out */)
+{
+ MPID_nem_ib_pkt_prefix_t *netmod_pkt = (MPID_nem_ib_pkt_prefix_t *) pkt;
+ MPIDI_CH3_Pkt_t *ch3_pkt =
+ (MPIDI_CH3_Pkt_t *) ((uint8_t *) pkt + sizeof(MPID_nem_ib_pkt_prefix_t));
+ int mpi_errno = MPI_SUCCESS;
+ int ibcom_errno;
+
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKT_RTS_HANDLER);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKT_RTS_HANDLER);
+
+ /* Update occupation status of local SR (send request) queue */
+ MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
+ dprintf("MPID_nem_ib_pkt_RTS_handler,lsr_seq_num_tail=%d,netmod_pkt->seq_num_tail=%d\n",
+ vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
+ vc_ib->ibcom->lsr_seq_num_tail = netmod_pkt->seq_num_tail;
+ dprintf("MPID_nem_ib_pkt_RTS_handler,lsr_seq_num_tail updated to %d\n",
+ vc_ib->ibcom->lsr_seq_num_tail);
+
+#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
+ /* change remote notification policy of RDMA-write-to buf */
+ dprintf("pkthandler,rts,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
+ MPID_nem_ib_change_rdmabuf_occupancy_notify_policy_lw(vc_ib, &vc_ib->ibcom->lsr_seq_num_tail);
+ dprintf("pkthandler,rts,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
+#endif
+
+ dprintf("pkthandler,rts,sendq_empty=%d,ncom=%d,rdmabuf_occ=%d\n",
+ MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom,
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
+ /* try to send from sendq because at least one RDMA-write-to buffer has been released */
+ dprintf("pkthandler,eagersend,send_progress\n");
+ fflush(stdout);
+ MPID_NEM_IB_CHECK_AND_SEND_PROGRESS;
+
+ /* fall back to the original handler */
+ /* we don't need to worry about the difference caused by embedding seq_num
+ * because size of MPI-header of MPIDI_CH3_PKT_EAGER_SEND equals to sizeof(MPIDI_CH3_Pkt_t)
+ * see MPID_nem_ib_iSendContig
+ */
+ MPIDI_msg_sz_t ch3_buflen = *buflen - sizeof(MPID_nem_ib_pkt_prefix_t);
+ mpi_errno = pkt_RTS_handler(vc, ch3_pkt, &ch3_buflen, rreqp);
+ *buflen = ch3_buflen + sizeof(MPID_nem_ib_pkt_prefix_t);
+ if (mpi_errno) {
+ MPIU_ERR_POP(mpi_errno);
+ }
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_PKT_RTS_HANDLER);
+ return mpi_errno;
+ fn_fail:
+ goto fn_exit;
+}
+#endif
+
#if 1
/* packet handler for wrapper packet of MPIDI_CH3_PKT_PUT */
/* see MPIDI_CH3_PktHandler_EagerSend (in src/mpid/ch3/src/ch3u_rma_sync.c) */
@@ -1407,8 +1461,7 @@ int MPID_nem_ib_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
dprintf("MPID_nem_ib_Pkthandler_Put,lsr_seq_num_tail=%d,put_pkt->seq_num_tail=%d\n",
vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
- vc_ib->ibcom->lsr_seq_num_tail =
- MPID_NEM_IB_MAX(vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
+ vc_ib->ibcom->lsr_seq_num_tail = netmod_pkt->seq_num_tail;
dprintf("MPID_nem_ib_Pkthandler_Put,lsr_seq_num_tail updated to %d\n",
vc_ib->ibcom->lsr_seq_num_tail);
@@ -1420,7 +1473,7 @@ int MPID_nem_ib_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
#endif
dprintf("pkthandler,put,sendq_empty=%d,ncom=%d,rdmabuf_occ=%d\n",
MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom,
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
/* try to send from sendq because at least one RDMA-write-to buffer has been released */
dprintf("pkthandler,put,send_progress\n");
fflush(stdout);
@@ -1474,8 +1527,7 @@ int MPID_nem_ib_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
dprintf("MPID_nem_ib_Pkthandler_Accumulate,lsr_seq_num_tail=%d,accum_pkt->seq_num_tail=%d\n",
vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
- vc_ib->ibcom->lsr_seq_num_tail =
- MPID_NEM_IB_MAX(vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
+ vc_ib->ibcom->lsr_seq_num_tail = netmod_pkt->seq_num_tail;
dprintf("MPID_nem_ib_Pkthandler_Accumulate,lsr_seq_num_tail updated to %d\n",
vc_ib->ibcom->lsr_seq_num_tail);
@@ -1487,7 +1539,7 @@ int MPID_nem_ib_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
#endif
dprintf("pkthandler,put,sendq_empty=%d,ncom=%d,rdmabuf_occ=%d\n",
MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom,
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
/* try to send from sendq because at least one RDMA-write-to buffer has been released */
dprintf("pkthandler,put,send_progress\n");
fflush(stdout);
@@ -1540,8 +1592,7 @@ int MPID_nem_ib_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
dprintf("MPID_nem_ib_Pkthandler_Get,lsr_seq_num_tail=%d,get_pkt->seq_num_tail=%d\n",
vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
- vc_ib->ibcom->lsr_seq_num_tail =
- MPID_NEM_IB_MAX(vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
+ vc_ib->ibcom->lsr_seq_num_tail = netmod_pkt->seq_num_tail;
dprintf("MPID_nem_ib_Pkthandler_Get,lsr_seq_num_tail updated to %d\n",
vc_ib->ibcom->lsr_seq_num_tail);
@@ -1553,7 +1604,7 @@ int MPID_nem_ib_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
#endif
dprintf("pkthandler,put,sendq_empty=%d,ncom=%d,rdmabuf_occ=%d\n",
MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom,
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
/* try to send from sendq because at least one RDMA-write-to buffer has been released */
dprintf("pkthandler,get,send_progress\n");
fflush(stdout);
@@ -1605,8 +1656,7 @@ int MPID_nem_ib_PktHandler_GetResp(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
dprintf("MPID_nem_ib_Pkthandler_GetResp,lsr_seq_num_tail=%d,get_pkt->seq_num_tail=%d\n",
vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
- vc_ib->ibcom->lsr_seq_num_tail =
- MPID_NEM_IB_MAX(vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
+ vc_ib->ibcom->lsr_seq_num_tail = netmod_pkt->seq_num_tail;
dprintf("MPID_nem_ib_Pkthandler_GetResp,lsr_seq_num_tail updated to %d\n",
vc_ib->ibcom->lsr_seq_num_tail);
@@ -1618,7 +1668,7 @@ int MPID_nem_ib_PktHandler_GetResp(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
#endif
dprintf("pkthandler,put,sendq_empty=%d,ncom=%d,rdmabuf_occ=%d\n",
MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom,
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
/* try to send from sendq because at least one RDMA-write-to buffer has been released */
dprintf("pkthandler,get,send_progress\n");
fflush(stdout);
@@ -1675,11 +1725,11 @@ int MPID_nem_ib_pkt_GET_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
case MPIDI_REQUEST_TYPE_RSEND:
case MPIDI_REQUEST_TYPE_SSEND:
case MPIDI_REQUEST_TYPE_BSEND:
+#if 0 /* obsolete, it's in netmod header now */
/* extract embeded RDMA-write-to buffer occupancy information */
dprintf("get_done_handler,old lsr_seq_num_tail=%d,done_pkt->seq_num_tail=%d\n",
vc_ib->ibcom->lsr_seq_num_tail, done_pkt->seq_num_tail);
- vc_ib->ibcom->lsr_seq_num_tail =
- MPID_NEM_IB_MAX(vc_ib->ibcom->lsr_seq_num_tail, done_pkt->seq_num_tail);
+ vc_ib->ibcom->lsr_seq_num_tail = done_pkt->seq_num_tail;
//dprintf("lmt_start_recv,new lsr_seq_num=%d\n", vc_ib->ibcom->lsr_seq_num_tail);
#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
@@ -1689,16 +1739,18 @@ int MPID_nem_ib_pkt_GET_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
&vc_ib->ibcom->lsr_seq_num_tail);
//dprintf("lmt_start_recv,reply_seq_num,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
#endif
- //dprintf("lmt_start_recv,reply_seq_num,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
+ //dprintf("lmt_start_recv,reply_seq_num,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
+
+#endif
/* try to send from sendq because at least one RDMA-write-to buffer has been released */
//dprintf("lmt_start_recv,reply_seq_num,send_progress\n");
if (!MPID_nem_ib_sendq_empty(vc_ib->sendq)) {
dprintf("get_done_handler,ncom=%d,ncqe=%d,diff=%d(%d-%d)\n",
vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY,
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num,
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
vc_ib->ibcom->lsr_seq_num_tail) <
- MPID_NEM_IB_COM_RDMABUF_NSEG, vc_ib->ibcom->sseq_num,
+ vc_ib->ibcom->local_ringbuf_nslot, vc_ib->ibcom->sseq_num,
vc_ib->ibcom->lsr_seq_num_tail);
}
dprintf("get_done_handler,send_progress\n");
@@ -1745,15 +1797,14 @@ int MPID_nem_ib_PktHandler_req_seq_num(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
/* update occupancy info of SR */
/* request piggy-backs seq_num although it's requesting responder's seq_num */
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- vc_ib->ibcom->lsr_seq_num_tail =
- MPID_NEM_IB_MAX(vc_ib->ibcom->lsr_seq_num_tail, req_pkt->seq_num_tail);
+ vc_ib->ibcom->lsr_seq_num_tail = req_pkt->seq_num_tail;
dprintf("PktHandler_req_seq_num,sendq=%d,ncom=%d,ncqe=%d,diff=%d(%d-%d)\n",
MPID_nem_ib_sendq_empty(vc_ib->sendq),
vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY,
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) < MPID_NEM_IB_COM_RDMABUF_NSEG,
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
+ vc_ib->ibcom->lsr_seq_num_tail) < vc_ib->ibcom->local_ringbuf_nslot,
vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail);
/* send reply */
@@ -1789,31 +1840,27 @@ int MPID_nem_ib_PktHandler_reply_seq_num(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
*rreqp = NULL;
/* update occupancy info of RDMA-write-buf */
- int *lsr_seq_num_tail;
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
dprintf("pkthandler,reply_seq_num,old lsr_seq_num=%d,reply_pkt->seq_num_tail=%d\n",
vc_ib->ibcom->lsr_seq_num_tail, reply_pkt->seq_num_tail);
- ibcom_errno = MPID_nem_ib_com_lsr_seq_num_tail_get(vc_ib->sc->fd, &lsr_seq_num_tail);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_lsr_seq_num_tail_get");
- *lsr_seq_num_tail = MPID_NEM_IB_MAX(*lsr_seq_num_tail, reply_pkt->seq_num_tail);
+ vc_ib->ibcom->lsr_seq_num_tail = reply_pkt->seq_num_tail;
//dprintf("pkthandler,reply_seq_num,new lsr_seq_num=%d\n", vc_ib->ibcom->lsr_seq_num_tail);
#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
/* change remote notification policy of RDMA-write-to buf */
//dprintf("pkthandler,reply_seq_num,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
- MPID_nem_ib_change_rdmabuf_occupancy_notify_policy_lw(vc_ib, lsr_seq_num_tail);
+ MPID_nem_ib_change_rdmabuf_occupancy_notify_policy_lw(vc_ib, &(vc_ib->ibcom->lsr_seq_num_tail));
//dprintf("pkthandler,reply_seq_num,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
#endif
- //dprintf("pkthandler,reply_seq_num,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
+ //dprintf("pkthandler,reply_seq_num,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
/* try to send from sendq because at least one RDMA-write-to buffer has been released */
//dprintf("pkthandler,reply_seq_num,send_progress\n");
dprintf("pkthandler,reply_seq_num,send_progress\n");
MPID_NEM_IB_CHECK_AND_SEND_PROGRESS fn_exit:
MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_PKTHANDLER_REPLY_SEQ_NUM);
return mpi_errno;
- fn_fail:
+ //fn_fail:
goto fn_exit;
}
@@ -1874,12 +1921,16 @@ int MPID_nem_ib_cm_drain_scq()
int result;
int i;
struct ibv_wc cqe[MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN];
-
+ MPID_nem_ib_cm_cmd_shadow_t* shadow_cm;
+ MPID_nem_ib_ringbuf_cmd_shadow_t * shadow_ringbuf;
+
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_DRAIN_SCQ);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_DRAIN_SCQ);
+ dprintf("cm_drain_scq,enter\n");
+
result =
- ibv_poll_cq(MPID_nem_ib_rc_shared_scq_lmt_put, MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN,
+ ibv_poll_cq(MPID_nem_ib_rc_shared_scq_scratch_pad, MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN,
&cqe[0]);
MPIU_ERR_CHKANDJUMP(result < 0, mpi_errno, MPI_ERR_OTHER, "**netmod,ib,ibv_poll_cq");
@@ -1888,36 +1939,236 @@ int MPID_nem_ib_cm_drain_scq()
}
for (i = 0; i < result; i++) {
+ dprintf("cm_drain_scq,wr_id=%p\n", (void *) cqe[i].wr_id);
+
#ifdef HAVE_LIBDCFA
if (cqe[i].status != IBV_WC_SUCCESS) {
dprintf("cm_drain_scq,status=%08x\n", cqe[i].status);
+ MPID_nem_ib_segv;
}
#else
if (cqe[i].status != IBV_WC_SUCCESS) {
dprintf("cm_drain_scq,status=%08x,%s\n", cqe[i].status,
ibv_wc_status_str(cqe[i].status));
+ MPID_nem_ib_segv;
}
#endif
MPIU_ERR_CHKANDJUMP(cqe[i].status != IBV_WC_SUCCESS, mpi_errno, MPI_ERR_OTHER,
"**MPID_nem_ib_cm_drain_scq");
- /* TODO retry a connection request when it timed out */
- if (cqe.wr_id == MPID_NEM_IB_SYN || cqe.wr_id == MPID_NEM_IB_SYNACK) {
-
- MPID_nem_ib_conn_ud_ibcom->ncom_lmt_put -= 1;
- MPID_nem_ib_ncqe_connect -= 1;
-
- /* Try to send from sendq_connect */
- if (!MPID_nem_ib_sendq_empty(sendq_connect) &&
- MPID_nem_ib_ncom_lmt_put < MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
- MPID_nem_ib_ncqe_lmt_put < MPID_NEM_IB_COM_MAX_CQ_CAPACITY) {
- MPID_nem_ib_send_progress_connect();
+ MPID_nem_ib_cm_ringbuf_cmd_type_t * type = (MPID_nem_ib_cm_ringbuf_cmd_type_t *) cqe[i].wr_id;
+ switch(*type) {
+ case MPID_NEM_IB_CM_CAS: {
+ shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
+
+ dprintf("cm_drain_scq,cm_cas,req=%p,responder_rank=%d\n",
+ shadow_cm->req, shadow_cm->req->responder_rank);
+
+ /* Check if CAS have succeeded */
+ uint64_t* cas_retval = (uint64_t *) shadow_cm->buf_from;
+ if(*cas_retval == MPID_NEM_IB_CM_RELEASED) {
+ /* CAS succeeded, so write command */
+
+ dprintf("cm_drain_scq,cm_cas,succeeded\n");
+
+ shadow_cm->req->state = MPID_NEM_IB_CM_SYN;
+ if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
+ shadow_cm->req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
+ MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
+ MPID_nem_ib_cm_ringbuf_tail) < MPID_NEM_IB_CM_NSEG) {
+
+ MPID_nem_ib_cm_cmd_syn_t *cmd = (MPID_nem_ib_cm_cmd_syn_t *) shadow_cm->req->ibcom->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
+ MPID_NEM_IB_CM_COMPOSE_SYN(cmd, shadow_cm->req);
+ cmd->responder_ringbuf_index = shadow_cm->req->responder_ringbuf_index = MPID_nem_ib_cm_ringbuf_head;
+ dprintf("cm_drain_scq,giving ringbuf_index=%d\n", cmd->responder_ringbuf_index);
+ MPID_nem_ib_cm_ringbuf_head++;
+ cmd->initiator_rank = MPID_nem_ib_myrank;
+
+ MPID_nem_ib_cm_cmd_shadow_t * shadow_syn =
+ (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ shadow_syn->type = shadow_cm->req->state;
+ shadow_syn->req = shadow_cm->req;
+ dprintf("shadow_syn=%p,shadow_syn->req=%p\n", shadow_syn, shadow_syn->req);
+ mpi_errno =
+ MPID_nem_ib_cm_cmd_core(shadow_cm->req->responder_rank, shadow_syn,
+ (void *) cmd, sizeof(MPID_nem_ib_cm_cmd_syn_t),
+ 1 /* syn:1 */, 0);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_send_core");
+ } else {
+ MPID_NEM_IB_CM_COMPOSE_SYN((MPID_nem_ib_cm_cmd_syn_t *)&(shadow_cm->req->cmd), shadow_cm->req);
+ MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, shadow_cm->req);
+ }
+ } else {
+ dprintf("cm_drain_scq,cm_cas,retval=%016lx,backoff=%ld\n",
+ *cas_retval, shadow_cm->req->retry_backoff);
+ shadow_cm->req->retry_backoff =
+ shadow_cm->req->retry_backoff ?
+ (shadow_cm->req->retry_backoff << 1) :
+ 1;
+ shadow_cm->req->retry_decided = MPID_nem_ib_progress_engine_vt; /* Schedule retry */
+ MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, shadow_cm->req);
+ dprintf("cm_drain_scq,cm_cas,failed,decided=%ld,backoff=%ld\n",
+ shadow_cm->req->retry_decided, shadow_cm->req->retry_backoff);
}
- }
- else {
- printf("unknown command=%d\n", cqe.wr_id);
+ MPID_nem_ib_ncqe_scratch_pad_to_drain -= 1;
+ shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
+ MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
+ MPIU_Free(shadow_cm);
+ break; }
+ case MPID_NEM_IB_CM_SYN:
+ dprintf("cm_drain_scq,syn sent\n");
+ shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
+ shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
+ MPIU_Free(shadow_cm);
+ break;
+ case MPID_NEM_IB_CM_SYNACK:
+ shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
+ dprintf("cm_drain_scq,synack sent,req=%p,initiator_rank=%d\n",
+ shadow_cm->req, shadow_cm->req->initiator_rank);
+ shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
+ MPIU_Free(shadow_cm);
+ break;
+ case MPID_NEM_IB_CM_ACK1:
+ dprintf("cm_drain_scq,ack1 sent\n");
+ shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
+ shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
+
+ /* Finalize protocol because there is no referer in cm_drain_scq and sendq.
+ Note that there might be one in cm_poll.*/
+ MPID_nem_ib_cm_request_release(shadow_cm->req);
+ MPIU_Free(shadow_cm);
+ break;
+ case MPID_NEM_IB_CM_ACK2:
+ shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
+ dprintf("cm_drain_scq,ack2 sent,req=%p,initiator_rank=%p=%d\n",
+ shadow_cm->req, &shadow_cm->req->initiator_rank, shadow_cm->req->initiator_rank);
+ shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
+
+ /* Let the guard down to let the following connection request go. */
+ VC_FIELD(MPID_nem_ib_conns[shadow_cm->req->initiator_rank].vc, connection_guard) = 0;
+
+ /* Finalize protocol because there is no referer in cm_drain_scq, sendq
+ and cm_poll because cm_poll sent ACK2. */
+ MPID_nem_ib_cm_request_release(shadow_cm->req);
+ MPIU_Free(shadow_cm);
+ break;
+ case MPID_NEM_IB_RINGBUF_ASK_FETCH:
+ shadow_ringbuf = (MPID_nem_ib_ringbuf_cmd_shadow_t *) cqe[i].wr_id;
+ memcpy(&shadow_ringbuf->req->fetched,
+ shadow_ringbuf->buf_from,
+ sizeof(MPID_nem_ib_ringbuf_headtail_t));
+ dprintf("cm_drain_scq,ask_fetch sent,%d->%d,req=%p,fetched->head=%ld,tail=%d\n",
+ MPID_nem_ib_myrank,
+ shadow_ringbuf->req->vc->pg_rank,
+ shadow_ringbuf->req, shadow_ringbuf->req->fetched.head,
+ shadow_ringbuf->req->fetched.tail);
+ /* Proceed to cas */
+ MPID_nem_ib_ringbuf_ask_cas(shadow_ringbuf->req->vc, shadow_ringbuf->req);
+ MPID_nem_ib_ncqe_scratch_pad_to_drain -= 1;
+ shadow_ringbuf->req->ibcom->ncom_scratch_pad -= 1;
+ MPID_nem_ib_rdmawr_from_free(shadow_ringbuf->buf_from, shadow_ringbuf->buf_from_sz);
+ MPIU_Free(shadow_ringbuf);
+ break;
+ case MPID_NEM_IB_RINGBUF_ASK_CAS: {
+ shadow_ringbuf = (MPID_nem_ib_ringbuf_cmd_shadow_t *) cqe[i].wr_id;
+ /* Check if CAS have succeeded */
+ MPID_nem_ib_ringbuf_headtail_t* cas_retval =
+ (MPID_nem_ib_ringbuf_headtail_t *) shadow_ringbuf->buf_from;
+ dprintf("cm_drain_scq,ask_cas sent,req=%p,fetched.head=%lx,retval=%lx\n",
+ shadow_ringbuf->req, shadow_ringbuf->req->fetched.head, cas_retval->head);
+ if(cas_retval->head == shadow_ringbuf->req->fetched.head) {
+ /* CAS succeeded */
+ dprintf("cm_drain_scq,ask_cas,cas succeeded,%d->%d,local_head=%d,local_tail=%d,nslot=%d\n",
+ MPID_nem_ib_myrank,
+ shadow_ringbuf->req->vc->pg_rank,
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num),
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->lsr_seq_num_tail),
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot));
+ if(MPID_nem_ib_diff16(VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num),
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->lsr_seq_num_tail)) >=
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot)) {
+ dprintf("cm_drain_scq,ask_cas,refill fast path\n");
+ /* Refill now when we don't have any slots */
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num) =
+ (uint16_t)shadow_ringbuf->req->fetched.head;
+ /* Move tail pointer to indicate only one slot is available to us */
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->lsr_seq_num_tail) =
+ (uint16_t)
+ (VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num) -
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot) + 1);
+ dprintf("cm_drain_scq,ask_cas,after refill,local_head=%d,local_tail=%d,nslot=%d\n",
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num),
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->lsr_seq_num_tail),
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot));
+ } else {
+ dprintf("cm_drain_scq,ask_cas,refill slow path\n");
+ /* Enqueue slots to avoid overwriting the slots when we have some slots.
+ This happens when two or more asks succeeded before
+ the first queued send is issued. */
+ MPID_nem_ib_ringbuf_sector_t * sector =
+ (MPID_nem_ib_ringbuf_sector_t *) MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_sector_t));
+ sector->type = MPID_NEM_IB_RINGBUF_SHARED;
+ sector->start = VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_start);
+ sector->nslot = VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot);
+ sector->head = (uint16_t)shadow_ringbuf->req->fetched.head;
+ sector->tail = sector->head -
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot) + 1;
+ MPIU_ERR_CHKANDJUMP(!sector, mpi_errno, MPI_ERR_OTHER, "**malloc");
+ MPID_nem_ib_ringbuf_sectorq_enqueue(&VC_FIELD(shadow_ringbuf->req->vc, ibcom->sectorq),
+ sector);
+ }
+ /* Let the guard down so that the following ask-fetch can be issued */
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->ask_guard) = 0;
+
+ /* Kick progress engine */
+ dprintf("cm_drain_scq,call send_progress for %d,ncom=%d,ncqe=%d,local_head=%d,local_tail=%d,nslot=%d\n",
+ shadow_ringbuf->req->vc->pg_rank,
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->ncom),
+ MPID_nem_ib_ncqe,
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num),
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->lsr_seq_num_tail),
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot)
+ );
+ MPID_nem_ib_send_progress(shadow_ringbuf->req->vc);
+
+ MPIU_Free(shadow_ringbuf->req);
+ } else {
+ /* CAS failed */
+ printf("ask-cas,failed\n");MPID_nem_ib_segv;
+ /* Let the guard down so that this ask-fetch can be issued in ringbuf_progress */
+ VC_FIELD(shadow_ringbuf->req->vc, ibcom->ask_guard) = 0;
+
+ /* Retry from fetch */
+ shadow_ringbuf->req->state = MPID_NEM_IB_RINGBUF_ASK_FETCH;
+
+ /* Schedule retry */
+ dprintf("cm_drain_scq,retval=%08lx,backoff=%ld\n",
+ cas_retval->head, shadow_ringbuf->req->retry_backoff);
+ MPID_NEM_IB_RINGBUF_UPDATE_BACKOFF(shadow_ringbuf->req->retry_backoff);
+ shadow_ringbuf->req->retry_decided = MPID_nem_ib_progress_engine_vt;
+
+ /* Make the ask-fetch in order */
+ MPID_nem_ib_ringbuf_sendq_enqueue_at_head(&MPID_nem_ib_ringbuf_sendq, shadow_ringbuf->req);
+ dprintf("cm_drain_scq,ask_cas,cas failed,decided=%ld,backoff=%ld\n",
+ shadow_ringbuf->req->retry_decided, shadow_ringbuf->req->retry_backoff);
+ }
+ MPID_nem_ib_ncqe_scratch_pad_to_drain -= 1;
+ shadow_ringbuf->req->ibcom->ncom_scratch_pad -= 1;
+ MPID_nem_ib_rdmawr_from_free(shadow_ringbuf->buf_from, shadow_ringbuf->buf_from_sz);
+ MPIU_Free(shadow_ringbuf);
+ break; }
+ default:
+ printf("unknown type=%d\n", *type);
MPIU_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_drain_scq");
+ break;
}
+ MPID_nem_ib_ncqe_scratch_pad -= 1;
+ }
+ /* The number of CQE is reduced or a slot of the ringbuf is released, so kick progress engine */
+ if(result > 0) {
+ MPID_nem_ib_cm_progress();
+ MPID_nem_ib_ringbuf_progress();
}
fn_exit:
@@ -1928,196 +2179,489 @@ int MPID_nem_ib_cm_drain_scq()
}
#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_cm_poll
+#define FUNCNAME MPID_nem_ib_cm_poll_syn
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_cm_poll()
+int MPID_nem_ib_cm_poll_syn()
{
int mpi_errno = MPI_SUCCESS;
+ int ibcom_errno;
+ int ib_port = 1;
+ int i;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_POLL);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_POLL);
-
- dprintf("cm_poll,enter\n");
-
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_POLL_SYN);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_POLL_SYN);
- volatile uint32_t *owner =
- (uint32_t *) (MPID_nem_ib_com_scratch_pad->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_TO]);
- if (*owner == (uint32_t) - 1) {
+ /* Memory layout is (CAS-word:SYN#0:SYN#1:...:SYN#N:CMD#0:CMD#1:...CMD#M) */
+ void* slot = (MPID_nem_ib_scratch_pad + MPID_NEM_IB_CM_OFF_SYN +
+ sizeof(MPID_nem_ib_cm_cmd_t) * (0 % MPID_NEM_IB_CM_NSEG));
+
+ volatile uint8_t *head_flag = (uint8_t *) slot;
+ if (*head_flag == MPID_NEM_IB_CM_HEAD_FLAG_ZERO) {
goto fn_exit;
- } /* not acquired */
-
- MPID_nem_ib_com *ibcom_scratch_pad;
- ibcom_errno =
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[*owner], &ibcom_scratch_pad);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
-
- MPID_nem_ib_cm_cmd_t *received =
- (MPID_nem_ib_cm_cmd_t *) (ibcom_scratch_pad->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_TO] +
- sizeof(uint32_t));
- MPID_nem_ib_cm_cmd_t cmd;
- MPID_nem_ib_vc_area *vc_ib;
- switch (received->type) {
- case MPID_NEM_IB_CM_SYN:
- ibcom_errno =
- MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_RC, &MPID_nem_ib_conns[*owner].fd);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
- cmd.type = MPID_NEM_IB_CM_SYNACK;
- goto common_tail;
- break;
- case MPID_NEM_IB_CM_BUSINESSCARD:
- ibcom_errno =
- MPID_nem_ib_com_rts(MPID_nem_ib_conns[*owner].fd, received->qpnum, received->lid,
- &(received->gid));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_rts");
- ibcom_errno =
- MPID_nem_ib_com_reg_mr_connect(MPID_nem_ib_conns[*owner].fd, received->rmem,
- received->rkey);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_reg_mr_connect");
- VC_FIELD(MPID_nem_ib_conns[owner].vc, is_connected) = 1;
-
- cmd.type = MPID_NEM_IB_CM_ACK;
- common_tail:
- ibcom_errno =
- MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conns[*owner].fd,
- MPID_NEM_IB_COM_INFOKEY_PORT_LID, &(cmd.lid),
- sizeof(uint16_t));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_get_info_conn");
-
- ibcom_errno =
- MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conns[*owner].fd,
- MPID_NEM_IB_COM_INFOKEY_PORT_GID, &(cmd.gid),
- sizeof(union ibv_gid));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_get_info_conn");
-
- ibcom_errno =
- MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conns[*owner].fd,
- MPID_NEM_IB_COM_INFOKEY_QP_QPN, &(cmd.qpnum),
- sizeof(uint32_t));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_get_info_conn");
-
- ibcom_errno =
- MPID_nem_ib_com_get_info_mr(MPID_nem_ib_conns[*owner].fd,
- MPID_NEM_IB_COM_SCRATCH_PAD_TO,
- MPID_NEM_IB_COM_INFOKEY_MR_ADDR, &(cmd.rmem),
- sizeof(void *));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_get_info_mr");
-
- ibcom_errno =
- MPID_nem_ib_com_get_info_mr(MPID_nem_ib_conns[*owner].fd,
- MPID_NEM_IB_COM_SCRATCH_PAD_TO,
- MPID_NEM_IB_COM_INFOKEY_MR_RKEY, &(cmd.rkey), sizeof(int));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_get_info_mr");
-
- *owner = (uint32_t) - 1; /* release */
-
- mpi_errno = MPID_nem_ib_cm_send_core(rank, &cmd);
- MPIU_ERR_CHKANDJUMP(mp_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_send_core");
+ } /* Incoming message hasn't arrived */
+
+ MPID_nem_ib_cm_cmd_t *cmd;
+ switch (*head_flag) {
+ case MPID_NEM_IB_CM_SYN: {
+ volatile MPID_nem_ib_cm_cmd_syn_t *syn_tail_flag =
+ (MPID_nem_ib_cm_cmd_syn_t *) slot;
+ while (syn_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
+ /* __asm__ __volatile__("pause;":::"memory"); */
+ }
+
+ volatile uint64_t *cas_word = (uint64_t *) (MPID_nem_ib_scratch_pad);
+ MPID_nem_ib_cm_cmd_syn_t *syn = (MPID_nem_ib_cm_cmd_syn_t *) slot;
+
+ dprintf("cm_poll_syn,syn detected!,initiator_rank=%d,ringbuf_index=%d\n",
+ syn->initiator_rank, syn->responder_ringbuf_index);
+ /* Skip QP createion on race condition */
+ if(!(VC_FIELD(MPID_nem_ib_conns[syn->initiator_rank].vc, connection_state) &
+ MPID_NEM_IB_CM_LOCAL_QP_RESET)) {
+ ibcom_errno = MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_RC, &MPID_nem_ib_conns[syn->initiator_rank].fd);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
+ /* store pointer to MPID_nem_ib_com */
+ dprintf("cm_poll_syn,initiator fd=%d\n", MPID_nem_ib_conns[syn->initiator_rank].fd);
+ ibcom_errno = MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[syn->initiator_rank].fd,
+ &VC_FIELD(MPID_nem_ib_conns[syn->initiator_rank].vc, ibcom));
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
+
+ /* Allocate RDMA-write-to ring-buf for remote */
+ mpi_errno = MPID_nem_ib_ringbuf_alloc(MPID_nem_ib_conns[syn->initiator_rank].vc);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_alloc");
+
+ /* Record state transition for race condition detection */
+ VC_FIELD(MPID_nem_ib_conns[syn->initiator_rank].vc, connection_state) |=
+ MPID_NEM_IB_CM_LOCAL_QP_RESET;
+ }
+
+ MPID_nem_ib_cm_req_t* req = MPIU_Malloc(sizeof(MPID_nem_ib_cm_req_t));
+ MPIU_ERR_CHKANDJUMP(!req, mpi_errno, MPI_ERR_OTHER, "**malloc");
+ req->state = MPID_NEM_IB_CM_SYNACK;
+ req->ref_count = 1; /* Released when draining SCQ of ACK2 */
+ req->ringbuf_index = syn->responder_ringbuf_index;
+ req->initiator_rank = syn->initiator_rank;
+ req->responder_rank = MPID_nem_ib_myrank;
+ ibcom_errno =
+ MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[req->initiator_rank],
+ &req->ibcom);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
+
+ if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
+ req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
+ MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
+ MPID_nem_ib_cm_ringbuf_tail) < MPID_NEM_IB_CM_NSEG) {
+
+ MPID_nem_ib_cm_cmd_synack_t *cmd = (MPID_nem_ib_cm_cmd_synack_t *) req->ibcom->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
+ MPID_NEM_IB_CM_COMPOSE_SYNACK(cmd, req, syn->initiator_req);
+ dprintf("cm_poll_syn,composing synack,responder_req=%p,cmd->rmem=%lx,rkey=%08x,ringbuf_nslot=%d,remote_vc=%lx\n",
+ cmd->responder_req, cmd->rmem, cmd->rkey, cmd->ringbuf_nslot, cmd->remote_vc);
+ cmd->initiator_ringbuf_index = req->initiator_ringbuf_index = MPID_nem_ib_cm_ringbuf_head;
+ dprintf("cm_poll_syn,giving ringbuf_index=%d\n", cmd->initiator_ringbuf_index);
+ MPID_nem_ib_cm_ringbuf_head++;
+ MPID_nem_ib_cm_cmd_shadow_t * shadow =
+ (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ shadow->type = req->state;
+ shadow->req = req;
+ dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
+ mpi_errno = MPID_nem_ib_cm_cmd_core(req->initiator_rank, shadow, (void *) cmd, sizeof(MPID_nem_ib_cm_cmd_synack_t), 0, req->ringbuf_index);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_send_core");
+
+ } else {
+ dprintf("cm_poll_syn,enqueue,ncqe=%d,ncom=%d,head=%d,tail=%d\n", MPID_nem_ib_ncqe_scratch_pad, req->ibcom->ncom_scratch_pad, MPID_nem_ib_cm_ringbuf_head, MPID_nem_ib_cm_ringbuf_tail);
+ MPID_NEM_IB_CM_COMPOSE_SYNACK((MPID_nem_ib_cm_cmd_synack_t *)&(req->cmd), req, syn->initiator_req);
+ MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
+ }
+ /* Release CAS word because there's no next write on this syn slot */
+ *cas_word = MPID_NEM_IB_CM_RELEASED;
+ }
+ //common_tail:
+ *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
+
+ /* Clear all possible tail-flag slots */
+ ((MPID_nem_ib_cm_cmd_syn_t *)slot)->tail_flag.tail_flag = 0;
break;
default:
printf("unknown connection command\n");
MPIU_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_poll");
+ break;
}
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_POLL_SYN);
+ return mpi_errno;
+ fn_fail:
+ goto fn_exit;
+}
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_POLL);
+
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_cm_release
+#undef FCNAME
+int MPID_nem_ib_cm_release(uint16_t index) {
+ int mpi_errno = MPI_SUCCESS;
+ int old_ringbuf_tail = MPID_nem_ib_cm_ringbuf_tail;
+
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_RELEASE);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_RELEASE);
+
+ /* mark that one buffer has been released */
+ MPIU_Assert(0 <= index && index < MPID_NEM_IB_CM_NSEG);
+ //dprintf("user_data=%p,mem=%p,sub=%08lx,index=%d\n", user_data, vc_ib->ibcom->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO], (unsigned long)user_data - (unsigned long)vc_ib->ibcom->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO], index);
+ //dprintf("index=%d,released=%016lx\n", index, vc_ib->ibcom->remote_ringbuf->remote_released[index / 64]);
+ MPID_nem_ib_cm_ringbuf_released[index / 64] |= (1ULL << (index & 63));
+ //dprintf("released[index/64]=%016lx\n", vc_ib->ibcom->remote_ringbuf->remote_released[index / 64]);
+ int index_tail = ((uint16_t)(MPID_nem_ib_cm_ringbuf_tail + 1) % MPID_NEM_IB_CM_NSEG);
+ //dprintf("tail+1=%d,index_tail=%d\n", vc_ib->ibcom->rsr_seq_num_tail + 1, index_tail);
+ //dprintf("released=%016lx\n", vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64]);
+ while (1) {
+ if (((MPID_nem_ib_cm_ringbuf_released[index_tail / 64] >> (index_tail & 63)) & 1) ==
+ 1) {
+ MPID_nem_ib_cm_ringbuf_tail++;
+ MPID_nem_ib_cm_ringbuf_released[index_tail / 64] &= ~(1ULL << (index_tail & 63));
+ dprintf("MPID_nem_ib_cm_ringbuf_tail,incremented to %d\n", MPID_nem_ib_cm_ringbuf_tail);
+ }
+ else {
+ break;
+ }
+ }
+
+ /* A slot of the ringbuf is released, so kick progress engine */
+ if(MPID_nem_ib_cm_ringbuf_tail != old_ringbuf_tail) {
+ MPID_nem_ib_cm_progress();
+ }
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_RELEASE);
return mpi_errno;
- fn_fail:
+ //fn_fail:
goto fn_exit;
+
}
#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_cm_accept
+#define FUNCNAME MPID_nem_ib_cm_poll
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_cm_accept()
+int MPID_nem_ib_cm_poll()
{
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
- int result;
- int i;
- struct ibv_wc cqe[MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN];
+ uint16_t i;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_ACCEPT);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_ACCEPT);
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_POLL);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_POLL);
- result = ibv_poll_cq(MPID_nem_ib_ud_shared_rcq, MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN, &cqe);
- MPIU_ERR_CHKANDJUMP(result < 0, mpi_errno, MPI_ERR_OTHER, "**netmod,ib,ibv_poll_cq");
+ /* Wrap-around tolerant by using "!=" */
+ for(i = MPID_nem_ib_cm_ringbuf_tail + 1; i != MPID_nem_ib_cm_ringbuf_head; i++) {
+
+ /* Memory layout is (CAS-word:SYN#0:SYN#1:...:SYN#N:CMD#0:CMD#1:...CMD#M) */
+ void* slot = (MPID_nem_ib_scratch_pad + MPID_NEM_IB_CM_OFF_CMD +
+ sizeof(MPID_nem_ib_cm_cmd_t) *
+ ((uint16_t)(i % MPID_NEM_IB_CM_NSEG)));
+
+ volatile uint8_t *head_flag = (uint8_t *) slot;
+ if (*head_flag == MPID_NEM_IB_CM_HEAD_FLAG_ZERO) {
+ continue;
+ } /* Incoming message hasn't arrived */
+
+ switch (*head_flag) {
+ case MPID_NEM_IB_CM_SYNACK: {
+ volatile MPID_nem_ib_cm_cmd_synack_t *synack_tail_flag =
+ (MPID_nem_ib_cm_cmd_synack_t *) slot;
+ while (synack_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
+ /* __asm__ __volatile__("pause;":::"memory"); */
+ }
+
+ MPID_nem_ib_cm_cmd_synack_t *synack = (MPID_nem_ib_cm_cmd_synack_t *) slot;
+ MPID_nem_ib_cm_req_t* req = (MPID_nem_ib_cm_req_t *) synack->initiator_req;
+ req->ringbuf_index = synack->initiator_ringbuf_index;
+
+ dprintf("cm_poll,synack detected!,responder_req=%p,responder_rank=%d,ringbuf_index=%d\n",
+ synack->responder_req, req->responder_rank, synack->initiator_ringbuf_index);
+
+ /* Deduct it from the packet */
+ VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_state) |=
+ MPID_NEM_IB_CM_REMOTE_QP_RESET;
+
+ /* Skip QP state transition on race condition */
+ if(!(VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_state) &
+ MPID_NEM_IB_CM_LOCAL_QP_RTS)) {
+ ibcom_errno =
+ MPID_nem_ib_com_rts(MPID_nem_ib_conns[req->responder_rank].fd, synack->qpnum, synack->lid,
+ &(synack->gid));
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_rts");
+
+ /* Connect ring buffer */
+ ibcom_errno =
+ MPID_nem_ib_com_connect_ringbuf(MPID_nem_ib_conns[req->responder_rank].fd,
+ synack->ringbuf_type,
+ synack->rmem, synack->rkey, synack->ringbuf_nslot,
+ synack->remote_vc,
+ 1);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_connect_ringbuf");
+ dprintf("connect_ringbuf,%d-%d=%d\n",
+ VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, ibcom->sseq_num),
+ VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, ibcom->lsr_seq_num_tail),
+ MPID_nem_ib_diff16(VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, ibcom->sseq_num),
+ VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, ibcom->lsr_seq_num_tail))
+ );
+
+ /* Record state transition for race condition detection */
+ VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_state) |=
+ MPID_NEM_IB_CM_LOCAL_QP_RTS;
+ }
- if (result > 0) {
- dprintf("accept,result=%d\n", result);
- }
- for (i = 0; i < result; i++) {
- dprintf("accept,i=%d\n", i);
-
- MPIU_ERR_CHKANDJUMP(cqe.status != IBV_WC_SUCCESS, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_accept");
-
- void *rbuf;
- ibcom_errno = MPID_nem_ib_com_mem_udwr_to(MPID_nem_ib_conn_ud_fd, &rbuf);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_mem_udwr_to");
- MPID_nem_ib_conn_pkt_t *rpkt = (MPID_nem_ib_conn_pkt_t *) (rbuf + 40);
- if (rpkt->type == MPID_NEM_IB_SYN) {
-
- dprintf("accept,%d<-%d,type=%08x\n", MPID_nem_ib_myrank, rpkt->remote_rank, rpkt->type);
-
- void *sbuf;
- ibcom_errno = MPID_nem_ib_com_mem_udwr_from(MPID_nem_ib_conn_ud_fd, &sbuf);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_mem_udwr_from");
- MPID_nem_ib_conn_pkt_t *spkt = (MPID_nem_ib_conn_pkt_t *) (sbuf + 40);
- spkt->remote_rank = MPID_nem_ib_myrank;
- spkt->type = MPID_NEM_IB_SYNACK;
-
- ibcom_errno =
- MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conns[rpkt->remote_rank].fd,
- MPID_NEM_IB_COM_INFOKEY_QP_QPN, &spkt->qpn,
- sizeof(uint32_t));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_get_info_conn");
-
- ibcom_errno =
- MPID_nem_ib_com_get_info_mr(MPID_nem_ib_conns[remote_rank].fd,
- MPID_NEM_IB_COM_RDMAWR_TO,
- MPID_NEM_IB_COM_INFOKEY_MR_ADDR, &spkt->rmem,
- sizeof(void *));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_get_info_mr");
-
- ibcom_errno =
- MPID_nem_ib_com_get_info_mr(MPID_nem_ib_conns[remote_rank].fd,
- MPID_NEM_IB_COM_RDMAWR_TO,
- MPID_NEM_IB_COM_INFOKEY_MR_RKEY, &spkt->rkey,
- sizeof(int));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_get_info_mr");
-
- /* kokomade. add udsend_core(synack) */
- if (MPID_nem_ib_conn_ibcom->ncom < &&MPID_nem_ib_ncqe_connect <) {
- MPID_nem_ib_conn_send_core(rpkt->remote_rank);
+ req->state = MPID_NEM_IB_CM_ACK1;
+ if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
+ req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY) {
+
+ dprintf("cm_poll,sending ack1,req=%p,ringbuf_index=%d\n", req, req->ringbuf_index);
+ MPID_nem_ib_cm_cmd_ack1_t *cmd = (MPID_nem_ib_cm_cmd_ack1_t *) req->ibcom->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
+ MPID_NEM_IB_CM_COMPOSE_ACK1(cmd, req, synack->responder_req);
+ dprintf("cm_poll,composing ack1,cmd->responder_req=%p,cmd->rmem=%lx,rkey=%08x,ringbuf_nslot=%d,remote_vc=%lx\n",
+ cmd->responder_req, cmd->rmem, cmd->rkey, cmd->ringbuf_nslot, cmd->remote_vc);
+ MPID_nem_ib_cm_cmd_shadow_t * shadow =
+ (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ shadow->type = req->state;
+ shadow->req = req;
+ dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
+ mpi_errno = MPID_nem_ib_cm_cmd_core(req->responder_rank, shadow, (void *) cmd, sizeof(MPID_nem_ib_cm_cmd_ack1_t), 0, req->ringbuf_index);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_send_core");
+ } else {
+ MPID_NEM_IB_CM_COMPOSE_ACK1((MPID_nem_ib_cm_cmd_ack1_t *)&(req->cmd), req, synack->responder_req);
+ MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
}
- else {
- MPID_nem_ib_sendq_conn_entry_t *entry =
- MPIU_Malloc(sizeof(MPID_nem_ib_sendq_conn_entry_t));
- MPIU_ERR_CHKANDJUMP(!entry, mpi_errno, MPI_ERR_OTHER, "**outofmemory");
- entry->pending_pkt = *spkt;
- MPID_nem_ib_conn_sendq_enqueue(MPID_nem_ib_conn_sendq, entry);
+ }
+ goto common_tail;
+ break;
+ case MPID_NEM_IB_CM_ACK1: {
+ volatile MPID_nem_ib_cm_cmd_ack1_t *ack1_tail_flag =
+ (MPID_nem_ib_cm_cmd_ack1_t *) slot;
+ while (ack1_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
+ /* __asm__ __volatile__("pause;":::"memory"); */
+ }
+
+ MPID_nem_ib_cm_cmd_ack1_t *ack1 = (MPID_nem_ib_cm_cmd_ack1_t *) slot;
+ MPID_nem_ib_cm_req_t* req = (MPID_nem_ib_cm_req_t *) ack1->responder_req;
+
+ dprintf("cm_poll,ack1 detected!,responder_req=%p,initiator_rank=%d\n",
+ ack1->responder_req, req->initiator_rank);
+
+ /* Deduct it from the packet */
+ VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, connection_state) |=
+ (MPID_NEM_IB_CM_REMOTE_QP_RESET | MPID_NEM_IB_CM_REMOTE_QP_RTS);
+
+ /* Skip QP createion on race condition */
+ if(!(VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, connection_state) &
+ MPID_NEM_IB_CM_LOCAL_QP_RTS)) {
+ ibcom_errno =
+ MPID_nem_ib_com_rts(MPID_nem_ib_conns[req->initiator_rank].fd,
+ ack1->qpnum, ack1->lid, &(ack1->gid));
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_rts");
+
+ /* Connect ring buffer */
+ ibcom_errno =
+ MPID_nem_ib_com_connect_ringbuf(MPID_nem_ib_conns[req->initiator_rank].fd,
+ ack1->ringbuf_type,
+ ack1->rmem, ack1->rkey, ack1->ringbuf_nslot,
+ ack1->remote_vc,
+ 1);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_connect_ringbuf");
+ dprintf("connect_ringbuf,%d-%d=%d\n",
+ VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, ibcom->sseq_num),
+ VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, ibcom->lsr_seq_num_tail),
+ MPID_nem_ib_diff16(VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, ibcom->sseq_num),
+ VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, ibcom->lsr_seq_num_tail))
+ );
+
+ MPID_nem_ib_vc_onconnect(MPID_nem_ib_conns[req->initiator_rank].vc);
+
+ /* Record state transition for race condition detection */
+ VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, connection_state) |=
+ MPID_NEM_IB_CM_LOCAL_QP_RTS;
+ }
+
+ req->state = MPID_NEM_IB_CM_ACK2;
+ if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
+ req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY) {
+
+ dprintf("cm_poll,sending ack2,req=%p,ringbuf_index=%d,initiator_rank=%d\n",
+ req, req->ringbuf_index, req->initiator_rank);
+
+ MPID_nem_ib_cm_cmd_ack2_t *cmd = (MPID_nem_ib_cm_cmd_ack2_t *) req->ibcom->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
+ MPID_NEM_IB_CM_COMPOSE_ACK2(cmd, ack1->initiator_req);
+ MPID_nem_ib_cm_cmd_shadow_t * shadow =
+ (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ shadow->type = req->state;
+ shadow->req = req;
+ dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
+ mpi_errno = MPID_nem_ib_cm_cmd_core(req->initiator_rank, shadow, (void *) cmd, sizeof(MPID_nem_ib_cm_cmd_ack2_t), 0, req->ringbuf_index);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_send_core");
+ } else {
+ MPID_NEM_IB_CM_COMPOSE_ACK2((MPID_nem_ib_cm_cmd_ack2_t *)&(req->cmd), ack1->initiator_req);
+ MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
}
+ /* The responder release the slot for initiator */
+ MPID_nem_ib_cm_release(req->initiator_ringbuf_index);
+ /* Kick ask-send commands waiting for connection */
+ MPID_nem_ib_ringbuf_progress();
+
+ /* Kick send commands waiting for connection.
+ This might be a dupe when running-ahead transaction kicked it when receiving ACK2. */
+ dprintf("cm_poll,kick progress engine for %d\n", req->initiator_rank);
+ MPID_nem_ib_send_progress(MPID_nem_ib_conns[req->initiator_rank].vc);
}
- else {
- dprintf("accept,unknown type=%08x\n", *((uint32_t *) (rbuf + 44)));
+ goto common_tail;
+ break;
+ case MPID_NEM_IB_CM_ACK2: {
+ volatile MPID_nem_ib_cm_cmd_ack2_t *ack2_tail_flag =
+ (MPID_nem_ib_cm_cmd_ack2_t *) slot;
+ while (ack2_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
+ /* __asm__ __volatile__("pause;":::"memory"); */
+ }
+ MPID_nem_ib_cm_cmd_ack2_t *ack2 = (MPID_nem_ib_cm_cmd_ack2_t *) slot;
+ MPID_nem_ib_cm_req_t* req = (MPID_nem_ib_cm_req_t *) ack2->initiator_req;
+
+ dprintf("cm_poll,ack2 detected!,req=%p,responder_rank=%d\n",
+ req, req->responder_rank);
+
+ /* Deduct it from the packet */
+ if(!(VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_state) &
+ MPID_NEM_IB_CM_REMOTE_QP_RTS)) {
+ MPID_nem_ib_vc_onconnect(MPID_nem_ib_conns[req->responder_rank].vc);
+
+ /* Record state transition for race condition detection */
+ VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_state) |=
+ MPID_NEM_IB_CM_REMOTE_QP_RTS;
+ }
+
+ /* The initiator release the slot for responder */
+ MPID_nem_ib_cm_release(req->responder_ringbuf_index);
+
+ /* Acquire ring-buffer slot now that it's connected if requested so */
+ if(req->ask_on_connect &&
+ VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc,
+ ibcom->local_ringbuf_type) == MPID_NEM_IB_RINGBUF_SHARED) {
+ dprintf("cm_poll,ack2,ask on connect\n");
+ mpi_errno = MPID_nem_ib_ringbuf_ask_fetch(MPID_nem_ib_conns[req->responder_rank].vc);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_ask_fetch");
+ }
+
+ /* Kick ask-send commands waiting for connection */
+ MPID_nem_ib_ringbuf_progress();
+
+ /* Kick send commands waiting for connection.
+ This might be a dupe when running-ahead transaction kicked it when receiving ACK1. */
+ dprintf("cm_poll,kick progress engine for %d\n", req->responder_rank);
+ MPID_nem_ib_send_progress(MPID_nem_ib_conns[req->responder_rank].vc);
+
+ /* Let the following connection request go */
+ VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_guard) = 0;
+
+ /* Finalize protocol because there is no referer in cm_poll and sendq.
+ Note that there might be one which sent ACK1 in cm_drain_scq. */
+ MPID_nem_ib_cm_request_release(req);
+ }
+ common_tail:
+ *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
+ /* Clear all possible tail-flag slots */
+ MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
+ break;
+ default:
+ printf("unknown connection command\n");
+ MPIU_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_poll");
+ break;
}
}
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_ACCEPT);
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_POLL);
return mpi_errno;
- fn_fail:
+ fn_fail:
goto fn_exit;
}
+
#endif
+
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_ringbuf_alloc
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_ib_ringbuf_alloc(MPIDI_VC_t * vc)
+{
+ int mpi_errno = MPI_SUCCESS;
+ int ibcom_errno;
+ int i;
+
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_ALLOC);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_ALLOC);
+
+ if(!MPID_nem_ib_ringbuf) {
+ MPID_nem_ib_ringbuf = MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_t) * MPID_NEM_IB_NRINGBUF);
+ MPIU_ERR_CHKANDJUMP(!MPID_nem_ib_ringbuf, mpi_errno, MPI_ERR_OTHER,
+ "**malloc");
+ }
+
+#if 0 /* Degug, "#if 1" to make exclusive ring-buffers not available */
+ //if(MPID_nem_ib_myrank == 0) {
+ for(i = 0; i < MPID_NEM_IB_NRINGBUF - 1; i++) {
+ MPID_nem_ib_ringbuf_acquired[i / 64] |= (1ULL << (i & 63));
+ }
+ //}
+#endif
+
+ int found = 0;
+ /* [MPID_NEM_IB_NRINGBUF-1] holds shared ring buffer */
+ for(i = 0; i < MPID_NEM_IB_NRINGBUF - 1; i++) {
+ if (((MPID_nem_ib_ringbuf_acquired[i / 64] >> (i & 63)) & 1) == 0) {
+ found = 1;
+ break;
+ }
+ }
+
+ if(found) {
+ MPID_nem_ib_ringbuf_acquired[i / 64] |= (1ULL << (i & 63));
+
+ if(!MPID_nem_ib_ringbuf[i].start) {
+ MPID_nem_ib_ringbuf[i].type = MPID_NEM_IB_RINGBUF_EXCLUSIVE;
+ MPID_nem_ib_ringbuf[i].start = MPID_nem_ib_rdmawr_to_alloc(MPID_NEM_IB_RINGBUF_NSLOT);
+ MPIU_ERR_CHKANDJUMP(!MPID_nem_ib_ringbuf[i].start, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_rdma_to_alloc");
+ MPID_nem_ib_ringbuf[i].nslot = MPID_NEM_IB_RINGBUF_NSLOT;
+ memset(MPID_nem_ib_ringbuf[i].remote_released, 0, (MPID_NEM_IB_RINGBUF_NSLOT + 63) / 64);
+ MPID_nem_ib_ringbuf_allocated[i / 64] |= (1ULL << (i & 63));
+ }
+ VC_FIELD(vc, ibcom->remote_ringbuf) = &MPID_nem_ib_ringbuf[i];
+ dprintf("ringbuf_alloc,start=%p\n", MPID_nem_ib_ringbuf[i].start);
+
+ VC_FIELD(vc, ibcom->rsr_seq_num_poll) = 0;
+ VC_FIELD(vc, ibcom->rsr_seq_num_tail) = -1;
+ VC_FIELD(vc, ibcom->rsr_seq_num_tail_last_sent) = -1;
+
+ MPID_nem_ib_ringbuf[i].vc = vc;
+ dprintf("ringbuf_alloc,i=%d,pg_rank=%d,ibcom=%p,ibcom->remote_ringbuf=%p\n",
+ i, vc->pg_rank, VC_FIELD(vc, ibcom), VC_FIELD(vc, ibcom->remote_ringbuf));
+ } else {
+ if(!MPID_nem_ib_ringbuf[i].start) {
+ MPID_nem_ib_ringbuf[i].type = MPID_NEM_IB_RINGBUF_SHARED;
+ MPID_nem_ib_ringbuf[i].start = MPID_nem_ib_rdmawr_to_alloc(MPID_NEM_IB_RINGBUF_NSLOT);
+ MPIU_ERR_CHKANDJUMP(!MPID_nem_ib_ringbuf[i].start, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_rdma_to_alloc");
+ MPID_nem_ib_ringbuf[i].nslot = MPID_NEM_IB_RINGBUF_NSLOT;
+ memset(MPID_nem_ib_ringbuf[i].remote_released, 0, (MPID_NEM_IB_RINGBUF_NSLOT + 63) / 64);
+ MPID_nem_ib_ringbuf_allocated[i / 64] |= (1ULL << (i & 63));
+ }
+ VC_FIELD(vc, ibcom->remote_ringbuf) = &MPID_nem_ib_ringbuf[i];
+
+ dprintf("ringbuf_alloc,not found\n");
+ }
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RINGBUF_ALLOC);
+ return mpi_errno;
+ fn_fail:
+ goto fn_exit;
+}
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
index b401df2..6704b95 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
@@ -25,7 +25,8 @@
#define MPID_NEM_IB_COM_REG_MR_SZPAGE 4096
#define MPID_NEM_IB_COM_REG_MR_LOGSZPAGE 12
-/* arena allocator */
+/* Allocator using reference count at the head of
+ aligned memory area */
#define MPID_NEM_IB_NIALLOCID 32
typedef struct {
@@ -110,7 +111,7 @@ struct MPID_nem_ib_com_reg_mr_cache_entry_t {
static struct MPID_nem_ib_com_reg_mr_listnode_t
MPID_nem_ib_com_reg_mr_cache[MPID_NEM_IB_COM_REG_MR_NLINE];
-__inline__ int MPID_nem_ib_com_hash_func(char *addr)
+static inline int MPID_nem_ib_com_hash_func(char *addr)
{
unsigned int v = (unsigned int) (unsigned long) addr;
//v = v >> MPID_NEM_IB_COM_REG_MR_LOGSZPAGE; /* assume it is page aligned */
@@ -162,7 +163,7 @@ static inline void __lru_queue_display()
}
}
-struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, int len)
+struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, int len, enum ibv_access_flags additional_flags)
{
#if 0 /* debug */
struct ibv_mr *mr;
@@ -239,7 +240,7 @@ struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, int len)
dprintf("MPID_nem_ib_com_reg_mr_fetch,miss,addr=%p,len=%d\n", addr_aligned, len_aligned);
/* register memory */
- ibcom_errno = MPID_nem_ib_com_reg_mr(addr_aligned, len_aligned, &e->mr);
+ ibcom_errno = MPID_nem_ib_com_reg_mr(addr_aligned, len_aligned, &e->mr, additional_flags);
if (ibcom_errno != 0) {
fprintf(stderr, "mrcache,MPID_nem_ib_com_reg_mr\n");
goto fn_fail;
@@ -297,10 +298,13 @@ static void MPID_nem_ib_com_reg_mr_dereg(struct ibv_mr *mr)
//e->refc, offset);
}
-void MPID_nem_ib_com_register_cache_init()
+int MPID_nem_ib_com_register_cache_init()
{
+ int ibcom_errno = 0;
int i;
+ ref_cout++;
+
/* Using the address to the start node to express the end of the list
* instead of using NULL */
for (i = 0; i < MPID_NEM_IB_COM_REG_MR_NLINE; i++) {
@@ -311,13 +315,25 @@ void MPID_nem_ib_com_register_cache_init()
}
dprintf("[MrCache] cache initializes %d entries\n", MPID_NEM_IB_COM_REG_MR_NLINE);
+
+ fn_exit:
+ return ibcom_errno;
+ //fn_fail:
+ goto fn_exit;
}
-void MPID_nem_ib_com_register_cache_destroy()
+int MPID_nem_ib_com_register_cache_release()
{
+ int ibcom_errno = 0;
+ int ib_errno;
struct MPID_nem_ib_com_reg_mr_cache_entry_t *p;
int i = 0, cnt = 0;
+ MPIU_Assert(ref_count > 0) {
+ if(--ref_count > 0) {
+ goto fn_exit;
+ }
+
for (i = 0; i < MPID_NEM_IB_COM_REG_MR_NLINE; i++) {
for (p =
(struct MPID_nem_ib_com_reg_mr_cache_entry_t *) MPID_nem_ib_com_reg_mr_cache[i].
@@ -325,7 +341,8 @@ void MPID_nem_ib_com_register_cache_destroy()
p != (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) &MPID_nem_ib_com_reg_mr_cache[i];
p = (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) p->lru_next) {
if (p && p->addr > 0) {
- MPID_nem_ib_com_dereg_mr(p->mr);
+ ib_errno = MPID_nem_ib_com_dereg_mr(p->mr);
+ MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, printf("MPID_nem_ib_com_dereg_mr"));
afree(p, MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
cnt++;
}
@@ -335,4 +352,8 @@ void MPID_nem_ib_com_register_cache_destroy()
//__lru_queue_display();
dprintf("[MrCache] cache destroyed %d entries\n", cnt);
+ fn_exit:
+ return ibcom_errno;
+ fn_fail:
+ goto fn_exit;
}
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
index 48b9b83..303964f 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
@@ -91,13 +91,13 @@ void MPID_nem_ib_cm_map_set(MPID_nem_ib_cm_map_t * map, char *key, int key_lengt
int MPID_nem_ib_cm_map_get(MPID_nem_ib_cm_map_t * map, char *key, int key_length, int *val)
{
- int llc_errno = LLC_SUCCESS;
+ int mpi_errno = MPI_SUCCESS;
char *pTree = map->data;
dprintf("MPID_nem_ib_cm_map_get,key=%s\n", key);
if (!pTree) {
- llc_errno = -1;
+ mpi_errno = -1;
dprintf("pTree is empty\n");
goto fn_fail;
}
@@ -116,7 +116,7 @@ int MPID_nem_ib_cm_map_get(MPID_nem_ib_cm_map_t * map, char *key, int key_length
else if (s1_minus_s2 < 0 || !s1_minus_s2 && residual < 0) {
// psArg is "smaller" OR same substring, psArg is shorter
if (MPID_NEM_IB_MAP_LPTR(pTree) == 0) {
- llc_errno = -1;
+ mpi_errno = -1;
dprintf("left is null\n");
goto fn_fail;
}
@@ -125,7 +125,7 @@ int MPID_nem_ib_cm_map_get(MPID_nem_ib_cm_map_t * map, char *key, int key_length
else {
// psArg is "larger" OR same substring, psArg is longer
if (MPID_NEM_IB_MAP_RPTR(pTree) == 0) {
- llc_errno = -1;
+ mpi_errno = -1;
dprintf("right is null\n");
goto fn_fail;
}
@@ -133,7 +133,7 @@ int MPID_nem_ib_cm_map_get(MPID_nem_ib_cm_map_t * map, char *key, int key_length
}
}
fn_exit:
- return llc_errno;
+ return mpi_errno;
fn_fail:
goto fn_exit;
}
@@ -150,28 +150,15 @@ static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, vo
int ibcom_errno;
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
MPID_nem_ib_pkt_prefix_t pkt_netmod;
- void *netmod_hdr;
- int sz_netmod_hdr;
+ void *prefix;
+ int sz_prefix;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_ISENDCONTIG_CORE);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_ISENDCONTIG_CORE);
/* piggy-back SR occupancy info might copy and modify given header */
- /* remote SR sequence number which is last sent */
- int *rsr_seq_num_tail;
- ibcom_errno = MPID_nem_ib_com_rsr_seq_num_tail_get(vc_ib->sc->fd, &rsr_seq_num_tail);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_rsr_seq_num_tail_get");
-
- /* remote SR sequence number which is last sent */
- int *rsr_seq_num_tail_last_sent;
- ibcom_errno =
- MPID_nem_ib_com_rsr_seq_num_tail_last_sent_get(vc_ib->sc->fd, &rsr_seq_num_tail_last_sent);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_rsr_seq_num_tail_last_sent_get");
-
- //dprintf("isendcontig,rsr_seq_num_tail=%d,rsr_seq_num_tail_last_sent=%d\n", *rsr_seq_num_tail, *rsr_seq_num_tail_last_sent);
+ //dprintf("isendcontig,rsr_seq_num_tail=%d,rsr_seq_num_tail_last_sent=%d\n", vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent);
int notify_rate;
ibcom_errno = MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get(vc_ib->sc->fd, ¬ify_rate);
@@ -181,12 +168,17 @@ static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, vo
/* send RDMA-write-to buffer occupancy information */
/* embed SR occupancy information and remember the last one sent */
MPIDI_CH3_Pkt_t *ch3_hdr = (MPIDI_CH3_Pkt_t *) hdr;
- if (MPID_nem_ib_diff32(*rsr_seq_num_tail, *rsr_seq_num_tail_last_sent) > notify_rate) {
-#if 1 /* debug, disabling piggy-back */
+ if (MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent) > notify_rate) {
+#if 0 /* debug, disabling piggy-back */
switch (ch3_hdr->type) {
case MPIDI_CH3_PKT_EAGER_SEND:
pkt_netmod.subtype = MPIDI_NEM_IB_PKT_EAGER_SEND;
goto common_tail;
+#if 0 /* modification of mpid_nem_lmt.c is required */
+ case MPIDI_NEM_PKT_LMT_RTS:
+ pkt_netmod.subtype = MPIDI_NEM_IB_PKT_LMT_RTS;
+ goto common_tail;
+#endif
case MPIDI_CH3_PKT_PUT:
pkt_netmod.subtype = MPIDI_NEM_IB_PKT_PUT;
goto common_tail;
@@ -200,24 +192,24 @@ static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, vo
pkt_netmod.subtype = MPIDI_NEM_IB_PKT_GET_RESP;
common_tail:
pkt_netmod.type = MPIDI_NEM_PKT_NETMOD;
- pkt_netmod.seq_num_tail = *rsr_seq_num_tail;
- *rsr_seq_num_tail_last_sent = *rsr_seq_num_tail;
- netmod_hdr = (void *) &pkt_netmod;
- sz_netmod_hdr = sizeof(MPID_nem_ib_pkt_prefix_t);
+ pkt_netmod.seq_num_tail = vc_ib->ibcom->rsr_seq_num_tail;
+ vc_ib->ibcom->rsr_seq_num_tail_last_sent = vc_ib->ibcom->rsr_seq_num_tail;
+ prefix = (void *) &pkt_netmod;
+ sz_prefix = sizeof(MPID_nem_ib_pkt_prefix_t);
break;
default:
- netmod_hdr = NULL;
- sz_netmod_hdr = 0;
+ prefix = NULL;
+ sz_prefix = 0;
break;
}
#else
- netmod_hdr = NULL;
- sz_netmod_hdr = 0;
+ prefix = NULL;
+ sz_prefix = 0;
#endif
}
else {
- netmod_hdr = NULL;
- sz_netmod_hdr = 0;
+ prefix = NULL;
+ sz_prefix = 0;
}
/* packet handlers including MPIDI_CH3_PktHandler_EagerSend and MPID_nem_handle_pkt assume this */
@@ -249,8 +241,8 @@ static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, vo
int msg_type = MPIDI_Request_get_msg_type(sreq);
dprintf
- ("isendcontig_core,netmod_hdr=%p,sz_netmod_hdr=%d,hdr=%p,sz_hdr=%ld,data=%p,sz_data=%d\n",
- netmod_hdr, sz_netmod_hdr, hdr, hdr_sz, data, (int) data_sz);
+ ("isendcontig_core,sreq=%p,prefix=%p,sz_prefix=%d,hdr=%p,sz_hdr=%ld,data=%p,sz_data=%d,remote_ringbuf->type=%d\n",
+ sreq, prefix, sz_prefix, hdr, hdr_sz, data, (int) data_sz, vc_ib->ibcom->remote_ringbuf->type);
if (sizeof(MPIDI_CH3_Pkt_t) != hdr_sz) {
printf("type=%d,subtype=%d\n", ((MPID_nem_pkt_netmod_t *) hdr)->type,
@@ -259,8 +251,14 @@ static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, vo
int copied;
ibcom_errno =
- MPID_nem_ib_com_isend(vc_ib->sc->fd, (uint64_t) sreq, netmod_hdr, sz_netmod_hdr, hdr,
- hdr_sz, data, (int) data_sz, &copied);
+ MPID_nem_ib_com_isend(vc_ib->sc->fd,
+ (uint64_t) sreq,
+ prefix, sz_prefix,
+ hdr, hdr_sz,
+ data, (int) data_sz,
+ &copied,
+ vc_ib->ibcom->local_ringbuf_type, vc_ib->ibcom->remote_ringbuf->type,
+ &REQ_FIELD(sreq, buf_from), &REQ_FIELD(sreq, buf_from_sz));
MPIU_ERR_CHKFATALANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_isend");
MPID_nem_ib_ncqe += 1;
//dprintf("isendcontig_core,ncqe=%d\n", MPID_nem_ib_ncqe);
@@ -357,7 +355,7 @@ static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, vo
vc_ib->ibcom->lsr_seq_num_tail, MPID_NEM_IB_COM_RDMABUF_HIGH_WATER_MARK);
/* if the number of slots in RMDA-write-to buffer have hit the high water-mark */
if (*notify_rstate == MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_LW &&
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num,
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
vc_ib->ibcom->lsr_seq_num_tail) >
MPID_NEM_IB_COM_RDMABUF_HIGH_WATER_MARK) {
dprintf("changing notify_rstate,id=%d\n", vc_ib->ibcom->sseq_num);
@@ -395,11 +393,34 @@ int MPID_nem_ib_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "ib_iSendContig");
MPIDI_DBG_Print_packet((MPIDI_CH3_Pkt_t *) hdr);
-#ifdef MPID_NEM_IB_ONDEMAND
- if (!vc_ib->is_connected) {
- MPID_nem_ib_send_syn(vc);
+ if (vc_ib->connection_state == MPID_NEM_IB_CM_CLOSED) {
+ if(vc_ib->connection_guard == 0) {
+ vc_ib->connection_guard = 1;
+ /* connected=no,ringbuf-type=shared,slot-available=no,
+ going-to-be-enqueued=yes case */
+ MPID_nem_ib_cm_cas(vc, 0); /* Call ask_fetch just after it's connected */
+ }
+
+ }
+ if (vc_ib->connection_state != MPID_NEM_IB_CM_ESTABLISHED) {
+ /* connected=closed/transit,ringbuf-type=shared,slot-available=no,
+ going-to-be-enqueued=yes case */
+ REQ_FIELD(sreq, ask) = 0;
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_ask_fetch");
+ } else {
+ /* connected=established,ringbuf-type=shared,slot-available=no,
+ going-to-be-enqueued=yes case */
+ if (vc_ib->ibcom->local_ringbuf_type == MPID_NEM_IB_RINGBUF_SHARED &&
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
+ vc_ib->ibcom->lsr_seq_num_tail) >= vc_ib->ibcom->local_ringbuf_nslot) {
+ dprintf("isendcontig,RINGBUF_SHARED and full,asking\n");
+ mpi_errno = MPID_nem_ib_ringbuf_ask_fetch(vc);
+ REQ_FIELD(sreq, ask) = 1;
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_ask_fetch");
+ } else {
+ REQ_FIELD(sreq, ask) = 0;
+ }
}
-#endif
#if 0
/* aggressively perform drain_scq */
@@ -416,6 +437,14 @@ int MPID_nem_ib_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
/* set it for drain_scq */
MPIDI_Request_set_msg_type(sreq, MPIDI_REQUEST_EAGER_MSG);
+
+
+#ifdef MPID_NEM_IB_ONDEMAND
+ if (vc_ib->connection_state != MPID_NEM_IB_CM_ESTABLISHED) {
+ goto enqueue;
+ }
+#endif
+
#if 0
/* anticipating received message releases RDMA-write-to buffer or IB command-queue entry */
/* Unexpected state MPIDI_VC_STATE_CLOSED in vc 0xf1fed0 (expecting MPIDI_VC_STATE_ACTIVE)
@@ -429,62 +458,50 @@ int MPID_nem_ib_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
}
#endif
- int *lsr_seq_num_tail;
- /* sequence number of (largest) completed send command */
- ibcom_errno = MPID_nem_ib_com_lsr_seq_num_tail_get(vc_ib->sc->fd, &lsr_seq_num_tail);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_lsr_seq_num_tail_get");
-
- int lsr_seq_num_head;
- /* sequence number of (largest) in-flight send command */
- ibcom_errno = MPID_nem_ib_com_sseq_num_get(vc_ib->sc->fd, &lsr_seq_num_head);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_sseq_num_get");
-
- dprintf("isendcontig,%d->%d,type=%d,subtype=%d,data_sz=%ld,ldiff=%d(%d-%d),rdiff=%d(%d-%d)\n",
- MPID_nem_ib_myrank, vc->pg_rank, ((MPIDI_CH3_Pkt_t *) hdr)->type,
+ dprintf("isendcontig,%d->%d,req=%p,type=%d,subtype=%d,data_sz=%ld,ldiff=%d(%d-%d),rdiff=%d(%d-%d)\n",
+ MPID_nem_ib_myrank, vc->pg_rank,
+ sreq,
+ ((MPIDI_CH3_Pkt_t *) hdr)->type,
((MPID_nem_pkt_netmod_t *) hdr)->subtype, data_sz,
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail,
- MPID_nem_ib_diff32(vc_ib->ibcom->rsr_seq_num_tail,
+ MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
vc_ib->ibcom->rsr_seq_num_tail_last_sent),
vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent);
dprintf("isendcontig,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n",
MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe,
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
/* if IB command overflow-queue is empty AND local IB command queue isn't full AND remote RDMA-write-to buf isn't getting overrun */
MPIDI_CH3_Pkt_t *ch3_hdr = (MPIDI_CH3_Pkt_t *) hdr;
- MPID_nem_pkt_netmod_t *netmod_hdr = (MPID_nem_pkt_netmod_t *) hdr;
+ MPID_nem_pkt_netmod_t *prefix = (MPID_nem_pkt_netmod_t *) hdr;
/* reserve one slot for control packet bringing sequence number
* to avoid dead-lock */
int slack = ((ch3_hdr->type != MPIDI_NEM_PKT_NETMOD ||
- netmod_hdr->subtype != MPIDI_NEM_IB_PKT_REQ_SEQ_NUM) &&
+ prefix->subtype != MPIDI_NEM_IB_PKT_REQ_SEQ_NUM) &&
(ch3_hdr->type != MPIDI_NEM_PKT_NETMOD ||
- netmod_hdr->subtype != MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM) &&
+ prefix->subtype != MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM) &&
(ch3_hdr->type != MPIDI_NEM_PKT_NETMOD ||
- netmod_hdr->subtype != MPIDI_NEM_IB_PKT_LMT_GET_DONE) &&
+ prefix->subtype != MPIDI_NEM_IB_PKT_LMT_GET_DONE) &&
ch3_hdr->type != MPIDI_NEM_PKT_LMT_RTS &&
ch3_hdr->type != MPIDI_NEM_PKT_LMT_CTS) ? MPID_NEM_IB_COM_AMT_SLACK : 0;
/* make control packet bringing sequence number go ahead of
* queued packets to avoid dead-lock */
int goahead =
(ch3_hdr->type == MPIDI_NEM_PKT_NETMOD &&
- netmod_hdr->subtype == MPIDI_NEM_IB_PKT_REQ_SEQ_NUM)
+ prefix->subtype == MPIDI_NEM_IB_PKT_REQ_SEQ_NUM)
|| (ch3_hdr->type == MPIDI_NEM_PKT_NETMOD &&
- netmod_hdr->subtype == MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM) ||
+ prefix->subtype == MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM) ||
(ch3_hdr->type == MPIDI_NEM_PKT_NETMOD &&
- netmod_hdr->subtype == MPIDI_NEM_IB_PKT_LMT_GET_DONE)
+ prefix->subtype == MPIDI_NEM_IB_PKT_LMT_GET_DONE)
? 1 : 0;
dprintf("isendcontig,slack=%d,goahead=%d\n", slack, goahead);
if (
-#ifdef MPID_NEM_IB_ONDEMAND
- vc_ib->is_connected &&
-#endif
- (goahead || MPID_nem_ib_sendq_empty(vc_ib->sendq)) &&
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack &&
- MPID_nem_ib_diff32(lsr_seq_num_head,
- *lsr_seq_num_tail) < MPID_NEM_IB_COM_RDMABUF_NSEG - slack) {
+ (goahead || MPID_nem_ib_sendq_empty(vc_ib->sendq)) &&
+ vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
+ MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack &&
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
+ vc_ib->ibcom->lsr_seq_num_tail) < vc_ib->ibcom->local_ringbuf_nslot - slack) {
mpi_errno = MPID_nem_ib_iSendContig_core(vc, sreq, hdr, hdr_sz, data, data_sz);
if (mpi_errno) {
@@ -493,15 +510,17 @@ int MPID_nem_ib_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
}
else {
-
/* enqueue command into send_queue */
- dprintf("isendcontig,enqueuing,sendq=%d,ncom=%d,ncqe=%d,ldiff=%d(%d-%d),slack=%d\n",
- (goahead || MPID_nem_ib_sendq_empty(vc_ib->sendq)),
+ dprintf("isendcontig,enqueuing,goahead=%d,sendq=%d,ncom=%d,ncqe=%d,ldiff=%d(%d-%d),slack=%d\n",
+ goahead, MPID_nem_ib_sendq_empty(vc_ib->sendq),
vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack,
MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack,
- MPID_nem_ib_diff32(lsr_seq_num_head, *lsr_seq_num_tail),
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail, slack);
+ /* this location because the above message refers undefined */
+ enqueue:
+
/* store required info. see MPIDI_CH3_iSendv in src/mpid/ch3/channels/nemesis/src/ch3_isendv.c */
sreq->dev.pending_pkt = *(MPIDI_CH3_Pkt_t *) hdr;
sreq->dev.iov[0].MPID_IOV_BUF = (char *) &sreq->dev.pending_pkt;
@@ -625,19 +644,16 @@ int MPID_nem_ib_iStartContigMsg(MPIDI_VC_t * vc, void *hdr, MPIDI_msg_sz_t hdr_s
//tsce = MPID_nem_ib_rdtsc(); printf("rc,%ld\n", tsce - tscs); // 124.15 cycles
#if 0
- ibcom_errno = MPID_nem_ib_com_sseq_num_get(vc_ib->sc->fd, &sseq_num);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_sseq_num_get");
-
if (hdr) {
MPIDI_CH3_Pkt_t *pkt = (MPIDI_CH3_Pkt_t *) hdr;
MPIDI_CH3_Pkt_close_t *close_pkt = &pkt->close;
dprintf("isend(istartcontig),%d->%d,seq_num=%d,type=%d,ack=%d\n", MPID_nem_ib_myrank,
- vc->pg_rank, sseq_num, close_pkt->type, close_pkt->ack);
+ vc->pg_rank, vc_ib->ibcom->sseq_num, close_pkt->type, close_pkt->ack);
}
else {
dprintf("isend(istartcontig),%d->%d,seq_num=%d\n", MPID_nem_ib_myrank, vc->pg_rank,
- sseq_num);
+ vc_ib->ibcom->sseq_num);
}
#endif
@@ -666,7 +682,6 @@ static int MPID_nem_ib_SendNoncontig_core(MPIDI_VC_t * vc, MPID_Request * sreq,
int ibcom_errno;
MPIDI_msg_sz_t last;
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- int sseq_num;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_SENDNONCONTIG_CORE);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_SENDNONCONTIG_CORE);
@@ -687,16 +702,24 @@ static int MPID_nem_ib_SendNoncontig_core(MPIDI_VC_t * vc, MPID_Request * sreq,
MPIR_Request_add_ref(sreq);
}
- ibcom_errno = MPID_nem_ib_com_sseq_num_get(vc_ib->sc->fd, &sseq_num);
- MPIU_ERR_CHKANDJUMP(ibcom_errno != 0, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_sseq_num_get");
+ if (sizeof(MPIDI_CH3_Pkt_t) != hdr_sz) {
+ printf("type=%d,subtype=%d\n", ((MPID_nem_pkt_netmod_t *) hdr)->type,
+ ((MPID_nem_pkt_netmod_t *) hdr)->subtype);
+ }
int copied;
- dprintf("sendnoncontig_core,isend,%d->%d,seq_num=%d\n", MPID_nem_ib_myrank, vc->pg_rank,
- sseq_num);
+ dprintf("sendnoncontig_core,isend,%d->%d,seq_num=%d,remote_ringbuf->type=%d\n", MPID_nem_ib_myrank, vc->pg_rank,
+ vc_ib->ibcom->sseq_num, vc_ib->ibcom->remote_ringbuf->type);
+
ibcom_errno =
- MPID_nem_ib_com_isend(vc_ib->sc->fd, (uint64_t) sreq, NULL, 0, hdr, sizeof(MPIDI_CH3_Pkt_t),
- (void *) REQ_FIELD(sreq, lmt_pack_buf), (int) last, &copied);
+ MPID_nem_ib_com_isend(vc_ib->sc->fd,
+ (uint64_t) sreq,
+ NULL, 0,
+ hdr, hdr_sz,
+ (void *) REQ_FIELD(sreq, lmt_pack_buf), (int) last,
+ &copied,
+ vc_ib->ibcom->local_ringbuf_type, vc_ib->ibcom->remote_ringbuf->type,
+ &REQ_FIELD(sreq, buf_from), &REQ_FIELD(sreq, buf_from_sz));
MPIU_ERR_CHKANDJUMP(ibcom_errno != 0, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_isend");
MPID_nem_ib_ncqe += 1;
dprintf("sendnoncontig_core,ncqe=%d\n", MPID_nem_ib_ncqe);
@@ -758,7 +781,7 @@ static int MPID_nem_ib_SendNoncontig_core(MPIDI_VC_t * vc, MPID_Request * sreq,
vc_ib->ibcom->lsr_seq_num_tail, MPID_NEM_IB_COM_RDMABUF_HIGH_WATER_MARK);
/* if the number of slots in RMDA-write-to buffer have hit the high water-mark */
if (*notify_rstate == MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_LW &&
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num,
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
vc_ib->ibcom->lsr_seq_num_tail) >
MPID_NEM_IB_COM_RDMABUF_HIGH_WATER_MARK) {
dprintf("changing notify_rstate,id=%d\n", vc_ib->ibcom->sseq_num);
@@ -790,16 +813,43 @@ int MPID_nem_ib_SendNoncontig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
int ibcom_errno;
MPIDI_msg_sz_t last;
MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- int sseq_num;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_SENDNONCONTIG);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_SENDNONCONTIG);
MPIU_Assert(hdr_sz <= sizeof(MPIDI_CH3_Pkt_t));
MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "ib_SendNoncontig");
+ if (vc_ib->connection_state == MPID_NEM_IB_CM_CLOSED) {
+ if(vc_ib->connection_guard == 0) {
+ vc_ib->connection_guard = 1;
+ /* connected=closed,ringbuf-type=shared,slot-available=no,
+ going-to-be-enqueued=yes case */
+ MPID_nem_ib_cm_cas(vc, 0); /* Call ask_fetch just after it's connected */
+ }
+ }
+ if (vc_ib->connection_state != MPID_NEM_IB_CM_ESTABLISHED) {
+ /* connected=closed/transit,ringbuf-type=shared,slot-available=no,
+ going-to-be-enqueued=yes case */
+ REQ_FIELD(sreq, ask) = 0;
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_ask_fetch");
+ } else {
+ /* connected=established,ringbuf-type=shared,slot-available=no,
+ going-to-be-enqueued=yes case */
+ if (vc_ib->ibcom->local_ringbuf_type == MPID_NEM_IB_RINGBUF_SHARED &&
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
+ vc_ib->ibcom->lsr_seq_num_tail) >= vc_ib->ibcom->local_ringbuf_nslot) {
+ dprintf("sendnoncontig,RINGBUF_SHARED and full,asking\n");
+ REQ_FIELD(sreq, ask) = 1;
+ mpi_errno = MPID_nem_ib_ringbuf_ask_fetch(vc);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_ask_fetch");
+ } else {
+ REQ_FIELD(sreq, ask) = 0;
+ }
+ }
+
dprintf("sendnoncontig,%d->%d,sendq_empty=%d,ncom=%d,rdmabuf_occ=%d\n", MPID_nem_ib_myrank,
vc->pg_rank, MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom,
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
#if 0
/* aggressively perform drain_scq */
/* try to clear the road blocks, i.e. ncom, ncqe */
@@ -818,11 +868,16 @@ int MPID_nem_ib_SendNoncontig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
/* if IB command overflow-queue is empty AND local IB command queue isn't full AND remote RDMA-write-to buf isn't getting overrun */
/* set it for drain_scq */
int slack = MPID_NEM_IB_COM_AMT_SLACK; /* slack for control packet bringing sequence number */
- if (MPID_nem_ib_sendq_empty(vc_ib->sendq) &&
+
+ if (
+#ifdef MPID_NEM_IB_ONDEMAND
+ vc_ib->connection_state == MPID_NEM_IB_CM_ESTABLISHED &&
+#endif
+ MPID_nem_ib_sendq_empty(vc_ib->sendq) &&
vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack &&
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) < MPID_NEM_IB_COM_RDMABUF_NSEG - slack) {
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
+ vc_ib->ibcom->lsr_seq_num_tail) < vc_ib->ibcom->local_ringbuf_nslot - slack) {
mpi_errno = MPID_nem_ib_SendNoncontig_core(vc, sreq, hdr, hdr_sz);
if (mpi_errno) {
@@ -833,7 +888,7 @@ int MPID_nem_ib_SendNoncontig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
else {
/* enqueue command into send_queue */
dprintf("sendnoncontig, enqueuing");
-
+ //enqueue:
/* store required info. see MPIDI_CH3_iSendv in src/mpid/ch3/channels/nemesis/src/ch3_isendv.c */
sreq->dev.pending_pkt = *(MPIDI_CH3_Pkt_t *) hdr;
sreq->dev.iov[0].MPID_IOV_BUF = (char *) &sreq->dev.pending_pkt;
@@ -859,21 +914,29 @@ int MPID_nem_ib_SendNoncontig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
#define FUNCNAME MPID_nem_ib_send_progress
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_send_progress(MPID_nem_ib_vc_area * vc_ib)
+int MPID_nem_ib_send_progress(MPIDI_VC_t * vc)
{
int mpi_errno = MPI_SUCCESS;
int ibcom_errno;
+ MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
MPID_IOV *iov;
int n_iov;
MPID_Request *sreq, *prev_sreq;
int again = 0;
- int msg_type;
+ int req_type, msg_type;
MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_SEND_PROGRESS);
MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_SEND_PROGRESS);
//dprintf("send_progress,enter\n");
+#ifdef MPID_NEM_IB_ONDEMAND
+ if(vc_ib->connection_state != MPID_NEM_IB_CM_ESTABLISHED) {
+ //dprintf("send_progress,connection_state=%08x\n", vc_ib->connection_state);
+ goto fn_exit;
+ }
+#endif
+
/* prevent a call path send_progress -> drain_scq -> send_progress */
if (entered_send_progress) {
goto fn_exit;
@@ -897,50 +960,110 @@ int MPID_nem_ib_send_progress(MPID_nem_ib_vc_area * vc_ib)
"**MPID_nem_ib_drain_scq");
}
#endif
+ req_type = MPIDI_Request_get_type(sreq);
msg_type = MPIDI_Request_get_msg_type(sreq);
MPIDI_CH3_Pkt_t *ch3_hdr = (MPIDI_CH3_Pkt_t *) sreq->dev.iov[0].MPID_IOV_BUF;
- MPID_nem_pkt_netmod_t *netmod_hdr =
+ MPID_nem_pkt_netmod_t *netmod_pkt =
(MPID_nem_pkt_netmod_t *) sreq->dev.iov[0].MPID_IOV_BUF;
int slack = (msg_type == MPIDI_REQUEST_EAGER_MSG) ? /* guard from RDMA-read or RDMA-write */
(((ch3_hdr->type != MPIDI_NEM_PKT_NETMOD ||
- netmod_hdr->subtype != MPIDI_NEM_IB_PKT_REQ_SEQ_NUM) &&
+ netmod_pkt->subtype != MPIDI_NEM_IB_PKT_REQ_SEQ_NUM) &&
(ch3_hdr->type != MPIDI_NEM_PKT_NETMOD ||
- netmod_hdr->subtype != MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM) &&
+ netmod_pkt->subtype != MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM) &&
(ch3_hdr->type != MPIDI_NEM_PKT_NETMOD ||
- netmod_hdr->subtype != MPIDI_NEM_IB_PKT_LMT_GET_DONE) &&
+ netmod_pkt->subtype != MPIDI_NEM_IB_PKT_LMT_GET_DONE) &&
ch3_hdr->type != MPIDI_NEM_PKT_LMT_RTS &&
ch3_hdr->type !=
MPIDI_NEM_PKT_LMT_CTS) ? MPID_NEM_IB_COM_AMT_SLACK : 0) :
MPID_NEM_IB_COM_AMT_SLACK;
+
+ /* Temporary fix until removing slack */
+ if(vc_ib->ibcom->local_ringbuf_type == MPID_NEM_IB_RINGBUF_SHARED) {
+ slack = 0;
+ }
+
+ /* Refill slots from queue
+ We don't need refill code in sendcontig because
+ there is an order where (1) send, (2) it's queued, (3) then ask obtains slots,
+ (4) then we can refill them here. */
+
+ if(vc_ib->ibcom->local_ringbuf_type == MPID_NEM_IB_RINGBUF_SHARED &&
+ (msg_type == MPIDI_REQUEST_EAGER_MSG &&
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
+ vc_ib->ibcom->lsr_seq_num_tail) >= vc_ib->ibcom->local_ringbuf_nslot)) {
+ /* Prevent RDMA-read for rendezvous protocol from issuing ask */
+
+ if(!REQ_FIELD(sreq, ask)) {
+ /* Transitioning from exclusive to shared and need to issue ask.
+ This case is detected because exclusive entries in the queue are deleted
+ and deprived of slots of exclusive and the last state is set to
+ shared when deciding a transition from exclusive to shared
+ and an issued or queued ask must be in the queue or ringbuf_sendq
+ when staying shared. */
+ dprintf("send_progress,call ask_fetch,%d->%d\n",
+ MPID_nem_ib_myrank, vc->pg_rank);
+ mpi_errno = MPID_nem_ib_ringbuf_ask_fetch(vc);
+ REQ_FIELD(sreq, ask) = 1;
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_ringbuf_ask_fetch");
+ } else if(!MPID_nem_ib_ringbuf_sectorq_empty(vc_ib->ibcom->sectorq)) {
+ /* Staying shared or transitioning from shared to exclusive.
+ We need to consume acquires slots in the latter case.
+ Transitioning from shared to exclusive is achieved by
+ finding an exlusive entry. */
+ MPID_nem_ib_ringbuf_sector_t* sector =
+ MPID_nem_ib_ringbuf_sectorq_head(vc_ib->ibcom->sectorq);
+
+ vc_ib->ibcom->local_ringbuf_type = sector->type;
+ vc_ib->ibcom->local_ringbuf_start = sector->start;
+ vc_ib->ibcom->local_ringbuf_nslot = sector->nslot;
+ vc_ib->ibcom->sseq_num = sector->head;
+ vc_ib->ibcom->lsr_seq_num_tail = sector->tail;
+
+ MPID_nem_ib_ringbuf_sectorq_dequeue(&vc_ib->ibcom->sectorq, §or);
+ MPIU_Free(sector);
+
+ dprintf("send_progress,refill,next type=%d,start=%p,local_head=%d,local_tail=%d\n",
+ vc_ib->ibcom->local_ringbuf_type,
+ vc_ib->ibcom->local_ringbuf_start,
+ vc_ib->ibcom->sseq_num,
+ vc_ib->ibcom->lsr_seq_num_tail
+ );
+ }
+ }
+
if (vc_ib->ibcom->ncom >= MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack ||
MPID_nem_ib_ncqe >= MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack ||
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) >=
- MPID_NEM_IB_COM_RDMABUF_NSEG - slack) {
- break;
+ (msg_type == MPIDI_REQUEST_EAGER_MSG &&
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
+ vc_ib->ibcom->lsr_seq_num_tail) >=
+ vc_ib->ibcom->local_ringbuf_nslot - slack)) {
+ /* Exit when full because this reduces the search cost.
+ Note that RDMA-read for rendezvous protocol can be issued even
+ when no ring-buffer slot is available. */
+ goto fn_exit;
}
-
if (vc_ib != MPID_nem_ib_debug_current_vc_ib) {
dprintf("send_progress,vc_ib != MPID_nem_ib_debug_current_vc_ib\n");
}
- dprintf("send_progress,kind=%d,msg_type=%d\n", sreq->kind, msg_type);
+ dprintf("send_progress,req=%p,kind=%d,msg_type=%d\n", sreq, sreq->kind, msg_type);
if (msg_type == MPIDI_REQUEST_EAGER_MSG) {
- dprintf("send_progress,type=%d\n", ch3_hdr->type);
+ dprintf("send_progress,ch3_hdr->type=%d\n", ch3_hdr->type);
}
dprintf("send_progress,%d->%d,rdiff=%d(%d-%d),ldiff=%d(%d-%d),slack=%d\n",
MPID_nem_ib_myrank, sreq->ch.vc->pg_rank,
- MPID_nem_ib_diff32(vc_ib->ibcom->rsr_seq_num_tail,
+ MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
vc_ib->ibcom->rsr_seq_num_tail_last_sent),
vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent,
- MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num,
+ MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
vc_ib->ibcom->lsr_seq_num_tail),
vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail, slack);
if (sreq->kind == MPID_REQUEST_SEND && msg_type == MPIDI_REQUEST_EAGER_MSG) {
if (!sreq->ch.noncontig) {
dprintf
- ("send_progress,contig,type=%d,sseq_num=%d,MPIDI_NEM_PKT_LMT_RTS=%d,MPIDI_NEM_IB_PKT_LMT_GET_DONE=%d\n",
+ ("send_progress,contig,ch3_hdr->type=%d,sseq_num=%d,MPIDI_NEM_PKT_LMT_RTS=%d,MPIDI_NEM_IB_PKT_LMT_GET_DONE=%d\n",
ch3_hdr->type, vc_ib->ibcom->sseq_num, MPIDI_NEM_PKT_LMT_RTS,
MPIDI_NEM_IB_PKT_LMT_GET_DONE);
if (sreq->dev.iov[1].MPID_IOV_LEN > 0) {
@@ -959,11 +1082,13 @@ int MPID_nem_ib_send_progress(MPID_nem_ib_vc_area * vc_ib)
(MPID_nem_ib_lmt_cookie_t *) sreq->dev.iov[1].MPID_IOV_BUF;
dprintf("send_progress,MPIDI_NEM_PKT_LMT_RTS,rsr_seq_num_tail=%d\n",
vc_ib->ibcom->rsr_seq_num_tail);
+#if 0 /* moving to packet header */
/* embed RDMA-write-to buffer occupancy information */
s_cookie_buf->seq_num_tail = vc_ib->ibcom->rsr_seq_num_tail;
/* remember the last one sent */
vc_ib->ibcom->rsr_seq_num_tail_last_sent =
vc_ib->ibcom->rsr_seq_num_tail;
+#endif
break;
}
@@ -972,11 +1097,13 @@ int MPID_nem_ib_send_progress(MPID_nem_ib_vc_area * vc_ib)
(MPID_nem_ib_lmt_cookie_t *) sreq->dev.iov[1].MPID_IOV_BUF;
dprintf("send_progress,MPIDI_NEM_PKT_LMT_CTS,rsr_seq_num_tail=%d\n",
vc_ib->ibcom->rsr_seq_num_tail);
+#if 0 /* moving to packet header */
/* embed RDMA-write-to buffer occupancy information */
s_cookie_buf->seq_num_tail = vc_ib->ibcom->rsr_seq_num_tail;
/* remember the last one sent */
vc_ib->ibcom->rsr_seq_num_tail_last_sent =
vc_ib->ibcom->rsr_seq_num_tail;
+#endif
break;
}
@@ -984,10 +1111,11 @@ int MPID_nem_ib_send_progress(MPID_nem_ib_vc_area * vc_ib)
}
if (ch3_hdr->type == MPIDI_NEM_PKT_NETMOD) {
- switch (netmod_hdr->subtype) {
+ switch (netmod_pkt->subtype) {
/* send current rsr_seq_num_tail because message from target to initiator
* might have happened while being queued */
case MPIDI_NEM_IB_PKT_LMT_GET_DONE:{
+#if 0
MPID_nem_ib_pkt_lmt_get_done_t *_done_pkt =
(MPID_nem_ib_pkt_lmt_get_done_t *) sreq->dev.
iov[0].MPID_IOV_BUF;
@@ -999,6 +1127,7 @@ int MPID_nem_ib_send_progress(MPID_nem_ib_vc_area * vc_ib)
/* remember the last one sent */
vc_ib->ibcom->rsr_seq_num_tail_last_sent =
vc_ib->ibcom->rsr_seq_num_tail;
+#endif
break;
}
case MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM:{
@@ -1057,7 +1186,7 @@ int MPID_nem_ib_send_progress(MPID_nem_ib_vc_area * vc_ib)
else if (sreq->kind == MPID_REQUEST_SEND && msg_type == MPIDI_REQUEST_RNDV_MSG) {
}
else {
- dprintf("send_progress,unknown sreq->type=%d,msg_type=%d\n", sreq->kind, msg_type);
+ dprintf("send_progress,unknown sreq=%p,sreq->kind=%d,msg_type=%d\n", sreq, sreq->kind, msg_type);
assert(0);
MPIU_ERR_INTERNALANDJUMP(mpi_errno, "send_progress,unknown type");
}
@@ -1082,10 +1211,13 @@ int MPID_nem_ib_send_progress(MPID_nem_ib_vc_area * vc_ib)
prev_sreq = sreq;
sreq = MPID_nem_ib_sendq_next(sreq);
next_unlinked:;
+ if(!sreq) {
+ dprintf("send_progress,sendq has got empty!\n");
+ }
} while (sreq);
}
- //dprintf("send_progress,exit,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
+ //dprintf("send_progress,exit,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
fn_exit:
entered_send_progress = 0;
@@ -1096,53 +1228,627 @@ int MPID_nem_ib_send_progress(MPID_nem_ib_vc_area * vc_ib)
}
#ifdef MPID_NEM_IB_ONDEMAND
-int MPID_nem_ib_cm_send_core(int rank, MPID_nem_ib_cm_cmd_t * cmd)
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_cm_progress
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_ib_cm_progress()
{
- MPID_nem_ib_com *ibcom_scratch_pad;
+ int mpi_errno = MPI_SUCCESS;
+ int ibcom_errno;
+ MPID_nem_ib_cm_req_t *sreq, *prev_sreq;
+ MPID_nem_ib_cm_cmd_shadow_t* shadow;
+
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_PROGRESS);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_PROGRESS);
+
+ //dprintf("cm_send_progress,enter\n");
+
+ sreq = MPID_nem_ib_cm_sendq_head(MPID_nem_ib_cm_sendq);
+ if (sreq) {
+ prev_sreq = NULL;
+ do {
+
+ if (sreq->ibcom->ncom >= MPID_NEM_IB_COM_MAX_SQ_CAPACITY ||
+ MPID_nem_ib_ncqe_scratch_pad >= MPID_NEM_IB_COM_MAX_CQ_CAPACITY) {
+ goto next;
+ }
+
+ switch (sreq->state) {
+ case MPID_NEM_IB_CM_CAS:
+ /* This comparison is OK if the diff is within 63-bit range */
+ if (MPID_nem_ib_diff63(MPID_nem_ib_progress_engine_vt, sreq->retry_decided) <
+ sreq->retry_backoff) {
+#if 0
+ dprintf("cm_progress,vt=%ld,retry_decided=%ld,diff=%ld,backoff=%ld\n",
+ MPID_nem_ib_progress_engine_vt, sreq->retry_decided,
+ MPID_nem_ib_diff63(MPID_nem_ib_progress_engine_vt, sreq->retry_decided),
+ sreq->retry_backoff);
+#endif
+ goto next;
+ }
+ dprintf("cm_progress,retry CAS,responder_rank=%d,req=%p,decided=%ld,vt=%ld,backoff=%ld\n",
+ sreq->responder_rank, sreq, sreq->retry_decided,
+ MPID_nem_ib_progress_engine_vt, sreq->retry_backoff);
+ shadow =
+ (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ shadow->type = sreq->state;
+ shadow->req = sreq;
+ mpi_errno =
+ MPID_nem_ib_cm_cas_core(sreq->responder_rank, shadow);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_connect_cas_core");
+ break;
+ case MPID_NEM_IB_CM_SYN:
+ /* The initiator acqurire slot for the responder when sending syn */
+ if(MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
+ MPID_nem_ib_cm_ringbuf_tail) >= MPID_NEM_IB_CM_NSEG) {
+ goto next;
+ }
+ ((MPID_nem_ib_cm_cmd_syn_t*)&sreq->cmd)->responder_ringbuf_index = MPID_nem_ib_cm_ringbuf_head;
+ MPID_nem_ib_cm_ringbuf_head++;
+ shadow =
+ (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ shadow->type = sreq->state;
+ shadow->req = sreq;
+ dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
+ mpi_errno =
+ MPID_nem_ib_cm_cmd_core(sreq->responder_rank, shadow,
+ (void *)(&sreq->cmd),
+ sizeof(MPID_nem_ib_cm_cmd_synack_t), 1 /* syn:1 */, 0);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_send_core");
+ break;
+ case MPID_NEM_IB_CM_SYNACK:
+ /* The responder acquire slot for the initiator when sending synack */
+ if(MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
+ MPID_nem_ib_cm_ringbuf_tail) >= MPID_NEM_IB_CM_NSEG) {
+ goto next;
+ }
+ ((MPID_nem_ib_cm_cmd_synack_t*)&sreq->cmd)->initiator_ringbuf_index = MPID_nem_ib_cm_ringbuf_head;
+ MPID_nem_ib_cm_ringbuf_head++;
+ shadow =
+ (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ shadow->type = sreq->state;
+ shadow->req = sreq;
+ dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
+ mpi_errno =
+ MPID_nem_ib_cm_cmd_core(sreq->initiator_rank, shadow,
+ (void *)(&sreq->cmd),
+ sizeof(MPID_nem_ib_cm_cmd_synack_t), 0, sreq->ringbuf_index);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_send_core");
+ break;
+ case MPID_NEM_IB_CM_ACK1:
+ shadow =
+ (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ shadow->type = sreq->state;
+ shadow->req = sreq;
+ dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
+ mpi_errno =
+ MPID_nem_ib_cm_cmd_core(sreq->responder_rank, shadow,
+ (void *)(&sreq->cmd),
+ sizeof(MPID_nem_ib_cm_cmd_ack1_t), 0, sreq->ringbuf_index);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_send_core");
+ break;
+ case MPID_NEM_IB_CM_ACK2:
+ shadow =
+ (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ shadow->type = sreq->state;
+ shadow->req = sreq;
+ dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
+ mpi_errno =
+ MPID_nem_ib_cm_cmd_core(sreq->initiator_rank, shadow,
+ (void *)(&sreq->cmd),
+ sizeof(MPID_nem_ib_cm_cmd_ack2_t), 0, sreq->ringbuf_index);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_cm_send_core");
+ break;
+ default:
+ dprintf("cm_progress,unknown state=%d\n", sreq->state);
+ assert(0);
+ MPIU_ERR_INTERNALANDJUMP(mpi_errno, "cm_progress,unknown state");
+ }
+
+ /* unlink sreq */
+ if (prev_sreq != NULL) {
+ MPID_nem_ib_cm_sendq_next(prev_sreq) = MPID_nem_ib_cm_sendq_next(sreq);
+ }
+ else {
+ MPID_nem_ib_cm_sendq_head(MPID_nem_ib_cm_sendq) = MPID_nem_ib_cm_sendq_next(sreq);
+ }
+ if (MPID_nem_ib_cm_sendq_next(sreq) == NULL) {
+ MPID_nem_ib_cm_sendq.tail = prev_sreq;
+ }
+
+ /* save sreq->dev.next (and sreq) because decrementing reference-counter might free sreq */
+ MPID_nem_ib_cm_req_t *tmp_sreq = sreq;
+ sreq = MPID_nem_ib_cm_sendq_next(sreq);
+ goto next_unlinked;
+ next:
+ prev_sreq = sreq;
+ sreq = MPID_nem_ib_cm_sendq_next(sreq);
+ next_unlinked:;
+ } while (sreq);
+ }
+
+ fn_exit:
+ entered_send_progress = 0;
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_PROGRESS);
+ return mpi_errno;
+ fn_fail:
+ goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_cm_cas_core
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_ib_cm_cas_core(int rank, MPID_nem_ib_cm_cmd_shadow_t* shadow)
+{
+ int mpi_errno = MPI_SUCCESS;
+ int ibcom_errno;
+ int val;
+
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_CAS_CORE);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_CAS_CORE);
+
+ dprintf("cm_cas_core,enter\n");
+
+ /* Compare-and-swap rank to acquire communication manager port */
ibcom_errno =
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[rank], &ibcom_scratch_pad);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
+ MPID_nem_ib_com_cas_scratch_pad(MPID_nem_ib_scratch_pad_fds[rank],
+ (uint64_t) shadow,
+ 0,
+ MPID_NEM_IB_CM_RELEASED, rank,
+ &shadow->buf_from, &shadow->buf_from_sz);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_cas_scratch_pad");
+ MPID_nem_ib_ncqe_scratch_pad += 1;
+
+ /* Direct poll to drain CQ to check CAS result */
+ MPID_nem_ib_ncqe_scratch_pad_to_drain += 1;
+ dprintf("ringbuf_cm_cas_core,scratch_pad_to_drain=%d\n",
+ MPID_nem_ib_ncqe_scratch_pad_to_drain);
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_CAS_CORE);
+ return mpi_errno;
+ fn_fail:
+ goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_cm_cas
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_ib_cm_cas(MPIDI_VC_t * vc, uint32_t ask_on_connect)
+{
+ int mpi_errno = MPI_SUCCESS;
+ int ibcom_errno;
+ int val;
- if (MPID_nem_ib_ncqe_scratch_pad >= MPID_NEM_IB_COM_MAX_SQ_CAPACITY ||
- ibcom_scratch_pad->ncom_scratch_pad >= MPID_NEM_IB_COM_MAX_CQ_CAPACITY) {
- mpi_errno = MPID_nem_ib_drain_scq_scratch_pad();
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_CAS);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_CAS);
+
+ dprintf("cm_cas,enter\n");
+
+ /* Prepare request structure for enqueued case */
+ MPID_nem_ib_cm_req_t* req = MPIU_Malloc(sizeof(MPID_nem_ib_cm_req_t));
+ MPIU_ERR_CHKANDJUMP(!req, mpi_errno, MPI_ERR_OTHER, "**malloc");
+ dprintf("req=%p\n", req);
+ req->state = MPID_NEM_IB_CM_CAS;
+ req->ref_count = 2; /* Released on receiving ACK2 and draining SCQ of ACK1 */
+ req->retry_backoff = 0;
+ req->initiator_rank = MPID_nem_ib_myrank;
+ req->responder_rank = vc->pg_rank;
+ req->ask_on_connect = ask_on_connect;
+ ibcom_errno =
+ MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[vc->pg_rank],
+ &req->ibcom);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
+ dprintf("req->ibcom=%p\n", req->ibcom);
+
+ /* Acquire remote scratch pad */
+ if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
+ req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
+ MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head, MPID_nem_ib_cm_ringbuf_tail) < MPID_NEM_IB_CM_NSEG) {
+ MPID_nem_ib_cm_cmd_shadow_t * shadow =
+ (MPID_nem_ib_cm_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
+ shadow->type = req->state;
+ shadow->req = req;
+
+ mpi_errno = MPID_nem_ib_cm_cas_core(req->responder_rank, shadow);
MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_drain_scq_scratch_pad");
+ "**MPID_nem_ib_cm_cas");
+ } else {
+ dprintf("cm_cas,enqueue\n");
+ req->retry_decided = MPID_nem_ib_progress_engine_vt;
+ MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
}
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_CAS);
+ return mpi_errno;
+ fn_fail:
+ goto fn_exit;
+}
+
+/* We're trying to send SYN when syn is one */
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_cm_cmd_core
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_ib_cm_cmd_core(int rank, MPID_nem_ib_cm_cmd_shadow_t* shadow, void* buf, MPIDI_msg_sz_t sz, uint32_t syn, uint16_t ringbuf_index)
+{
+ int mpi_errno = MPI_SUCCESS;
+ int ibcom_errno;
+ int val;
+ MPID_nem_ib_cm_cmd_t cmd;
+ int ib_port = 1;
+
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_CMD_CORE);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_CMD_CORE);
+
+ dprintf("cm_cmd_core,enter,syn=%d\n", syn);
+
+ shadow->req->ibcom = MPID_nem_ib_scratch_pad_ibcoms[rank];
ibcom_errno =
MPID_nem_ib_com_put_scratch_pad(MPID_nem_ib_scratch_pad_fds[rank],
- (uint64_t) ibcom_scratch_pad, sizeof(uint32_t),
- sizeof(MPID_nem_ib_cm_cmd_t), (void *) cmd);
+ (uint64_t) shadow,
+ syn ? MPID_NEM_IB_CM_OFF_SYN :
+ MPID_NEM_IB_CM_OFF_CMD +
+ sizeof(MPID_nem_ib_cm_cmd_t) *
+ ((uint16_t)(ringbuf_index % MPID_NEM_IB_CM_NSEG)),
+ sz,
+ buf);
+
MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_put_scratch_pad");
MPID_nem_ib_ncqe_scratch_pad += 1;
- /* atomic write to doorbell */
+ if(syn) {
+ /* Skip QP createion on race condition */
+ if(!(VC_FIELD(MPID_nem_ib_conns[rank].vc, connection_state) &
+ MPID_NEM_IB_CM_LOCAL_QP_RESET)) {
+
+ /* Prepare QP (RESET). Attempting to overlap it with preparing QP (RESET) on the responder side */
+ ibcom_errno = MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_RC, &MPID_nem_ib_conns[rank].fd);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
+ VC_FIELD(MPID_nem_ib_conns[rank].vc, connection_state) |=
+ MPID_NEM_IB_CM_LOCAL_QP_RESET;
+
+ /* Store pointer to MPID_nem_ib_com */
+ ibcom_errno = MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[rank].fd,
+ &VC_FIELD(MPID_nem_ib_conns[rank].vc, ibcom));
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
+
+ /* Allocate RDMA-write-to ring-buf for remote */
+ mpi_errno = MPID_nem_ib_ringbuf_alloc(MPID_nem_ib_conns[rank].vc);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ring_alloc");
+ }
+ }
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_CMD_CORE);
+ return mpi_errno;
+ fn_fail:
+ goto fn_exit;
}
+#endif /* MPID_NEM_ONDEMAND */
+/* RDMA-read the head pointer of the shared ring buffer */
#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_cm_connect
+#define FUNCNAME MPID_nem_ib_ringbuf_ask_fetch_core
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_cm_connect(MPIDI_VC_t * vc)
+int MPID_nem_ib_ringbuf_ask_fetch_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t* shadow, MPIDI_msg_sz_t sz)
{
int mpi_errno = MPI_SUCCESS;
+ int ibcom_errno;
+
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_FETCH_CORE);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_FETCH_CORE);
+
+ dprintf("ringbuf_ask_fetch_core,req=%p\n", shadow->req);
+
+ ibcom_errno =
+ MPID_nem_ib_com_get_scratch_pad(MPID_nem_ib_scratch_pad_fds[vc->pg_rank],
+ (uint64_t) shadow,
+ MPID_NEM_IB_RINGBUF_OFF_HEAD,
+ sz,
+ &shadow->buf_from, &shadow->buf_from_sz);
+
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_get_scratch_pad");
+ MPID_nem_ib_ncqe_scratch_pad += 1;
+
+ /* Direct poll to drain CQ to issue CAS */
+ MPID_nem_ib_ncqe_scratch_pad_to_drain += 1;
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_FETCH_CORE);
+ return mpi_errno;
+ fn_fail:
+ goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_ringbuf_ask_fetch
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_ib_ringbuf_ask_fetch(MPIDI_VC_t * vc)
+{
+ int mpi_errno = MPI_SUCCESS;
+ int ibcom_errno;
int val;
- MPID_nem_ib_cm_cmd_t cmd;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_CONNECT);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_CONNECT);
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_FETCH);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_FETCH);
- dprintf("connect,enter\n");
+ dprintf("ringbuf_ask_fetch,enter\n");
- cmd.type = MPID_NEM_IB_CM_SYN;
- mpi_errno = MPID_nem_ib_cm_send_core(rank, &cmd);
- MPIU_ERR_CHKANDJUMP(mp_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_put");
+ /* Prepare state of ask-send */
+ MPID_nem_ib_ringbuf_req_t* req = MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_req_t));
+ MPIU_ERR_CHKANDJUMP(!req, mpi_errno, MPI_ERR_OTHER, "**malloc");
+ dprintf("ask_fetch,req=%p\n", req);
+ req->state = MPID_NEM_IB_RINGBUF_ASK_FETCH;
+ req->retry_backoff = 0;
+ req->vc = vc;
+ ibcom_errno =
+ MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[vc->pg_rank],
+ &req->ibcom);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
+
+ dprintf("ask_fetch,connection=%08x,ncqe=%d,ncom=%d,guard=%d\n",
+ VC_FIELD(vc, connection_state),
+ MPID_nem_ib_ncqe_scratch_pad,
+ req->ibcom->ncom_scratch_pad,
+ VC_FIELD(vc, ibcom->ask_guard)
+ );
+
+ /* Acquire remote scratch pad */
+ if (VC_FIELD(vc, connection_state) == MPID_NEM_IB_CM_ESTABLISHED &&
+ MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
+ req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
+ !VC_FIELD(vc, ibcom->ask_guard)) {
+
+ /* Let the guard up here to prevent CAS conflicts between consecutive asks
+ from the same process */
+ VC_FIELD(vc, ibcom->ask_guard) = 1;
+
+ MPID_nem_ib_ringbuf_cmd_shadow_t * shadow =
+ (MPID_nem_ib_ringbuf_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_cmd_shadow_t));
+ shadow->type = req->state;
+ shadow->req = req;
+
+ mpi_errno =
+ MPID_nem_ib_ringbuf_ask_fetch_core(req->vc, shadow,
+ sizeof(MPID_nem_ib_ringbuf_headtail_t));
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_ringbuf_ask_fetch");
+ } else {
+ dprintf("ask_fetch,enqueue,req=%p\n", req);
+ MPID_nem_ib_ringbuf_sendq_enqueue(&MPID_nem_ib_ringbuf_sendq, req);
+ }
fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_CONNECT);
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_FETCH);
return mpi_errno;
fn_fail:
goto fn_exit;
}
+
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_ringbuf_ask_cas_core
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_ib_ringbuf_ask_cas_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t* shadow, uint64_t head)
+{
+ int mpi_errno = MPI_SUCCESS;
+ int ibcom_errno;
+ int val;
+
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_CAS_CORE);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_CAS_CORE);
+
+ dprintf("ringbuf_ask_cas_core,req=%p,head=%ld\n", shadow->req, head);
+
+ /* Compare-and-swap to increment head pointer */
+ ibcom_errno =
+ MPID_nem_ib_com_cas_scratch_pad(MPID_nem_ib_scratch_pad_fds[vc->pg_rank],
+ (uint64_t) shadow,
+ MPID_NEM_IB_RINGBUF_OFF_HEAD,
+ head, head + 1,
+ &shadow->buf_from, &shadow->buf_from_sz);
+ MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_cas_scratch_pad");
+ MPID_nem_ib_ncqe_scratch_pad += 1;
+
+ /* Direct poll to drain CQ to check CAS result */
+ MPID_nem_ib_ncqe_scratch_pad_to_drain += 1;
+ dprintf("ringbuf_ask_cas_core,scratch_pad_to_drain=%d\n",
+ MPID_nem_ib_ncqe_scratch_pad_to_drain);
+
+ /* Let the guard down here to overlap CAS with a fetch of the following request
+ when CAS fails, out-of-order acquire may happen, but it's OK */
+ VC_FIELD(vc, ibcom->ask_guard) = 0;
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_CAS_CORE);
+ return mpi_errno;
+ fn_fail:
+ goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_ringbuf_ask_cas
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_ib_ringbuf_ask_cas(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_req_t* req)
+{
+ int mpi_errno = MPI_SUCCESS;
+ int ibcom_errno;
+ int val;
+
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_CAS);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_CAS);
+
+ dprintf("ask_cas,ncqe=%d,ncom=%d,head=%ld,tail=%d,diff=%d,nslot=%d\n",
+ MPID_nem_ib_ncqe_scratch_pad,
+ req->ibcom->ncom_scratch_pad,
+ req->fetched.head, req->fetched.tail,
+ MPID_nem_ib_diff16(req->fetched.head, req->fetched.tail),
+ VC_FIELD(vc, ibcom->local_ringbuf_nslot)
+ );
+
+ /* Acquire one slot of the shared ring buffer */
+ if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
+ req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY) {
+
+ if(MPID_nem_ib_diff16(req->fetched.head, req->fetched.tail) <
+ VC_FIELD(vc, ibcom->local_ringbuf_nslot)) {
+
+ dprintf("ask_cas,core\n");
+ req->state = MPID_NEM_IB_RINGBUF_ASK_CAS;
+ MPID_nem_ib_ringbuf_cmd_shadow_t * shadow =
+ (MPID_nem_ib_ringbuf_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_cmd_shadow_t));
+ shadow->type = req->state;
+ shadow->req = req;
+ mpi_errno = MPID_nem_ib_ringbuf_ask_cas_core(vc, shadow, (uint64_t)req->fetched.head);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_ringbuf_ask_cas");
+ } else {
+ dprintf("ask_cas,ringbuf full,enqueue\n");
+ /* Ring-buffer is full */
+
+ /* Let the guard down so that this ask-fetch can be issued in ringbuf_progress */
+#if 0 /*debug*/
+ VC_FIELD(vc, ibcom->ask_guard) = 0;
#endif
+ /* Retry from fetch */
+
+ /* Schedule retry */
+ req->retry_decided = MPID_nem_ib_progress_engine_vt;
+ req->retry_backoff = 0;
+
+ /* Make the ask-fetch in order */
+ MPID_nem_ib_ringbuf_sendq_enqueue_at_head(&MPID_nem_ib_ringbuf_sendq, req);
+ }
+ } else {
+ dprintf("ask_cas,ncqe or ncom full,enqueue\n");
+ req->retry_decided = MPID_nem_ib_progress_engine_vt;
+ req->retry_backoff = 0;
+ MPID_nem_ib_ringbuf_sendq_enqueue(&MPID_nem_ib_ringbuf_sendq, req);
+ }
+
+ fn_exit:
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_CAS);
+ return mpi_errno;
+ fn_fail:
+ goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPID_nem_ib_ringbuf_progress
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPID_nem_ib_ringbuf_progress()
+{
+ int mpi_errno = MPI_SUCCESS;
+ int ibcom_errno;
+ MPID_nem_ib_ringbuf_req_t *sreq, *prev_sreq;
+ MPID_nem_ib_ringbuf_cmd_shadow_t* shadow;
+
+ MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_PROGRESS);
+ MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_PROGRESS);
+
+ //dprintf("rinbguf_send_progress,enter\n");
+
+ sreq = MPID_nem_ib_ringbuf_sendq_head(MPID_nem_ib_ringbuf_sendq);
+ if (sreq) {
+ prev_sreq = NULL;
+ do {
+ if (VC_FIELD(sreq->vc, connection_state) != MPID_NEM_IB_CM_ESTABLISHED ||
+ sreq->ibcom->ncom >= MPID_NEM_IB_COM_MAX_SQ_CAPACITY ||
+ MPID_nem_ib_ncqe_scratch_pad >= MPID_NEM_IB_COM_MAX_CQ_CAPACITY) {
+ goto next;
+ }
+
+ switch (sreq->state) {
+ case MPID_NEM_IB_RINGBUF_ASK_CAS:
+ dprintf("ringbuf_progress,ask_cas,req=%p\n",
+ sreq);
+ shadow =
+ (MPID_nem_ib_ringbuf_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_cmd_shadow_t));
+ shadow->type = sreq->state;
+ shadow->req = sreq;
+ mpi_errno =
+ MPID_nem_ib_ringbuf_ask_cas_core(sreq->vc, shadow, (uint64_t)sreq->fetched.head);
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_ringbuf_connect_cas_core");
+ break;
+ case MPID_NEM_IB_RINGBUF_ASK_FETCH:
+ if (MPID_nem_ib_diff63(MPID_nem_ib_progress_engine_vt, sreq->retry_decided) <
+ sreq->retry_backoff) {
+ dprintf("ringbuf_progress,vt=%ld,retry_decided=%ld,diff=%ld,backoff=%ld\n",
+ MPID_nem_ib_progress_engine_vt, sreq->retry_decided,
+ MPID_nem_ib_diff63(MPID_nem_ib_progress_engine_vt, sreq->retry_decided),
+ sreq->retry_backoff);
+ goto next;
+ }
+ //dprintf("ringbuf_progress,ask_fetch,decided=%ld,vt=%ld,backoff=%ld\n",
+ //sreq->retry_decided, MPID_nem_ib_progress_engine_vt, sreq->retry_backoff);
+
+ /* Enqueued speculatively, so discard if not needed. */
+ if (VC_FIELD(sreq->vc, ibcom->local_ringbuf_type) == MPID_NEM_IB_RINGBUF_SHARED) {
+ if (VC_FIELD(sreq->vc, ibcom->ask_guard)) {
+ goto next;
+ }
+ dprintf("ringbuf_progress,ask_fetch,req=%p\n",
+ sreq);
+ VC_FIELD(sreq->vc, ibcom->ask_guard) = 1;
+ shadow =
+ (MPID_nem_ib_ringbuf_cmd_shadow_t *)MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_cmd_shadow_t));
+ shadow->type = sreq->state;
+ shadow->req = sreq;
+ mpi_errno =
+ MPID_nem_ib_ringbuf_ask_fetch_core(sreq->vc, shadow,
+ sizeof(MPID_nem_ib_ringbuf_headtail_t));
+ MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
+ "**MPID_nem_ib_ringbuf_send_core");
+ }
+ break;
+ default:
+ dprintf("ringbuf_progress,unknown state=%d\n", sreq->state);
+ assert(0);
+ MPIU_ERR_INTERNALANDJUMP(mpi_errno, "ringbuf_progress,unknown state");
+ }
+
+ /* unlink sreq */
+ if (prev_sreq != NULL) {
+ MPID_nem_ib_ringbuf_sendq_next(prev_sreq) = MPID_nem_ib_ringbuf_sendq_next(sreq);
+ }
+ else {
+ MPID_nem_ib_ringbuf_sendq_head(MPID_nem_ib_ringbuf_sendq) = MPID_nem_ib_ringbuf_sendq_next(sreq);
+ }
+ if (MPID_nem_ib_ringbuf_sendq_next(sreq) == NULL) {
+ MPID_nem_ib_ringbuf_sendq.tail = prev_sreq;
+ }
+
+ /* save sreq->dev.next (and sreq) because decrementing reference-counter might free sreq */
+ MPID_nem_ib_ringbuf_req_t *tmp_sreq = sreq;
+ sreq = MPID_nem_ib_ringbuf_sendq_next(sreq);
+
+ goto next_unlinked;
+ next:
+ prev_sreq = sreq;
+ sreq = MPID_nem_ib_ringbuf_sendq_next(sreq);
+ next_unlinked:;
+ } while (sreq);
+ }
+
+ fn_exit:
+ entered_send_progress = 0;
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RINGBUF_PROGRESS);
+ return mpi_errno;
+ fn_fail:
+ goto fn_exit;
+}
-----------------------------------------------------------------------
Summary of changes:
.../ch3/channels/nemesis/netmod/ib/Makefile.mk | 3 +-
.../ch3/channels/nemesis/netmod/ib/errnames.txt | 61 +-
src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c | 1059 +++++++----
src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h | 415 ++++-
src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h | 430 ++++-
src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c | 469 +++--
src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c | 19 +-
.../ch3/channels/nemesis/netmod/ib/ib_malloc.c | 472 +++++
src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c | 2108 ++++++++++++++------
.../ch3/channels/nemesis/netmod/ib/ib_reg_mr.c | 83 +-
src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c | 1140 +++++++++--
11 files changed, 4765 insertions(+), 1494 deletions(-)
create mode 100644 src/mpid/ch3/channels/nemesis/netmod/ib/ib_malloc.c
hooks/post-receive
--
MPICH primary repository
More information about the commits
mailing list