[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.2b3-49-g8b8b7f6
Service Account
noreply at mpich.org
Thu Jun 11 02:07:38 CDT 2015
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".
The branch, master has been updated
via 8b8b7f67523602c38b31cf8fe0ad1d312c25af15 (commit)
via a8ac5b4e166db86c09732524a6c4a600718ee7e6 (commit)
from f02bffbe7a73d6cf073d7a4f33ef4e53bf8eb078 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/8b8b7f67523602c38b31cf8fe0ad1d312c25af15
commit 8b8b7f67523602c38b31cf8fe0ad1d312c25af15
Author: Pavan Balaji <balaji at anl.gov>
Date: Thu Jun 11 01:32:07 2015 -0500
Edit CHANGES and README.vin to remove IB mentions.
Signed-off-by: Sangmin Seo <sseo at anl.gov>
diff --git a/CHANGES b/CHANGES
index 0af3592..af5616d 100644
--- a/CHANGES
+++ b/CHANGES
@@ -15,9 +15,6 @@
# Added support for the Mellanox HCOLL interface for collectives.
(thanks to Mellanox for the code contribution).
- # Added support for OFED IB on Xeon and Xeon Phi. (thanks to RIKEN
- and University of Tokyo for the contribution).
-
# Significant stability improvements to the MPICH/portals4
implementation.
diff --git a/README.vin b/README.vin
index 61c147a..eae47e1 100644
--- a/README.vin
+++ b/README.vin
@@ -502,45 +502,6 @@ to "error" instead of the default "warn" by using:
MXM_LOG_LEVEL=error
export MXM_LOG_LEVEL
-ib network module
-`````````````````
-The IB netmod provides support for InfiniBand on x86_64 platforms
-(including Xeon Phi). It can be built in the following
-configurations:
-
-1. InfiniBand Open Fabrics, x86_64 (host), Linux
-
- For this mode, configure with the following option:
-
- --with-device=ch3:nemesis:ib
-
-2. InfiniBand Open Fabrics, Xeon Phi, Intel MPSS Linux
-
- For this mode, configure with the following options:
-
- --with-device=ch3:nemesis:ib
- --with-cross=<mpich-source>/src/mpid/ch3/channels/nemesis/netmod/ib/cross_values.txt
- --host=x86_64-k1om-linux
- --with-ib=/opt/intel/mic/ofed/card/usr
- CC=icc CXX=icpc FC=ifort
- CFLAGS=-mmic CXXFLAGS=-mmic FCFLAGS=-mmic LDFLAGS=-mmic
-
-3. InfiniBand Open Fabrics, Xeon Phi, McKernel (developed by
- University of Tokyo)
-
- For this mode, configure with the following options:
-
- --with-device=ch3:nemesis:ib
- --with-cross=<mpich-source>/src/mpid/ch3/channels/nemesis/netmod/ib/cross_values.txt
- --host=x86_64-k1om-linux
- --with-ib=<mckernel-source>/attached
- --disable-shared
- CC=icc CXX=icpc FC=ifort
- CFLAGS=-mmic CXXFLAGS=-mmic FCFLAGS=-mmic LDFLAGS=-mmic
-
- Note that shared builds are not supported for the third
- configuration right now.
-
portals4 network module
```````````````````````
http://git.mpich.org/mpich.git/commitdiff/a8ac5b4e166db86c09732524a6c4a600718ee7e6
commit a8ac5b4e166db86c09732524a6c4a600718ee7e6
Author: Pavan Balaji <balaji at anl.gov>
Date: Thu Jun 11 01:30:22 2015 -0500
Remove the IB netmod.
Signed-off-by: Sangmin Seo <sseo at anl.gov>
diff --git a/src/mpid/ch3/channels/nemesis/netmod/Makefile.mk b/src/mpid/ch3/channels/nemesis/netmod/Makefile.mk
index b7c17ee..5030d23 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/Makefile.mk
+++ b/src/mpid/ch3/channels/nemesis/netmod/Makefile.mk
@@ -9,7 +9,6 @@
include $(top_srcdir)/src/mpid/ch3/channels/nemesis/netmod/tcp/Makefile.mk
include $(top_srcdir)/src/mpid/ch3/channels/nemesis/netmod/none/Makefile.mk
include $(top_srcdir)/src/mpid/ch3/channels/nemesis/netmod/portals4/Makefile.mk
-include $(top_srcdir)/src/mpid/ch3/channels/nemesis/netmod/ib/Makefile.mk
include $(top_srcdir)/src/mpid/ch3/channels/nemesis/netmod/mxm/Makefile.mk
include $(top_srcdir)/src/mpid/ch3/channels/nemesis/netmod/ofi/Makefile.mk
include $(top_srcdir)/src/mpid/ch3/channels/nemesis/netmod/llc/Makefile.mk
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/Makefile.mk b/src/mpid/ch3/channels/nemesis/netmod/ib/Makefile.mk
deleted file mode 100644
index 89ac755..0000000
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/Makefile.mk
+++ /dev/null
@@ -1,24 +0,0 @@
-## -*- Mode: Makefile; -*-
-## vim: set ft=automake :
-##
-## (C) 2013 by Argonne National Laboratory.
-## See COPYRIGHT in top-level directory.
-##
-
-if BUILD_NEMESIS_NETMOD_IB
-
-mpi_core_sources += \
- src/mpid/ch3/channels/nemesis/netmod/ib/ib_finalize.c \
- src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c \
- src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c \
- src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c \
- src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c \
- src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c \
- src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c \
- src/mpid/ch3/channels/nemesis/netmod/ib/ib_malloc.c
-
-noinst_HEADERS += \
- src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h \
- src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
-
-endif BUILD_NEMESIS_NETMOD_IB
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/cross_values.txt b/src/mpid/ch3/channels/nemesis/netmod/ib/cross_values.txt
deleted file mode 100644
index 88a2171..0000000
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/cross_values.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-# The Fortran related cross compilation values.
-# This file is generated with mpich/maint/fcrosscompile/configure
-# with CC/F77/FC set to "icc/ifort -mmic".
-CROSS_F77_SIZEOF_INTEGER="4"
-CROSS_F77_SIZEOF_REAL="4"
-CROSS_F77_SIZEOF_DOUBLE_PRECISION="8"
-CROSS_F77_TRUE_VALUE="-1"
-CROSS_F77_FALSE_VALUE="0"
-CROSS_F90_ADDRESS_KIND="8"
-CROSS_F90_OFFSET_KIND="8"
-CROSS_F90_INTEGER_KIND="4"
-CROSS_F90_REAL_MODEL=" 6 , 37"
-CROSS_F90_DOUBLE_MODEL=" 15 , 307"
-CROSS_F90_INTEGER_MODEL=" 9"
-CROSS_F90_ALL_INTEGER_MODELS=" 2 , 1, 4 , 2, 9 , 4, 18 , 8,"
-CROSS_F90_INTEGER_MODEL_MAP=" { 2 , 1 , 1 }, { 4 , 2 , 2 }, { 9 , 4 , 4 }, { 18 , 8 , 8 },"
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt b/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
deleted file mode 100644
index f11ac62..0000000
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
+++ /dev/null
@@ -1,69 +0,0 @@
-
-**MPIDI_PG_GetConnKVSname:MPIDI_PG_GetConnKVSname failed
-**MPID_nem_ib_cm_cas:MPID_nem_ib_cm_cas failed
-**MPID_nem_ib_cm_cas_release:MPID_nem_ib_cm_cas_release failed
-**MPID_nem_ib_cm_cas_release_core:MPID_nem_ib_cm_cas_release_core failed
-**MPID_nem_ib_cm_connect_cas_core:MPID_nem_ib_cm_connect_cas_core failed
-**MPID_nem_ib_cm_drain_rcq:MPID_nem_ib_cm_drain_rcq failed
-**MPID_nem_ib_cm_drain_scq:MPID_nem_ib_cm_drain_scq failed
-**MPID_nem_ib_cm_poll:MPID_nem_ib_cm_poll failed
-**MPID_nem_ib_cm_poll_syn:MPID_nem_ib_cm_poll_syn failed
-**MPID_nem_ib_cm_progress:MPID_nem_ib_cm_progress failed
-**MPID_nem_ib_cm_send_core:MPID_nem_ib_cm_send_core failed
-**MPID_nem_ib_cm_notify_send:MPID_nem_ib_cm_notify_send failed
-**MPID_nem_ib_com_alloc:MPID_nem_ib_com_alloc failed
-**MPID_nem_ib_com_cas_scratch_pad:MPID_nem_ib_com_cas_scratch_pad failed
-**MPID_nem_ib_com_close:MPID_nem_ib_com_close failed
-**MPID_nem_ib_com_connect_ringbuf:MPID_nem_ib_com_connect_ringbuf failed
-**MPID_nem_ib_com_free:MPID_nem_ib_com_free failed
-**MPID_nem_ib_com_get_info_conn:MPID_nem_ib_com_get_info_conn failed
-**MPID_nem_ib_com_get_info_mr:MPID_nem_ib_com_get_info_mr failed
-**MPID_nem_ib_com_get_scratch_pad:MPID_nem_ib_com_get_scratch_pad failed
-**MPID_nem_ib_com_irecv:MPID_nem_ib_com_irecv failed
-**MPID_nem_ib_com_isend:MPID_nem_ib_com_isend failed
-**MPID_nem_ib_com_lrecv:MPID_nem_ib_com_lrecv failed
-**MPID_nem_ib_com_obtain_pointer:MPID_nem_ib_com_obtain_pointer failed
-**MPID_nem_ib_com_open:MPID_nem_ib_com_open failed
-**MPID_nem_ib_com_poll_cq %s:MPID_nem_ib_com_poll_cq failed with error %s
-**MPID_nem_ib_com_poll_cq:MPID_nem_ib_com_poll_cq failed
-**MPID_nem_ib_com_put_scratch_pad:MPID_nem_ib_com_put_scratch_pad failed
-**MPID_nem_ib_com_rdmabuf_occupancy_notify_lstate_get:MPID_nem_ib_com_rdmabuf_occupancy_notify_lstate_get failed
-**MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get:MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get failed
-**MPID_nem_ib_com_rdmabuf_occupancy_notify_rstate_get:MPID_nem_ib_com_rdmabuf_occupancy_notify_rstate_get failed
-**MPID_nem_ib_com_reg_mr_connect:MPID_nem_ib_com_reg_mr_connect failed
-**MPID_nem_ib_com_reg_mr_fetch:MPID_nem_ib_com_reg_mr_fetch failed
-**MPID_nem_ib_com_rts:MPID_nem_ib_com_rts failed
-**MPID_nem_ib_com_sq_occupancy_notify_rate_get:MPID_nem_ib_com_sq_occupancy_notify_rate_get failed
-**MPID_nem_ib_com_wr_scratch_pad:MPID_nem_ib_com_wr_scratch_pad failed
-**MPID_nem_ib_drain_scq:MPID_nem_ib_drain_scq failed
-**MPID_nem_ib_drain_scq_scratch_pad:MPID_nem_ib_drain_scq_scratch_pad failed
-**MPID_nem_ib_handle_pkt_bh:MPID_nem_ib_handle_pkt_bh failed
-**MPID_nem_ib_kvs_put_binary:MPID_nem_ib_kvs_put_binary failed
-**MPID_nem_ib_lmt_done_recv:MPID_nem_ib_lmt_done_recv failed
-**MPID_nem_ib_lmt_done_send:MPID_nem_ib_lmt_done_send failed
-**MPID_nem_ib_lmt_send_GET_DONE:MPID_nem_ib_lmt_send_GET_DONE failed
-**MPID_nem_ib_lmt_send_RTS:MPID_nem_ib_lmt_send_RTS failed
-**MPID_nem_ib_npollingset:MPID_nem_ib_npollingset failed
-**MPID_nem_ib_poll:MPID_nem_ib_poll failed
-**MPID_nem_ib_poll_eager:MPID_nem_ib_poll_eager failed
-**MPID_nem_ib_rdma_to_alloc:MPID_nem_ib_rdma_to_alloc failed
-**MPID_nem_ib_ring_alloc:MPID_nem_ib_ring_alloc failed
-**MPID_nem_ib_ringbuf_alloc:MPID_nem_ib_ringbuf_alloc failed
-**MPID_nem_ib_ringbuf_ask_cas:MPID_nem_ib_ringbuf_ask_cas failed
-**MPID_nem_ib_ringbuf_ask_fetch:MPID_nem_ib_ringbuf_ask_fetch failed
-**MPID_nem_ib_ringbuf_connect_cas_core:MPID_nem_ib_ringbuf_connect_cas_core failed
-**MPID_nem_ib_ringbuf_free:MPID_nem_ib_ringbuf_free failed
-**MPID_nem_ib_ringbuf_progress:MPID_nem_ib_ringbuf_progress failed
-**MPID_nem_ib_ringbuf_send_core:MPID_nem_ib_ringbuf_send_core failed
-**MPID_nem_ib_send_change_rdmabuf_occupancy_notify_state:MPID_nem_ib_send_change_rdmabuf_occupancy_notify_state failed
-**MPID_nem_ib_send_progress:MPID_nem_ib_send_progress failed
-**MPID_nem_ib_send_reply_seq_num:MPID_nem_ib_send_reply_seq_num failed
-**MPID_nem_ib_send_req_seq_num:MPID_nem_ib_send_req_seq_num failed
-**PMI_Barrier:PMI_Barrier failed
-**PMI_KVS_Put:PMI_KVS_Put failed
-**PMS_KVS_Get:PMS_KVS_Get failed
-**malloc:malloc failed
-**netmod,ib,ibv_poll_cq:netmod,ib,ibv_poll_cq failed
-**notimplemented:notimplemented failed
-**outofmemory:outofmemory failed
-**sizeof(MPIDI_CH3_Pkt_t):sizeof(MPIDI_CH3_Pkt_t) failed
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_finalize.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_finalize.c
deleted file mode 100644
index b9b560e..0000000
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_finalize.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
-/*
- * (C) 2012 NEC Corporation
- * (C) 2014 RIKEN AICS
- *
- * See COPYRIGHT in top-level directory.
- */
-
-#include "ib_impl.h"
-
-//#define MPID_NEM_IB_DEBUG_FINALIZE
-#ifdef dprintf /* avoid redefinition with src/mpid/ch3/include/mpidimpl.h */
-#undef dprintf
-#endif
-#ifdef MPID_NEM_IB_DEBUG_FINALIZE
-#define dprintf printf
-#else
-#define dprintf(...)
-#endif
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_finalize
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_finalize(void)
-{
- int mpi_errno = MPI_SUCCESS;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_FINALIZE);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_FINALIZE);
-
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_FINALIZE);
-
- fn_exit:
- return mpi_errno;
- //fn_fail:
- goto fn_exit;
-}
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
deleted file mode 100644
index 21c080f..0000000
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
+++ /dev/null
@@ -1,2548 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
-/*
- * (C) 2012 NEC Corporation
- * (C) 2012 University of Tokyo
- * (C) 2014-2015 RIKEN AICS
- *
- * See COPYRIGHT in top-level directory.
- */
-
-/*
- * TODO:
- * - MPID_nem_ib_com_clean might not clean all allocated memory area. Need to FIX it.
- * - During error processing in each function, some memory area might not
- * be deallocated. Look at all functions.
- */
-#include "ib_ibcom.h"
-//#include <sys/ipc.h>
-//#include <sys/shm.h>
-#include <sys/types.h>
-#include <assert.h>
-#include <linux/mman.h> /* make it define MAP_ANONYMOUS */
-#include <sys/mman.h>
-
-//#define MPID_NEM_IB_DEBUG_IBCOM
-#ifdef dprintf /* avoid redefinition with src/mpid/ch3/include/mpidimpl.h */
-#undef dprintf
-#endif
-#ifdef MPID_NEM_IB_DEBUG_IBCOM
-#define dprintf printf
-#else
-#define dprintf(...)
-#endif
-
-static MPID_nem_ib_com_t contab[MPID_NEM_IB_COM_SIZE];
-static int ib_initialized = 0;
-static int maxcon;
-static struct ibv_device **ib_devlist;
-static struct ibv_context *ib_ctx;
-struct ibv_context *MPID_nem_ib_ctx_export; /* for SC13 demo connector */
-static struct ibv_port_attr ib_pattr;
-static struct ibv_pd *ib_pd;
-struct ibv_pd *MPID_nem_ib_pd_export; /* for SC13 demo connector */
-struct ibv_cq *MPID_nem_ib_rc_shared_scq;
-static int MPID_nem_ib_rc_shared_scq_ref_count;
-struct ibv_cq *MPID_nem_ib_rc_shared_scq_scratch_pad;
-static int MPID_nem_ib_rc_shared_scq_scratch_pad_ref_count;
-static struct ibv_cq *MPID_nem_ib_ud_shared_scq;
-static int MPID_nem_ib_ud_shared_scq_ref_count;
-static struct ibv_cq *MPID_nem_ib_rc_shared_rcq;
-static int MPID_nem_ib_rc_shared_rcq_ref_count;
-struct ibv_cq *MPID_nem_ib_rc_shared_rcq_scratch_pad;
-static int MPID_nem_ib_rc_shared_rcq_scratch_pad_ref_count;
-struct ibv_cq *MPID_nem_ib_ud_shared_rcq;
-static int MPID_nem_ib_ud_shared_rcq_ref_count;
-uint8_t *MPID_nem_ib_scratch_pad = 0;
-int MPID_nem_ib_scratch_pad_ref_count;
-char *MPID_nem_ib_rdmawr_from_alloc_free_list_front[MPID_NEM_IB_RDMAWR_FROM_ALLOC_NID] = { 0 };
-char *MPID_nem_ib_rdmawr_from_alloc_arena_free_list[MPID_NEM_IB_RDMAWR_FROM_ALLOC_NID] = { 0 };
-
-struct ibv_mr *MPID_nem_ib_rdmawr_to_alloc_mr;
-uint8_t *MPID_nem_ib_rdmawr_to_alloc_start;
-uint8_t *MPID_nem_ib_rdmawr_to_alloc_free_list;
-
-#define MPID_NEM_IB_RANGE_CHECK(condesc, conp) \
-{ \
- if (condesc < 0 || condesc >= MPID_NEM_IB_COM_SIZE) return; \
- conp = &contab[condesc]; \
- if (conp->icom_used != 1) return; \
-}
-
-#define MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp) \
-{ \
- if (condesc < 0 || condesc >= MPID_NEM_IB_COM_SIZE) { \
- dprintf("condesc=%d\n", condesc); \
- MPID_nem_ib_segv; \
- return -1; \
- } \
- conp = &contab[condesc]; \
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_used != 1, -1, dprintf("MPID_NEM_IB_RANGE_CHECK_WITH_ERROR,conp->icom_used=%d\n", conp->icom_used)); \
-}
-
-/* Allocator for RDMA write to buffer
- - Allocate performs dequeue
- - Slow to "malloc" (two load and one store instructions)
- - Free performs enqueue
- - Slow to "free" (one load and two store instructions)
- - No flagmentation occurs
- - munmap unit is small (4KB)
- - Less header when compared to reference count
- - Refill never happens because IB-registers whole pool at the beginning
- - Fast when first-time allocs occur
- - Free list is a linked list
- - Fast to find a empty slot (one load instruction)
- */
-static int MPID_nem_ib_rdmawr_to_init(uint64_t sz)
-{
- int ibcom_errno = 0;
- void *start;
- void *cur;
- start = (void *) mmap(0, sz, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(start == (void *) -1, -1, printf("mmap failed\n"));
- dprintf("rdmawr_to_init,sz=%ld,start=%p\n", sz, start);
-
- memset(start, 0, sz);
-
- MPID_nem_ib_rdmawr_to_alloc_mr =
- MPID_nem_ib_com_reg_mr_fetch(start, sz, 0, MPID_NEM_IB_COM_REG_MR_STICKY);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!MPID_nem_ib_rdmawr_to_alloc_mr, -1,
- printf("MPID_nem_ib_com_reg_mr_fetchibv_reg_mr failed\n"));
- dprintf("rdmawr_to_init,rkey=%08x\n", MPID_nem_ib_rdmawr_to_alloc_mr->rkey);
-
- MPID_nem_ib_rdmawr_to_alloc_start = start;
- MPID_nem_ib_rdmawr_to_alloc_free_list = start;
- for (cur = start;
- cur < (void *) ((uint8_t *) start + sz - MPID_NEM_IB_COM_RDMABUF_SZSEG);
- cur = (uint8_t *) cur + MPID_NEM_IB_COM_RDMABUF_SZSEG) {
- //dprintf("rdmawr_to_init,cur=%p\n", cur);
- ((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) cur)->next =
- (uint8_t *) cur + MPID_NEM_IB_COM_RDMABUF_SZSEG;
- }
- ((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) cur)->next = 0;
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-void *MPID_nem_ib_rdmawr_to_alloc(int nslots)
-{
- dprintf("rdmawr_to_alloc,nslots=%d\n", nslots);
- void *start = NULL;
- int i;
- for (i = 0; i < nslots; i++) {
- //dprintf("MPID_nem_ib_rdmawr_to_alloc,free_list=%p\n", MPID_nem_ib_rdmawr_to_alloc_free_list);
- if (MPID_nem_ib_rdmawr_to_alloc_free_list) {
- if (i == 0) {
- start = MPID_nem_ib_rdmawr_to_alloc_free_list;
- }
- MPID_nem_ib_rdmawr_to_alloc_free_list =
- ((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) MPID_nem_ib_rdmawr_to_alloc_free_list)->next;
- }
- else {
- printf("out of rdmawr_to bufer\n");
- return 0;
- }
- }
- return start;
-}
-
-void MPID_nem_ib_rdmawr_to_free(void *p, int nslots)
-{
- void *q;
- ((MPID_nem_ib_rdmawr_to_alloc_hdr_t *)
- ((uint8_t *) p + MPID_NEM_IB_COM_RDMABUF_SZSEG * (nslots - 1)))->next =
- MPID_nem_ib_rdmawr_to_alloc_free_list;
- for (q = (uint8_t *) p + MPID_NEM_IB_COM_RDMABUF_SZSEG * (nslots - 2);
- q >= p; q = (uint8_t *) q - MPID_NEM_IB_COM_RDMABUF_SZSEG) {
- ((MPID_nem_ib_rdmawr_to_alloc_hdr_t *) q)->next =
- (uint8_t *) q + MPID_NEM_IB_COM_RDMABUF_SZSEG;
- }
- MPID_nem_ib_rdmawr_to_alloc_free_list = p;
-}
-
-int MPID_nem_ib_rdmawr_to_munmap(void *p, int nslots)
-{
- int retval;
- int ibcom_errno = 0;
- retval = munmap(p, MPID_NEM_IB_COM_RDMABUF_SZSEG * nslots);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, printf("munmap failed\n"));
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-static int modify_qp_to_init(struct ibv_qp *qp, int ib_port, int additional_flags)
-{
- struct ibv_qp_attr attr;
- int flags;
- int rc;
-
- memset(&attr, 0, sizeof(attr));
- attr.qp_state = IBV_QPS_INIT;
- attr.port_num = ib_port;
- attr.pkey_index = 0;
- attr.qp_access_flags =
- IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ |
- IBV_ACCESS_REMOTE_WRITE | additional_flags;
- flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS;
- rc = ibv_modify_qp(qp, &attr, flags);
- if (rc) {
- fprintf(stderr, "failed to modify QP state to INIT\n");
- }
- return rc;
-}
-
-static int modify_qp_to_rtr(struct ibv_qp *qp, uint32_t remote_qpn, uint16_t dlid,
- union ibv_gid *dgid, int ib_port, int gid_idx)
-{
- struct ibv_qp_attr attr;
- int flags;
- int rc;
-
- memset(&attr, 0, sizeof(attr));
- attr.qp_state = IBV_QPS_RTR;
- attr.path_mtu = IBV_MTU_2048;
- //attr.path_mtu = IBV_MTU_1024;
- //attr.path_mtu = IBV_MTU_256; /* DCFA */
- attr.dest_qp_num = remote_qpn;
- attr.rq_psn = 0;
- attr.max_dest_rd_atomic = MPID_NEM_IB_COM_MAX_RD_ATOMIC;
- //attr.max_dest_rd_atomic = 1;
- //attr.max_dest_rd_atomic = 0; /* DCFA */
-
- /* Default is 0x12 (= 5.12ms) see IB Spec. Rel. 1.2, Vol. 1, 9.7.5.2.8 */
- attr.min_rnr_timer = 0x12;
-
- attr.ah_attr.dlid = dlid;
- attr.ah_attr.sl = 0;
- attr.ah_attr.src_path_bits = 0;
- attr.ah_attr.is_global = 0;
- attr.ah_attr.port_num = ib_port;
-
- /* In dcfa gid is not set and for testing here it is also not set */
-#ifdef HAVE_LIBDCFA /* DCFA doesn't use gid */
-#else
- if (gid_idx >= 0) {
- attr.ah_attr.is_global = 1;
- attr.ah_attr.port_num = ib_port;
- memcpy(&attr.ah_attr.grh.dgid, dgid, 16);
- attr.ah_attr.grh.flow_label = 0;
- attr.ah_attr.grh.hop_limit = 1;
- attr.ah_attr.grh.sgid_index = gid_idx;
- attr.ah_attr.grh.traffic_class = 0;
- }
-#endif
-
- flags = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN
- | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER;
- rc = ibv_modify_qp(qp, &attr, flags);
- if (rc) {
- dprintf("failed to modify QP state to RTR\n");
- }
- return rc;
-}
-
-static int modify_qp_to_rts(struct ibv_qp *qp)
-{
- struct ibv_qp_attr attr;
- int flags;
- int rc;
-
- memset(&attr, 0, sizeof(attr));
- attr.qp_state = IBV_QPS_RTS;
- attr.timeout = (0x14); /* timeout 4.096us * 2^x */
- attr.retry_cnt = 7;
- attr.rnr_retry = 7;
- attr.sq_psn = 0;
- attr.max_rd_atomic = MPID_NEM_IB_COM_MAX_RD_ATOMIC;
- //attr.max_rd_atomic = 1;
- //attr.max_rd_atomic = 0; /* DCFA */
-
- flags = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT
- | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC;
- rc = ibv_modify_qp(qp, &attr, flags);
- if (rc) {
- fprintf(stderr, "failed to modify QP state to RTS\n");
- }
- return rc;
-}
-
-/* called from MPID_nem_ib_com_open if needed */
-static int MPID_nem_ib_com_device_init()
-{
- int ibcom_errno = 0;
- int dev_num;
- char *dev_name;
- int i;
- int ib_port = 1;
-
- if (ib_initialized == 1) {
- dprintf("MPID_nem_ib_com_device_init,already initialized\n");
- return 0;
- }
- if (ib_initialized == -1)
- return -1;
-
- /* Get the device list */
- ib_devlist = ibv_get_device_list(&dev_num);
- if (!ib_devlist || !dev_num) {
- fprintf(stderr, "No IB device is found\n");
- return -1;
- }
-
-#ifdef HAVE_LIBDCFA
- for (i = 0; i < dev_num; i++) {
- if (ib_devlist[i]) {
- goto dev_found;
- }
- }
-#else
- for (i = 0; i < dev_num; i++) {
- if (!strcmp(ibv_get_device_name(ib_devlist[i]), "mlx4_0") ||
- !strcmp(ibv_get_device_name(ib_devlist[i]), "mlx5_0") ||
- !strcmp(ibv_get_device_name(ib_devlist[i]), "qib0")) {
- goto dev_found;
- }
- }
-#endif
- MPID_NEM_IB_COM_ERR_SETANDJUMP(-1, printf("IB device not found"));
- dev_found:
-
- /* Open the requested device */
- if (MPID_nem_ib_ctx_export) {
- ib_ctx = MPID_nem_ib_ctx_export;
- }
- else {
- ib_ctx = ibv_open_device(ib_devlist[i]);
-
- if (ib_ctx) {
- /* get port attribute */
- if (ibv_query_port(ib_ctx, ib_port, &ib_pattr)) {
- dprintf("ibv_query_port on port %d failed\n", ib_port);
- goto err_exit;
- }
- }
- }
- dprintf("MPID_nem_ib_com_device_init,MPID_nem_ib_ctx_export=%p,ib_ctx=%p\n",
- MPID_nem_ib_ctx_export, ib_ctx);
- if (!ib_ctx) {
- fprintf(stderr, "failed to open IB device\n");
- goto err_exit;
- }
- MPID_nem_ib_ctx_export = ib_ctx;
-#ifdef HAVE_LIBDCFA
-#else
- dev_name = MPIU_Strdup(ibv_get_device_name(ib_devlist[i]));
- dprintf("MPID_nem_ib_com_device_init,dev_name=%s\n", dev_name);
- MPIU_Free(dev_name);
-#endif
- /* Create a PD */
- if (MPID_nem_ib_pd_export) {
- ib_pd = MPID_nem_ib_pd_export;
- }
- else {
- ib_pd = ibv_alloc_pd(ib_ctx);
- }
- dprintf("MPID_nem_ib_com_device_init,MPID_nem_ib_pd_export=%p,ib_pd=%p\n",
- MPID_nem_ib_pd_export, ib_pd);
- if (!ib_pd) {
- fprintf(stderr, "ibv_alloc_pd failed\n");
- goto err_exit;
- }
- MPID_nem_ib_pd_export = ib_pd;
-
- ib_initialized = 1;
- fn_exit:
- return ibcom_errno;
-
- err_exit:
- ib_initialized = -1;
- if (ib_devlist)
- ibv_free_device_list(ib_devlist);
- if (ib_ctx)
- ibv_close_device(ib_ctx);
- return -1;
- fn_fail:
- goto fn_exit;
-}
-
-static int MPID_nem_ib_com_clean(MPID_nem_ib_com_t * conp)
-{
- int i;
- int ibcom_errno = 0;
- int ib_errno;
- int retval;
-
- if (conp->icom_qp) {
- ibv_destroy_qp(conp->icom_qp);
- conp->icom_qp = NULL;
- }
- if (conp->icom_mrlist && conp->icom_mrlen > 0) {
- switch (conp->open_flag) {
- case MPID_NEM_IB_COM_OPEN_RC:
- MPIU_Assert(MPID_nem_ib_rc_shared_scq_ref_count > 0);
- if (--MPID_nem_ib_rc_shared_scq_ref_count == 0) {
- dprintf("ibcom,destroy MPID_nem_ib_rc_shared_scq\n");
- ib_errno = ibv_destroy_cq(MPID_nem_ib_rc_shared_scq);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq failed\n"));
-
- /* Tell drain_scq that CQ is destroyed because
- * drain_scq is called after poll_eager calls vc_terminate */
- MPID_nem_ib_rc_shared_scq = NULL;
- }
- MPIU_Assert(MPID_nem_ib_rc_shared_rcq_ref_count > 0);
- if (--MPID_nem_ib_rc_shared_rcq_ref_count == 0) {
- dprintf("ibcom,destroy MPID_nem_ib_rc_shared_rcq\n");
- ib_errno = ibv_destroy_cq(MPID_nem_ib_rc_shared_rcq);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq failed\n"));
-
- MPID_nem_ib_rc_shared_rcq = NULL;
- }
-
- MPIU_Free(conp->icom_mrlist);
- MPIU_Free(conp->icom_mem);
- MPIU_Free(conp->icom_msize);
-
- MPIU_Free(conp->icom_rmem);
- MPIU_Free(conp->icom_rsize);
- MPIU_Free(conp->icom_rkey);
- for (i = 0; i < MPID_NEM_IB_COM_SMT_INLINE_NCHAIN; i++) {
-#ifndef HAVE_LIBDCFA
- MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].sg_list);
-#endif
- }
- MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list);
- MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list);
- MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].sg_list);
- MPIU_Free(conp->icom_sr);
- MPIU_Free(conp->icom_rr);
- break;
- case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
- MPIU_Assert(MPID_nem_ib_rc_shared_scq_scratch_pad_ref_count > 0);
- if (--MPID_nem_ib_rc_shared_scq_scratch_pad_ref_count == 0) {
- ib_errno = ibv_destroy_cq(MPID_nem_ib_rc_shared_scq_scratch_pad);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq failed\n"));
- /* Tell drain_scq that CQ is destroyed because
- * drain_scq is called after poll_eager calls vc_terminate */
- MPID_nem_ib_rc_shared_scq_scratch_pad = NULL;
- }
- MPIU_Assert(MPID_nem_ib_rc_shared_rcq_scratch_pad_ref_count > 0);
- if (--MPID_nem_ib_rc_shared_rcq_scratch_pad_ref_count == 0) {
- ib_errno = ibv_destroy_cq(MPID_nem_ib_rc_shared_rcq_scratch_pad);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq failed\n"));
- }
- retval = munmap(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM],
- MPID_NEM_IB_COM_SCRATCH_PAD_FROM_SZ);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, dprintf("munmap"));
-
- MPIU_Free(conp->icom_mrlist);
- MPIU_Free(conp->icom_mem);
- MPIU_Free(conp->icom_msize);
-
- MPIU_Free(conp->icom_rmem);
- MPIU_Free(conp->icom_rsize);
- MPIU_Free(conp->icom_rkey);
-
-#ifndef HAVE_LIBDCFA
- MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list);
- MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list);
- MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list);
- MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].sg_list);
-#endif
- MPIU_Free(conp->icom_sr);
-#ifndef HAVE_LIBDCFA
- MPIU_Free(conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].sg_list);
-#endif
- MPIU_Free(conp->icom_rr);
- break;
- case MPID_NEM_IB_COM_OPEN_UD:
- MPIU_Assert(MPID_nem_ib_ud_shared_scq_ref_count > 0);
- if (--MPID_nem_ib_ud_shared_scq_ref_count == 0) {
- ib_errno = ibv_destroy_cq(MPID_nem_ib_ud_shared_scq);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq"));
- /* Tell drain_scq that CQ is destroyed because
- * drain_scq is called after poll_eager calls vc_terminate */
- MPID_nem_ib_ud_shared_scq = NULL;
- }
- MPIU_Assert(MPID_nem_ib_ud_shared_rcq_ref_count > 0);
- if (--MPID_nem_ib_ud_shared_rcq_ref_count == 0) {
- ib_errno = ibv_destroy_cq(MPID_nem_ib_ud_shared_rcq);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_destroy_cq"));
- }
- retval = munmap(conp->icom_mem[MPID_NEM_IB_COM_UDWR_FROM], MPID_NEM_IB_COM_UDBUF_SZ);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, dprintf("munmap"));
- retval = munmap(conp->icom_mem[MPID_NEM_IB_COM_UDWR_TO], MPID_NEM_IB_COM_UDBUF_SZ);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, dprintf("munmap"));
-
- MPIU_Free(conp->icom_mrlist);
- MPIU_Free(conp->icom_mem);
- MPIU_Free(conp->icom_msize);
-
- MPIU_Free(conp->icom_ah_attr);
-#ifndef HAVE_LIBDCFA
- MPIU_Free(conp->icom_sr[MPID_NEM_IB_COM_UD_INITIATOR].sg_list);
-#endif
- MPIU_Free(conp->icom_sr);
-
- MPIU_Free(conp->icom_rr[MPID_NEM_IB_COM_UD_RESPONDER].sg_list);
- MPIU_Free(conp->icom_rr);
- break;
- }
- }
- memset(conp, 0, sizeof(MPID_nem_ib_com_t));
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_open(int ib_port, int open_flag, int *condesc)
-{
- int ibcom_errno = 0, ib_errno;
- MPID_nem_ib_com_t *conp;
- struct ibv_qp_init_attr qp_init_attr;
- struct ibv_sge *sge;
- int i;
-
- dprintf("MPID_nem_ib_com_open,port=%d,flag=%08x\n", ib_port, open_flag);
-
- int open_flag_conn = open_flag;
- if (open_flag_conn != MPID_NEM_IB_COM_OPEN_RC &&
- open_flag_conn != MPID_NEM_IB_COM_OPEN_UD &&
- open_flag_conn != MPID_NEM_IB_COM_OPEN_SCRATCH_PAD) {
- dprintf("MPID_nem_ib_com_open,bad flag\n");
- ibcom_errno = -1;
- goto fn_fail;
- }
-
- /* Increment reference counter of ibv_reg_mr cache */
- ibcom_errno = MPID_nem_ib_com_register_cache_init();
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ibcom_errno, -1, dprintf("MPID_nem_ib_com_register_cache_init"));
-
- /* device open error */
- if (MPID_nem_ib_com_device_init() < 0) {
- ibcom_errno = -1;
- goto fn_fail;
- }
-
- /* no more connection can be estabilished */
- if (maxcon == MPID_NEM_IB_COM_SIZE) {
- ibcom_errno = -1;
- goto fn_fail;
- }
-
- for (*condesc = 0; *condesc < MPID_NEM_IB_COM_SIZE; (*condesc)++) {
- //dprintf("*condesc=%d,used=%d\n", *condesc, contab[*condesc].icom_used);
- if (contab[*condesc].icom_used == 0) {
- goto ok_cont;
- }
- }
- /* count says not full, but we couldn't fine vacant slot */
- dprintf("contable has inconsistent\n");
- ibcom_errno = -1;
- goto fn_fail;
-
- ok_cont:
- dprintf("MPID_nem_ib_com_open,condesc=%d\n", *condesc);
- conp = &contab[*condesc];
- memset(conp, 0, sizeof(MPID_nem_ib_com_t));
- conp->icom_used = 1;
- conp->icom_port = ib_port;
- conp->open_flag = open_flag;
- conp->rsr_seq_num_poll = 0; /* it means slot 0 is polled */
- conp->rsr_seq_num_tail = -1; /* it means slot 0 is not released */
- conp->rsr_seq_num_tail_last_sent = -1;
- conp->lsr_seq_num_tail_last_requested = -2;
- conp->rdmabuf_occupancy_notify_rstate = MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_LW;
- conp->rdmabuf_occupancy_notify_lstate = MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_LW;
- conp->ask_guard = 0;
- //dprintf("MPID_nem_ib_com_open,ptr=%p,rsr_seq_num_poll=%d\n", conp, conp->rsr_seq_num_poll);
-
-#ifdef HAVE_LIBDCFA
-#else
- if (ibv_query_port(ib_ctx, ib_port, &conp->icom_pattr)) {
- dprintf("ibv_query_port on port %u failed\n", ib_port);
- goto err_exit;
- }
-#endif
-
- /* Create send/recv CQ */
- switch (open_flag) {
- case MPID_NEM_IB_COM_OPEN_RC:
- MPID_nem_ib_rc_shared_scq_ref_count++;
- if (!MPID_nem_ib_rc_shared_scq) {
-#ifdef HAVE_LIBDCFA
- MPID_nem_ib_rc_shared_scq = ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
-#else
- MPID_nem_ib_rc_shared_scq =
- ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY, NULL, NULL, 0);
-#endif
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!MPID_nem_ib_rc_shared_scq, -1,
- dprintf("MPID_nem_ib_rc_shared_scq"));
- }
- conp->icom_scq = MPID_nem_ib_rc_shared_scq;
-
- MPID_nem_ib_rc_shared_rcq_ref_count++;
- if (!MPID_nem_ib_rc_shared_rcq) {
-#ifdef HAVE_LIBDCFA
- MPID_nem_ib_rc_shared_rcq = ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
-#else
- MPID_nem_ib_rc_shared_rcq =
- ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY, NULL, NULL, 0);
-#endif
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!MPID_nem_ib_rc_shared_rcq, -1,
- dprintf("MPID_nem_ib_rc_shared_rcq"));
- }
- conp->icom_rcq = MPID_nem_ib_rc_shared_rcq;
- break;
- case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
- MPID_nem_ib_rc_shared_scq_scratch_pad_ref_count++;
- if (!MPID_nem_ib_rc_shared_scq_scratch_pad) {
-#ifdef HAVE_LIBDCFA
- MPID_nem_ib_rc_shared_scq_scratch_pad =
- ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
-#else
- MPID_nem_ib_rc_shared_scq_scratch_pad =
- ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY, NULL, NULL, 0);
-#endif
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!MPID_nem_ib_rc_shared_scq_scratch_pad, -1,
- dprintf("MPID_nem_ib_rc_shared_scq"));
- }
- conp->icom_scq = MPID_nem_ib_rc_shared_scq_scratch_pad;
-
- MPID_nem_ib_rc_shared_rcq_scratch_pad_ref_count++;
- if (!MPID_nem_ib_rc_shared_rcq_scratch_pad) {
-#ifdef HAVE_LIBDCFA
- MPID_nem_ib_rc_shared_rcq_scratch_pad =
- ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
-#else
- MPID_nem_ib_rc_shared_rcq_scratch_pad =
- ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY, NULL, NULL, 0);
-#endif
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!MPID_nem_ib_rc_shared_rcq_scratch_pad, -1,
- dprintf("MPID_nem_ib_rc_shared_rcq"));
- }
- conp->icom_rcq = MPID_nem_ib_rc_shared_rcq_scratch_pad;
- break;
- case MPID_NEM_IB_COM_OPEN_UD:
- MPID_nem_ib_ud_shared_scq_ref_count++;
- if (!MPID_nem_ib_ud_shared_scq) {
-#ifdef HAVE_LIBDCFA
- MPID_nem_ib_ud_shared_scq = ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
-#else
- MPID_nem_ib_ud_shared_scq =
- ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY, NULL, NULL, 0);
-#endif
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!MPID_nem_ib_ud_shared_scq, -1,
- dprintf("MPID_nem_ib_ud_shared_scq"));
- }
- conp->icom_scq = MPID_nem_ib_ud_shared_scq;
-
- MPID_nem_ib_ud_shared_rcq_ref_count++;
- if (!MPID_nem_ib_ud_shared_rcq) {
-#ifdef HAVE_LIBDCFA
- MPID_nem_ib_ud_shared_rcq = ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
-#else
- MPID_nem_ib_ud_shared_rcq =
- ibv_create_cq(ib_ctx, MPID_NEM_IB_COM_MAX_CQ_CAPACITY, NULL, NULL, 0);
-#endif
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!MPID_nem_ib_ud_shared_rcq, -1,
- dprintf("MPID_nem_ib_ud_shared_rcq"));
- }
- conp->icom_rcq = MPID_nem_ib_ud_shared_rcq;
- break;
- }
-
- /* Create QP */
- memset(&qp_init_attr, 0, sizeof(qp_init_attr));
- qp_init_attr.send_cq = conp->icom_scq;
- qp_init_attr.recv_cq = conp->icom_rcq;
- qp_init_attr.cap.max_send_wr = MPID_NEM_IB_COM_MAX_SQ_CAPACITY;
- qp_init_attr.cap.max_recv_wr = MPID_NEM_IB_COM_MAX_RQ_CAPACITY;
- qp_init_attr.cap.max_send_sge = MPID_NEM_IB_COM_MAX_SGE_CAPACITY;
- qp_init_attr.cap.max_recv_sge = MPID_NEM_IB_COM_MAX_SGE_CAPACITY;
- qp_init_attr.cap.max_inline_data = MPID_NEM_IB_COM_INLINE_DATA;
- switch (open_flag) {
- case MPID_NEM_IB_COM_OPEN_RC:
- case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
- qp_init_attr.qp_type = IBV_QPT_RC;
- break;
- case MPID_NEM_IB_COM_OPEN_UD:
- qp_init_attr.qp_type = IBV_QPT_UD;
- break;
- default:
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(1, -1, dprintf("invalid open_flag\n"));
- break;
- }
- qp_init_attr.sq_sig_all = 1;
-
- conp->icom_qp = ibv_create_qp(ib_pd, &qp_init_attr);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_qp, -1, printf("ibv_create_qp\n"));
-
- conp->max_send_wr = qp_init_attr.cap.max_send_wr;
- conp->max_recv_wr = qp_init_attr.cap.max_recv_wr;
- conp->max_inline_data = qp_init_attr.cap.max_inline_data;
-
- dprintf("MPID_nem_ib_com_open,max_send_wr=%d,max_recv_wr=%d,max_inline_data=%d\n",
- qp_init_attr.cap.max_send_wr, qp_init_attr.cap.max_recv_wr,
- qp_init_attr.cap.max_inline_data);
- dprintf("MPID_nem_ib_com_open,fd=%d,qpn=%08x\n", *condesc, conp->icom_qp->qp_num);
-#ifdef HAVE_LIBDCFA
- dprintf("MPID_nem_ib_com_open,fd=%d,lid=%04x\n", *condesc, ib_ctx->lid);
-#else
- dprintf("MPID_nem_ib_com_open,fd=%d,lid=%04x\n", *condesc, conp->icom_pattr.lid);
-#endif
-
-#ifdef HAVE_LIBDCFA
- /* DCFA doesn't use gid */
- for (i = 0; i < 16; i++) {
- conp->icom_gid.raw[i] = 0;
- }
-#else
- ib_errno = ibv_query_gid(ib_ctx, ib_port, 0, &conp->icom_gid);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_query_gid\n"));
-
- dprintf("MPID_nem_ib_com_open,fd=%d,my_gid=", *condesc);
- for (i = 0; i < 16; i++) {
- dprintf("%02x", (int) conp->icom_gid.raw[i]);
- }
- dprintf("\n");
-#endif
-
- /* buffers */
- switch (open_flag) {
- case MPID_NEM_IB_COM_OPEN_RC:
- /* RDMA-write-from and -to local memory area */
- conp->icom_mrlist = MPIU_Malloc(sizeof(struct ibv_mr *) * MPID_NEM_IB_COM_NBUF_RDMA);
- memset(conp->icom_mrlist, 0, sizeof(struct ibv_mr *) * MPID_NEM_IB_COM_NBUF_RDMA);
- conp->icom_mrlen = MPID_NEM_IB_COM_NBUF_RDMA;
- conp->icom_mem = (void **) MPIU_Malloc(sizeof(void **) * MPID_NEM_IB_COM_NBUF_RDMA);
- //printf("open,icom_mem=%p\n", conp->icom_mem);
- memset(conp->icom_mem, 0, sizeof(void **) * MPID_NEM_IB_COM_NBUF_RDMA);
- conp->icom_msize = (int *) MPIU_Malloc(sizeof(int *) * MPID_NEM_IB_COM_NBUF_RDMA);
- memset(conp->icom_msize, 0, sizeof(int *) * MPID_NEM_IB_COM_NBUF_RDMA);
-
- /* RDMA-write-to local memory area */
- conp->icom_msize[MPID_NEM_IB_COM_RDMAWR_TO] = MPID_NEM_IB_COM_RDMABUF_SZ;
-
- /* ibv_reg_mr all memory area for all ring buffers
- * including shared and exclusive ones */
- if (!MPID_nem_ib_rdmawr_to_alloc_start) {
- ibcom_errno =
- MPID_nem_ib_rdmawr_to_init(MPID_NEM_IB_COM_RDMABUF_SZ * MPID_NEM_IB_NRINGBUF);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ibcom_errno, -1, printf("MPID_nem_ib_rdmawr_to_init"));
- dprintf("ib_com_open,MPID_nem_ib_rdmawr_to_alloc_free_list=%p\n",
- MPID_nem_ib_rdmawr_to_alloc_free_list);
- }
-
- conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO] = MPID_nem_ib_rdmawr_to_alloc_start;
- //mmap(0, MPID_NEM_IB_COM_RDMABUF_SZ, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE,
- //-1, 0);
- dprintf("MPID_nem_ib_com_open,mmap=%p,len=%d\n", conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO],
- MPID_NEM_IB_COM_RDMABUF_SZ);
-
-#ifdef HAVE_LIBDCFA
- dprintf("MPID_nem_ib_com_open,fd=%d,rmem=%p\n", *condesc,
- MPID_nem_ib_rdmawr_to_alloc_mr->buf);
-#else
- dprintf("MPID_nem_ib_com_open,fd=%d,rmem=%p\n", *condesc,
- MPID_nem_ib_rdmawr_to_alloc_mr->addr);
-#endif
- dprintf("MPID_nem_ib_com_open,fd=%d,rkey=%08x\n", *condesc,
- MPID_nem_ib_rdmawr_to_alloc_mr->rkey);
-
- /* RDMA-write-to remote memory area */
- conp->icom_rmem = (void **) MPIU_Malloc(sizeof(void **) * MPID_NEM_IB_COM_NBUF_RDMA);
- if (conp->icom_rmem == 0)
- goto err_exit;
- memset(conp->icom_rmem, 0, sizeof(void **) * MPID_NEM_IB_COM_NBUF_RDMA);
-
- conp->icom_rsize = (size_t *) MPIU_Malloc(sizeof(void **) * MPID_NEM_IB_COM_NBUF_RDMA);
- if (conp->icom_rsize == 0)
- goto err_exit;
- memset(conp->icom_rsize, 0, sizeof(void **) * MPID_NEM_IB_COM_NBUF_RDMA);
-
- conp->icom_rkey = (int *) MPIU_Malloc(sizeof(int) * MPID_NEM_IB_COM_NBUF_RDMA);
- if (conp->icom_rkey == 0)
- goto err_exit;
- memset(conp->icom_rkey, 0, sizeof(int) * MPID_NEM_IB_COM_NBUF_RDMA);
- break;
- case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
- /* RDMA-write-from and -to local memory area */
- conp->icom_mrlist =
- (struct ibv_mr **) MPIU_Malloc(sizeof(struct ibv_mr *) *
- MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
- memset(conp->icom_mrlist, 0, sizeof(struct ibv_mr *) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
- conp->icom_mrlen = MPID_NEM_IB_COM_NBUF_SCRATCH_PAD;
- conp->icom_mem = (void **) MPIU_Malloc(sizeof(void *) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
- memset(conp->icom_mem, 0, sizeof(void *) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
- conp->icom_msize = (int *) MPIU_Malloc(sizeof(int) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
- memset(conp->icom_msize, 0, sizeof(int) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
-
- /* RDMA-write-from local memory area */
- conp->icom_msize[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] = MPID_NEM_IB_COM_SCRATCH_PAD_FROM_SZ;
- conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] =
- mmap(0, MPID_NEM_IB_COM_SCRATCH_PAD_FROM_SZ, PROT_READ | PROT_WRITE,
- MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] ==
- (void *) -1, -1, printf("mmap failed\n"));
-
- conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] =
- MPID_nem_ib_com_reg_mr_fetch(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM],
- conp->icom_msize[MPID_NEM_IB_COM_SCRATCH_PAD_FROM], 0,
- MPID_NEM_IB_COM_REG_MR_STICKY);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_FROM], -1,
- printf("ibv_reg_mr failed\n"));
-
- /* RDMA-write-to remote memory area */
- conp->icom_rmem = (void **) MPIU_Malloc(sizeof(void *) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_rmem == 0, -1, dprintf("malloc failed\n"));
- memset(conp->icom_rmem, 0, sizeof(void *) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
-
- conp->icom_rsize =
- (size_t *) MPIU_Malloc(sizeof(size_t) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_rsize == 0, -1, dprintf("malloc failed\n"));
- memset(conp->icom_rsize, 0, sizeof(size_t) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
-
- conp->icom_rkey = (int *) MPIU_Malloc(sizeof(int) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_rkey == 0, -1, dprintf("malloc failed\n"));
- memset(conp->icom_rkey, 0, sizeof(int) * MPID_NEM_IB_COM_NBUF_SCRATCH_PAD);
- break;
-
- case MPID_NEM_IB_COM_OPEN_UD:
- /* UD-write-from and -to local memory area */
- conp->icom_mrlist = MPIU_Malloc(sizeof(struct ibv_mr *) * MPID_NEM_IB_COM_NBUF_UD);
- memset(conp->icom_mrlist, 0, sizeof(struct ibv_mr *) * MPID_NEM_IB_COM_NBUF_UD);
- conp->icom_mrlen = MPID_NEM_IB_COM_NBUF_UD;
- conp->icom_mem = (void **) MPIU_Malloc(sizeof(void **) * MPID_NEM_IB_COM_NBUF_UD);
- memset(conp->icom_mem, 0, sizeof(void **) * MPID_NEM_IB_COM_NBUF_UD);
- conp->icom_msize = (int *) MPIU_Malloc(sizeof(int *) * MPID_NEM_IB_COM_NBUF_UD);
- memset(conp->icom_msize, 0, sizeof(int *) * MPID_NEM_IB_COM_NBUF_UD);
-
- /* UD-write-from local memory area */
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(MPID_NEM_IB_COM_UDBUF_SZ <= 40, -1,
- dprintf("buf_size too short\n"));
- conp->icom_msize[MPID_NEM_IB_COM_UDWR_FROM] = MPID_NEM_IB_COM_UDBUF_SZ;
- conp->icom_mem[MPID_NEM_IB_COM_UDWR_FROM] =
- mmap(0, MPID_NEM_IB_COM_UDBUF_SZ, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE,
- -1, 0);
- dprintf("MPID_nem_ib_com_open,mmap=%p,len=%d\n", conp->icom_mem[MPID_NEM_IB_COM_UDWR_FROM],
- MPID_NEM_IB_COM_UDBUF_SZ);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_mem[MPID_NEM_IB_COM_UDWR_FROM] == (void *) -1, -1,
- dprintf("failed to allocate buffer\n"));
- memset(conp->icom_mem[MPID_NEM_IB_COM_UDWR_FROM], 0,
- conp->icom_msize[MPID_NEM_IB_COM_UDWR_FROM]);
-
- conp->icom_mrlist[MPID_NEM_IB_COM_UDWR_FROM] =
- MPID_nem_ib_com_reg_mr_fetch(conp->icom_mem[MPID_NEM_IB_COM_UDWR_FROM],
- conp->icom_msize[MPID_NEM_IB_COM_UDWR_FROM], 0,
- MPID_NEM_IB_COM_REG_MR_STICKY);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_mrlist[MPID_NEM_IB_COM_UDWR_FROM], -1,
- printf("ibv_reg_mr failed\n"));
-
- /* UD-write-to local memory area */
- /* addr to addr+39 are not filled, addr+40 to addr+length-1 are filled with payload */
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(MPID_NEM_IB_COM_UDBUF_SZ <= 40, -1,
- dprintf("buf_size too short\n"));
- conp->icom_msize[MPID_NEM_IB_COM_UDWR_TO] = MPID_NEM_IB_COM_UDBUF_SZ;
- conp->icom_mem[MPID_NEM_IB_COM_UDWR_TO] =
- mmap(0, MPID_NEM_IB_COM_UDBUF_SZ, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE,
- -1, 0);
- dprintf("MPID_nem_ib_com_open,mmap=%p,len=%d\n", conp->icom_mem[MPID_NEM_IB_COM_UDWR_TO],
- MPID_NEM_IB_COM_UDBUF_SZ);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->icom_mem[MPID_NEM_IB_COM_UDWR_TO] == (void *) -1, -1,
- dprintf("failed to allocate buffer\n"));
- memset(conp->icom_mem[MPID_NEM_IB_COM_UDWR_TO], 0,
- conp->icom_msize[MPID_NEM_IB_COM_UDWR_TO]);
-
- conp->icom_mrlist[MPID_NEM_IB_COM_UDWR_TO] =
- MPID_nem_ib_com_reg_mr_fetch(conp->icom_mem[MPID_NEM_IB_COM_UDWR_TO],
- conp->icom_msize[MPID_NEM_IB_COM_UDWR_TO], 0,
- MPID_NEM_IB_COM_REG_MR_STICKY);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_mrlist[MPID_NEM_IB_COM_UDWR_TO], -1,
- dprintf("ibv_reg_mr failed with mr_flags=0x%x\n", mr_flags));
-
- /* initialize arena allocator for MPID_NEM_IB_COM_UDWR_TO */
- //MPID_nem_ib_com_udbuf_init(conp->icom_mem[MPID_NEM_IB_COM_UDWR_TO]);
-
- dprintf("MPID_nem_ib_com_open,ud,fd=%d,lkey=%08x\n", *condesc,
- conp->icom_mrlist[MPID_NEM_IB_COM_UDWR_TO]->lkey);
- break;
- default:
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(1, -1, dprintf("invalid open_flag\n"));
- break;
-
- }
-
- /* command templates */
- switch (open_flag) {
- case MPID_NEM_IB_COM_OPEN_RC:
-
- /* SR (send request) template */
- conp->icom_sr =
- (struct ibv_send_wr *) MPIU_Malloc(sizeof(struct ibv_send_wr) *
- MPID_NEM_IB_COM_RC_SR_NTEMPLATE);
- memset(conp->icom_sr, 0, sizeof(struct ibv_send_wr) * MPID_NEM_IB_COM_RC_SR_NTEMPLATE);
-
- for (i = 0; i < MPID_NEM_IB_COM_SMT_INLINE_NCHAIN; i++) {
- /* SGE (RDMA-send-from memory) template */
-#ifdef HAVE_LIBDCFA
- memset(&(conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].sg_list[0]), 0,
- sizeof(struct ibv_sge) * WR_SG_NUM);
-#else
- sge =
- (struct ibv_sge *) MPIU_Malloc(sizeof(struct ibv_sge) *
- MPID_NEM_IB_COM_SMT_INLINE_INITIATOR_NSGE);
- memset(sge, 0, sizeof(struct ibv_sge) * MPID_NEM_IB_COM_SMT_INLINE_INITIATOR_NSGE);
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].next =
- (i ==
- MPID_NEM_IB_COM_SMT_INLINE_NCHAIN -
- 1) ? NULL : &conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i + 1];
-#ifdef HAVE_LIBDCFA
-#else
- conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].sg_list = sge;
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].opcode = IBV_WR_RDMA_WRITE;
- conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].send_flags =
- IBV_SEND_SIGNALED | IBV_SEND_INLINE;
- }
-
- {
-#ifdef HAVE_LIBDCFA
- memset(&(conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[0]), 0,
- sizeof(struct ibv_sge) * WR_SG_NUM);
-#else
- sge =
- (struct ibv_sge *) MPIU_Malloc(sizeof(struct ibv_sge) *
- MPID_NEM_IB_COM_SMT_NOINLINE_INITIATOR_NSGE);
- memset(sge, 0, sizeof(struct ibv_sge) * MPID_NEM_IB_COM_SMT_NOINLINE_INITIATOR_NSGE);
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].next = NULL;
-#ifdef HAVE_LIBDCFA
-#else
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list = sge;
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].opcode = IBV_WR_RDMA_WRITE;
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].send_flags = IBV_SEND_SIGNALED;
- }
- {
- /* SR (send request) template for MPID_NEM_IB_COM_LMT_INITIATOR */
-#ifdef HAVE_LIBDCFA
- memset(&(conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[0]), 0,
- sizeof(struct ibv_sge) * WR_SG_NUM);
-#else
- sge =
- (struct ibv_sge *) MPIU_Malloc(sizeof(struct ibv_sge) *
- MPID_NEM_IB_COM_LMT_INITIATOR_NSGE);
- memset(sge, 0, sizeof(struct ibv_sge) * MPID_NEM_IB_COM_LMT_INITIATOR_NSGE);
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].next = NULL;
-#ifdef HAVE_LIBDCFA
-#else
- conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list = sge;
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].opcode = IBV_WR_RDMA_READ;
- conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].send_flags = IBV_SEND_SIGNALED;
- }
-
- /* SR (send request) template for MPID_NEM_IB_COM_LMT_PUT *//* for lmt-put-done */
-#ifdef HAVE_LIBDCFA
- memset(&(conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].sg_list[0]), 0,
- sizeof(struct ibv_sge) * WR_SG_NUM);
-#else
- sge = (struct ibv_sge *) MPIU_Malloc(sizeof(struct ibv_sge) * MPID_NEM_IB_COM_LMT_PUT_NSGE);
- memset(sge, 0, sizeof(struct ibv_sge) * MPID_NEM_IB_COM_LMT_PUT_NSGE);
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].next = NULL;
-#ifdef HAVE_LIBDCFA
-#else
- conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].sg_list = sge;
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].opcode = IBV_WR_RDMA_WRITE;
- conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].send_flags = IBV_SEND_SIGNALED;
-
- /* SR (send request) template for MPID_NEM_IB_COM_RDMAWR_FRMFIXED */
- /* not implemented */
-
- /* SGE (scatter gather element) template for recv */
- /* nothing is required for RDMA-write */
-
- /* RR (receive request) template for MPID_NEM_IB_COM_RDMAWR_RESPONDER */
- conp->icom_rr =
- (struct ibv_recv_wr *) MPIU_Malloc(sizeof(struct ibv_recv_wr) *
- MPID_NEM_IB_COM_RC_RR_NTEMPLATE);
- memset(conp->icom_rr, 0, sizeof(struct ibv_recv_wr) * MPID_NEM_IB_COM_RC_RR_NTEMPLATE);
-
- /* create one dummy RR to ibv_post_recv */
- conp->icom_rr[MPID_NEM_IB_COM_RDMAWR_RESPONDER].next = NULL;
-#ifdef HAVE_LIBDCFA
-#else
- conp->icom_rr[MPID_NEM_IB_COM_RDMAWR_RESPONDER].sg_list = NULL;
-#endif
- conp->icom_rr[MPID_NEM_IB_COM_RDMAWR_RESPONDER].num_sge = 0;
- break;
-
- case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:{
- /* SR (send request) template */
- conp->icom_sr =
- (struct ibv_send_wr *) MPIU_Malloc(sizeof(struct ibv_send_wr) *
- MPID_NEM_IB_COM_SCRATCH_PAD_SR_NTEMPLATE);
- memset(conp->icom_sr, 0,
- sizeof(struct ibv_send_wr) * MPID_NEM_IB_COM_SCRATCH_PAD_SR_NTEMPLATE);
-
- /* SR (send request) template for MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR */
-#ifdef HAVE_LIBDCFA
- memset(&(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0]), 0,
- sizeof(struct ibv_sge) * WR_SG_NUM);
-#else
- sge =
- (struct ibv_sge *) MPIU_Malloc(sizeof(struct ibv_sge) *
- MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR_NSGE);
- memset(sge, 0, sizeof(struct ibv_sge) * MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR_NSGE);
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].next = NULL;
-#ifdef HAVE_LIBDCFA
-#else
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list = sge;
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].num_sge = 1;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].opcode = IBV_WR_RDMA_WRITE;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].send_flags =
- IBV_SEND_SIGNALED | IBV_SEND_INLINE;
-
-
-#ifdef HAVE_LIBDCFA
- memset(&(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list[0]),
- 0, sizeof(struct ibv_sge) * WR_SG_NUM);
-#else
- sge =
- (struct ibv_sge *) MPIU_Malloc(sizeof(struct ibv_sge) *
- MPID_NEM_IB_COM_SCRATCH_PAD_GET_NSGE);
- memset(sge, 0, sizeof(struct ibv_sge) * MPID_NEM_IB_COM_SCRATCH_PAD_GET_NSGE);
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].next = NULL;
-#ifdef HAVE_LIBDCFA
-#else
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list = sge;
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].num_sge = 1;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].opcode = IBV_WR_RDMA_READ;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].send_flags = IBV_SEND_SIGNALED;
-
-
-#ifdef HAVE_LIBDCFA
- memset(&(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list[0]),
- 0, sizeof(struct ibv_sge) * WR_SG_NUM);
-#else
- sge =
- (struct ibv_sge *) MPIU_Malloc(sizeof(struct ibv_sge) *
- MPID_NEM_IB_COM_SCRATCH_PAD_CAS_NSGE);
- memset(sge, 0, sizeof(struct ibv_sge) * MPID_NEM_IB_COM_SCRATCH_PAD_CAS_NSGE);
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].next = NULL;
-#ifdef HAVE_LIBDCFA
-#else
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list = sge;
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].num_sge = 1;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].opcode = IBV_WR_ATOMIC_CMP_AND_SWP;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].send_flags = IBV_SEND_SIGNALED;
-
-#ifdef HAVE_LIBDCFA
- memset(&(conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].sg_list[0]),
- 0, sizeof(struct ibv_sge) * WR_SG_NUM);
-#else
- sge = (struct ibv_sge *) MPIU_Malloc(sizeof(struct ibv_sge));
- memset(sge, 0, sizeof(struct ibv_sge));
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].next = NULL;
-#ifdef HAVE_LIBDCFA
-#else
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].sg_list = sge;
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].num_sge = 1;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].opcode = IBV_WR_SEND;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].send_flags = IBV_SEND_SIGNALED;
-
- /* RR (receive request) template */
- conp->icom_rr =
- (struct ibv_recv_wr *) MPIU_Malloc(sizeof(struct ibv_recv_wr) *
- MPID_NEM_IB_COM_SCRATCH_PAD_RR_NTEMPLATE);
- memset(conp->icom_rr, 0,
- sizeof(struct ibv_recv_wr) * MPID_NEM_IB_COM_SCRATCH_PAD_RR_NTEMPLATE);
-
- /* RR (receive request) template for MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER */
-#ifdef HAVE_LIBDCFA
- memset(&(conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].sg_list[0]), 0,
- sizeof(struct ibv_sge) * WR_SG_NUM);
-#else
- sge = (struct ibv_sge *) MPIU_Malloc(sizeof(struct ibv_sge));
- memset(sge, 0, sizeof(struct ibv_sge));
-#endif
- conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].next = NULL;
-#ifdef HAVE_LIBDCFA
-#else
- conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].sg_list = sge;
-#endif
- conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].num_sge = 1;
- break;
- }
-
- case MPID_NEM_IB_COM_OPEN_UD:
- /* SGE (RDMA-send-from memory) template for MPID_NEM_IB_COM_UD_INITIATOR */
-#ifdef HAVE_LIBDCFA
- sge = &(conp->icom_sr[MPID_NEM_IB_COM_UD_INITIATOR].sg_list[0]);
- memset(sge, 0, sizeof(struct ibv_sge) * WR_SG_NUM);
-#else
- sge = (struct ibv_sge *) MPIU_Calloc(1, sizeof(struct ibv_sge));
-#endif
- /* addr to addr + length - 1 will be on the payload, but search backword for "<= 40" */
- sge[0].addr = (uint64_t) conp->icom_mem[MPID_NEM_IB_COM_UDWR_FROM] + 40;
- sge[0].length = MPID_NEM_IB_COM_UDBUF_SZSEG - 40;
- sge[0].lkey = conp->icom_mrlist[MPID_NEM_IB_COM_UDWR_FROM]->lkey;
-
-
- conp->icom_ah_attr =
- (struct ibv_ah_attr *) MPIU_Calloc(MPID_NEM_IB_COM_UD_SR_NTEMPLATE,
- sizeof(struct ibv_ah_attr));
-
- conp->icom_ah_attr[MPID_NEM_IB_COM_UD_INITIATOR].sl = 0;
- conp->icom_ah_attr[MPID_NEM_IB_COM_UD_INITIATOR].src_path_bits = 0;
- conp->icom_ah_attr[MPID_NEM_IB_COM_UD_INITIATOR].static_rate = 0; /* not limit on static rate (100% port speed) */
- conp->icom_ah_attr[MPID_NEM_IB_COM_UD_INITIATOR].is_global = 0;
- conp->icom_ah_attr[MPID_NEM_IB_COM_UD_INITIATOR].port_num = conp->icom_port;
-
- /* SR (send request) template for MPID_NEM_IB_COM_UD_INITIATOR */
- conp->icom_sr =
- (struct ibv_send_wr *) MPIU_Calloc(MPID_NEM_IB_COM_UD_SR_NTEMPLATE,
- sizeof(struct ibv_send_wr));
-
- conp->icom_sr[MPID_NEM_IB_COM_UD_INITIATOR].next = NULL;
-#ifdef HAVE_LIBDCFA
-#else
- conp->icom_sr[MPID_NEM_IB_COM_UD_INITIATOR].sg_list = sge;
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_UD_INITIATOR].num_sge = 1;
- conp->icom_sr[MPID_NEM_IB_COM_UD_INITIATOR].opcode = IBV_WR_SEND;
- conp->icom_sr[MPID_NEM_IB_COM_UD_INITIATOR].send_flags = IBV_SEND_SIGNALED;
-
- conp->icom_sr[MPID_NEM_IB_COM_UD_INITIATOR].wr.ud.remote_qkey = MPID_NEM_IB_COM_QKEY;
-
- /* SGE (scatter gather element) template for recv */
-#ifdef HAVE_LIBDCFA
- sge = &(conp->icom_rr[MPID_NEM_IB_COM_UD_RESPONDER].sg_list[0]);
- memset(sge, 0, sizeof(struct ibv_sge) * WR_SG_NUM);
-#else
- sge = (struct ibv_sge *) MPIU_Calloc(1, sizeof(struct ibv_sge));
-#endif
- sge[0].addr = (uint64_t) conp->icom_mem[MPID_NEM_IB_COM_UDWR_TO];
- sge[0].length = MPID_NEM_IB_COM_UDBUF_SZ;
- sge[0].lkey = conp->icom_mrlist[MPID_NEM_IB_COM_UDWR_TO]->lkey;
-
- /* RR (receive request) template for MPID_NEM_IB_COM_UD_RESPONDER */
- conp->icom_rr =
- (struct ibv_recv_wr *) MPIU_Calloc(MPID_NEM_IB_COM_UD_RR_NTEMPLATE,
- sizeof(struct ibv_recv_wr));
-
- /* create one dummy RR to ibv_post_recv */
- conp->icom_rr[MPID_NEM_IB_COM_UD_RESPONDER].next = NULL;
-#ifdef HAVE_LIBDCFA
-#else
- conp->icom_rr[MPID_NEM_IB_COM_UD_RESPONDER].sg_list = sge;
-#endif
- conp->icom_rr[MPID_NEM_IB_COM_UD_RESPONDER].num_sge = 1;
- break;
- }
-
- maxcon++;
-
- fn_exit:
- return ibcom_errno;
- err_exit:
- return -1;
- fn_fail:
- goto fn_exit;
-}
-
-/* 1. allocate memory area if it's not allocated or reuse it if it's allocated
- 2. ibv_reg_mr it and store rkey to conp->icom_mrlist
- buf is output */
-int MPID_nem_ib_com_alloc(int condesc, int sz)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
-#ifdef MPID_NEM_IB_DEBUG_IBCOM
- int mr_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE;
-#endif
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
-
- switch (conp->open_flag) {
-
- case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
- /* RDMA-write-to local memory area */
- MPID_nem_ib_scratch_pad_ref_count++;
- if (!MPID_nem_ib_scratch_pad) {
- MPID_nem_ib_scratch_pad =
- mmap(0, sz, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
- dprintf("MPID_nem_ib_com_alloc,mmap=%p,len=%d\n", MPID_nem_ib_scratch_pad, sz);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(MPID_nem_ib_scratch_pad == (void *) -1, -1,
- dprintf("failed to allocate buffer\n"));
- dprintf("MPID_nem_ib_com_alloc,MPID_nem_ib_scratch_pad=%p\n", MPID_nem_ib_scratch_pad);
- memset(MPID_nem_ib_scratch_pad, 0, sz);
- }
- conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_TO] = MPID_nem_ib_scratch_pad;
- conp->icom_msize[MPID_NEM_IB_COM_SCRATCH_PAD_TO] = sz;
-
- conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_TO] =
- MPID_nem_ib_com_reg_mr_fetch(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_TO],
- conp->icom_msize[MPID_NEM_IB_COM_SCRATCH_PAD_TO],
- IBV_ACCESS_REMOTE_ATOMIC, MPID_NEM_IB_COM_REG_MR_STICKY);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_TO], -1,
- dprintf("ibv_reg_mr failed with mr_flags=0x%x\n", mr_flags));
-
-#ifdef HAVE_LIBDCFA
- dprintf("MPID_nem_ib_com_alloc,fd=%d,rmem=%p\n", condesc,
- conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_TO]->buf);
-#else
- dprintf("MPID_nem_ib_com_alloc,fd=%d,rmem=%p\n", condesc,
- conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_TO]->addr);
-#endif
- dprintf("MPID_nem_ib_com_alloc,fd=%d,rkey=%08x\n", condesc,
- conp->icom_mrlist[MPID_NEM_IB_COM_SCRATCH_PAD_TO]->rkey);
- break;
- default:
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(1, -1,
- dprintf("MPID_nem_ib_com_alloc, invalid open_flag=%d\n",
- conp->open_flag));
- break;
- }
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_free(int condesc, int sz)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
- int retval;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
-
- switch (conp->open_flag) {
-
- case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
- MPIU_Assert(MPID_nem_ib_scratch_pad_ref_count > 0);
- if (--MPID_nem_ib_scratch_pad_ref_count == 0) {
- retval = munmap(MPID_nem_ib_scratch_pad, sz);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(retval, -1, dprintf("munmap"));
- MPID_nem_ib_scratch_pad = NULL;
- dprintf("ib_com_free,MPID_nem_ib_scratch_pad is freed\n");
- }
- break;
- default:
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(1, -1,
- dprintf("MPID_nem_ib_com_free, invalid open_flag=%d\n",
- conp->open_flag));
- break;
- }
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_close(int condesc)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
-
- dprintf("MPID_nem_ib_com_close,condesc=%d\n", condesc);
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- ibcom_errno = MPID_nem_ib_com_register_cache_release();
- MPID_nem_ib_com_clean(conp);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ibcom_errno, -1,
- printf("MPID_nem_ib_com_register_cache_release"));
- --maxcon;
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_rts(int condesc, int remote_qpnum, uint16_t remote_lid,
- union ibv_gid *remote_gid)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
- int ib_errno;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- if (conp->icom_connected == 1) {
- ibcom_errno = -1;
- goto fn_fail;
- }
-
- struct ibv_qp_attr attr;
- int flags;
-
- switch (conp->open_flag) {
- case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
- /* Init QP */
- ib_errno = modify_qp_to_init(conp->icom_qp, conp->icom_port, IBV_ACCESS_REMOTE_ATOMIC);
- if (ib_errno) {
- fprintf(stderr, "change QP state to INIT failed\n");
- ibcom_errno = ib_errno;
- goto fn_fail;
- }
- goto common_tail;
- case MPID_NEM_IB_COM_OPEN_RC:
- /* Init QP */
- ib_errno = modify_qp_to_init(conp->icom_qp, conp->icom_port, 0);
- if (ib_errno) {
- fprintf(stderr, "change QP state to INIT failed\n");
- ibcom_errno = ib_errno;
- goto fn_fail;
- }
- common_tail:
- /* Modify QP TO RTR status */
- ib_errno =
- modify_qp_to_rtr(conp->icom_qp, remote_qpnum, remote_lid, remote_gid, conp->icom_port,
- 0);
- conp->remote_lid = remote_lid; /* for debug */
- if (ib_errno) {
- fprintf(stderr, "failed to modify QP state to RTR\n");
- ibcom_errno = ib_errno;
- goto fn_fail;
- }
- /* Modify QP TO RTS status */
- ib_errno = modify_qp_to_rts(conp->icom_qp);
- if (ib_errno) {
- fprintf(stderr, "failed to modify QP state to RTS\n");
- ibcom_errno = ib_errno;
- goto fn_fail;
- }
- break;
- case MPID_NEM_IB_COM_OPEN_UD:
- /* INIT */
- memset(&attr, 0, sizeof(attr));
- attr.qp_state = IBV_QPS_INIT;
- attr.port_num = conp->icom_port;
- attr.pkey_index = 0;
- attr.qkey = MPID_NEM_IB_COM_QKEY;
- flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY;
- ib_errno = ibv_modify_qp(conp->icom_qp, &attr, flags);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, perror("ibv_modify_qp"));
-
- /* RTR */
- memset(&attr, 0, sizeof(attr));
- attr.qp_state = IBV_QPS_RTR;
- flags = IBV_QP_STATE;
- ib_errno = ibv_modify_qp(conp->icom_qp, &attr, flags);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, perror("ibv_modify_qp"));
-
- /* RTS */
- memset(&attr, 0, sizeof(attr));
- attr.qp_state = IBV_QPS_RTS;
- attr.sq_psn = 0;
- flags = IBV_QP_STATE | IBV_QP_SQ_PSN;
- ib_errno = ibv_modify_qp(conp->icom_qp, &attr, flags);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, perror("ibv_modify_qp"));
- break;
- }
- conp->icom_connected = 1;
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#define MPID_NEM_IB_ENABLE_INLINE
-/* <buf_from_out, buf_from_sz_out>: Free the slot in drain_scq */
-int MPID_nem_ib_com_isend(int condesc,
- uint64_t wr_id,
- void *prefix, int sz_prefix,
- void *hdr, int sz_hdr,
- void *data, int sz_data,
- int *copied,
- uint32_t local_ringbuf_type, uint32_t remote_ringbuf_type,
- void **buf_from_out, uint32_t * buf_from_sz_out)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
- struct ibv_send_wr *bad_wr;
- int ib_errno;
- int num_sge;
-
- dprintf
- ("MPID_nem_ib_com_isend,prefix=%p,sz_prefix=%d,hdr=%p,sz_hdr=%d,data=%p,sz_data=%d,local_ringbuf_type=%d,remote_ringbuf_type=%d\n",
- prefix, sz_prefix, hdr, sz_hdr, data, sz_data, local_ringbuf_type, remote_ringbuf_type);
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- if (conp->icom_connected == 0) {
- return -1;
- }
-
-
- int off_pow2_aligned;
- MPID_NEM_IB_OFF_POW2_ALIGNED(MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix +
- sz_hdr + sz_data);
- uint32_t sumsz = off_pow2_aligned + sizeof(MPID_nem_ib_netmod_trailer_t);
- int sz_pad =
- off_pow2_aligned - (MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr +
- sz_data);
-
- uint32_t buf_from_sz = MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr +
- sz_pad + sizeof(MPID_nem_ib_netmod_trailer_t);
- *buf_from_sz_out = buf_from_sz;
- void *buf_from = MPID_nem_ib_rdmawr_from_alloc(buf_from_sz);
- dprintf("isend,rdmawr_from_alloc=%p,sz=%d\n", buf_from, buf_from_sz);
- *buf_from_out = buf_from;
- struct ibv_mr *mr_rdmawr_from = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_MR(buf_from);
-
- if (sz_data > 16000) {
- //dprintf("MPID_nem_ib_com_isend,sz_data=%d,off_pow2_aligned=%d,sz_max=%ld\n", sz_data, off_pow2_aligned, MPID_NEM_IB_MAX_DATA_POW2);
- }
-
- num_sge = 0;
- uint32_t hdr_ringbuf_type = local_ringbuf_type;
- MPID_NEM_IB_NETMOD_HDR_SZ_SET(buf_from,
- MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) +
- sz_prefix + sz_hdr + sz_data);
- if (remote_ringbuf_type == MPID_NEM_IB_RINGBUF_EXCLUSIVE) {
- hdr_ringbuf_type |= MPID_NEM_IB_RINGBUF_RELINDEX;
- MPID_NEM_IB_NETMOD_HDR_RELINDEX_SET(buf_from, conp->rsr_seq_num_tail);
- conp->rsr_seq_num_tail_last_sent = conp->rsr_seq_num_tail;
- dprintf("isend,rsr_seq_num_tail=%d\n", MPID_NEM_IB_NETMOD_HDR_RELINDEX_GET(buf_from));
- }
- if (local_ringbuf_type == MPID_NEM_IB_RINGBUF_SHARED) {
- MPID_NEM_IB_NETMOD_HDR_VC_SET(buf_from, conp->remote_vc);
- dprintf("isend,remote_vc=%p\n", MPID_NEM_IB_NETMOD_HDR_VC_GET(buf_from));
- }
- MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_SET(buf_from, hdr_ringbuf_type);
- dprintf("isend,hdr_ringbuf_type=%08x\n", MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf_from));
-
- /* memcpy hdr is needed because hdr resides in stack when sending close-VC command */
- /* memcpy is performed onto MPID_NEM_IB_COM_RDMAWR_FROM buffer */
- void *hdr_copy = (uint8_t *) buf_from + MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type);
- memcpy(hdr_copy, prefix, sz_prefix);
- memcpy((uint8_t *) hdr_copy + sz_prefix, hdr, sz_hdr);
-#ifdef HAVE_LIBDCFA
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].mic_addr = (uint64_t) buf_from;
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].addr =
- mr_rdmawr_from->host_addr +
- ((uint64_t) buf_from - (uint64_t) MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_START(buf_from));
-
-#else
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].addr = (uint64_t) buf_from;
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].length =
- MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix + sz_hdr;
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].lkey = mr_rdmawr_from->lkey;
- num_sge += 1;
-
- struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache = NULL;
- if (sz_data) {
- //dprintf("MPID_nem_ib_com_isend,data=%p,sz_data=%d\n", data, sz_data);
- mr_cache = MPID_nem_ib_com_reg_mr_fetch(data, sz_data, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_cache, -1,
- printf("MPID_nem_ib_com_isend,ibv_reg_mr_fetch failed\n"));
- struct ibv_mr *mr_data = mr_cache->mr;
-#ifdef HAVE_LIBDCFA
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].mic_addr = (uint64_t) data;
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].addr =
- mr_data->host_addr + ((uint64_t) data - (uint64_t) data);
-#else
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].addr = (uint64_t) data;
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].length = sz_data;
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].lkey = mr_data->lkey;
- num_sge += 1;
- }
-
- MPID_nem_ib_netmod_trailer_t *netmod_trailer =
- (MPID_nem_ib_netmod_trailer_t *) ((uint8_t *) buf_from +
- MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) +
- sz_prefix + sz_hdr + sz_pad);
- netmod_trailer->tail_flag = MPID_NEM_IB_COM_MAGIC;
-#ifdef HAVE_LIBDCFA
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].mic_addr =
- (uint64_t) buf_from + MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix +
- sz_hdr;
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].addr =
- mr_rdmawr_from->host_addr + ((uint64_t) buf_from +
- MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix +
- sz_hdr - (uint64_t)
- MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_START(buf_from));
-#else
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].addr =
- (uint64_t) buf_from + MPID_NEM_IB_NETMOD_HDR_SIZEOF(local_ringbuf_type) + sz_prefix +
- sz_hdr;
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].length =
- sz_pad + sizeof(MPID_nem_ib_netmod_trailer_t);
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].sg_list[num_sge].lkey = mr_rdmawr_from->lkey;
- num_sge += 1;
- dprintf("MPID_nem_ib_com_isend,sz_data=%d,pow2=%d,sz_pad=%d,num_sge=%d\n", sz_data,
- off_pow2_aligned, sz_pad, num_sge);
-
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].num_sge = num_sge;
- MPID_nem_ib_rc_send_request *wrap_wr_id = MPIU_Malloc(sizeof(MPID_nem_ib_rc_send_request));
- wrap_wr_id->wr_id = wr_id;
- wrap_wr_id->mf = MPID_NEM_IB_LAST_PKT;
- wrap_wr_id->mr_cache = (void *) mr_cache;
-
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr_id = (uint64_t) wrap_wr_id;
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.remote_addr =
- (uint64_t) conp->local_ringbuf_start +
- MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t) (conp->sseq_num % conp->local_ringbuf_nslot));
- dprintf("isend,ringbuf_start=%p,local_head=%04ux,nslot=%d,rkey=%08x,remote_addr=%lx\n",
- conp->local_ringbuf_start, conp->sseq_num, conp->local_ringbuf_nslot,
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.rkey,
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.remote_addr);
- if (conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.remote_addr <
- (uint64_t) conp->local_ringbuf_start ||
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.remote_addr >=
- (uint64_t) conp->local_ringbuf_start +
- MPID_NEM_IB_COM_RDMABUF_SZSEG * conp->local_ringbuf_nslot) {
- MPID_nem_ib_segv;
- }
- /* rkey is defined in MPID_nem_ib_com_connect_ringbuf */
-
- //dprintf("MPID_nem_ib_com_isend,condesc=%d,num_sge=%d,opcode=%08x,imm_data=%08x,wr_id=%016lx, raddr=%p, rkey=%08x\n", condesc, conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].num_sge, conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].opcode, conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].imm_data, conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr_id, conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.remote_addr, conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.rkey);
-
-#ifdef MPID_NEM_IB_ENABLE_INLINE
- if (sumsz <= conp->max_inline_data) {
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].send_flags |= IBV_SEND_INLINE;
- *copied = 1;
- }
- else {
- *copied = 0;
- }
-#endif
-#ifdef HAVE_LIBDCFA
- ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE]);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
- dprintf("MPID_nem_ib_com_isend, ibv_post_send, rc=%d\n",
- ib_errno));
-#else
- ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE], &bad_wr);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
- dprintf
- ("MPID_nem_ib_com_isend, ibv_post_send, rc=%d, bad_wr=%p\n",
- ib_errno, bad_wr));
-#endif
-#ifdef MPID_NEM_IB_ENABLE_INLINE
- if (sumsz <= conp->max_inline_data) {
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].send_flags &= ~IBV_SEND_INLINE;
- }
-#endif
-
- conp->sseq_num += 1;
- conp->ncom += 1;
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_irecv(int condesc, uint64_t wr_id)
-{
-
- MPID_nem_ib_com_t *conp;
- int ib_errno;
- int ibcom_errno = 0;
- struct ibv_recv_wr *bad_wr;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- // if (conp->icom_connected == 0) { return -1; }
-
- //dprintf("MPID_nem_ib_com_irecv,condesc=%d,wr_id=%016lx\n", condesc, wr_id);
-
- conp->icom_rr[MPID_NEM_IB_COM_RDMAWR_RESPONDER].wr_id = wr_id;
-#ifdef HAVE_LIBDCFA
- ib_errno = ibv_post_recv(conp->icom_qp, &conp->icom_rr[MPID_NEM_IB_COM_RDMAWR_RESPONDER]);
-#else
- ib_errno =
- ibv_post_recv(conp->icom_qp, &conp->icom_rr[MPID_NEM_IB_COM_RDMAWR_RESPONDER], &bad_wr);
-#endif
- if (ib_errno) {
-#ifdef HAVE_LIBDCFA
- fprintf(stderr, "MPID_nem_ib_com_irecv: failed to post receive, ib_errno=%d\n", ib_errno);
-#else
- fprintf(stderr, "MPID_nem_ib_com_irecv: failed to post receive, ib_errno=%d,bad_wr=%p\n",
- ib_errno, bad_wr);
-#endif
- ibcom_errno = ib_errno;
- goto fn_fail;
- }
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_udsend(int condesc, union ibv_gid *remote_gid, uint16_t remote_lid,
- uint32_t remote_qpn, uint32_t imm_data, uint64_t wr_id)
-{
- MPID_nem_ib_com_t *conp;
- struct ibv_send_wr *bad_wr;
- int ibcom_errno = 0, ib_errno;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
-
-#ifdef HAVE_LIBDCFA
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(1, -1,
- dprintf
- ("MPID_nem_ib_com_udsend not supported by DCFA because DCFA doesn't have ibv_create_ah\n"));
-#else
- /* prepare ibv_ah_attr */
- conp->icom_ah_attr[MPID_NEM_IB_COM_UD_INITIATOR].dlid = remote_lid;
-
- /* prepare ibv_ah */
- struct ibv_ah *ah;
- ah = ibv_create_ah(ib_pd, &conp->icom_ah_attr[MPID_NEM_IB_COM_UD_INITIATOR]);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!ah, -1, dprintf("ibv_crate_ah\n"));
-
- conp->icom_sr[MPID_NEM_IB_COM_UD_INITIATOR].wr.ud.ah = ah;
- conp->icom_sr[MPID_NEM_IB_COM_UD_INITIATOR].wr.ud.remote_qpn = remote_qpn;
- /* qkey is defined in open */
-
- //dprintf("lid=%04x\n", conp->icom_ah_attr[MPID_NEM_IB_COM_UD_INITIATOR].dlid);
- //dprintf("qpn=%08x\n", conp->icom_sr[MPID_NEM_IB_COM_UD_INITIATOR].wr.ud.remote_qpn);
-
- /* recv doesn't know the length, so we can't optimize it */
- // conp->icom_sr[MPID_NEM_IB_COM_UD_INITIATOR].sg_list[0].length = length;
-
- conp->icom_sr[MPID_NEM_IB_COM_UD_INITIATOR].wr_id = wr_id;
- conp->icom_sr[MPID_NEM_IB_COM_UD_INITIATOR].imm_data = imm_data;
-
-#ifdef HAVE_LIBDCFA
- ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_UD_INITIATOR]);
-#else
- ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_UD_INITIATOR], &bad_wr);
-#endif
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, perror("ibv_post_send"));
-#endif /* DCFA */
-
- conp->ncom += 1;
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_udrecv(int condesc)
-{
- MPID_nem_ib_com_t *conp;
- struct ibv_recv_wr *bad_wr;
- int ibcom_errno = 0, ib_errno;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
-
- /* Create RR */
- conp->icom_rr[MPID_NEM_IB_COM_UD_RESPONDER].wr_id = 0;
-
- /* Post RR to RQ */
-#ifdef HAVE_LIBDCFA
- ib_errno = ibv_post_recv(conp->icom_qp, &conp->icom_rr[MPID_NEM_IB_COM_UD_RESPONDER]);
-#else
- ib_errno = ibv_post_recv(conp->icom_qp, &conp->icom_rr[MPID_NEM_IB_COM_UD_RESPONDER], &bad_wr);
-#endif
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_post_recv ib_errno=%d\n", ib_errno));
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, long sz_data, uint32_t rkey,
- void *laddr, int last)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
- struct ibv_send_wr *bad_wr;
- int ib_errno;
- int num_sge = 0;
-
- dprintf("MPID_nem_ib_com_lrecv,enter,raddr=%p,sz_data=%ld,laddr=%p\n", raddr, sz_data, laddr);
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_connected, -1,
- dprintf("MPID_nem_ib_com_lrecv,not connected\n"));
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!sz_data, -1, dprintf("MPID_nem_ib_com_lrecv,sz_data==0\n"));
-
- /* register memory area containing data */
- struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache =
- MPID_nem_ib_com_reg_mr_fetch(laddr, sz_data, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_cache, -1,
- dprintf("MPID_nem_ib_com_lrecv,ibv_reg_mr_fetch failed\n"));
- struct ibv_mr *mr_data = mr_cache->mr;
-
- MPID_nem_ib_rc_send_request *wrap_wr_id = MPIU_Malloc(sizeof(MPID_nem_ib_rc_send_request));
- wrap_wr_id->wr_id = wr_id;
- wrap_wr_id->mf = last;
- wrap_wr_id->mr_cache = (void *) mr_cache;
-
- num_sge = 0;
-
- /* Erase magic, super bug!! */
- //((MPID_nem_ib_netmod_trailer_t*)(laddr + sz_data - sizeof(MPID_nem_ib_netmod_trailer_t)))->magic = 0;
-#ifdef HAVE_LIBDCFA
- conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].mic_addr = (uint64_t) laddr;
- conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].addr =
- mr_data->host_addr + ((uint64_t) laddr - (uint64_t) laddr);
-#else
- conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].addr = (uint64_t) laddr;
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].length = sz_data;
- conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].sg_list[num_sge].lkey = mr_data->lkey;
- num_sge += 1;
-
- conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].num_sge = num_sge;
- conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].wr_id = (uint64_t) wrap_wr_id;
- conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].wr.rdma.remote_addr = (uint64_t) raddr;
- conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR].wr.rdma.rkey = rkey;
-
-#ifdef HAVE_LIBDCFA
- ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR]);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
- dprintf("MPID_nem_ib_com_lrecv, ibv_post_send, rc=%d\n",
- ib_errno));
-#else
- ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_LMT_INITIATOR], &bad_wr);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
- dprintf
- ("MPID_nem_ib_com_lrecv, ibv_post_send, rc=%d, bad_wr=%p\n",
- ib_errno, bad_wr));
-#endif
-
- conp->ncom += 1;
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-/* use the same QP as isend */
-int MPID_nem_ib_com_put_lmt(int condesc, uint64_t wr_id, void *raddr, int sz_data, uint32_t rkey,
- void *laddr)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
- struct ibv_send_wr *bad_wr;
- int ib_errno;
- int num_sge;
-
- dprintf("MPID_nem_ib_com_put_lmt,enter,sz_data=%d,laddr=%p\n", sz_data, laddr);
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_connected, -1,
- dprintf("MPID_nem_ib_com_put_lmt,not connected\n"));
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!sz_data, -1, dprintf("MPID_nem_ib_com_put_lmt,sz_data==0\n"));
-
- num_sge = 0;
-
- /* register memory area containing data */
- struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache =
- MPID_nem_ib_com_reg_mr_fetch(laddr, sz_data, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!mr_cache, -1,
- dprintf("MPID_nem_ib_com_put_lmt,ibv_reg_mr_fetch failed\n"));
- struct ibv_mr *mr_data = mr_cache->mr;
-
-#ifdef HAVE_LIBDCFA
- conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].sg_list[num_sge].mic_addr = (uint64_t) laddr;
- conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].sg_list[num_sge].addr =
- mr_data->host_addr + ((uint64_t) laddr - (uint64_t) laddr);
-#else
- conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].sg_list[num_sge].addr = (uint64_t) laddr;
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].sg_list[num_sge].length = sz_data;
- conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].sg_list[num_sge].lkey = mr_data->lkey;
- num_sge += 1;
-
- conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].num_sge = num_sge;
- MPID_nem_ib_rc_send_request *wrap_wr_id = MPIU_Malloc(sizeof(MPID_nem_ib_rc_send_request));
- wrap_wr_id->wr_id = wr_id;
- wrap_wr_id->mf = MPID_NEM_IB_LAST_PKT;
- wrap_wr_id->mr_cache = (void *) mr_cache;
-
- conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].wr_id = (uint64_t) wrap_wr_id;
- conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].wr.rdma.remote_addr = (uint64_t) raddr;
- conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT].wr.rdma.rkey = rkey;
-
-#ifdef HAVE_LIBDCFA
- ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT]);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
- dprintf("MPID_nem_ib_com_put_lmt, ibv_post_send, rc=%d\n",
- ib_errno));
-#else
- ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_LMT_PUT], &bad_wr);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
- dprintf
- ("MPID_nem_ib_com_put_lmt, ibv_post_send, rc=%d, bad_wr=%p\n",
- ib_errno, bad_wr));
-#endif
-
- conp->ncom += 1;
- dprintf("MPID_nem_ib_com_put_lmt,exit\n");
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_scratch_pad_recv(int condesc, int sz_data)
-{
- MPID_nem_ib_com_t *conp;
- struct ibv_recv_wr *bad_wr;
- int ibcom_errno = 0, ib_errno;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
-
- void *buf_to = MPID_nem_ib_rdmawr_from_alloc(sz_data);
- struct ibv_mr *mr_buf_to = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_MR(buf_to);
-
- /* Create RR */
-
-#ifdef HAVE_LIBDCFA
- conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].sg_list[0].mic_addr = (uint64_t) buf_to;
- conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].sg_list[0].addr =
- mr_buf_to->host_addr + ((uint64_t) buf_to -
- (uint64_t) MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_START(buf_to));
-#else
- conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].sg_list[0].addr = (uint64_t) buf_to;
-#endif
-
- conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].sg_list[0].length = sz_data;
- conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].sg_list[0].lkey = mr_buf_to->lkey;
-
- conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER].wr_id = (uint64_t) buf_to;
-
- /* Post RR to RQ */
-#ifdef HAVE_LIBDCFA
- ib_errno = ibv_post_recv(conp->icom_qp, &conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER]);
-#else
- ib_errno =
- ibv_post_recv(conp->icom_qp, &conp->icom_rr[MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER],
- &bad_wr);
-#endif
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, dprintf("ibv_post_recv ib_errno=%d\n", ib_errno));
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_put_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset, int sz,
- void *laddr, void **buf_from_out, uint32_t * buf_from_sz_out)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
- struct ibv_send_wr *bad_wr;
- int ib_errno;
-
- dprintf("MPID_nem_ib_com_put_scratch_pad,enter,wr_id=%llx,offset=%llx,sz=%d,laddr=%p\n",
- (unsigned long long) wr_id, (unsigned long long) offset, sz, laddr);
- dprintf("MPID_nem_ib_com_put_scratch_pad,data=%08x\n", *((uint32_t *) laddr));
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(conp->open_flag != MPID_NEM_IB_COM_OPEN_SCRATCH_PAD, -1,
- dprintf("MPID_nem_ib_com_put_scratch_pad,invalid open_flag=%d\n",
- conp->open_flag));
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!conp->icom_connected, -1,
- dprintf("MPID_nem_ib_com_put_scratch_pad,not connected\n"));
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(!sz, -1, dprintf("MPID_nem_ib_com_put_scratch_pad,sz==0\n"));
-
- /* Use inline so that we don't need to worry about overwriting write-from buffer */
-// assert(sz <= conp->max_inline_data);
-
- /* When cm_progress calls this function, 'comp->icom_mem' and 'laddr' are not equal. */
-// assert(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM] == laddr);
-// memcpy(conp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM], laddr, sz);
-
- /* Instead of using the pre-mmaped memory (comp->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM]),
- * we allocate a memory. */
- void *buf_from = MPID_nem_ib_rdmawr_from_alloc(sz);
- memcpy(buf_from, laddr, sz);
- dprintf("put_scratch_pad,rdmawr_from_alloc=%p,sz=%d\n", buf_from, sz);
- struct ibv_mr *mr_rdmawr_from = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_MR(buf_from);
-
- *buf_from_out = buf_from;
- *buf_from_sz_out = sz;
-
- void *from = (uint8_t *) buf_from;
-
-#ifdef HAVE_LIBDCFA
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].mic_addr = (uint64_t) from;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].addr =
- mr_rdmawr_from->host_addr + ((uint64_t) from - (uint64_t) from);
-#else
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].addr = (uint64_t) from;
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].length = sz;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].sg_list[0].lkey = mr_rdmawr_from->lkey;
-
- /* num_sge is defined in MPID_nem_ib_com_open */
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].wr_id = wr_id;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].wr.rdma.remote_addr =
- (uint64_t) conp->icom_rmem[MPID_NEM_IB_COM_SCRATCH_PAD_TO] + offset;
- /* rkey is defined in MPID_nem_ib_com_reg_mr_connect */
-
- dprintf("MPID_nem_ib_com_put_scratch_pad,wr.rdma.remote_addr=%llx\n",
- (unsigned long long) conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].wr.rdma.
- remote_addr);
-
-#ifdef HAVE_LIBDCFA
- ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR]);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
- dprintf
- ("MPID_nem_ib_com_put_scratch_pad, ibv_post_send, rc=%d\n",
- ib_errno));
-#else
- ib_errno =
- ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR],
- &bad_wr);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
- dprintf
- ("MPID_nem_ib_com_put_scratch_pad, ibv_post_send, rc=%d, bad_wr=%p\n",
- ib_errno, bad_wr));
-#endif
-
- conp->ncom_scratch_pad += 1;
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_get_scratch_pad(int condesc,
- uint64_t wr_id,
- uint64_t offset, int sz,
- void **buf_from_out, uint32_t * buf_from_sz_out)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
- struct ibv_send_wr *bad_wr;
- int ib_errno;
-
- dprintf("MPID_nem_ib_com_get_scratch_pad,enter,wr_id=%llx,offset=%llx,sz=%d\n",
- (unsigned long long) wr_id, (unsigned long long) offset, sz);
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
-
- *buf_from_sz_out = sz;
- void *buf_from = MPID_nem_ib_rdmawr_from_alloc(sz);
- dprintf("get_scratch_pad,rdmawr_from_alloc=%p,sz=%d\n", buf_from, sz);
- *buf_from_out = buf_from;
- struct ibv_mr *mr_rdmawr_from = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_MR(buf_from);
-
-#ifdef HAVE_LIBDCFA
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list[0].mic_addr = (uint64_t) buf_from;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list[0].addr =
- mr_rdmawr_from->host_addr + ((uint64_t) buf_from - (uint64_t) buf_from);
-#else
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list[0].addr = (uint64_t) buf_from;
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list[0].length = sz;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].sg_list[0].lkey = mr_rdmawr_from->lkey;
-
- /* num_sge is defined in MPID_nem_ib_com_open */
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].wr_id = wr_id;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].wr.rdma.remote_addr =
- (uint64_t) conp->icom_rmem[MPID_NEM_IB_COM_SCRATCH_PAD_TO] + offset;
- /* rkey is defined in MPID_nem_ib_com_reg_mr_connect */
-
- dprintf("MPID_nem_ib_com_get_scratch_pad,wr.rdma.remote_addr=%llx\n",
- (unsigned long long) conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].wr.rdma.
- remote_addr);
-
-#ifdef HAVE_LIBDCFA
- ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET]);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
- dprintf
- ("MPID_nem_ib_com_put_scratch_pad, ibv_post_send, rc=%d\n",
- ib_errno));
-#else
- ib_errno =
- ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET], &bad_wr);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
- dprintf
- ("MPID_nem_ib_com_put_scratch_pad, ibv_post_send, rc=%d, bad_wr=%p\n",
- ib_errno, bad_wr));
-#endif
-
- conp->ncom_scratch_pad += 1;
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_cas_scratch_pad(int condesc,
- uint64_t wr_id, uint64_t offset,
- uint64_t compare, uint64_t swap,
- void **buf_from_out, uint32_t * buf_from_sz_out)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
- struct ibv_send_wr *bad_wr;
- int ib_errno;
- uint32_t sz = sizeof(uint64_t);
-
- dprintf("MPID_nem_ib_com_cas_scratch_pad,enter,wr_id=%llx,offset=%llx\n",
- (unsigned long long) wr_id, (unsigned long long) offset);
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
-
- *buf_from_sz_out = sz;
- void *buf_from = MPID_nem_ib_rdmawr_from_alloc(sz);
- dprintf("cas_scratch_pad,rdmawr_from_alloc=%p,sz=%d\n", buf_from, sz);
- *buf_from_out = buf_from;
- struct ibv_mr *mr_rdmawr_from = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_MR(buf_from);
-
-#ifdef HAVE_LIBDCFA
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list[0].mic_addr = (uint64_t) buf_from;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list[0].addr =
- mr_rdmawr_from->host_addr + ((uint64_t) buf_from - (uint64_t) buf_from);
-#else
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list[0].addr = (uint64_t) buf_from;
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list[0].length = sizeof(uint64_t);
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].sg_list[0].lkey = mr_rdmawr_from->lkey;
-
- /* num_sge is defined in MPID_nem_ib_com_open */
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].wr_id = wr_id;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].wr.atomic.remote_addr =
- (uint64_t) conp->icom_rmem[MPID_NEM_IB_COM_SCRATCH_PAD_TO] + offset;
- /* atomic.rkey is defined in MPID_nem_ib_com_reg_mr_connect */
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].wr.atomic.compare_add = compare;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].wr.atomic.swap = swap;
-
- dprintf("MPID_nem_ib_com_cas_scratch_pad,wr.rdma.remote_addr=%llx\n",
- (unsigned long long) conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].wr.rdma.
- remote_addr);
-
-#ifdef HAVE_LIBDCFA
- ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS]);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
- dprintf
- ("MPID_nem_ib_com_cas_scratch_pad, ibv_post_send, rc=%d\n",
- ib_errno));
-#else
- ib_errno =
- ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS], &bad_wr);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
- dprintf
- ("MPID_nem_ib_com_cas_scratch_pad, ibv_post_send, rc=%d, bad_wr=%p\n",
- ib_errno, bad_wr));
-#endif
-
- conp->ncom_scratch_pad += 1;
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_wr_scratch_pad(int condesc, uint64_t wr_id,
- void *buf_from, uint32_t buf_from_sz)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
- struct ibv_send_wr *bad_wr;
- int ib_errno;
-
- dprintf("MPID_nem_ib_com_wr_scratch_pad,enter,wr_id=%llx,buf=%llx,sz=%d\n",
- (unsigned long long) wr_id, (unsigned long long) buf_from, buf_from_sz);
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
-
- struct ibv_mr *mr_rdmawr_from = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_MR(buf_from);
-
-#ifdef HAVE_LIBDCFA
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].sg_list[0].mic_addr = (uint64_t) buf_from;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].sg_list[0].addr =
- mr_rdmawr_from->host_addr + ((uint64_t) buf_from - (uint64_t) buf_from);
-#else
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].sg_list[0].addr = (uint64_t) buf_from;
-#endif
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].sg_list[0].length = buf_from_sz;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].sg_list[0].lkey = mr_rdmawr_from->lkey;
-
- /* num_sge is defined in MPID_nem_ib_com_open */
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].wr_id = wr_id;
-
- dprintf("MPID_nem_ib_com_wr_scratch_pad,wr.rdma.remote_addr=%llx\n",
- (unsigned long long) conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR].wr.rdma.remote_addr);
-
-#ifdef HAVE_LIBDCFA
- ib_errno = ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR]);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
- dprintf
- ("MPID_nem_ib_com_wr_scratch_pad, ibv_post_send, rc=%d\n",
- ib_errno));
-#else
- ib_errno =
- ibv_post_send(conp->icom_qp, &conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_WR], &bad_wr);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1,
- dprintf
- ("MPID_nem_ib_com_wr_scratch_pad, ibv_post_send, rc=%d, bad_wr=%p\n",
- ib_errno, bad_wr));
-#endif
-
- conp->ncom_scratch_pad += 1;
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-/* poll completion queue */
-int MPID_nem_ib_com_poll_cq(int which_cq, struct ibv_wc *wc, int *result)
-{
- int ibcom_errno = 0;
-
- switch (which_cq) {
- case MPID_NEM_IB_COM_RC_SHARED_RCQ:
- *result = ibv_poll_cq(MPID_nem_ib_rc_shared_rcq, 1, wc);
- break;
- case MPID_NEM_IB_COM_RC_SHARED_SCQ:
- *result = ibv_poll_cq(MPID_nem_ib_rc_shared_scq, 1, wc);
- break;
- case MPID_NEM_IB_COM_UD_SHARED_RCQ:
- *result = ibv_poll_cq(MPID_nem_ib_ud_shared_rcq, 1, wc);
- break;
- case MPID_NEM_IB_COM_UD_SHARED_SCQ:
- *result = ibv_poll_cq(MPID_nem_ib_ud_shared_scq, 1, wc);
- break;
- }
-
- if (*result < 0) {
- dprintf
- ("MPID_nem_ib_com_poll_cq,status=%08x,vendor_err=%08x,len=%d,opcode=%08x,wr_id=%016lx\n",
- wc->status, wc->vendor_err, wc->byte_len, wc->opcode, wc->wr_id);
- ibcom_errno = *result;
- goto fn_fail;
- }
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_reg_mr_connect(int condesc, void *rmem, int rkey)
-{
- int ibcom_errno = 0;
- MPID_nem_ib_com_t *conp;
- int i;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- switch (conp->open_flag) {
- case MPID_NEM_IB_COM_OPEN_RC:
- conp->icom_rmem[MPID_NEM_IB_COM_RDMAWR_TO] = rmem;
- conp->icom_rkey[MPID_NEM_IB_COM_RDMAWR_TO] = rkey;
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.rkey =
- conp->icom_rkey[MPID_NEM_IB_COM_RDMAWR_TO];
- for (i = 0; i < MPID_NEM_IB_COM_SMT_INLINE_NCHAIN; i++) {
- conp->icom_sr[MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 + i].wr.rdma.rkey =
- conp->icom_rkey[MPID_NEM_IB_COM_RDMAWR_TO];
- }
- break;
-
- case MPID_NEM_IB_COM_OPEN_SCRATCH_PAD:
- conp->icom_rmem[MPID_NEM_IB_COM_SCRATCH_PAD_TO] = rmem;
- conp->icom_rkey[MPID_NEM_IB_COM_SCRATCH_PAD_TO] = rkey;
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR].wr.rdma.rkey =
- conp->icom_rkey[MPID_NEM_IB_COM_SCRATCH_PAD_TO];
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_GET].wr.rdma.rkey =
- conp->icom_rkey[MPID_NEM_IB_COM_SCRATCH_PAD_TO];
- conp->icom_sr[MPID_NEM_IB_COM_SCRATCH_PAD_CAS].wr.atomic.rkey =
- conp->icom_rkey[MPID_NEM_IB_COM_SCRATCH_PAD_TO];
- break;
-
- default:
- dprintf("invalid open_flag=%d\n", conp->open_flag);
- break;
- }
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-/* alloc_new_mr
- 0: The new ring buffer is located in the same IB Memory Region as
- the previous ring buffer is located in.
- This happens when making the connection switch to smaller ring buffer.
- 1: The new ring buffer is located in the new IB Memory Region
- This happens when memory area shrunk then has grown. */
-int MPID_nem_ib_com_connect_ringbuf(int condesc,
- uint32_t ringbuf_type,
- void *start, int rkey, int nslot,
- MPIDI_VC_t * remote_vc, uint32_t alloc_new_mr)
-{
- int ibcom_errno = 0;
- MPID_nem_ib_com_t *conp;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
-
- conp->local_ringbuf_type = ringbuf_type;
-
-
- /* Address and size */
- conp->local_ringbuf_start = start;
- conp->local_ringbuf_nslot = nslot;
- switch (conp->local_ringbuf_type) {
- case MPID_NEM_IB_RINGBUF_EXCLUSIVE:
- /* Head and tail pointers */
- conp->sseq_num = 0;
- conp->lsr_seq_num_tail = -1;
- break;
- case MPID_NEM_IB_RINGBUF_SHARED:
- /* Mark as full to make the sender ask */
- conp->lsr_seq_num_tail = conp->sseq_num - conp->local_ringbuf_nslot;
- conp->remote_vc = remote_vc;
- break;
- default:
- printf("unknown ringbuf type");
- break;
- }
- if (alloc_new_mr) {
- conp->local_ringbuf_rkey = rkey;
- conp->icom_sr[MPID_NEM_IB_COM_SMT_NOINLINE].wr.rdma.rkey = rkey;
- }
- dprintf
- ("connect_ringbuf,ringbuf_type=%d,rkey=%08x,start=%p,nslot=%d,sseq_num=%d,lsr_seq_num_tail=%d,remote_vc=%p,alloc_new_mr=%d\n",
- conp->local_ringbuf_type, conp->local_ringbuf_rkey, conp->local_ringbuf_start,
- conp->local_ringbuf_nslot, conp->sseq_num, conp->lsr_seq_num_tail, conp->remote_vc,
- alloc_new_mr);
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_get_info_conn(int condesc, int key, void *out, uint32_t out_len)
-{
- int ibcom_errno = 0;
- MPID_nem_ib_com_t *conp;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
-
- switch (key) {
- case MPID_NEM_IB_COM_INFOKEY_QP_QPN:
- memcpy(out, &conp->icom_qp->qp_num, out_len);
- break;
- case MPID_NEM_IB_COM_INFOKEY_PORT_LID:
-#ifdef HAVE_LIBDCFA
- dprintf("MPID_nem_ib_com_get_info_conn,lid=%04x\n", ib_ctx->lid);
- memcpy(out, &ib_ctx->lid, out_len);
-#else
- dprintf("MPID_nem_ib_com_get_info_conn,lid=%04x\n", conp->icom_pattr.lid);
- memcpy(out, &conp->icom_pattr.lid, out_len);
-#endif
- break;
- case MPID_NEM_IB_COM_INFOKEY_PORT_GID:
- memcpy(out, &conp->icom_gid, out_len);
- break;
- case MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ:{
-#ifdef HAVE_LIBDCFA
- uint32_t max_msg_sz = 1073741824; /* ConnectX-3 */
- memcpy(out, &max_msg_sz, out_len);
-#else
- memcpy(out, &conp->icom_pattr.max_msg_sz, out_len);
-#endif
- break;
- }
- default:
- ibcom_errno = -1;
- break;
- }
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_get_info_pattr(int key, void *out, uint32_t out_len)
-{
- int ibcom_errno = 0;
-
- switch (key) {
- case MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ:{
-#ifdef HAVE_LIBDCFA
- uint32_t max_msg_sz = 1073741824; /* ConnectX-3 */
- memcpy(out, &max_msg_sz, out_len);
-#else
- memcpy(out, &ib_pattr.max_msg_sz, out_len);
-#endif
- break;
- }
- default:
- ibcom_errno = -1;
- break;
- }
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_get_info_mr(int condesc, int memid, int key, void *out, int out_len)
-{
- int ibcom_errno = 0;
- MPID_nem_ib_com_t *conp;
- struct ibv_mr *mr;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(memid >= conp->icom_mrlen, -1,
- dprintf("MPID_nem_ib_com_get_info_mr,wrong mem_id=%d\n", memid));
- mr = conp->icom_mrlist[memid];
-
- switch (key) {
- case MPID_NEM_IB_COM_INFOKEY_MR_ADDR:
-#ifdef HAVE_LIBDCFA
- /* host_addr is created by ibv_reg_mr in MPID_nem_ib_com_open, */
- /* ib_init read this host-addr, put it into KVS, the counter-party read it through KVS */
- memcpy(out, &mr->host_addr, out_len);
-#else
- memcpy(out, &mr->addr, out_len);
-#endif
- break;
- case MPID_NEM_IB_COM_INFOKEY_MR_LENGTH:{
-#ifdef HAVE_LIBDCFA
- assert(out_len == sizeof(size_t));
- size_t length = mr->size; /* type of mr->size is int */
- memcpy(out, &length, out_len);
-#else
- memcpy(out, &mr->length, out_len);
-#endif
- break;
- }
- case MPID_NEM_IB_COM_INFOKEY_MR_RKEY:
- memcpy(out, &mr->rkey, out_len);
- break;
- default:
- dprintf("MPID_nem_ib_com_get_info_mr,unknown key=%d\n", key);
- ibcom_errno = -1;
- break;
- }
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_mem_rdmawr_from(int condesc, void **out)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- *out =
- (uint8_t *) conp->icom_mem[MPID_NEM_IB_COM_RDMAWR_FROM] +
- MPID_NEM_IB_COM_RDMABUF_SZSEG *
- ((uint16_t) (conp->sseq_num % MPID_NEM_IB_COM_RDMABUF_NSEG));
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_mem_udwr_from(int condesc, void **out)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- *out = conp->icom_mem[MPID_NEM_IB_COM_UDWR_FROM];
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_mem_udwr_to(int condesc, void **out)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- *out = conp->icom_mem[MPID_NEM_IB_COM_UDWR_TO];
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get(int condesc, int *notify_rate)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
-
- switch (conp->rdmabuf_occupancy_notify_lstate) {
- case MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_HW:
- *notify_rate = MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_RATE_HW;
- break;
- case MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_LW:
- *notify_rate = MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_RATE_LW;
- break;
- default:
- ibcom_errno = -1;
- goto fn_fail;
- break;
- }
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_rdmabuf_occupancy_notify_rstate_get(int condesc, int **rstate)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- *rstate = &(conp->rdmabuf_occupancy_notify_rstate);
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_rdmabuf_occupancy_notify_lstate_get(int condesc, int **lstate)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- *lstate = &(conp->rdmabuf_occupancy_notify_lstate);
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_obtain_pointer(int condesc, MPID_nem_ib_com_t ** MPID_nem_ib_com)
-{
- MPID_nem_ib_com_t *conp;
- int ibcom_errno = 0;
-
- MPID_NEM_IB_RANGE_CHECK_WITH_ERROR(condesc, conp);
- *MPID_nem_ib_com = conp;
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-static const char *strerror_tbl[] = {
- [0] = "zero",
- [1] = "one",
- [2] = "two",
- [3] = "three",
-};
-
-char *MPID_nem_ib_com_strerror(int err)
-{
- char *r;
- if (-err > 3) {
- r = MPIU_Malloc(256);
- sprintf(r, "%d", -err);
- goto fn_exit;
- }
- else {
- r = (char *) strerror_tbl[-err];
- }
- fn_exit:
- return r;
- //fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_reg_mr(void *addr, long len, struct ibv_mr **mr,
- enum ibv_access_flags additional_flags)
-{
- int ibcom_errno = 0;
- int err = -1;
- dprintf("MPID_nem_ib_com_reg_mr,addr=%p,len=%ld,mr=%p\n", addr, len, mr);
-
- *mr =
- ibv_reg_mr(ib_pd, addr, len,
- IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE |
- IBV_ACCESS_REMOTE_READ | additional_flags);
-
- if (*mr == 0) {
- err = errno; /* copy errno of ibv_reg_mr */
- }
-
- /* return the errno of ibv_reg_mr when error occurs */
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(*mr == 0, err,
- dprintf("MPID_nem_ib_com_reg_mr,cannot register memory\n"));
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_dereg_mr(struct ibv_mr *mr)
-{
- int ib_errno;
- int ibcom_errno = 0;
-
- if (!mr) {
- goto fn_exit;
- }
-
- ib_errno = ibv_dereg_mr(mr);
- if (ib_errno < 0) {
- fprintf(stderr, "cannot deregister memory\n");
- goto fn_fail;
- }
-#ifdef HAVE_LIBDCFA
- dprintf("MPID_nem_ib_com_dereg_mr, addr=%p\n", mr->buf);
-#else
- dprintf("MPID_nem_ib_com_dereg_mr, addr=%p\n", mr->addr);
-#endif
-
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
deleted file mode 100644
index ebce06c..0000000
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
+++ /dev/null
@@ -1,785 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
-/*
- * (C) 2012 NEC Corporation
- * (C) 2012 University of Tokyo
- * (C) 2014 RIKEN AICS
- *
- * See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <sys/types.h>
-#include <linux/mman.h> /* make it define MAP_ANONYMOUS */
-#include "mpid_nem_impl.h"
-
-#ifdef HAVE_LIBDCFA
-#include "dcfa.h"
-
-/*
-*** diff -p verbs.h dcfa.h (structures)
-same name, same fields
- struct ibv_device { };
- struct ibv_context { };
- struct ibv_pd { };
- struct ibv_ah_attr { };
-
-same name, different fields
- struct ibv_qp_init_attr {
-- void *qp_context;
-- struct ibv_xrc_domain *xrc_domain;
-};
-
- struct ibv_mr {
-- void *addr;
-+ void *buf;
-+ uint64_t host_addr;
-- size_t length;
-+ int size;
-- uint32_t handle;
-+ uint64_t handle;
-+ int flag; 1: offload
-- uint32_t lkey;
-+ int lkey;
-- uint32_t rkey;
-+ int rkey;
-};
-
- struct ibv_qp {
-+ struct mlx4_buf buf;
-+ int max_inline_data;
-+ int buf_size;
-
-+ uint32_t doorbell_qpn;
-+ uint32_t sq_signal_bits;
-+ int sq_spare_wqes;
-+ struct mlx4_wq sq;
-
-+ uint32_t *db; // doorbell addr for post recv
-+ struct mlx4_wq rq;
-+ ibmic_qp_conn_info_t remote_qp_info;
-
-- uint32_t handle;
-+ uint64_t handle;
-
-- struct ibv_context *context;
-- void *qp_context;
-- uint32_t events_completed;
-- struct ibv_xrc_domain *xrc_domain;
-- pthread_mutex_t mutex;
-- pthread_cond_t cond;
-};
-
- struct ibv_cq {
-- struct ibv_comp_channel *channel;
-- void *cq_context;
-- uint32_t handle;
-- uint32_t comp_events_completed;
-- uint32_t async_events_completed;
-
-- pthread_mutex_t mutex;
-- pthread_cond_t cond;
-
-+ struct mlx4_buf buf;
-+ uint32_t cons_index;
-+ uint32_t wait_index;
-+ uint32_t *set_ci_db;
-+ uint32_t *arm_db;
-+ int arm_sn;
-+ int cqe_size;
-+ uint64_t handle;
-};
-
- struct ibv_wc {
-- uint32_t src_qp;
-- uint16_t pkey_index;
-- uint16_t slid;
-- uint8_t sl;
-- uint8_t dlid_path_bits;
-};
-
- struct ibv_send_wr {
-- struct ibv_sge *sg_list;
-+ struct ibv_sge sg_list[WR_SG_NUM];
-+ uint64_t addr;
-+ uint32_t length;
-+ uint32_t lkey;
- };
-
- struct ibv_recv_wr {
-- struct ibv_sge *sg_list;
-+ struct ibv_sge sg_list[WR_SG_NUM];
- };
-
- struct ibv_sge {
-+ uint64_t mic_addr; // buffer address on mic
- };
-
-non-existent
-- struct ibv_port_attr { };
-
-
-*** diff -p verbs.h dcfa.h (functions)
-
-same name, same arguments
- ibv_get_device_list
- ibv_open_device
- ibv_close_device
- ibv_free_device_list
- ibv_alloc_pd
- ibv_dealloc_pd
- ibv_create_qp
- ibv_destroy_qp
- ibv_reg_mr
- ibv_dereg_mr
- ibv_destroy_cq
- ibv_poll_cq
- ibv_modify_qp
-
-same name, different arguments
-- int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr)
-+ int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr);
-
-- int ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr)
-+ int ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr);
-
-- struct ibv_cq *ibv_create_cq(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
-+ struct ibv_cq *ibv_create_cq(struct ibv_context *context, int cqe_max);
-
-non-existent
-- ibv_get_device_name
-- ibv_query_port
-- ibv_query_gid
-- ibv_create_ah
-struct ibv_ah *ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr);
-*/
-
-#else
-/* Original Infiniband */
-#include <infiniband/verbs.h>
-#endif
-
-static inline unsigned long long MPID_nem_ib_rdtsc_cpuid(void)
-{
- unsigned int lo, hi;
- __asm__ __volatile__(// serialize
- "xorl %%eax,%%eax \n cpuid":::"%rax", "%rbx", "%rcx", "%rdx");
- __asm__ __volatile__("rdtsc":"=a"(lo), "=d"(hi));
- return (unsigned long long) hi << 32 | lo;
-}
-
-#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_NID 32
-
-extern struct ibv_cq *MPID_nem_ib_rc_shared_scq;
-extern struct ibv_cq *MPID_nem_ib_rc_shared_scq_scratch_pad;
-extern struct ibv_cq *MPID_nem_ib_rc_shared_rcq_scratch_pad;
-extern struct ibv_cq *MPID_nem_ib_ud_shared_rcq;
-extern uint8_t *MPID_nem_ib_scratch_pad;
-extern int MPID_nem_ib_scratch_pad_ref_count;
-extern char *MPID_nem_ib_rdmawr_from_alloc_free_list_front[MPID_NEM_IB_RDMAWR_FROM_ALLOC_NID];
-extern char *MPID_nem_ib_rdmawr_from_alloc_arena_free_list[MPID_NEM_IB_RDMAWR_FROM_ALLOC_NID];
-extern struct ibv_mr *MPID_nem_ib_rdmawr_to_alloc_mr;
-extern uint8_t *MPID_nem_ib_rdmawr_to_alloc_start;
-extern uint8_t *MPID_nem_ib_rdmawr_to_alloc_free_list;
-
-#define MPID_NEM_IB_COM_SIZE (65536*2) /* Maxiumum number of QPs. One process uses 2 QPs. */
-#define MPID_NEM_IB_COM_INLINE_DATA (512-64) /* experimented max is 884 */ /* this is lower bound and more than this value is set. the more this value is, the more the actual value set is. you need to check it */
-
-#define MPID_NEM_IB_COM_MAX_SQ_CAPACITY (256/1)
-#define MPID_NEM_IB_COM_MAX_RQ_CAPACITY ((MPID_NEM_IB_COM_MAX_SQ_CAPACITY)+16) /* We pre-post_recv MPID_NEM_IB_COM_MAX_SQ_CAPACITY of commands */
-#define MPID_NEM_IB_COM_MAX_SGE_CAPACITY (32/2) /* maximum for ConnectX-3 looks like 32 */
-#define MPID_NEM_IB_COM_MAX_CQ_CAPACITY MPID_NEM_IB_COM_MAX_RQ_CAPACITY
-#define MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN (((MPID_NEM_IB_COM_MAX_CQ_CAPACITY)>>2)+((MPID_NEM_IB_COM_MAX_CQ_CAPACITY)>>1)) /* drain when reaching this amount */
-#define MPID_NEM_IB_COM_MAX_SQ_HEIGHT_DRAIN (((MPID_NEM_IB_COM_MAX_SQ_CAPACITY)>>2)+((MPID_NEM_IB_COM_MAX_SQ_CAPACITY)>>1)) /* drain when reaching this amount */
-#define MPID_NEM_IB_COM_AMT_CQ_DRAIN ((MPID_NEM_IB_COM_MAX_CQ_CAPACITY)>>2) /* drain this amount */
-#define MPID_NEM_IB_COM_MAX_RD_ATOMIC 4
-
-#define MPID_NEM_IB_COM_MAX_TRIES 1
-#define MPID_NEM_IB_COM_SCQ_FLG 1
-#define MPID_NEM_IB_COM_RCQ_FLG 2
-
-#define MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ 100
-#define MPID_NEM_IB_COM_INFOKEY_MR_ADDR 200
-#define MPID_NEM_IB_COM_INFOKEY_MR_LENGTH 201
-#define MPID_NEM_IB_COM_INFOKEY_MR_RKEY 202
-#define MPID_NEM_IB_COM_INFOKEY_QP_QPN 300
-#define MPID_NEM_IB_COM_INFOKEY_PORT_LID 400
-#define MPID_NEM_IB_COM_INFOKEY_PORT_GID 401
-
-
-/* buffers */
-#define MPID_NEM_IB_COM_NBUF_RDMA 2 /* number of <addr, sz, lkey, rkey> */
-#define MPID_NEM_IB_COM_RDMAWR_FROM 0 /* index to RDMA-write-from buffer */
-#define MPID_NEM_IB_COM_RDMAWR_TO 1 /* index to RDMA-write-to buffer */
-/* assuming that the unit (32768) is equals to eager-RDMA-write threashold
- assuming that the multiplier (256) is
- equals to max number of outstanding eager-RDMA-write transactions */
-#define MPID_NEM_IB_COM_RDMABUF_SZSEG (16384/4) //(16384+8+40+1) /* this size minus magics and headers must be 2^n because data might grow to the next 2^m boundary, see ib_impl.h, ib_com.c, src/mpid/ch3/src/mpid_isend.c */
-#define MPID_NEM_IB_COM_RDMABUF_SZ ((MPID_NEM_IB_COM_RDMABUF_SZSEG) * 16) /* (32768 * 256) */
-#define MPID_NEM_IB_COM_RDMABUF_NSEG ((MPID_NEM_IB_COM_RDMABUF_SZ) / (MPID_NEM_IB_COM_RDMABUF_SZSEG))
-
-#define MPID_NEM_IB_RINGBUF_SHARED_SZSEG (16384/4)
-#define MPID_NEM_IB_RINGBUF_SHARED_SZ ((MPID_NEM_IB_RINGBUF_SHARED_SZSEG) * 16)
-#define MPID_NEM_IB_RINGBUF_SHARED_NSEG ((MPID_NEM_IB_RINGBUF_SHARED_SZ) / (MPID_NEM_IB_RINGBUF_SHARED_SZSEG))
-
-#define MPID_NEM_IB_COM_SMT_INLINE_NCHAIN 8 /* maximum number of chained inline-send commands */
-#define MPID_NEM_IB_COM_RDMABUF_HIGH_WATER_MARK (((MPID_NEM_IB_COM_RDMABUF_NSEG)>>1)+((MPID_NEM_IB_COM_RDMABUF_NSEG)>>2))
-#define MPID_NEM_IB_COM_RDMABUF_LOW_WATER_MARK (((MPID_NEM_IB_COM_RDMABUF_NSEG)>>2))
-#define MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_HW 1
-#define MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_LW 2
-#define MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_RATE_HW /*1*/(((MPID_NEM_IB_COM_RDMABUF_NSEG)>>4) == 0 ? 1 : ((MPID_NEM_IB_COM_RDMABUF_NSEG)>>4))
-#define MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_RATE_LW (((MPID_NEM_IB_COM_RDMABUF_NSEG)>>2)) /*12*/ /* receiver tries to notify sender the number of releases when receiver find not-noticed releases of more than this number */
-#define MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_RATE_DELAY_MULTIPLIER(notify_rate) (notify_rate + (notify_rate>>1)) /* (notify_rate) */ /* send seq_num to the sender side if there is no chance to embed seq_num into a packet bound for the sender side for this number of release events */
-
-#define MPID_NEM_IB_COM_NBUF_UD 2 /* number of <addr, sz, lkey, rkey> */
-#define MPID_NEM_IB_COM_UDWR_FROM 0 /* index to UD-write-from buffer */
-#define MPID_NEM_IB_COM_UDWR_TO 1 /* index to UD-write-to buffer */
-#define MPID_NEM_IB_COM_UDBUF_SZ (128 * 8192) /* supporting 100K ranks with 10 rounds */
-#define MPID_NEM_IB_COM_UDBUF_SZSEG (128)
-#define MPID_NEM_IB_COM_UDBUF_NSEG (MPID_NEM_IB_COM_UDBUF_SZ / MPID_NEM_IB_COM_UDBUF_SZSEG)
-
-#define MPID_NEM_IB_COM_NBUF_SCRATCH_PAD 2 /* number of <addr, sz, lkey, rkey> */
-#define MPID_NEM_IB_COM_SCRATCH_PAD_FROM_SZ 4096
-#define MPID_NEM_IB_COM_SCRATCH_PAD_FROM 0
-#define MPID_NEM_IB_COM_SCRATCH_PAD_TO 1 /* index to RDMA-write-to buffer */
-
-/* send command templates */
-#define MPID_NEM_IB_COM_RC_SR_NTEMPLATE (8+1+2) /* number of request templates, 8 for inline-chained-smt, 1 for smt, 1 for lmt */
-#define MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 0 /* index to it */
-#define MPID_NEM_IB_COM_SMT_INLINE_CHAINED7 7
-#define MPID_NEM_IB_COM_SMT_NOINLINE 8
-#define MPID_NEM_IB_COM_LMT_INITIATOR 9 /* FIXME: bad naming */
-#define MPID_NEM_IB_COM_LMT_PUT 10
-
-/* recv command templates */
-#define MPID_NEM_IB_COM_RC_RR_NTEMPLATE 1 /* 1 for smt, */
-#define MPID_NEM_IB_COM_RDMAWR_RESPONDER 0 /* index to recv request template */
-
-/* sge template */
-#define MPID_NEM_IB_COM_SMT_INLINE_INITIATOR_NSGE 4 /* MPI header, (sz;magic), data x1, magic */
-#define MPID_NEM_IB_COM_SMT_NOINLINE_INITIATOR_NSGE 4 /* MPI header, (sz;magic), data x1, magic */
-#define MPID_NEM_IB_COM_LMT_INITIATOR_NSGE 1 /* data x1 */
-#define MPID_NEM_IB_COM_LMT_PUT_NSGE 1 /* data x1 */
-#define MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR_NSGE 1 /* QP state */
-#define MPID_NEM_IB_COM_SCRATCH_PAD_CAS_NSGE 1 /* QP state */
-#define MPID_NEM_IB_COM_SCRATCH_PAD_GET_NSGE 1
-
-#define MPID_NEM_IB_COM_UD_SR_NTEMPLATE 1
-#define MPID_NEM_IB_COM_UD_RR_NTEMPLATE 1
-#define MPID_NEM_IB_COM_UD_INITIATOR 0 /* index to send request template */
-#define MPID_NEM_IB_COM_UD_RESPONDER 0 /* index to recv request template */
-
-#define MPID_NEM_IB_COM_SCRATCH_PAD_SR_NTEMPLATE 4
-#define MPID_NEM_IB_COM_SCRATCH_PAD_RR_NTEMPLATE 1
-#define MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR 0 /* index to send request template */
-#define MPID_NEM_IB_COM_SCRATCH_PAD_CAS 1
-#define MPID_NEM_IB_COM_SCRATCH_PAD_GET 2
-#define MPID_NEM_IB_COM_SCRATCH_PAD_WR 3
-#define MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER 0 /* index to recv request template */
-
-/* Header prepended to the MPI packet */
-#define MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf) ((uint32_t)(((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first >> 61))
-#define MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_SET(buf, val) ((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first = (((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first & ~(7ULL<<61)) | ((uint64_t)(val) << 61)
-
-#define MPID_NEM_IB_NETMOD_HDR_RELINDEX_GET(buf) ((int16_t)((((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first >> 32) & 65535))
-#define MPID_NEM_IB_NETMOD_HDR_RELINDEX_SET(buf, val) ((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first = (((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first & ~(65535ULL<<32)) | ((uint64_t)((val&65535)) << 32)
-
-/* Note that the result is put into [63:32] */
-#define MPID_NEM_IB_NETMOD_HDR_ACQADDRH_GET(buf) ((uint64_t)((((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first << 12) & (((1ULL<<32)-1)<<32)))
-/* Note that the value to put is located in [63:32] */
-#define MPID_NEM_IB_NETMOD_HDR_ACQADDRH_SET(buf, val) ((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first = (((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first & ~(((1ULL<<32)-1)<<20)) | (((val) & (((1ULL<<32)-1)<<32)) >> 12)
-
-#define MPID_NEM_IB_NETMOD_HDR_ACQADDR_GET(buf) (MPID_NEM_IB_NETMOD_HDR_ACQADDRH_GET(buf)|((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->second)
-#define MPID_NEM_IB_NETMOD_HDR_ACQADDR_SET(buf, val) MPID_NEM_IB_NETMOD_HDR_ACQADDRH_SET((buf), (val)); ((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->second = ((val) & ((1ULL<<32)-1))
-
-#define MPID_NEM_IB_NETMOD_HDR_ACQAMTLOG_GET(buf) ((uint32_t)((((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first >> 16) & 15))
-#define MPID_NEM_IB_NETMOD_HDR_ACQAMTLOG_SET(buf, val) ((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first = (((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first & ~(15ULL<<16)) | ((uint64_t)(val) << 16)
-
-#define MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) ((uint32_t)(((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first & 65535))
-#define MPID_NEM_IB_NETMOD_HDR_SZ_SET(buf, val) ((MPID_nem_ib_netmod_hdr_exclusive_t *)(buf))->first = (((MPID_nem_ib_netmod_hdr_exclusive_t *)buf)->first & ~65535ULL) | (val)
-
-#define MPID_NEM_IB_NETMOD_HDR_VC_GET(buf) ((struct MPIDI_VC *)(((uint64_t)((MPID_nem_ib_netmod_hdr_shared_t *)(buf))->third << 32) | (uint64_t)((MPID_nem_ib_netmod_hdr_shared_t *)(buf))->forth))
-#define MPID_NEM_IB_NETMOD_HDR_VC_SET(buf, val) ((MPID_nem_ib_netmod_hdr_shared_t *)(buf))->third = (uint64_t)(val) >> 32; ((MPID_nem_ib_netmod_hdr_shared_t *)(buf))->forth = (uint64_t)(val) & ((1ULL << 32) - 1);
-
-#define MPID_NEM_IB_NETMOD_HDR_SIZEOF(type) (((type) == MPID_NEM_IB_RINGBUF_EXCLUSIVE) ? sizeof(MPID_nem_ib_netmod_hdr_exclusive_t) : sizeof(MPID_nem_ib_netmod_hdr_shared_t))
-#define MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf) ((MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf) & MPID_NEM_IB_RINGBUF_EXCLUSIVE) ? sizeof(MPID_nem_ib_netmod_hdr_exclusive_t) : sizeof(MPID_nem_ib_netmod_hdr_shared_t))
-
-#define MPID_NEM_IB_NETMOD_HDR_HEAD_FLAG_PTR(buf) (&((MPID_nem_ib_netmod_hdr_shared_t *)(buf))->first)
-#define MPID_NEM_IB_NETMOD_HDR_HEAD_FLAG_SET(buf, val) ((MPID_nem_ib_netmod_hdr_shared_t *)(buf))->first = (val);
-
-typedef struct MPID_nem_ib_netmod_hdr_exclusive {
- /*
- * [63:61] ring buffer type
- * remote is exclusive:
- * [47:32] largest index of contiguous released slots 16-bit
- * reply to slot request:
- * [51:20] Start address of acquired slots, MSB part
- * [19:16] Log_2 of amount of acquired slots
- * [15:0] Packet size without padding
- */
- uint64_t first;
- /* jump case:
- * [31:0] Start address of acquired slots, LSB part
- */
- uint32_t second;
-
-} MPID_nem_ib_netmod_hdr_exclusive_t;
-
-typedef struct MPID_nem_ib_netmod_hdr_shared {
- uint64_t first;
- uint32_t second;
-
- /* remote is one slot:
- * [31:0] VC pointer in remote node, MSB part */
- uint32_t third;
-
- /* remote is one slot:
- * [31:0] VC pointer in remote node, LSB part */
- uint32_t forth;
-} MPID_nem_ib_netmod_hdr_shared_t;
-
-typedef struct MPID_nem_ib_netmod_trailer {
- uint8_t tail_flag;
- //uint32_t traits; /* for debug */
-} MPID_nem_ib_netmod_trailer_t;
-
-/* Allocator for RDMA write to buffer */
-typedef struct {
- /* Avoid polluting netmod_hdr and trailer */
- uint8_t padding[sizeof(MPID_nem_ib_netmod_hdr_shared_t)];
- uint8_t *next;
-}
-MPID_nem_ib_rdmawr_to_alloc_hdr_t;
-
-typedef struct {
- uint64_t wr_id; /* address of MPID_Request */
- int mf; /* more fragment (0 means the end of packet) */
- void *mr_cache; /* address of mr_cache_entry. derecement refc in drain_scq */
-} MPID_nem_ib_rc_send_request;
-
-#define MPID_NEM_IB_LMT_LAST_PKT 0
-#define MPID_NEM_IB_LMT_SEGMENT_LAST 1
-#define MPID_NEM_IB_LMT_PART_OF_SEGMENT 2
-#define MPID_NEM_IB_LAST_PKT MPID_NEM_IB_LMT_LAST_PKT
-
-/* Ring-buffer to which a remote note RDMA-writes */
-#define MPID_NEM_IB_NRINGBUF 64
-#define MPID_NEM_IB_RINGBUF_NSLOT 16
-
-/* Ring-buffer type. It is set by ringbuf_alloc on the receiver side
- and sent in SYNACK or ACK1 to the sender side and referenced by isend
- on the sender side and by poll on the receiver side */
-/* Exclusive ring buffer has been allocated */
-#define MPID_NEM_IB_RINGBUF_EXCLUSIVE 1
-/* Shared ring buffer has been allocated */
-#define MPID_NEM_IB_RINGBUF_SHARED 2
-#define MPID_NEM_IB_RINGBUF_RELINDEX 4
-
-typedef struct {
- uint32_t type; /* acquiring contiguous slots or a single slot */
- void *start;
- int nslot;
- MPIDI_VC_t *vc;
- uint64_t remote_released[(MPID_NEM_IB_COM_RDMABUF_NSEG + 63) / 64];
- int ref_count; /* number of VCs sharing the ring-buffer */
-} MPID_nem_ib_ringbuf_t;
-
-/* Represent a ring-buffer is exclusively acquired */
-extern uint64_t MPID_nem_ib_ringbuf_acquired[(MPID_NEM_IB_NRINGBUF + 63) / 64];
-
-/* Represent a ring-buffer is ready to poll */
-extern uint64_t MPID_nem_ib_ringbuf_allocated[(MPID_NEM_IB_NRINGBUF + 63) / 64];
-
-extern MPID_nem_ib_ringbuf_t *MPID_nem_ib_ringbuf;
-
-
-/* Next ring-buffer type and slots
- Exclusive slots are sticky.
- Shared slot is consumed.
- Use the type described here because we need to
- use up acquired slots of shared ring-buffer when
- transitioning from share to exclusive.
- The next type is absent means we're transitioning
- from exclusive to shared. */
-typedef struct MPID_nem_ib_ringbuf_sector {
- uint32_t type;
- void *start;
- int nslot;
- uint16_t head;
- uint16_t tail;
-
- struct MPID_nem_ib_ringbuf_sector *sectorq_next;
-} MPID_nem_ib_ringbuf_sector_t;
-
-typedef GENERIC_Q_DECL(MPID_nem_ib_ringbuf_sector_t) MPID_nem_ib_ringbuf_sectorq_t;
-
-#define MPID_nem_ib_ringbuf_sectorq_empty(q) GENERICM_Q_EMPTY (q)
-#define MPID_nem_ib_ringbuf_sectorq_head(q) GENERICM_Q_HEAD (q)
-#define MPID_nem_ib_ringbuf_sectorq_next_field(ep, next_field) ((ep)->next_field)
-#define MPID_nem_ib_ringbuf_sectorq_next(ep) ((ep)->sectorq_next)
-#define MPID_nem_ib_ringbuf_sectorq_enqueue(qp, ep) GENERICM_Q_ENQUEUE (qp, ep, MPID_nem_ib_ringbuf_sectorq_next_field, sectorq_next);
-#define MPID_nem_ib_ringbuf_sectorq_dequeue(qp, epp) GENERICM_Q_DEQUEUE (qp, epp, MPID_nem_ib_ringbuf_sectorq_next_field, sectorq_next);
-
-
-/* IB connection */
-typedef struct MPID_nem_ib_com {
- short icom_used;
- short icom_connected;
- int icom_port;
-#ifdef HAVE_LIBDCFA
-#else
- struct ibv_port_attr icom_pattr; /* IB port attributes */
-#endif
- struct ibv_qp *icom_qp;
- struct ibv_cq *icom_scq;
- struct ibv_cq *icom_rcq;
- struct ibv_mr **icom_mrlist;
- int icom_mrlen;
- union ibv_gid icom_gid;
- void **icom_mem; /* 0: send 1: recv 2..: rdma */
- int *icom_msize; /* 0: send 1: recv 2..: rdma */
- struct ibv_send_wr *icom_sr;
- struct ibv_ah_attr *icom_ah_attr;
- struct ibv_recv_wr *icom_rr;
- void **icom_rmem;
- int *icom_rkey;
- size_t *icom_rsize;
- uint16_t sseq_num;
- uint16_t rsr_seq_num_poll;
- uint16_t rsr_seq_num_tail; /* occupation status of remote Send Request (SR) queue (it covers occupation status of local RDMA-wr-to buffer) */
- uint16_t rsr_seq_num_tail_last_sent; /* latest one sent to remote rank */
- uint16_t lsr_seq_num_tail; /* occupation status of local Send Request (SR) queue */
- int lsr_seq_num_tail_last_requested; /* value when lmt_start_send issued req_seq_num */
- int rdmabuf_occupancy_notify_rstate, rdmabuf_occupancy_notify_lstate;
- int ncom, ncom_scratch_pad; /* number of entries in the command queue */
-
- uint32_t max_inline_data; /* actual value obtained after ibv_create_qp */
- uint32_t max_send_wr;
- uint32_t max_recv_wr;
-
- uint32_t open_flag; /* MPID_NEM_IB_COM_OPEN_UD, ... */
- uint16_t remote_lid; /* for debug */
-
- /* other commands can be executed before RDMA-rd command */
- /* see the "Ordering and the Fence Indicator" section in "InfiniBand Architecture" by William T. Futral */
- uint16_t after_rdma_rd;
-
- /* Ring-buffer information on the receiver side.
- * It's allocated on the receiver side. */
- MPID_nem_ib_ringbuf_t *remote_ringbuf;
-
- /* Ring buffer information on the sender side.
- * The information is passed from the receiver side on connection. */
- uint32_t local_ringbuf_type;
- void *local_ringbuf_start;
- int local_ringbuf_rkey;
- uint16_t local_ringbuf_nslot;
-
- /* VC of remote node. It's embedded in a packet going to the
- * shared ring buffer because no VC information is available on
- * the receiver side in the shared case. c.f. They are stored in
- * the individual exclusive ring-buffers in the exclusive case. */
- MPIDI_VC_t *remote_vc;
-
- /* Delay the fetch of the second ask until the first issues CAS */
- uint8_t ask_guard;
-
- /* Ring buffer sectors obtained through ask-send protocol */
- MPID_nem_ib_ringbuf_sectorq_t sectorq;
-
-
- /* Two transactions from the both ends for a connection
- * can be outstanding at the same time when they were initiated
- * at the same time. This makes one end try to send ACK2 after
- * freeing scratch-pad QP for the connection. So we must monitor and
- * wait until all the onnection request transactions ends before
- * freeing scratch-pad QP. */
- int outstanding_connection_tx;
- int incoming_connection_tx;
- int notify_outstanding_tx_empty;
-
-} MPID_nem_ib_com_t;
-
-extern void *MPID_nem_ib_rdmawr_to_alloc(int nslots);
-extern void MPID_nem_ib_rdmawr_to_free(void *p, int nslots);
-extern int MPID_nem_ib_rdmawr_to_munmap(void *p, int nslots);
-extern int MPID_nem_ib_com_open(int ib_port, int MPID_nem_ib_com_open_flag, int *condesc);
-extern int MPID_nem_ib_com_close(int);
-extern int MPID_nem_ib_com_alloc(int condesc, int sz);
-extern int MPID_nem_ib_com_free(int condesc, int sz);
-extern int MPID_nem_ib_com_rts(int condesc, int remote_qpnum, uint16_t remote_lid,
- union ibv_gid *remote_gid);
-
-extern int MPID_nem_ib_com_reg_mr_connect(int condesc, void *rmem, int rkey);
-extern int MPID_nem_ib_com_connect_ringbuf(int condesc,
- uint32_t ringbuf_type,
- void *start, int rkey, int nslot,
- MPIDI_VC_t * remote_vc, uint32_t alloc_new_mr);
-
-extern int MPID_nem_ib_com_isend(int condesc,
- uint64_t wr_id,
- void *prefix, int sz_prefix,
- void *hdr, int sz_hdr,
- void *data, int sz_data,
- int *copied,
- uint32_t local_ringbuf_type, uint32_t remote_ringbuf_type,
- void **buf_from_out, uint32_t * buf_from_sz_out);
-extern int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_hdr,
- void *data, int sz_data);
-extern int MPID_nem_ib_com_put_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset, int sz,
- void *laddr, void **buf_from_out,
- uint32_t * buf_from_sz_out);
-extern int MPID_nem_ib_com_get_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset, int sz,
- void **buf_from_out, uint32_t * buf_from_sz_out);
-extern int MPID_nem_ib_com_cas_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset,
- uint64_t compare, uint64_t swap, void **buf_from_out,
- uint32_t * buf_from_sz_out);
-extern int MPID_nem_ib_com_wr_scratch_pad(int condesc, uint64_t wr_id,
- void *buf_from, uint32_t buf_from_sz);
-
-//extern int MPID_nem_ib_com_isend(int condesc, uint64_t wr_id, void* hdr, int sz_hdr, void* data, int sz_data);
-extern int MPID_nem_ib_com_irecv(int condesc, uint64_t wr_id);
-extern int MPID_nem_ib_com_udsend(int condesc, union ibv_gid *remote_gid, uint16_t remote_lid,
- uint32_t remote_qpn, uint32_t imm_data, uint64_t wr_id);
-extern int MPID_nem_ib_com_udrecv(int condesc);
-extern int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, long sz_data,
- uint32_t rkey, void *laddr, int last);
-extern int MPID_nem_ib_com_put_lmt(int condesc, uint64_t wr_id, void *raddr, int sz_data,
- uint32_t rkey, void *laddr);
-extern int MPID_nem_ib_com_scratch_pad_recv(int condesc, int sz_data);
-extern int MPID_nem_ib_com_poll_cq(int which_cq, struct ibv_wc *wc, int *result);
-
-extern int MPID_nem_ib_com_obtain_pointer(int condesc, MPID_nem_ib_com_t ** MPID_nem_ib_com);
-
-/* for ib_reg_mr.c */
-extern int MPID_nem_ib_com_reg_mr(void *addr, long len, struct ibv_mr **mr,
- enum ibv_access_flags additional_flags);
-extern int MPID_nem_ib_com_dereg_mr(struct ibv_mr *mr);
-
-extern int MPID_nem_ib_com_get_info_conn(int condesc, int key, void *out, uint32_t out_len);
-extern int MPID_nem_ib_com_get_info_pattr(int key, void *out, uint32_t out_len);
-extern int MPID_nem_ib_com_get_info_mr(int condesc, int memid, int key, void *out, int out_len);
-
-extern int MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get(int condesc, int *notify_rate);
-extern int MPID_nem_ib_com_rdmabuf_occupancy_notify_rstate_get(int condesc, int **rstate);
-extern int MPID_nem_ib_com_rdmabuf_occupancy_notify_lstate_get(int condesc, int **lstate);
-
-extern char *MPID_nem_ib_com_strerror(int err);
-
-extern int MPID_nem_ib_com_mem_rdmawr_from(int condesc, void **out);
-//extern int MPID_nem_ib_com_mem_rdmawr_to(int condesc, int seq_num, void **out);
-extern int MPID_nem_ib_com_mem_udwr_from(int condesc, void **out);
-extern int MPID_nem_ib_com_mem_udwr_to(int condesc, void **out);
-
-/* ib_reg_mr.c */
-struct MPID_nem_ib_com_reg_mr_listnode_t {
- struct MPID_nem_ib_com_reg_mr_listnode_t *lru_next;
- struct MPID_nem_ib_com_reg_mr_listnode_t *lru_prev;
-};
-
-struct MPID_nem_ib_com_reg_mr_cache_entry_t {
- /* : public MPID_nem_ib_com_reg_mr_listnode_t */
- struct MPID_nem_ib_com_reg_mr_listnode_t *lru_next;
- struct MPID_nem_ib_com_reg_mr_listnode_t *lru_prev;
- struct MPID_nem_ib_com_reg_mr_listnode_t g_lru;
-
- struct ibv_mr *mr;
- void *addr;
- long len;
- int refc;
-};
-extern int MPID_nem_ib_com_register_cache_init(void);
-extern int MPID_nem_ib_com_register_cache_release(void);
-extern void *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
- enum ibv_access_flags additional_flags, int mode);
-extern void MPID_nem_ib_com_reg_mr_release(struct MPID_nem_ib_com_reg_mr_cache_entry_t *entry);
-#define MPID_NEM_IB_COM_REG_MR_GLOBAL (0)
-#define MPID_NEM_IB_COM_REG_MR_STICKY (1)
-
-#define list_entry(ptr, type, member) \
- ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
-
-extern int MPID_nem_ib_com_udbuf_init(void *q);
-
-#define MPID_NEM_IB_COM_RC_SHARED_RCQ 0
-#define MPID_NEM_IB_COM_RC_SHARED_SCQ 1
-#define MPID_NEM_IB_COM_UD_SHARED_RCQ 2
-#define MPID_NEM_IB_COM_UD_SHARED_SCQ 3
-
-/* flag for open */
-#define MPID_NEM_IB_COM_OPEN_RC 0x01
-/* for MPI control message, eager send, rendezvous protocol,
- so via RC-send/recv or RDMA-write/RDMA-read */
-
-#define MPID_NEM_IB_COM_OPEN_UD 0x02
-/* obsolete, to wait for you-to-me QP to become RTR state
- so via UD-send/recv */
-
-#define MPID_NEM_IB_COM_OPEN_SCRATCH_PAD 0x04
-/* obsolete, to wait for you-to-me QP to become RTR state
- so via RDMA-write */
-
-#define MPID_nem_ib_segv printf("%d\n", *(int32_t*)0);
-#define MPID_NEM_IB_COM_ERR_SETANDJUMP(errno, stmt) { stmt; ibcom_errno = errno; goto fn_fail; }
-#define MPID_NEM_IB_COM_ERR_CHKANDJUMP(cond, errno, stmt) if (cond) { stmt; ibcom_errno = errno; goto fn_fail; }
-#define MPID_NEM_IB_ERR_FATAL(cond, var, val, tag) if (cond) { var = val; printf("%s\n", tag); MPID_nem_ib_segv; }
-
-#define MPID_NEM_IB_COM_QKEY 0x1234
-#define MPID_NEM_IB_COM_MAGIC 0x55
-
-#define MPID_NEM_IB_OFF_POW2_ALIGNED(sz) \
- for(off_pow2_aligned = 15; off_pow2_aligned < (sz); off_pow2_aligned = ((((off_pow2_aligned + 1) << 1) - 1) > MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_netmod_trailer_t)) ? MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_netmod_trailer_t) : (((off_pow2_aligned + 1) << 1) - 1)) { } \
- if (off_pow2_aligned > MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_netmod_trailer_t)) { printf("assertion failed\n"); }; \
-
-#define MPID_NEM_IB_MAX_OFF_POW2_ALIGNED (MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_netmod_trailer_t))
-
-typedef struct MPID_nem_ib_com_qp_state_t {
- uint32_t state;
-} MPID_nem_ib_com_qp_state_t;
-
-#define MPID_NEM_IB_COM_QP_STATE_RTR 0x12345678
-#define MPID_NEM_IB_COM_SZ_MPI_HEADER 48
-#define MPID_NEM_IB_COM_AMT_SLACK (MPID_NEM_IB_COM_RDMABUF_NSEG > 128 ? 1 : 1)
-
-#define MPID_NEM_IB_MAX(a, b) ((a) > (b) ? (a) : (b))
-
-/* Allocator for RDMA write from buffer
- - Allocate performs overflow checks and increments pointer
- - Fast to "malloc" (one load and one store instructions)
- - Free decrements counter at the head of
- aligned memory area. The area is freed when the counter is zero.
- - Fast to "free" (one load and one store instructions)
- - Easy to shrink
- - Refill allocates multiple slots and IB-registers them
- - Fast when first-time allocs occur
- - Free list is pointers for 2^n sizes.
- - Fast to find a empty slot
- */
-typedef struct {
- union {
- uint32_t ref_count;
- char *next;
- } first;
- struct ibv_mr *mr;
-} MPID_nem_ib_rdmawr_from_alloc_hdr_t;
-#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA 65536
-#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_ROUNDUP64(addr, align) ((addr + align - 1) & ~((unsigned long)align - 1))
-#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_ROUNDUP64_ADDR(addr, align) ((char*)(((uint64_t)addr + align - 1) & ~((uint64_t)align - 1)))
-#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_NCLUST_SLAB 1
-#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_START(p) ((void *) ((uint64_t) (p) & ~(MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA - 1)))
-#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_MR(p) (((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) ((uint64_t) (p) & ~(MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA - 1)))->mr)
-#define MPID_NEM_IB_RDMAWR_FROM_ALLOC_PREPROCESS_SZ \
- if (_sz < 256) { \
- clz = 23; \
- sz = 256; \
- } else { \
- clz = __builtin_clz(_sz); \
- int ctz = __builtin_ctz(_sz); \
- if (clz + ctz == 31) { \
- sz = _sz; \
- } else { \
- sz = (1ULL << (32 - clz)); \
- clz = clz - 1; \
- } \
- }
-
-static inline void *MPID_nem_ib_rdmawr_from_alloc(uint32_t _sz)
-{
- int retval;
- int clz;
- uint32_t sz;
- assert(_sz <= (1ULL << 31));
- MPID_NEM_IB_RDMAWR_FROM_ALLOC_PREPROCESS_SZ;
- char *p = MPID_nem_ib_rdmawr_from_alloc_free_list_front[clz];
- if ((unsigned long) p & (MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA - 1)) {
- MPID_nem_ib_rdmawr_from_alloc_free_list_front[clz] += sz;
- return p;
- }
- else {
- char *q;
- if (MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz]) {
- q = MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz];
- MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz] =
- ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *)
- MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz])->first.next;
- }
- else {
- unsigned long sz_clust =
- MPID_NEM_IB_RDMAWR_FROM_ALLOC_ROUNDUP64(MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA *
- MPID_NEM_IB_RDMAWR_FROM_ALLOC_NCLUST_SLAB,
- 4096);
- char *unaligned = mmap(NULL,
- sz_clust + MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA,
- PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- if (unaligned == (void *) -1) {
- printf("mmap failed\n");
- MPID_nem_ib_segv;
- }
-
- q = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ROUNDUP64_ADDR(unaligned,
- MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA);
- retval = munmap(unaligned, q - unaligned);
- if (q - unaligned != 0 && retval) {
- printf("munmap failed\n");
- MPID_nem_ib_segv;
- }
- retval = munmap(q + sz_clust, MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA - (q - unaligned));
- if (retval) {
- printf("munmap failed\n");
- MPID_nem_ib_segv;
- }
-
- ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->mr =
- MPID_nem_ib_com_reg_mr_fetch(q, MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA, 0,
- MPID_NEM_IB_COM_REG_MR_STICKY);
- if (!((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->mr) {
- printf("ibv_reg_mr failed\n");
- MPID_nem_ib_segv;
- }
-
-#if MPID_NEM_IB_RDMAWR_FROM_ALLOC_NCLUST_SLAB > 1
- MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz] =
- q + MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA;
- for (p = q + MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA;
- p <
- q + (MPID_NEM_IB_RDMAWR_FROM_ALLOC_NCLUST_SLAB -
- 1) * MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA;
- p += MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA) {
- ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) p)->mr =
- MPID_nem_ib_com_reg_mr_fetch(q, MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA, 0,
- MPID_NEM_IB_COM_REG_MR_STICKY);
- if (!((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) p)->mr) {
- printf("ibv_reg_mr failed\n");
- MPID_nem_ib_segv;
- }
-
- ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) p)->first.next =
- p + MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA;
- }
- ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) p)->first.next = 0;
-#endif
- }
- ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->first.ref_count =
- MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA / sz - 1;
- q += sz + (MPID_NEM_IB_RDMAWR_FROM_ALLOC_SZARENA % sz);
- MPID_nem_ib_rdmawr_from_alloc_free_list_front[clz] = q + sz;
- return q;
- }
-}
-
-static inline void MPID_nem_ib_rdmawr_from_free(const void *p, uint32_t _sz)
-{
- int clz;
- uint32_t sz _UNUSED_;
- assert(_sz <= (1ULL << 31));
- MPID_NEM_IB_RDMAWR_FROM_ALLOC_PREPROCESS_SZ;
- void *q = MPID_NEM_IB_RDMAWR_FROM_ALLOC_ARENA_START(p);
- if (!(--(((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->first.ref_count))) {
- ((MPID_nem_ib_rdmawr_from_alloc_hdr_t *) q)->first.next =
- MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz];
- MPID_nem_ib_rdmawr_from_alloc_arena_free_list[clz] = (char *) q;
- }
-}
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
deleted file mode 100644
index faa3079..0000000
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
+++ /dev/null
@@ -1,1061 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
-/*
- * (C) 2012 NEC Corporation
- * (C) 2014-2015 RIKEN AICS
- *
- * See COPYRIGHT in top-level directory.
- */
-
-#ifndef IB_IMPL_H_INCLUDED
-#define IB_IMPL_H_INCLUDED
-
-#include "mpid_nem_impl.h"
-#include "ib_ibcom.h"
-#include <sys/types.h>
-#include <errno.h>
-#include <linux/mman.h> /* make it define MAP_ANONYMOUS */
-#include <sys/mman.h>
-
-#define MPID_NEM_IB_LMT_GET_CQE /* detect RDMA completion by CQE */
-#define MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
-/* lmt-put:
- (1) receiver sends cts to sender (2) sender RDMA-write to receiver
- (3) sender fetch CQE (4) receiver polls on end-flag
-*/
-#define MPID_NEM_IB_ONDEMAND
-
-#ifdef __GNUC__
-#define _UNUSED_ __attribute__ ((__unused__))
-#else
-#define _UNUSED_
-#endif
-
-typedef struct {
- union ibv_gid gid;
- uint16_t lid;
- uint32_t qpn;
-} MPID_nem_ib_conn_ud_t;
-
-typedef struct {
- int fd;
- MPIDI_VC_t *vc;
-} MPID_nem_ib_conn_t;
-
-/* see src/mpid/ch3/channels/nemesis/include/mpid_nem_generic_queue.h */
-typedef GENERIC_Q_DECL(struct MPID_Request) MPID_nem_ib_sendq_t;
-
-/* The vc provides a generic buffer in which network modules can store
- private fields This removes all dependencies from the VC struction
- on the network module, facilitating dynamic module loading. */
-typedef struct {
- MPID_nem_ib_conn_t *sc;
- int pending_sends; /* number of send in flight */
- MPID_nem_ib_com_t *ibcom;
- MPID_nem_ib_sendq_t sendq; /* overflow queue for IB commands */
- int connection_state; /* dynamic connection, checked in iSendContig, protocol processed there and in progress engine */
-
- /* Number of outstanding connection sequence started to eliminate
- * duplicated connection reuests */
- uint8_t connection_guard;
- void *vc_terminate_buf; /* address of ringbuffer which calls vc_terminate */
-} MPID_nem_ib_vc_area;
-
-/* macro for secret area in vc */
-#define VC_CH(vc) ((MPIDI_CH3I_VC *)&(vc)->ch)
-static inline MPID_nem_ib_vc_area *VC_IB(MPIDI_VC_t * vc)
-{
- return (MPID_nem_ib_vc_area *) vc->ch.netmod_area.padding;
-}
-
-#define VC_FIELD(vcp, field) VC_IB(vcp)->field
-
-/* The req provides a generic buffer in which network modules can store
- private fields This removes all dependencies from the req structure
- on the network module, facilitating dynamic module loading. */
-typedef struct {
- int seq_num; /* NOT USED, DELETE IT: sequence number of SR which RDMA-RD for lmt releases in ib_poll */
- struct MPID_Request *lmt_next; /* for lmtq */
- struct MPID_Request *sendq_next; /* for sendq */
- void *lmt_raddr; /* remember this for sendq, it might be better to use sreq->dev.iov[0].MPID_IOV_BUF instead */
- uint32_t lmt_rkey; /* remember this for sendq, survive over lrecv and referenced when dequeueing from sendq */
- long lmt_szsend; /* remember this for sendq */
- uint8_t lmt_tail, lmt_sender_tail, lmt_receiver_tail; /* survive over lrecv and referenced when polling */
- MPI_Aint lmt_dt_true_lb; /* to locate the last byte of receive buffer */
- void *lmt_write_to_buf; /* user buffer or temporary buffer for pack and remember it for lmt_orderq */
- void *lmt_pack_buf; /* to pack non-contiguous data */
- void *buf_from; /* address of RDMA write from buffer */
- uint32_t buf_from_sz; /* size of RDMA write from buffer. It's set on sending, referenced on freeing */
- uint8_t ask; /* Issued ask or not on send */
- union {
- void *from;
- void *to;
- } buf;
- uint32_t max_msg_sz; /* remember this for sendq, max message size */
- MPIDI_msg_sz_t data_sz;
- int seg_seq_num; /* sequence number of segments */
- int seg_num; /* number of segments */
- int last; /* flag for last packet or not */
- void *lmt_mr_cache; /* address of mr_cache_entry */
-} MPID_nem_ib_req_area;
-
-/* macro for secret area in req */
-static inline MPID_nem_ib_req_area *REQ_IB(MPID_Request * req)
-{
- return (MPID_nem_ib_req_area *) req->ch.netmod_area.padding;
-}
-
-#define REQ_FIELD(reqp, field) (REQ_IB(reqp)->field)
-
-/* see src/mpid/ch3/channels/nemesis/include/mpidi_ch3_impl.h */
-/* sreq is never enqueued into posted-queue nor unexpected-queue, so we can reuse sreq->dev.next */
-#define MPID_nem_ib_sendq_empty(q) GENERICM_Q_EMPTY (q)
-#define MPID_nem_ib_sendq_head(q) GENERICM_Q_HEAD (q)
-#define MPID_nem_ib_sendq_next_field(ep, next_field) REQ_FIELD(ep, next_field)
-#define MPID_nem_ib_sendq_next(ep) REQ_FIELD(ep, sendq_next)
-//#define MPID_nem_ib_sendq_next(ep) (ep->dev.next) /*takagi*/
-#define MPID_nem_ib_sendq_enqueue(qp, ep) GENERICM_Q_ENQUEUE (qp, ep, MPID_nem_ib_sendq_next_field, sendq_next);
-#define MPID_nem_ib_sendq_enqueue_at_head(qp, ep) GENERICM_Q_ENQUEUE_AT_HEAD(qp, ep, MPID_nem_ib_sendq_next_field, sendq_next);
-#define MPID_nem_ib_sendq_dequeue(qp, ep) GENERICM_Q_DEQUEUE (qp, ep, MPID_nem_ib_sendq_next_field, sendq_next);
-
-/* see src/mpid/ch3/channels/nemesis/include/mpid_nem_generic_queue.h */
-typedef GENERIC_Q_DECL(struct MPID_Request) MPID_nem_ib_lmtq_t;
-
-#ifdef MPID_NEM_IB_ONDEMAND
-
-/* States in connection protocol */
-#define MPID_NEM_IB_CM_CLOSED 0
-#define MPID_NEM_IB_CM_LOCAL_QP_RESET 1
-#define MPID_NEM_IB_CM_REMOTE_QP_RESET 2
-#define MPID_NEM_IB_CM_REMOTE_QP_RTS 4
-#define MPID_NEM_IB_CM_LOCAL_QP_RTS 8
-#define MPID_NEM_IB_CM_ESTABLISHED 15
-
-#define is_conn_established(rank) \
- (VC_FIELD(MPID_nem_ib_conns[rank].vc, connection_state) == MPID_NEM_IB_CM_ESTABLISHED)
-
-typedef struct {
- char *data;
- int length;
- int max_length;
-} MPID_nem_ib_cm_map_t;
-
-/* Types of connection protocol packets */
-enum MPID_nem_ib_cm_cmd_types {
- MPID_NEM_IB_CM_HEAD_FLAG_ZERO = 0,
- MPID_NEM_IB_CM_CAS,
- MPID_NEM_IB_CM_CAS_RELEASE,
- MPID_NEM_IB_CM_SYN,
- MPID_NEM_IB_CM_SYNACK,
- MPID_NEM_IB_CM_ACK1,
- MPID_NEM_IB_CM_ACK2,
- MPID_NEM_IB_RINGBUF_ASK_FETCH,
- MPID_NEM_IB_RINGBUF_ASK_CAS,
- MPID_NEM_IB_CM_CAS_RELEASE2,
- MPID_NEM_IB_CM_ALREADY_ESTABLISHED,
- MPID_NEM_IB_CM_RESPONDER_IS_CONNECTING,
- MPID_NEM_IB_NOTIFY_OUTSTANDING_TX_EMPTY
-};
-
-#define NOTIFY_OUTSTANDING_TX_SCQ (1 << 0)
-#define NOTIFY_OUTSTANDING_TX_RCQ (1 << 1)
-#define NOTIFY_OUTSTANDING_TX_COMP (NOTIFY_OUTSTANDING_TX_SCQ | NOTIFY_OUTSTANDING_TX_RCQ)
-
-/* Packet types of connection protocol */
-struct MPID_nem_ib_cm_req;
-
-/* They should have the same type because
- cm commands and ring buffer commands share one CQ */
-typedef uint8_t MPID_nem_ib_cm_ringbuf_cmd_type_t;
-typedef MPID_nem_ib_cm_ringbuf_cmd_type_t MPID_nem_ib_ringbuf_cmd_type_t;
-typedef MPID_nem_ib_cm_ringbuf_cmd_type_t MPID_nem_ib_cm_cmd_type_t;
-
-typedef struct {
- MPID_nem_ib_cm_cmd_type_t type;
- struct MPID_nem_ib_cm_req *initiator_req;
- uint16_t responder_ringbuf_index;
- int initiator_rank;
- MPID_nem_ib_netmod_trailer_t tail_flag;
-} MPID_nem_ib_cm_cmd_syn_t;
-
-typedef struct {
- MPID_nem_ib_cm_cmd_type_t type; /* this is used as head flag as well */
- uint32_t qpnum;
- uint16_t lid;
- union ibv_gid gid;
- void *rmem;
- uint32_t rkey;
- int ringbuf_nslot;
- uint32_t ringbuf_type; /* Ring buffer information sent from receiver side to sender side */
- struct MPID_nem_ib_cm_req *initiator_req;
- struct MPID_nem_ib_cm_req *responder_req;
- uint16_t initiator_ringbuf_index; /* index to connection protocol ring buffer */
- MPIDI_VC_t *remote_vc;
- MPID_nem_ib_netmod_trailer_t tail_flag;
-} MPID_nem_ib_cm_cmd_synack_t;
-
-typedef struct {
- MPID_nem_ib_cm_cmd_type_t type;
- uint32_t qpnum;
- uint16_t lid;
- union ibv_gid gid;
- void *rmem;
- uint32_t rkey;
- int ringbuf_nslot;
- uint32_t ringbuf_type; /* Ring buffer information sent from sender side to receiver side */
- struct MPID_nem_ib_cm_req *initiator_req;
- struct MPID_nem_ib_cm_req *responder_req;
- MPIDI_VC_t *remote_vc;
- MPID_nem_ib_netmod_trailer_t tail_flag;
-} MPID_nem_ib_cm_cmd_ack1_t;
-
-typedef struct {
- MPID_nem_ib_cm_cmd_type_t type;
- struct MPID_nem_ib_cm_req *initiator_req;
- MPID_nem_ib_netmod_trailer_t tail_flag;
-} MPID_nem_ib_cm_cmd_ack2_t;
-
-/* Base class for branching on type
- and used to measure maximum size */
-typedef union {
- MPID_nem_ib_cm_cmd_type_t type;
- MPID_nem_ib_cm_cmd_syn_t syn;
- MPID_nem_ib_cm_cmd_synack_t synack;
- MPID_nem_ib_cm_cmd_ack1_t ack1;
- MPID_nem_ib_cm_cmd_ack2_t ack2;
-} MPID_nem_ib_cm_cmd_t;
-
-/* State store for connection protocol */
-typedef struct MPID_nem_ib_cm_req {
- MPID_nem_ib_cm_cmd_type_t state;
- MPID_nem_ib_com_t *ibcom; /* Referenced in drain_scq */
- uint64_t retry_decided; /* Virtual time when CAS retry is decided */
- uint64_t retry_backoff; /* Back-off duration of retry */
- uint16_t ringbuf_index; /* index of slot where responder writes responds */
- int initiator_rank;
- int responder_rank;
- uint16_t initiator_ringbuf_index; /* responder stores it when acquiring it */
- uint16_t responder_ringbuf_index; /* initiator stores it when acquiring it */
- struct MPID_nem_ib_cm_req *sendq_next;
- MPID_nem_ib_cm_cmd_t cmd; /* buf used only when enqueued */
- uint32_t ask_on_connect; /* Ask ring-buffer slot when connected */
-
- /* We need to track reference count because the last reference of state
- * is non-deterministic. i.e. it happens either on receiving packet and draining SCQ */
- uint32_t ref_count;
-} MPID_nem_ib_cm_req_t;
-
-/* Track identity of a packet */
-typedef struct {
- MPID_nem_ib_cm_cmd_type_t type; /* Type referenced in drain_scq */
- MPID_nem_ib_cm_req_t *req;
- void *buf_from;
- uint32_t buf_from_sz;
-} MPID_nem_ib_cm_cmd_shadow_t;
-
-typedef struct {
- MPID_nem_ib_cm_cmd_type_t type;
- int initiator_rank;
-} MPID_nem_ib_cm_notify_send_t;
-
-typedef struct MPID_nem_ib_cm_notify_send_req {
- MPID_nem_ib_com_t *ibcom;
- int my_rank;
- int pg_rank;
- struct MPID_nem_ib_cm_notify_send_req *sendq_next;
-} MPID_nem_ib_cm_notify_send_req_t;
-
-#define MPID_NEM_IB_CM_RELEASED ((uint64_t)(-1))
-#define MPID_NEM_IB_CM_OFF_SYN (256) /* Align for 256-byte-write PCI command */
-#define MPID_NEM_IB_CM_OFF_CMD (256*2) /* Align for 256-byte-write PCI command */
-#define MPID_NEM_IB_CM_NSEG 64 /* number of slots to which responder writes its response */
-
-typedef GENERIC_Q_DECL(MPID_nem_ib_cm_req_t) MPID_nem_ib_cm_sendq_t;
-
-#define MPID_nem_ib_cm_sendq_empty(q) GENERICM_Q_EMPTY (q)
-#define MPID_nem_ib_cm_sendq_head(q) GENERICM_Q_HEAD (q)
-#define MPID_nem_ib_cm_sendq_next_field(ep, next_field) ((ep)->next_field)
-#define MPID_nem_ib_cm_sendq_next(ep) ((ep)->sendq_next)
-#define MPID_nem_ib_cm_sendq_enqueue(qp, ep) GENERICM_Q_ENQUEUE (qp, ep, MPID_nem_ib_cm_sendq_next_field, sendq_next);
-
-typedef GENERIC_Q_DECL(MPID_nem_ib_cm_notify_send_req_t) MPID_nem_ib_cm_notify_sendq_t;
-
-#define MPID_nem_ib_cm_notify_sendq_empty(q) GENERICM_Q_EMPTY (q)
-#define MPID_nem_ib_cm_notify_sendq_head(q) GENERICM_Q_HEAD (q)
-#define MPID_nem_ib_cm_notify_sendq_next_field(ep, next_field) ((ep)->next_field)
-#define MPID_nem_ib_cm_notify_sendq_next(ep) ((ep)->sendq_next)
-#define MPID_nem_ib_cm_notify_sendq_enqueue(qp, ep) GENERICM_Q_ENQUEUE (qp, ep, MPID_nem_ib_cm_notify_sendq_next_field, sendq_next);
-
-#ifdef HAVE_LIBDCFA
-#define MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO_MR_ADDR host_adddr
-#else
-#define MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO_MR_ADDR addr
-#endif
-
-#define MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO(cmd, rank) { \
- ibcom_errno = \
- MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conns[(rank)].fd, MPID_NEM_IB_COM_INFOKEY_PORT_LID, &((cmd)->lid), \
- sizeof(uint16_t)); \
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_get_info_conn"); \
-\
- ibcom_errno = \
- MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conns[(rank)].fd, MPID_NEM_IB_COM_INFOKEY_PORT_GID, &((cmd)->gid), \
- sizeof(union ibv_gid)); \
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_get_info_conn"); \
- \
- ibcom_errno = \
- MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conns[(rank)].fd, MPID_NEM_IB_COM_INFOKEY_QP_QPN, &((cmd)->qpnum), \
- sizeof(uint32_t)); \
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_get_info_conn"); \
- \
- (cmd)->rmem = (uint8_t*)MPID_nem_ib_rdmawr_to_alloc_mr->MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO_MR_ADDR + \
- ((uint8_t*)VC_FIELD(MPID_nem_ib_conns[(rank)].vc, ibcom->remote_ringbuf->start) - \
- (uint8_t*)MPID_nem_ib_rdmawr_to_alloc_start) ; \
- (cmd)->rkey = MPID_nem_ib_rdmawr_to_alloc_mr->rkey; \
- (cmd)->ringbuf_nslot = VC_FIELD(MPID_nem_ib_conns[(rank)].vc, ibcom->remote_ringbuf->nslot); \
- }
-
-#define MPID_NEM_IB_CM_COMPOSE_SYN(cmd, req) { \
- (cmd)->type = MPID_NEM_IB_CM_SYN; \
- (cmd)->initiator_req = (req); \
- (cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
-}
-
-#define MPID_NEM_IB_CM_COMPOSE_CAS_RELEASE2(cmd, req) { \
- (cmd)->type = MPID_NEM_IB_CM_CAS_RELEASE2; \
- (cmd)->initiator_req = (req); \
- (cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
-}
-
-#define MPID_NEM_IB_CM_COMPOSE_SYNACK(cmd, req, _initiator_req) { \
- (cmd)->type = MPID_NEM_IB_CM_SYNACK; \
- MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO((cmd), (req)->initiator_rank); \
- (cmd)->ringbuf_type = VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc, ibcom->remote_ringbuf->type); \
- (cmd)->initiator_req = (_initiator_req); \
- (cmd)->responder_req = (req); \
- (cmd)->remote_vc = MPID_nem_ib_conns[req->initiator_rank].vc; \
- (cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
-}
-
-#define MPID_NEM_IB_CM_COMPOSE_END_CM(cmd, req, _initiator_req, _type) { \
- (cmd)->type = _type; \
- (cmd)->initiator_req = (_initiator_req); \
- (cmd)->responder_req = (req); \
- (cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
-}
-
-#define MPID_NEM_IB_CM_COMPOSE_ACK1(cmd, req, _responder_req) { \
- (cmd)->type = MPID_NEM_IB_CM_ACK1; \
- MPID_NEM_IB_CM_COMPOSE_NETWORK_INFO((cmd), (req)->responder_rank); \
- (cmd)->ringbuf_type = VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, ibcom->remote_ringbuf->type); \
- (cmd)->initiator_req = (req); \
- (cmd)->responder_req = (_responder_req); \
- (cmd)->remote_vc = MPID_nem_ib_conns[req->responder_rank].vc; \
- (cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
-}
-
-#define MPID_NEM_IB_CM_COMPOSE_ACK2(cmd, _initiator_req) { \
- (cmd)->type = MPID_NEM_IB_CM_ACK2; \
- (cmd)->initiator_req = (_initiator_req); \
- (cmd)->tail_flag.tail_flag = MPID_NEM_IB_COM_MAGIC; \
-}
-
-#define MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(buf) { \
- ((MPID_nem_ib_cm_cmd_synack_t *)(buf))->tail_flag.tail_flag = 0; \
- ((MPID_nem_ib_cm_cmd_ack1_t *)(buf))->tail_flag.tail_flag = 0; \
- ((MPID_nem_ib_cm_cmd_ack2_t *)(buf))->tail_flag.tail_flag = 0; \
-}
-
-static inline void MPID_nem_ib_cm_request_release(MPID_nem_ib_cm_req_t * req)
-{
- if (req->ref_count == 0) {
- MPID_nem_ib_segv;
- }
- if (--req->ref_count == 0) {
- MPIU_Free(req);
- }
-}
-
-int MPID_nem_ib_cm_progress(void);
-int MPID_nem_ib_cm_release(uint16_t index);
-
-int MPID_nem_ib_cm_notify_send(int pg_rank, int myrank);
-int MPID_nem_ib_cm_notify_progress(void);
-#endif
-
-/* Ring buffer protocol
- including Ask-Send protocol */
-
-uint32_t MPID_nem_ib_ringbuf_local_shared_nseg;
-
-/* It's on the scratch pad, RDMA-read by a process which performs ask-send */
-
-typedef struct {
- uint64_t head; /* CAS size is 64-bit */
- uint16_t tail;
-} MPID_nem_ib_ringbuf_headtail_t;
-
-/* Types of ring buffer protocol packets is included in
- MPID_nem_ib_cm_cmd_types */
-
-/* State store for connection protocol */
-typedef struct MPID_nem_ib_ringbuf_req {
- MPID_nem_ib_ringbuf_cmd_type_t state;
- MPIDI_VC_t *vc; /* You can eliminate this. */
- MPID_nem_ib_com_t *ibcom; /* ibcom of scratch pad, referenced in drain_scq */
-
- /* fetch the head and compare-and-swap head and head + 1
- * to prevent the case 2^32-1 contiguos fetches while assuming
- * the ring buffer isn't full corrupt the head pointer */
- MPID_nem_ib_ringbuf_headtail_t fetched;
-
- uint64_t retry_decided; /* Virtual time when CAS retry is decided */
- uint64_t retry_backoff; /* Back-off duration of retry */
- struct MPID_nem_ib_ringbuf_req *sendq_next;
-} MPID_nem_ib_ringbuf_req_t;
-
-/* Track identity of a packet */
-typedef struct {
- MPID_nem_ib_ringbuf_cmd_type_t type; /* Type referenced in drain_scq */
- MPID_nem_ib_ringbuf_req_t *req;
- void *buf_from;
- uint32_t buf_from_sz;
-} MPID_nem_ib_ringbuf_cmd_shadow_t;
-
-/* Location of head of the shared ring buffer */
-#define MPID_NEM_IB_RINGBUF_OFF_HEAD (MPID_NEM_IB_CM_OFF_CMD + sizeof(MPID_nem_ib_cm_cmd_t) * MPID_NEM_IB_CM_NSEG)
-#define MPID_NEM_IB_RINGBUF_UPDATE_BACKOFF(backoff) (backoff) = (backoff) ? ((backoff) << 1) : 1;
-
-typedef GENERIC_Q_DECL(MPID_nem_ib_ringbuf_req_t) MPID_nem_ib_ringbuf_sendq_t;
-
-#define MPID_nem_ib_ringbuf_sendq_empty(q) GENERICM_Q_EMPTY (q)
-#define MPID_nem_ib_ringbuf_sendq_head(q) GENERICM_Q_HEAD (q)
-#define MPID_nem_ib_ringbuf_sendq_next_field(ep, next_field) ((ep)->next_field)
-#define MPID_nem_ib_ringbuf_sendq_next(ep) ((ep)->sendq_next)
-#define MPID_nem_ib_ringbuf_sendq_enqueue(qp, ep) GENERICM_Q_ENQUEUE (qp, ep, MPID_nem_ib_ringbuf_sendq_next_field, sendq_next);
-#define MPID_nem_ib_ringbuf_sendq_enqueue_at_head(qp, ep) GENERICM_Q_ENQUEUE_AT_HEAD(qp, ep, MPID_nem_ib_ringbuf_sendq_next_field, sendq_next);
-
-
-/* see src/mpid/ch3/channels/nemesis/include/mpidi_ch3_impl.h */
-/* TODO: rreq for rendezvous is dequeued from posted-queue nor unexpected-queue when do_cts is called,
- so we can reuse rreq->dev.next */
-#define MPID_nem_ib_lmtq_empty(q) GENERICM_Q_EMPTY(q)
-#define MPID_nem_ib_lmtq_head(q) GENERICM_Q_HEAD(q)
-#define MPID_nem_ib_lmtq_next_field(ep, next_field) REQ_FIELD(ep, next_field)
-#define MPID_nem_ib_lmtq_next(ep) REQ_FIELD(ep, lmt_next)
-#define MPID_nem_ib_lmtq_enqueue(qp, ep) GENERICM_Q_ENQUEUE(qp, ep, MPID_nem_ib_lmtq_next_field, lmt_next);
-#define MPID_nem_ib_diff63(a, b) ((uint64_t)(((a) + (1ULL<<63) - (b)) & ((1ULL<<63)-1)))
-#define MPID_nem_ib_diff16(a, b) ((uint16_t)(((a) + (1ULL<<16) - (b)) & ((1ULL<<16)-1)))
-#define MPID_nem_ib_diff32(a, b) ((uint32_t)(((a) + (1ULL<<32) - (b)) & ((1ULL<<32)-1)))
-#define MPID_nem_ib_sendq_ready_to_send_head(vc_ib) (vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY && MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY && MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail) < MPID_NEM_IB_COM_RDMABUF_NSEG)
-
-/* counting bloom filter to detect multiple lmt-sends in one send-wait period to
- avoid overwriting the last byte in the receive buffer */
-#define MPID_nem_ib_cbf_nslot 16 /* slots */
-#define MPID_nem_ib_cbf_bitsperslot 4 /* one slot can accomodate multiple bits */
-#define MPID_nem_ib_cbf_lognslot 4
-#define MPID_nem_ib_cbf_nhash 3 /* number of hash functions */
-#define MPID_nem_ib_getpos \
- int pos_8b = pos / (8 / MPID_nem_ib_cbf_bitsperslot);\
- assert(0 <= pos_8b && pos_8b < MPID_nem_ib_cbf_nslot * MPID_nem_ib_cbf_bitsperslot / 8);\
- int pos_bps = pos & (8 / MPID_nem_ib_cbf_bitsperslot - 1);
-#define MPID_nem_ib_shift \
- ((array[pos_8b] >> (pos_bps * MPID_nem_ib_cbf_bitsperslot)) & ((1ULL<<MPID_nem_ib_cbf_bitsperslot) - 1))
-#define MPID_nem_ib_maskset \
- array[pos_8b] &= ~(((1ULL<<MPID_nem_ib_cbf_bitsperslot) - 1) << (pos_bps * MPID_nem_ib_cbf_bitsperslot)); \
- array[pos_8b] |= (bits & ((1ULL<<MPID_nem_ib_cbf_bitsperslot)-1)) << (pos_bps * MPID_nem_ib_cbf_bitsperslot)
-static inline int MPID_nem_ib_cbf_get(uint8_t * array, int pos)
-{
- MPID_nem_ib_getpos;
- return MPID_nem_ib_shift;
-}
-
-static inline void MPID_nem_ib_cbf_set(uint8_t * array, int pos, uint16_t bits)
-{
- MPID_nem_ib_getpos;
- MPID_nem_ib_maskset;
-}
-
-static inline void MPID_nem_ib_cbf_inc(uint8_t * array, int pos)
-{
- MPID_nem_ib_getpos;
- int16_t bits = MPID_nem_ib_shift;
- assert(bits != (1ULL << MPID_nem_ib_cbf_bitsperslot) - 1);
- bits++;
- MPID_nem_ib_maskset;
-}
-
-static inline void MPID_nem_ib_cbf_dec(uint8_t * array, int pos)
-{
- MPID_nem_ib_getpos;
- int16_t bits = MPID_nem_ib_shift;
- assert(bits != 0);
- bits--;
- MPID_nem_ib_maskset;
-}
-
-static inline int MPID_nem_ib_cbf_hash1(uint64_t addr)
-{
- return
- (((addr >> (MPID_nem_ib_cbf_lognslot * 0)) & (MPID_nem_ib_cbf_nslot - 1)) ^
- ((addr >> (MPID_nem_ib_cbf_lognslot * 3)) & (MPID_nem_ib_cbf_nslot - 1)) ^
- (((addr >> (MPID_nem_ib_cbf_lognslot * 6)) & (MPID_nem_ib_cbf_nslot - 1))
- + 1)) & (MPID_nem_ib_cbf_nslot - 1);
-}
-
-static inline int MPID_nem_ib_cbf_hash2(uint64_t addr)
-{
- /* adding one because addr tends to have a postfix of "fff" */
- return
- (((addr >> (MPID_nem_ib_cbf_lognslot * 1)) & (MPID_nem_ib_cbf_nslot - 1)) ^
- ((addr >> (MPID_nem_ib_cbf_lognslot * 4)) & (MPID_nem_ib_cbf_nslot - 1)) ^
- (((addr >> (MPID_nem_ib_cbf_lognslot * 7)) & (MPID_nem_ib_cbf_nslot - 1))
- + 1)) & (MPID_nem_ib_cbf_nslot - 1);
-}
-
-static inline int MPID_nem_ib_cbf_hash3(uint64_t addr)
-{
- /* adding two because addr tends to have a postfix of "fff" */
- return
- (((addr >> (MPID_nem_ib_cbf_lognslot * 2)) & (MPID_nem_ib_cbf_nslot - 1)) ^
- ((addr >> (MPID_nem_ib_cbf_lognslot * 5)) & (MPID_nem_ib_cbf_nslot - 1)) ^
- (((addr >> (MPID_nem_ib_cbf_lognslot * 8)) & (MPID_nem_ib_cbf_nslot - 1))
- + 2)) & (MPID_nem_ib_cbf_nslot - 1);
-
-}
-
-static inline void MPID_nem_ib_cbf_add(uint64_t addr, uint8_t * array)
-{
- //dprintf("cbf_add,addr=%08lx,%08x,%08x,%08x\n", addr, MPID_nem_ib_cbf_hash1(addr), MPID_nem_ib_cbf_hash2(addr), MPID_nem_ib_cbf_hash3(addr));
- //dprintf("cbf_add,%d,%d,%d\n", MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash1(addr)), MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash2(addr)), MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash3(addr)));
- MPID_nem_ib_cbf_inc(array, MPID_nem_ib_cbf_hash1(addr));
- MPID_nem_ib_cbf_inc(array, MPID_nem_ib_cbf_hash2(addr));
- MPID_nem_ib_cbf_inc(array, MPID_nem_ib_cbf_hash3(addr));
- //dprintf("cbf_add,%d,%d,%d\n", MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash1(addr)), MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash2(addr)), MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash3(addr)));
-}
-
-static inline void MPID_nem_ib_cbf_delete(uint64_t addr, uint8_t * array)
-{
- //dprintf("cbf_delete,addr=%08lx,%08x,%08x,%08x\n", addr, MPID_nem_ib_cbf_hash1(addr), MPID_nem_ib_cbf_hash2(addr), MPID_nem_ib_cbf_hash3(addr));
- //dprintf("cbf_delete,%d,%d,%d\n", MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash1(addr)), MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash2(addr)), MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash3(addr)));
- MPID_nem_ib_cbf_dec(array, MPID_nem_ib_cbf_hash1(addr));
- MPID_nem_ib_cbf_dec(array, MPID_nem_ib_cbf_hash2(addr));
- MPID_nem_ib_cbf_dec(array, MPID_nem_ib_cbf_hash3(addr));
- //dprintf("cbf_delete,%d,%d,%d\n", MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash1(addr)), MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash2(addr)), MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash3(addr)));
-}
-
-static inline int MPID_nem_ib_cbf_query(uint64_t addr, uint8_t * array)
-{
- //dprintf("cbf_query,addr=%08lx,%08x,%08x,%08x\n", addr, MPID_nem_ib_cbf_hash1(addr), MPID_nem_ib_cbf_hash2(addr), MPID_nem_ib_cbf_hash3(addr));
- //dprintf("cbf_query,%d,%d,%d\n", MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash1(addr)), MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash2(addr)), MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash3(addr)));
- return
- MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash1(addr)) > 0 &&
- MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash2(addr)) > 0 &&
- MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash3(addr)) > 0;
-}
-
-static inline int MPID_nem_ib_cbf_would_overflow(uint64_t addr, uint8_t * array)
-{
- //dprintf("cbf_would_overflow,addr=%08lx,%08x,%08x,%08x\n", addr, MPID_nem_ib_cbf_hash1(addr), MPID_nem_ib_cbf_hash2(addr), MPID_nem_ib_cbf_hash3(addr));
- //dprintf("cbf_would_overflow,%d,%d,%d\n", MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash1(addr)), MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash2(addr)), MPID_nem_ib_cbf_get(array, MPID_nem_ib_cbf_hash3(addr)));
- return
- MPID_nem_ib_cbf_get(array,
- MPID_nem_ib_cbf_hash1(addr)) ==
- (1ULL << MPID_nem_ib_cbf_bitsperslot) - 1 ||
- MPID_nem_ib_cbf_get(array,
- MPID_nem_ib_cbf_hash2(addr)) ==
- (1ULL << MPID_nem_ib_cbf_bitsperslot) - 1 ||
- MPID_nem_ib_cbf_get(array,
- MPID_nem_ib_cbf_hash3(addr)) ==
- (1ULL << MPID_nem_ib_cbf_bitsperslot) - 1;
-}
-
-/* functions */
-uint8_t MPID_nem_ib_rand(void);
-uint64_t MPID_nem_ib_rdtsc(void);
-int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_max_sz_p);
-int MPID_nem_ib_finalize(void);
-int MPID_nem_ib_drain_scq(int dont_call_progress);
-int MPID_nem_ib_drain_scq_scratch_pad(void);
-int MPID_nem_ib_poll(int in_blocking_poll);
-int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t * ringbuf);
-int MPID_nem_ib_ring_alloc(MPIDI_VC_t * vc);
-int MPID_nem_ib_handle_pkt_bh(MPIDI_VC_t * vc, MPID_Request * req, char *buf,
- MPIDI_msg_sz_t buflen);
-
-int MPID_nem_ib_cm_drain_scq(void);
-int MPID_nem_ib_cm_drain_rcq(void);
-int MPID_nem_ib_cm_poll_syn(void);
-int MPID_nem_ib_cm_poll(void);
-
-int MPID_nem_ib_get_business_card(int my_rank, char **bc_val_p, int *val_max_sz_p);
-int MPID_nem_ib_connect_to_root(const char *business_card, MPIDI_VC_t * new_vc);
-int MPID_nem_ib_vc_onconnect(MPIDI_VC_t * vc);
-int MPID_nem_ib_vc_init(MPIDI_VC_t * vc);
-int MPID_nem_ib_vc_destroy(MPIDI_VC_t * vc);
-int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc);
-int MPID_nem_ib_pkthandler_init(MPIDI_CH3_PktHandler_Fcn * pktArray[], int arraySize);
-
-int MPID_nem_ib_SendNoncontig(MPIDI_VC_t * vc, MPID_Request * sreq, void *header,
- MPIDI_msg_sz_t hdr_sz);
-
-/* CH3 send/recv functions */
-int MPID_nem_ib_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
- MPIDI_msg_sz_t hdr_sz, void *data, MPIDI_msg_sz_t data_sz);
-int MPID_nem_ib_iStartContigMsg(MPIDI_VC_t * vc, void *hdr, MPIDI_msg_sz_t hdr_sz, void *data,
- MPIDI_msg_sz_t data_sz, MPID_Request ** sreq_ptr);
-
-int MPID_nem_ib_cm_cas_core(int rank, MPID_nem_ib_cm_cmd_shadow_t * shadow);
-int MPID_nem_ib_cm_cas(MPIDI_VC_t * vc, uint32_t ask_on_connect);
-int MPID_nem_ib_cm_cas_release_core(int rank, MPID_nem_ib_cm_cmd_shadow_t * shadow);
-int MPID_nem_ib_cm_cas_release(MPIDI_VC_t * vc);
-int MPID_nem_ib_cm_cmd_core(int rank, MPID_nem_ib_cm_cmd_shadow_t * shadow, void *buf,
- MPIDI_msg_sz_t sz, uint32_t syn, uint16_t ringbuf_index);
-int MPID_nem_ib_ringbuf_ask_cas(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_req_t * req);
-int MPID_nem_ib_ringbuf_ask_fetch_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t * shadow,
- MPIDI_msg_sz_t sz);
-int MPID_nem_ib_ringbuf_ask_fetch(MPIDI_VC_t * vc);
-int MPID_nem_ib_ringbuf_ask_cas_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t * shadow,
- uint64_t head);
-int MPID_nem_ib_ringbuf_progress(void);
-
-int MPID_nem_ib_ringbuf_alloc(MPIDI_VC_t * vc);
-int MPID_nem_ib_ringbuf_free(MPIDI_VC_t * vc);
-
-/* used by ib_poll.c */
-int MPID_nem_ib_send_progress(MPIDI_VC_t * vc);
-
-/* CH3--lmt send/recv functions */
-int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_pkt,
- struct MPID_Request *req);
-int MPID_nem_ib_lmt_start_recv_core(struct MPID_Request *req, void *raddr, uint32_t rkey, long len,
- void *write_to_buf, uint32_t max_msg_sz, int end);
-int MPID_nem_ib_lmt_start_recv(struct MPIDI_VC *vc, struct MPID_Request *req, MPID_IOV s_cookie);
-int MPID_nem_ib_lmt_handle_cookie(struct MPIDI_VC *vc, struct MPID_Request *req, MPID_IOV cookie);
-int MPID_nem_ib_lmt_switch_send(struct MPIDI_VC *vc, struct MPID_Request *req);
-int MPID_nem_ib_lmt_done_send(struct MPIDI_VC *vc, struct MPID_Request *req);
-int MPID_nem_ib_lmt_done_recv(struct MPIDI_VC *vc, struct MPID_Request *req);
-int MPID_nem_ib_lmt_vc_terminated(struct MPIDI_VC *vc);
-/* overriding functions
- initialize the value of a member named "recv_posted"
- in BSS-variable named "comm_ops" with type of MPIDI_Comm_ops_t
- to "MPID_nem_ib_recv_posted" in ib_init.c
- MPIDI_Comm_ops_t is defined in src/mpid/ch3/include/mpidimpl.h */
-int MPID_nem_ib_recv_posted(struct MPIDI_VC *vc, struct MPID_Request *req);
-int MPID_nem_ib_recv_buf_released(struct MPIDI_VC *vc, void *user_data);
-
-/* Keys for business cards */
-#define MPID_NEM_IB_GID_KEY "gid"
-#define MPID_NEM_IB_LID_KEY "lid"
-#define MPID_NEM_IB_QPN_KEY "qpn"
-#define MPID_NEM_IB_RKEY_KEY "rkey"
-#define MPID_NEM_IB_RMEM_KEY "rmem"
-
-#define MPID_NEM_IB_RECV_MAX_PKT_LEN 1024
-
-extern int MPID_nem_ib_conn_ud_fd;
-extern MPID_nem_ib_com_t *MPID_nem_ib_conn_ud_ibcom;
-extern MPID_nem_ib_conn_ud_t *MPID_nem_ib_conn_ud;
-extern MPID_nem_ib_conn_t *MPID_nem_ib_conns;
-extern int MPID_nem_ib_conns_ref_count;
-//extern MPIDI_VC_t **MPID_nem_ib_pollingset;
-extern int *MPID_nem_ib_scratch_pad_fds; /* TODO: create structure including fds and ibcoms */
-extern int MPID_nem_ib_scratch_pad_fds_ref_count;
-extern MPID_nem_ib_com_t **MPID_nem_ib_scratch_pad_ibcoms;
-//extern int MPID_nem_ib_npollingset;
-extern void *MPID_nem_ib_fl[18];
-extern int MPID_nem_ib_nranks;
-//extern char *MPID_nem_ib_recv_buf;
-extern int MPID_nem_ib_myrank;
-extern uint64_t MPID_nem_ib_tsc_poll; /* to throttle ib_poll in recv_posted (in ib_poll.c) */
-extern int MPID_nem_ib_ncqe; /* for lazy poll scq */
-extern uint64_t MPID_nem_ib_progress_engine_vt; /* virtual time stamp counter */
-extern uint16_t MPID_nem_ib_remote_poll_shared; /* index to poll for shared ring buffer */
-#ifdef MPID_NEM_IB_ONDEMAND
-extern uint16_t MPID_nem_ib_cm_ringbuf_head; /* head is incremented after assigned */
-extern uint16_t MPID_nem_ib_cm_ringbuf_tail;
-extern uint64_t MPID_nem_ib_cm_ringbuf_released[(MPID_NEM_IB_CM_NSEG + 63) / 64];
-
-/* overflow queue when no more slots for responder to write on are available */
-extern MPID_nem_ib_cm_sendq_t MPID_nem_ib_cm_sendq;
-extern MPID_nem_ib_cm_notify_sendq_t MPID_nem_ib_cm_notify_sendq;
-
-extern MPID_nem_ib_ringbuf_sendq_t MPID_nem_ib_ringbuf_sendq;
-
-#endif
-extern int MPID_nem_ib_ncqe_scratch_pad;
-extern int MPID_nem_ib_ncqe_scratch_pad_to_drain;
-extern int MPID_nem_ib_ncqe_to_drain; /* count put in lmt-put-done protocol */
-extern int MPID_nem_ib_ncqe_nces; /* counting non-copied eager-send */
-extern MPID_nem_ib_lmtq_t MPID_nem_ib_lmtq; /* poll queue for lmt */
-extern MPID_nem_ib_lmtq_t MPID_nem_ib_lmt_orderq; /* force order when two or more rts_to_sender randomizes the last byte of receive buffer */
-extern MPID_nem_ib_vc_area *MPID_nem_ib_debug_current_vc_ib;
-
-/* to detect multiple lmt-sends in one send-wait period to
- avoid overwriting the last byte in the receive buffer */
-extern uint8_t MPID_nem_ib_lmt_tail_addr_cbf[MPID_nem_ib_cbf_nslot *
- MPID_nem_ib_cbf_bitsperslot / 8];
-
-
-//#define MPID_NEM_IB_MAX_POLLINGSET 65536
-
-/* xfer.c manages memory region using memid */
-#define MPID_NEM_IB_MEMID_RDMA 0
-
-/* command using IB UD */
-#define MPID_NEM_IB_SYNC_SYN 0
-#define MPID_NEM_IB_SYNC_SYNACK 1
-#define MPID_NEM_IB_SYNC_NACK 2
-
-#define MPID_NEM_IB_EAGER_MAX_MSG_SZ (MPID_NEM_IB_COM_RDMABUF_SZSEG/*1024*/-sizeof(MPIDI_CH3_Pkt_t)+sizeof(MPIDI_CH3_Pkt_eager_send_t)-sizeof(MPID_nem_ib_netmod_hdr_shared_t)-sizeof(MPID_nem_ib_pkt_prefix_t)-sizeof(MPID_nem_ib_netmod_trailer_t)) /* when > this size, lmt is used. see src/mpid/ch3/src/mpid_isend.c */
-#define MPID_NEM_IB_POLL_PERIOD_RECV_POSTED 2000 /* minimum period from previous ib_poll to ib_poll in recv_posted */
-#define MPID_NEM_IB_POLL_PERIOD_SEND_POSTED 2000
-
-typedef struct {
- void *addr;
- uint32_t rkey;
- uint8_t tail; /* last word of payload */
- uint32_t max_msg_sz; /* max message size */
- int seg_seq_num;
- int seg_num;
-} MPID_nem_ib_lmt_cookie_t;
-
-typedef struct {
- void *addr;
- uint32_t rkey;
- uint8_t tail; /* last word of payload */
- long len;
- MPI_Request sender_req_id; /* request id of sender side */
- MPI_Request receiver_req_id; /* request id of sender side */
- uint32_t max_msg_sz; /* max message size */
- int seg_seq_num;
- int seg_num;
-} MPID_nem_ib_rma_lmt_cookie_t;
-
-typedef enum MPID_nem_ib_pkt_subtype {
- MPIDI_NEM_IB_PKT_EAGER_SEND,
- MPIDI_NEM_IB_PKT_RMA_LMT_RTS,
- MPIDI_NEM_IB_PKT_PUT,
- MPIDI_NEM_IB_PKT_ACCUMULATE,
- MPIDI_NEM_IB_PKT_GET,
- MPIDI_NEM_IB_PKT_GET_RESP,
- MPIDI_NEM_IB_PKT_LMT_GET_DONE,
- MPIDI_NEM_IB_PKT_LMT_RTS,
- MPIDI_NEM_IB_PKT_REQ_SEQ_NUM,
- MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM,
- MPIDI_NEM_IB_PKT_CHG_RDMABUF_OCC_NOTIFY_STATE,
- MPIDI_NEM_IB_PKT_RMA_LMT_GET_DONE,
- MPIDI_NEM_IB_PKT_NUM_PKT_HANDLERS
-} MPID_nem_ib_pkt_subtype_t;
-
-/* derived from MPID_nem_pkt_netmod_t */
-typedef struct MPID_nem_ib_pkt_prefix {
- MPIDI_CH3_Pkt_type_t type;
- unsigned subtype;
- /* additional field */
- int16_t seq_num_tail;
-} MPID_nem_ib_pkt_prefix_t;
-
-/* derived from MPID_nem_pkt_netmod_t and MPID_nem_pkt_lmt_done_t */
-typedef struct MPID_nem_ib_pkt_lmt_get_done {
- MPIDI_CH3_Pkt_type_t type;
- unsigned subtype;
- /* additional field */
- MPI_Request req_id;
- int16_t seq_num_tail;
- MPI_Request receiver_req_id;
-} MPID_nem_ib_pkt_lmt_get_done_t;
-
-typedef struct MPID_nem_ib_pkt_lmt_rts {
- MPIDI_CH3_Pkt_type_t type;
- unsigned subtype;
- /* additional field */
- MPI_Request req_id;
- int16_t seq_num_tail;
- void *addr;
- uint32_t rkey;
- int seg_seq_num;
-} MPID_nem_ib_pkt_lmt_rts_t;
-
-/* derived from MPID_nem_pkt_netmod_t */
-typedef struct MPID_nem_ib_pkt_req_seq_num_t {
- MPIDI_CH3_Pkt_type_t type;
- unsigned subtype;
- /* additional field */
- int16_t seq_num_tail;
-} MPID_nem_ib_pkt_req_seq_num_t;
-
-/* derived from MPID_nem_pkt_netmod_t */
-typedef struct MPID_nem_ib_pkt_reply_seq_num_t {
- MPIDI_CH3_Pkt_type_t type;
- unsigned subtype;
- /* additional field */
- int16_t seq_num_tail;
-} MPID_nem_ib_pkt_reply_seq_num_t;
-
-/* derived from MPID_nem_pkt_netmod_t */
-typedef struct MPID_nem_ib_pkt_change_rdmabuf_occupancy_notify_state_t {
- MPIDI_CH3_Pkt_type_t type;
- unsigned subtype;
- /* additional field */
- int state;
-} MPID_nem_ib_pkt_change_rdmabuf_occupancy_notify_state_t;
-
-int MPID_nem_ib_PktHandler_EagerSend(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen /* out */ ,
- MPID_Request ** rreqp /* out */);
-int MPID_nem_ib_PktHandler_rma_lmt_rts(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen /* out */ ,
- MPID_Request ** rreqp /* out */);
-int MPID_nem_ib_PktHandler_lmt_done(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp);
-int MPID_nem_ib_pkt_GET_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp);
-int MPID_nem_ib_pkt_RTS_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp);
-int MPID_nem_ib_PktHandler_req_seq_num(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp);
-int MPID_nem_ib_PktHandler_reply_seq_num(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp);
-int MPID_nem_ib_PktHandler_change_rdmabuf_occupancy_notify_state(MPIDI_VC_t * vc,
- MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen,
- MPID_Request ** rreqp);
-int MPID_nem_ib_pkt_rma_lmt_getdone(MPIDI_VC_t * vc,
- MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp);
-
-/* MPID_nem_ib_PktHandler_lmt_done is a wrapper of pkt_DONE_handler and calls it */
-/* pkt_DONE_handler (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c) is not exported */
-int pkt_DONE_handler(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt, MPIDI_msg_sz_t * buflen,
- MPID_Request ** rreqp);
-
-
-#define MPID_nem_ib_send_req_seq_num(vc) do { \
- MPID_PKT_DECL_CAST(_upkt, MPID_nem_ib_pkt_req_seq_num_t, _pkt); \
- MPID_Request *_req; \
- \
- MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"sending req_seq_num packet"); \
- MPIDI_Pkt_init(_pkt, MPIDI_NEM_PKT_NETMOD); \
- _pkt->subtype = MPIDI_NEM_IB_PKT_REQ_SEQ_NUM; \
- \
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc); \
- _pkt->seq_num_tail = vc_ib->ibcom->rsr_seq_num_tail; \
- vc_ib->ibcom->rsr_seq_num_tail_last_sent = vc_ib->ibcom->rsr_seq_num_tail; \
- \
- mpi_errno = MPIDI_CH3_iStartMsg((vc), _pkt, sizeof(*_pkt), &_req); \
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_send_req_seq_num"); \
- if (_req != NULL) { \
- MPIU_ERR_CHKANDJUMP(_req->status.MPI_ERROR, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_send_req_seq_num"); \
- MPID_Request_release(_req); \
- dprintf("send_req_seq_num,release,req=%p\n", _req); \
- } \
- } while (0)
-
-#define MPID_nem_ib_send_reply_seq_num(vc) do { \
- MPID_PKT_DECL_CAST(_upkt, MPID_nem_ib_pkt_reply_seq_num_t, _pkt); \
- MPID_Request *_req; \
- \
- MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"sending reply_seq_num packet"); \
- MPIDI_Pkt_init(_pkt, MPIDI_NEM_PKT_NETMOD); \
- _pkt->subtype = MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM; \
- _pkt->seq_num_tail = vc_ib->ibcom->rsr_seq_num_tail; \
- vc_ib->ibcom->rsr_seq_num_tail_last_sent = vc_ib->ibcom->rsr_seq_num_tail; \
-\
- mpi_errno = MPIDI_CH3_iStartMsg((vc), _pkt, sizeof(*_pkt), &_req); \
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_send_reply_seq_num"); \
- if (_req != NULL) { \
- MPIU_ERR_CHKANDJUMP(_req->status.MPI_ERROR, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_send_reply_seq_num"); \
- MPID_Request_release(_req); \
- dprintf("send_reply_seq_num,release,req=%p\n", _req); \
- } \
- } while (0)
-
-#define MPID_nem_ib_send_change_rdmabuf_occupancy_notify_state(vc, _state) do { \
- MPID_PKT_DECL_CAST(_upkt, MPID_nem_ib_pkt_change_rdmabuf_occupancy_notify_state_t, _pkt); \
- MPID_Request *_req; \
- \
- MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"sending change_rdmabuf_occupancy_notify_state packet"); \
- MPIDI_Pkt_init(_pkt, MPIDI_NEM_PKT_NETMOD); \
- _pkt->subtype = MPIDI_NEM_IB_PKT_CHG_RDMABUF_OCC_NOTIFY_STATE; \
- _pkt->state = _state; \
- \
- mpi_errno = MPIDI_CH3_iStartMsg((vc), _pkt, sizeof(*_pkt), &_req); \
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_send_change_rdmabuf_occupancy_notify_state"); \
- if (_req != NULL) { \
- MPIU_ERR_CHKANDJUMP(_req->status.MPI_ERROR, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_send_change_rdmabuf_occupancy_notify_state"); \
- MPID_Request_release(_req); \
- dprintf("send_change_...,release,req=%p\n", _req); \
- } \
- } while (0)
-
-#define MPID_nem_ib_change_rdmabuf_occupancy_notify_policy_lw(vc_ib, lsr_seq_num_tail) \
- do { \
- int *rdmabuf_occupancy_notify_rstate; \
- ibcom_errno = MPID_nem_ib_com_rdmabuf_occupancy_notify_rstate_get(vc_ib->sc->fd, &rdmabuf_occupancy_notify_rstate); \
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_rdmabuf_occupancy_notify_rstate_get"); \
- \
- /*dprintf("notify_policy_lw,head=%d,tail=%d,lw=%d\n", vc_ib->ibcom->sseq_num, *lsr_seq_num_tail, MPID_NEM_IB_COM_RDMABUF_LOW_WATER_MARK);*/ \
- /* if the number of occupied slot of RDMA-write-to buffer have got below the low water-mark */ \
- if (*rdmabuf_occupancy_notify_rstate == MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_HW && \
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, *lsr_seq_num_tail) < MPID_NEM_IB_COM_RDMABUF_LOW_WATER_MARK) { \
- dprintf("changing notify_rstate\n"); \
- /* remember remote notifying policy so that local can know when to change remote policy back to HW */ \
- *rdmabuf_occupancy_notify_rstate = MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_LW; \
- /* change remote notifying policy of RDMA-write-to buf occupancy info */ \
- MPID_nem_ib_send_change_rdmabuf_occupancy_notify_state(vc, MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_LW); \
- } \
- } while (0)
-
-#define MPID_nem_ib_lmt_send_GET_DONE(vc, rreq) do { \
- MPID_PKT_DECL_CAST(_upkt, MPID_nem_ib_pkt_lmt_get_done_t, _done_pkt); \
- MPID_Request *_done_req; \
- \
- MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"sending rndv DONE packet"); \
- MPIDI_Pkt_init(_done_pkt, MPIDI_NEM_PKT_NETMOD); \
- _done_pkt->subtype = MPIDI_NEM_IB_PKT_LMT_GET_DONE;\
- _done_pkt->req_id = (rreq)->ch.lmt_req_id; \
- _done_pkt->receiver_req_id = (rreq)->handle; \
- /* embed SR occupancy information */ \
- _done_pkt->seq_num_tail = VC_FIELD(vc, ibcom->rsr_seq_num_tail); \
- \
- /* remember the last one sent */ \
- VC_FIELD(vc, ibcom->rsr_seq_num_tail_last_sent) = VC_FIELD(vc, ibcom->rsr_seq_num_tail); \
- \
- mpi_errno = MPIDI_CH3_iStartMsg((vc), _done_pkt, sizeof(*_done_pkt), &_done_req); \
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_lmt_send_GET_DONE"); \
- if (_done_req != NULL) \
- { \
- MPIU_ERR_CHKANDJUMP(_done_req->status.MPI_ERROR, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_lmt_send_GET_DONE"); \
- MPID_Request_release(_done_req); \
- dprintf("send_get_done,release,req=%p\n", _done_req); \
- } \
- } while (0)
-
-#define MPID_nem_ib_lmt_send_RTS(_subtype, vc, _req_id, _addr, _rkey, _seg_seq_num) do { \
- MPID_PKT_DECL_CAST(_upkt, MPID_nem_ib_pkt_lmt_rts_t, _rts_pkt); \
- MPID_Request *_rts_req; \
- \
- MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"sending rndv RTS segment packet"); \
- MPIDI_Pkt_init(_rts_pkt, MPIDI_NEM_PKT_NETMOD); \
- _rts_pkt->subtype = _subtype;\
- _rts_pkt->req_id = _req_id; \
- _rts_pkt->addr = _addr; \
- _rts_pkt->rkey = _rkey; \
- _rts_pkt->seg_seq_num = _seg_seq_num; \
- /* embed SR occupancy information */ \
- _rts_pkt->seq_num_tail = VC_FIELD(vc, ibcom->rsr_seq_num_tail); \
- \
- /* remember the last one sent */ \
- VC_FIELD(vc, ibcom->rsr_seq_num_tail_last_sent) = VC_FIELD(vc, ibcom->rsr_seq_num_tail); \
- \
- mpi_errno = MPIDI_CH3_iStartMsg((vc), _rts_pkt, sizeof(*_rts_pkt), &_rts_req); \
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_lmt_send_RTS"); \
- if (_rts_req != NULL) \
- { \
- MPIU_ERR_CHKANDJUMP(_rts_req->status.MPI_ERROR, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_lmt_send_RTS"); \
- MPID_Request_release(_rts_req); \
- dprintf("send_rts,release,req=%p\n", _rts_req); \
- } \
- } while (0)
-
-#define MPID_nem_ib_lmt_send_PKT_LMT_DONE(vc, rreq) do { \
- MPID_PKT_DECL_CAST(_upkt, MPID_nem_ib_pkt_lmt_get_done_t, _done_pkt); \
- MPID_Request *_done_req; \
- \
- MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"sending rndv DONE packet"); \
- MPIDI_Pkt_init(_done_pkt, MPIDI_NEM_PKT_NETMOD); \
- _done_pkt->subtype = MPIDI_NEM_IB_PKT_RMA_LMT_GET_DONE;\
- _done_pkt->req_id = (rreq)->ch.lmt_req_id; \
- _done_pkt->receiver_req_id = (rreq)->handle; \
- /* embed SR occupancy information */ \
- _done_pkt->seq_num_tail = VC_FIELD(vc, ibcom->rsr_seq_num_tail); \
- \
- /* remember the last one sent */ \
- VC_FIELD(vc, ibcom->rsr_seq_num_tail_last_sent) = VC_FIELD(vc, ibcom->rsr_seq_num_tail); \
- \
- mpi_errno = MPIDI_CH3_iStartMsg((vc), _done_pkt, sizeof(*_done_pkt), &_done_req); \
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_lmt_send_GET_DONE"); \
- if (_done_req != NULL) \
- { \
- MPIU_ERR_CHKANDJUMP(_done_req->status.MPI_ERROR, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_lmt_send_GET_DONE"); \
- MPID_Request_release(_done_req); \
- dprintf("send_get_done,release,req=%p\n", _done_req); \
- } \
- } while (0)
-
-/* Allocator for packing buffer for non-contiguous data
- - Allocate performs dequeue
- - Slow to "malloc" (two load and one store instructions)
- - Free preforms enqueue
- - Slow to "free" (one load and two store instructions)
- - Refill allocates a single slot
- - Slow when first-time allocs occur
- - Free list is linked lists and prepared for 2^n sizes.
- - Fast to find a empty slot (one load instruction)
- - Use mmap and munmap for requests of larger than or
- equal to 4KB buffers
- - No unused slots for large requests */
-static inline void *MPID_nem_ib_stmalloc(size_t _sz)
-{
- size_t sz = _sz;
- int i = 0;
- do {
- i++;
- sz >>= 1;
- } while (sz > 0);
- if (i < 12) {
- return MPIU_Malloc(_sz);
- }
- if (i > 30) {
- void *addr = mmap(0, _sz, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
- if (addr == (void *) -1) {
- return NULL;
- }
- else {
- return addr;
- }
- }
- int ndx = i - 12;
- void *slot;
- if (MPID_nem_ib_fl[ndx]) {
- slot = MPID_nem_ib_fl[ndx];
- if (MPID_nem_ib_myrank == 1) {
- //printf("stmalloc,reuse %p,%08x\n", slot, (int)_sz);
- }
- MPID_nem_ib_fl[ndx] = *((void **) MPID_nem_ib_fl[ndx]);
- }
- else {
- slot = mmap(0, 1 << i, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
- if (MPID_nem_ib_myrank == 1) {
- //printf("stmalloc,new %p,%08x\n", slot, (int)_sz);
- }
- }
- return slot;
-}
-
-static inline void MPID_nem_ib_stfree(void *ptr, size_t _sz)
-{
- if (MPID_nem_ib_myrank == 1) {
- //printf("stfree,%p,%08x\n", ptr, (int)sz);
- }
- int i = 0;
- size_t sz = _sz;
- do {
- i++;
- sz >>= 1;
- } while (sz > 0);
- if (i < 12) {
- MPIU_Free(ptr);
- goto fn_exit;
- }
- if (i > 30) {
- munmap(ptr, _sz);
- goto fn_exit;
- }
- int ndx = i - 12;
- *((void **) ptr) = MPID_nem_ib_fl[ndx];
- MPID_nem_ib_fl[ndx] = ptr;
- fn_exit:;
-}
-
-#endif /* IB_IMPL_H_INCLUDED */
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
deleted file mode 100644
index 9010a6b..0000000
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
+++ /dev/null
@@ -1,1105 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
-/*
- * (C) 2006 by Argonne National Laboratory.
- * (C) 2012 NEC Corporation
- * (C) 2014-2015 RIKEN AICS
- *
- * See COPYRIGHT in top-level directory.
- */
-
-#include "ib_impl.h"
-#ifdef USE_PMI2_API
-#include "pmi2.h"
-#else
-#include "pmi.h"
-#endif
-
-//#define MPID_NEM_IB_DEBUG_INIT
-#ifdef dprintf /* avoid redefinition with src/mpid/ch3/include/mpidimpl.h */
-#undef dprintf
-#endif
-#ifdef MPID_NEM_IB_DEBUG_INIT
-#define dprintf printf
-#else
-#define dprintf(...)
-#endif
-
-MPID_nem_netmod_funcs_t MPIDI_nem_ib_funcs = {
- MPID_nem_ib_init,
- MPID_nem_ib_finalize,
- MPID_nem_ib_poll,
- MPID_nem_ib_get_business_card,
- MPID_nem_ib_connect_to_root,
- MPID_nem_ib_vc_init,
- MPID_nem_ib_vc_destroy,
- MPID_nem_ib_vc_terminate,
- NULL, /*MPID_nem_ib_anysource_iprobe */
- NULL, /*MPID_nem_ib_anysource_improbe */
-};
-
-MPIDI_CH3_PktHandler_Fcn *MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_NUM_PKT_HANDLERS];
-
-static MPIDI_Comm_ops_t comm_ops = {
- /*NULL, */ MPID_nem_ib_recv_posted,
- /* recv_posted */
-
- NULL, /* send */
- NULL, /* rsend */
- NULL, /* ssend */
- NULL, /* isend */
- NULL, /* irsend */
- NULL, /* issend */
-
- NULL, /* send_init */
- NULL, /* bsend_init */
- NULL, /* rsend_init */
- NULL, /* ssend_init */
- NULL, /* startall */
-
- NULL, /* cancel_send */
- NULL, /* cancel_recv */
-
- NULL, /* probe */
- NULL, /* iprobe */
- NULL, /* improbe */
-};
-
-void *MPID_nem_ib_fl[18];
-int MPID_nem_ib_nranks;
-MPID_nem_ib_conn_ud_t *MPID_nem_ib_conn_ud;
-MPID_nem_ib_conn_t *MPID_nem_ib_conns;
-int MPID_nem_ib_conns_ref_count;
-//MPIDI_VC_t **MPID_nem_ib_pollingset;
-int MPID_nem_ib_conn_ud_fd;
-MPID_nem_ib_com_t *MPID_nem_ib_conn_ud_MPID_nem_ib_com;
-//int MPID_nem_ib_npollingset;
-int *MPID_nem_ib_scratch_pad_fds;
-int MPID_nem_ib_scratch_pad_fds_ref_count;
-MPID_nem_ib_com_t **MPID_nem_ib_scratch_pad_ibcoms;
-//char *MPID_nem_ib_recv_buf;
-int MPID_nem_ib_myrank;
-uint64_t MPID_nem_ib_tsc_poll;
-int MPID_nem_ib_ncqe;
-uint64_t MPID_nem_ib_progress_engine_vt;
-uint16_t MPID_nem_ib_remote_poll_shared;
-#ifdef MPID_NEM_IB_ONDEMAND
-uint16_t MPID_nem_ib_cm_ringbuf_head;
-uint16_t MPID_nem_ib_cm_ringbuf_tail;
-uint64_t MPID_nem_ib_cm_ringbuf_released[(MPID_NEM_IB_CM_NSEG + 63) / 64];
-MPID_nem_ib_cm_sendq_t MPID_nem_ib_cm_sendq = { NULL, NULL };
-MPID_nem_ib_cm_notify_sendq_t MPID_nem_ib_cm_notify_sendq = { NULL, NULL };
-
-int MPID_nem_ib_ncqe_scratch_pad_to_drain;
-#endif
-MPID_nem_ib_ringbuf_sendq_t MPID_nem_ib_ringbuf_sendq = { NULL, NULL };
-
-
-int MPID_nem_ib_ncqe_scratch_pad;
-int MPID_nem_ib_ncqe_to_drain;
-int MPID_nem_ib_ncqe_nces;
-MPID_nem_ib_lmtq_t MPID_nem_ib_lmtq = { NULL, NULL };
-MPID_nem_ib_lmtq_t MPID_nem_ib_lmt_orderq = { NULL, NULL };
-uint8_t MPID_nem_ib_lmt_tail_addr_cbf[MPID_nem_ib_cbf_nslot * MPID_nem_ib_cbf_bitsperslot /
- 8] = { 0 };
-static uint32_t MPID_nem_ib_rand_next = 1;
-MPID_nem_ib_vc_area *MPID_nem_ib_debug_current_vc_ib;
-uint64_t MPID_nem_ib_ringbuf_acquired[(MPID_NEM_IB_NRINGBUF + 63) / 64];
-uint64_t MPID_nem_ib_ringbuf_allocated[(MPID_NEM_IB_NRINGBUF + 63) / 64];
-MPID_nem_ib_ringbuf_t *MPID_nem_ib_ringbuf;
-
-uint8_t MPID_nem_ib_rand()
-{
- //return 0xaa;
- MPID_nem_ib_rand_next = MPID_nem_ib_rand_next * 1103515245 + 12345;
- return (MPID_nem_ib_rand_next / 65536) % 256;
-}
-
-uint64_t MPID_nem_ib_rdtsc()
-{
- uint64_t x;
- __asm__ __volatile__("rdtsc; shl $32, %%rdx; or %%rdx, %%rax":"=a"(x)::"%rdx", "memory"); /* rdtsc cannot be executed earlier than here */
- return x;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_kvs_put_binary
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int MPID_nem_ib_kvs_put_binary(int from, const char *postfix, const uint8_t * buf,
- int length)
-{
- int mpi_errno = MPI_SUCCESS;
- int pmi_errno;
- char *kvs_name;
- char key[256], val[256], str[256];
- int j;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_KVS_PUT_BINARY);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_KVS_PUT_BINARY);
-
- mpi_errno = MPIDI_PG_GetConnKVSname(&kvs_name);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPIDI_PG_GetConnKVSname");
- //dprintf("kvs_put_binary,kvs_name=%s\n", kvs_name);
-
- sprintf(key, "bc/%d/%s", from, postfix);
- val[0] = 0;
- for (j = 0; j < length; j++) {
- sprintf(str, "%02x", buf[j]);
- strcat(val, str);
- }
- //dprintf("kvs_put_binary,rank=%d,from=%d,PMI_KVS_Put(%s, %s, %s)\n", MPID_nem_ib_myrank, from,
- //kvs_name, key, val);
- pmi_errno = PMI_KVS_Put(kvs_name, key, val);
- MPIU_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**PMI_KVS_Put");
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_KVS_PUT_BINARY);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_kvs_get_binary
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int MPID_nem_ib_kvs_get_binary(int from, const char *postfix, char *buf, int length)
-{
- int mpi_errno = MPI_SUCCESS;
- int pmi_errno;
- char *kvs_name;
- char key[256], val[256], str[256];
- int j;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_KVS_GET_BINARY);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_KVS_GET_BINARY);
-
- mpi_errno = MPIDI_PG_GetConnKVSname(&kvs_name);
- //dprintf("kvs_get_binary,kvs_name=%s\n", kvs_name);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPIDI_PG_GetConnKVSname");
-
- sprintf(key, "bc/%d/%s", from, postfix);
- pmi_errno = PMI_KVS_Get(kvs_name, key, val, 256);
- //dprintf("kvs_put_binary,rank=%d,from=%d,PMI_KVS_Get(%s, %s, %s)\n", MPID_nem_ib_myrank, from,
- //kvs_name, key, val);
- MPIU_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**PMS_KVS_Get");
-
- dprintf("rank=%d,obtained val=%s\n", MPID_nem_ib_myrank, val);
- char *strp = val;
- for (j = 0; j < length; j++) {
- memcpy(str, strp, 2);
- str[2] = 0;
- buf[j] = strtol(str, NULL, 16);
- strp += 2;
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_KVS_GET_BINARY);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#ifndef MPID_NEM_IB_ONDEMAND
-static int MPID_nem_ib_announce_network_addr(int my_rank, char **bc_val_p, int *val_max_sz_p);
-#endif
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_init
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_max_sz_p)
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno = 0, pmi_errno;
- int i, j, k;
- int ib_port = 1;
-
- MPIU_CHKPMEM_DECL(6);
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_INIT);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_INIT);
-
- /* first make sure that our private fields in the vc fit into the area provided */
- MPIU_Assert(sizeof(MPID_nem_ib_vc_area) <= MPID_NEM_VC_NETMOD_AREA_LEN);
-
- MPID_nem_ib_nranks = pg_p->size;
- MPID_nem_ib_myrank = pg_rank;
- MPID_nem_ib_tsc_poll = MPID_nem_ib_rdtsc();
- MPID_nem_ib_ncqe = 0;
- MPID_nem_ib_ncqe_to_drain = 0;
- MPID_nem_ib_ncqe_nces = 0;
- MPID_nem_ib_ncqe_scratch_pad = 0;
- MPID_nem_ib_ncqe_scratch_pad_to_drain = 0;
- // MPID_nem_ib_npollingset = 0;
- MPID_nem_ib_progress_engine_vt = 0;
- MPID_nem_ib_remote_poll_shared = 0;
-#ifdef MPID_NEM_IB_ONDEMAND
- MPID_nem_ib_cm_ringbuf_head = 0;
- MPID_nem_ib_cm_ringbuf_tail = -1; /* it means slot 0 is not acquired */
- memset(MPID_nem_ib_cm_ringbuf_released, 0, (MPID_NEM_IB_CM_NSEG + 63) / 64);
-#endif
-
- /* no need to malloc scratch-pad when the number of rank is '1' */
- if (pg_p->size == 1) {
- goto fn_exit;
- }
-
- /* malloc scratch-pad fd */
- MPIU_CHKPMEM_MALLOC(MPID_nem_ib_scratch_pad_fds, int *, MPID_nem_ib_nranks * sizeof(int),
- mpi_errno, "connection table");
- memset(MPID_nem_ib_scratch_pad_fds, 0, MPID_nem_ib_nranks * sizeof(int));
-
- MPIU_CHKPMEM_MALLOC(MPID_nem_ib_scratch_pad_ibcoms, MPID_nem_ib_com_t **,
- MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_t *),
- mpi_errno, "connection table");
- memset(MPID_nem_ib_scratch_pad_ibcoms, 0, MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_t *));
-
- /* prepare scrath-pad QP and malloc scratch-pad */
- for (i = 0; i < MPID_nem_ib_nranks; i++) {
- if (i == MPID_nem_ib_myrank) {
- continue;
- }
- dprintf("init,MPID_nem_ib_myrank=%d,i=%d\n", MPID_nem_ib_myrank, i);
- ibcom_errno =
- MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_SCRATCH_PAD,
- &MPID_nem_ib_scratch_pad_fds[i]);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
- MPID_nem_ib_scratch_pad_fds_ref_count++;
-
- ibcom_errno =
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[i],
- &MPID_nem_ib_scratch_pad_ibcoms[i]);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_obtain_pointer");
-
-
- ibcom_errno = MPID_nem_ib_com_alloc(MPID_nem_ib_scratch_pad_fds[i],
-#ifdef MPID_NEM_IB_ONDEMAND
- MPID_NEM_IB_CM_OFF_CMD +
- MPID_NEM_IB_CM_NSEG * sizeof(MPID_nem_ib_cm_cmd_t) +
- sizeof(MPID_nem_ib_ringbuf_headtail_t)
-#else
- MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_qp_state_t)
-#endif
-);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_alloc");
- }
-#ifdef MPID_NEM_IB_ONDEMAND
- /* Release CAS word */
- *((uint64_t *) MPID_nem_ib_scratch_pad) = MPID_NEM_IB_CM_RELEASED;
-#endif
- /* Initialize head and tail pointer of shared ring buffer */
- MPID_nem_ib_ringbuf_headtail_t *headtail =
- (MPID_nem_ib_ringbuf_headtail_t *) ((uint8_t *) MPID_nem_ib_scratch_pad +
- MPID_NEM_IB_RINGBUF_OFF_HEAD);
- headtail->head = 0;
- headtail->tail = -1;
-
- /* put bc/me/sp/{gid,lid} put bc/me/sp/{qpn,rmem,rkey}/you */
- int nranks;
-
- uint32_t my_qpnum;
- uint16_t my_lid;
- union ibv_gid my_gid;
- void *my_rmem;
- int my_rkey;
-
- int remote_qpnum;
- uint16_t remote_lid;
- union ibv_gid remote_gid;
- void *remote_rmem;
- int remote_rkey;
-
- char *remote_rank_str;
- char *key_str;
-
- /* count maximum length of the string representation of remote_rank */
- for (i = 0, nranks = MPID_nem_ib_nranks; nranks > 0; nranks /= 10, i++) {
- }
- MPIU_CHKPMEM_MALLOC(remote_rank_str, char *, 1 + i + 1, mpi_errno, "connection table");
- MPIU_CHKPMEM_MALLOC(key_str, char *, strlen("sp/rmem") + 1 + i + 1, mpi_errno,
- "connection table");
-
- for (i = 0; i < MPID_nem_ib_nranks; i++) {
-
- if (i == 0) {
- ibcom_errno =
- MPID_nem_ib_com_get_info_conn(MPID_nem_ib_scratch_pad_fds[i],
- MPID_NEM_IB_COM_INFOKEY_PORT_LID, &my_lid,
- sizeof(uint16_t));
- dprintf("ib_init,scratch pad,lid=%04x\n", my_lid);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_get_info_conn");
-
- mpi_errno =
- MPID_nem_ib_kvs_put_binary(MPID_nem_ib_myrank, "sp/lid", (uint8_t *) & my_lid,
- sizeof(uint16_t));
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_kvs_put_binary");
-
- {
- dprintf("ib_init,scratch pad,put <%d/sp/lid/,%04x>\n", MPID_nem_ib_myrank,
- (int) my_lid);
- }
-
- ibcom_errno =
- MPID_nem_ib_com_get_info_conn(MPID_nem_ib_scratch_pad_fds[i],
- MPID_NEM_IB_COM_INFOKEY_PORT_GID, &my_gid,
- sizeof(union ibv_gid));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_get_info_conn");
-
- mpi_errno =
- MPID_nem_ib_kvs_put_binary(MPID_nem_ib_myrank, "sp/gid", (uint8_t *) & my_gid,
- sizeof(union ibv_gid));
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_kvs_put_binary");
-
- dprintf("ib_init,scratch pad,gid ");
- for (k = 0; k < 16; k++) {
- dprintf("%02x", (int) my_gid.raw[k]);
- }
- dprintf("\n");
- }
-
- /* put bc/me/sp/qpn/you */
- strcpy(key_str, "sp/qpn");
- sprintf(remote_rank_str, "/%x", i);
- strcat(key_str, remote_rank_str);
- ibcom_errno =
- MPID_nem_ib_com_get_info_conn(MPID_nem_ib_scratch_pad_fds[i],
- MPID_NEM_IB_COM_INFOKEY_QP_QPN, &my_qpnum,
- sizeof(uint32_t));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_get_info_conn");
- dprintf("ib_init,scratch pad,qpn=%08x\n", my_qpnum);
-
- mpi_errno =
- MPID_nem_ib_kvs_put_binary(MPID_nem_ib_myrank, key_str, (uint8_t *) & my_qpnum,
- sizeof(uint32_t));
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_kvs_put_binary");
- dprintf("ib_init,scratch pad,kvs put done\n");
-
- strcpy(key_str, "sp/rmem");
- sprintf(remote_rank_str, "/%x", i);
- strcat(key_str, remote_rank_str);
-
- ibcom_errno =
- MPID_nem_ib_com_get_info_mr(MPID_nem_ib_scratch_pad_fds[i],
- MPID_NEM_IB_COM_SCRATCH_PAD_TO,
- MPID_NEM_IB_COM_INFOKEY_MR_ADDR, &my_rmem, sizeof(void *));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_get_info_mr");
-
- dprintf("ib_init,scratch_pad,rmem=%p\n", my_rmem);
- mpi_errno =
- MPID_nem_ib_kvs_put_binary(MPID_nem_ib_myrank, key_str, (uint8_t *) & my_rmem,
- sizeof(void *));
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_kvs_put_binary");
-
- strcpy(key_str, "sp/rkey");
- sprintf(remote_rank_str, "/%x", i);
- strcat(key_str, remote_rank_str);
-
- ibcom_errno =
- MPID_nem_ib_com_get_info_mr(MPID_nem_ib_scratch_pad_fds[i],
- MPID_NEM_IB_COM_SCRATCH_PAD_TO,
- MPID_NEM_IB_COM_INFOKEY_MR_RKEY, &my_rkey, sizeof(int));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_get_info_mr");
- dprintf("ib_init,scratch_pad,rkey=%08x\n", my_rkey);
-
- mpi_errno =
- MPID_nem_ib_kvs_put_binary(MPID_nem_ib_myrank, key_str, (uint8_t *) & my_rkey,
- sizeof(uint32_t));
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_kvs_put_binary");
- }
-
- /* wait until key-value propagates among all ranks */
- pmi_errno = PMI_Barrier();
- MPIU_ERR_CHKANDJUMP(pmi_errno != PMI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**PMI_Barrier");
- dprintf("ib_init,put KVS;barrier;\n");
-
- /* make me-to-you scratch-pad QP RTS */
- for (i = 0; i < MPID_nem_ib_nranks; i++) {
- if (i != MPID_nem_ib_myrank) {
-
- mpi_errno =
- MPID_nem_ib_kvs_get_binary(i, "sp/gid", (char *) &remote_gid,
- sizeof(union ibv_gid));
- dprintf("ib_init,after kvs get\n");
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
-
- mpi_errno =
- MPID_nem_ib_kvs_get_binary(i, "sp/lid", (char *) &remote_lid, sizeof(uint16_t));
- dprintf("ib_init,after kvs get\n");
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
-
- strcpy(key_str, "sp/qpn");
- strcat(key_str, ""); /* "" or "lmt-put" */
- sprintf(remote_rank_str, "/%x", MPID_nem_ib_myrank);
- strcat(key_str, remote_rank_str);
- mpi_errno =
- MPID_nem_ib_kvs_get_binary(i, key_str, (char *) &remote_qpnum, sizeof(uint32_t));
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
- dprintf("ib_init,get KVS,remote_qpnum=%08x\n", remote_qpnum);
-
- ibcom_errno =
- MPID_nem_ib_com_rts(MPID_nem_ib_scratch_pad_fds[i], remote_qpnum, remote_lid,
- &remote_gid);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_rts");
-
- strcpy(key_str, "sp/rmem");
- sprintf(remote_rank_str, "/%x", MPID_nem_ib_myrank);
- strcat(key_str, remote_rank_str);
- mpi_errno =
- MPID_nem_ib_kvs_get_binary(i, key_str, (char *) &remote_rmem, sizeof(void *));
- dprintf("ib_init,after kvs get\n");
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
- dprintf("ib_init,get KVS,remote_rmem=%p\n", remote_rmem);
-
- strcpy(key_str, "sp/rkey");
- sprintf(remote_rank_str, "/%x", MPID_nem_ib_myrank);
- strcat(key_str, remote_rank_str);
- mpi_errno =
- MPID_nem_ib_kvs_get_binary(i, key_str, (char *) &remote_rkey, sizeof(uint32_t));
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
- dprintf("ib_init,get KVS,remote_rkey=%08x\n", remote_rkey);
-
- ibcom_errno =
- MPID_nem_ib_com_reg_mr_connect(MPID_nem_ib_scratch_pad_fds[i], remote_rmem,
- remote_rkey);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_reg_mr_connect");
- }
- }
-
- /* wait until you-to-me scratch-pad QP becomes RTR */
- pmi_errno = PMI_Barrier();
- MPIU_ERR_CHKANDJUMP(pmi_errno != PMI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**PMI_Barrier");
-
- MPIU_CHKPMEM_MALLOC(MPID_nem_ib_conns, MPID_nem_ib_conn_t *,
- MPID_nem_ib_nranks * sizeof(MPID_nem_ib_conn_t), mpi_errno,
- "connection table");
- memset(MPID_nem_ib_conns, 0, MPID_nem_ib_nranks * sizeof(MPID_nem_ib_conn_t));
-
- /* post receive request */
- for (i = 0; i < MPID_nem_ib_nranks; i++) {
- if (i != MPID_nem_ib_myrank) {
- for (j = 0; j < MPID_NEM_IB_COM_MAX_RQ_CAPACITY; j++) {
- MPID_nem_ib_com_scratch_pad_recv(MPID_nem_ib_scratch_pad_fds[i],
- sizeof(MPID_nem_ib_cm_notify_send_t));
- }
- }
- }
-
-#ifndef MPID_NEM_IB_ONDEMAND
- /* prepare eager-send QP */
- for (i = 0; i < MPID_nem_ib_nranks; i++) {
- ibcom_errno =
- MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_RC, &MPID_nem_ib_conns[i].fd);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
- MPID_nem_ib_conns_ref_count++;
- dprintf("init,fd=%d\n", MPID_nem_ib_conns[i].fd);
- }
-
- /* put bc/me/{gid,lid}, put bc/me/{qpn,rmem,rkey}/you */
- mpi_errno = MPID_nem_ib_announce_network_addr(pg_rank, bc_val_p, val_max_sz_p);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
-
- /* wait until key-value propagates among all ranks */
- pmi_errno = PMI_Barrier();
- MPIU_ERR_CHKANDJUMP(pmi_errno != PMI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**PMI_Barrier");
-
- /* make me-to-you eager-send QP RTS */
- for (i = 0; i < MPID_nem_ib_nranks; i++) {
- if (i != MPID_nem_ib_myrank) {
-
- mpi_errno =
- MPID_nem_ib_kvs_get_binary(i, MPID_NEM_IB_LID_KEY, (char *) &remote_lid,
- sizeof(uint16_t));
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
- mpi_errno =
- MPID_nem_ib_kvs_get_binary(i, MPID_NEM_IB_GID_KEY, (char *) &remote_gid,
- sizeof(union ibv_gid));
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
-
- strcpy(key_str, MPID_NEM_IB_RMEM_KEY);
- sprintf(remote_rank_str, "/%x", MPID_nem_ib_myrank);
- strcat(key_str, remote_rank_str);
- mpi_errno =
- MPID_nem_ib_kvs_get_binary(i, key_str, (char *) &remote_rmem, sizeof(void *));
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
-
- strcpy(key_str, MPID_NEM_IB_RKEY_KEY);
- sprintf(remote_rank_str, "/%x", MPID_nem_ib_myrank);
- strcat(key_str, remote_rank_str);
- mpi_errno =
- MPID_nem_ib_kvs_get_binary(i, key_str, (char *) &remote_rkey, sizeof(uint32_t));
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
-
- strcpy(key_str, MPID_NEM_IB_QPN_KEY);
- strcat(key_str, ""); /* "" or "lmt-put" */
- sprintf(remote_rank_str, "/%x", MPID_nem_ib_myrank);
- strcat(key_str, remote_rank_str);
- mpi_errno =
- MPID_nem_ib_kvs_get_binary(i, key_str, (char *) &remote_qpnum, sizeof(uint32_t));
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
- dprintf("remote_qpnum obtained=%08x\n", remote_qpnum);
-
- ibcom_errno =
- MPID_nem_ib_com_rts(MPID_nem_ib_conns[i].fd, remote_qpnum, remote_lid, &remote_gid);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_rts");
-
- /* report me-to-you eager-send QP becomes RTR */
- MPID_nem_ib_com_qp_state_t state = {.state = MPID_NEM_IB_COM_QP_STATE_RTR };
- ibcom_errno =
- MPID_nem_ib_com_put_scratch_pad(MPID_nem_ib_scratch_pad_fds[i],
- (uint64_t) MPID_nem_ib_scratch_pad_ibcoms[i],
- sizeof(MPID_nem_ib_com_qp_state_t) *
- MPID_nem_ib_myrank,
- sizeof(MPID_nem_ib_com_qp_state_t),
- (void *) &state);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_put_scratch_pad");
- MPID_nem_ib_ncqe_scratch_pad += 1;
-
- ibcom_errno =
- MPID_nem_ib_com_reg_mr_connect(MPID_nem_ib_conns[i].fd, remote_rmem, remote_rkey);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_reg_mr_connect");
- dprintf("ib_init,after mr_connect for me-to-you eager-send QP\n");
-
- }
- }
-#else /* define(MPID_NEM_IB_ONDEMAND) */
- /* We need to communicate with all other ranks in close sequence. */
- MPID_nem_ib_conns_ref_count = MPID_nem_ib_nranks - MPID_nem_mem_region.num_local;
-
- if (MPID_nem_ib_conns_ref_count == 0) {
- MPIU_Free(MPID_nem_ib_conns);
- }
-
- for (i = 0; i < MPID_nem_mem_region.num_local; i++) {
- if (MPID_nem_mem_region.local_procs[i] != MPID_nem_ib_myrank) {
- ibcom_errno =
- MPID_nem_ib_com_close(MPID_nem_ib_scratch_pad_fds
- [MPID_nem_mem_region.local_procs[i]]);
- if (--MPID_nem_ib_scratch_pad_fds_ref_count == 0) {
- MPIU_Free(MPID_nem_ib_scratch_pad_fds);
- MPIU_Free(MPID_nem_ib_scratch_pad_ibcoms);
- }
- }
- }
-#endif
-
- MPIU_Free(remote_rank_str);
- MPIU_Free(key_str);
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_INIT);
- return mpi_errno;
- fn_fail:
- MPIU_CHKPMEM_REAP();
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_get_business_card
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_get_business_card(int my_rank, char **bc_val_p, int *val_max_sz_p)
-{
- int mpi_errno = MPI_SUCCESS;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_GET_BUSINESS_CARD);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_GET_BUSINESS_CARD);
- dprintf("MPID_nem_ib_get_business_card,enter\n");
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_GET_BUSINESS_CARD);
- return mpi_errno;
-}
-
-#ifndef MPID_NEM_IB_ONDEMAND
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_announce_network_addr
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int MPID_nem_ib_announce_network_addr(int my_rank, char **bc_val_p, int *val_max_sz_p)
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
- int i, j, nranks;
-
- uint32_t my_qpnum;
- uint16_t my_lid;
- union ibv_gid my_gid;
- void *my_rmem;
- int my_rkey;
- char *remote_rank_str; /* perl -e '$key_str .= $remote_rank;' */
- char *key_str;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_ANNOUNCE_NETWORK_ADDR);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_ANNOUNCE_NETWORK_ADDR);
- MPIU_CHKLMEM_DECL(2); /* argument is the number of alloca */
-
- /* count maximum length of the string representation of remote_rank */
- for (i = 0, nranks = MPID_nem_ib_nranks; nranks > 0; nranks /= 10, i++) {
- }
- MPIU_CHKLMEM_MALLOC(remote_rank_str, char *, i + 1, mpi_errno, "key_str"); /* alloca */
- MPIU_CHKLMEM_MALLOC(key_str, char *, strlen(MPID_NEM_IB_QPN_KEY) + i + 1, mpi_errno, "key_str"); /* alloca */
-
- /* We have one local qp and remote qp for each rank-pair,
- * so a rank should perform
- * remote_qpn = kvs_get($remote_rank . "qpnum/" . $local_rank).
- * a memory area to read from and write to HCA,
- * and a memory area to read from HCA and write to DRAM is
- * associated with each connection, so a rank should perform
- * rkey = kvs_get($remote_rank . "rkey/" . $local_rank)
- * and raddr = kvs_get($remote_rank . "raddr/" . $local_rank). */
- for (i = 0; i < MPID_nem_ib_nranks; i++) {
-
- /* lid and gid are common for all remote-ranks */
- if (i == 0) {
- ibcom_errno =
- MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conns[i].fd,
- MPID_NEM_IB_COM_INFOKEY_PORT_LID, &my_lid,
- sizeof(uint16_t));
- dprintf("get_business_card,lid=%04x\n", my_lid);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_get_info_conn");
-
- mpi_errno =
- MPID_nem_ib_kvs_put_binary(MPID_nem_ib_myrank, MPID_NEM_IB_LID_KEY,
- (uint8_t *) & my_lid, sizeof(uint16_t));
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_kvs_put_binary");
-
- ibcom_errno =
- MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conns[i].fd,
- MPID_NEM_IB_COM_INFOKEY_PORT_GID, &my_gid,
- sizeof(union ibv_gid));
-
- dprintf("get_business_card,val_max_sz=%d\n", *val_max_sz_p);
- dprintf("get_business_card,sz=%ld,my_gid=", sizeof(union ibv_gid));
- for (j = 0; j < 16; j++) {
- dprintf("%02x", (int) my_gid.raw[j]);
- }
- dprintf("\n");
-
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_get_info_conn");
-
- mpi_errno =
- MPID_nem_ib_kvs_put_binary(MPID_nem_ib_myrank, MPID_NEM_IB_GID_KEY,
- (uint8_t *) & my_gid, sizeof(union ibv_gid));
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_kvs_put_binary");
- dprintf("get_business_card,val_max_sz=%d\n", *val_max_sz_p);
- }
-
- /* we use different RDMA-rbuf for different senders.
- * so announce like this:
- * <"0/qpn/0", 0xa0000>
- * <"0/qpn/1", 0xb0000>
- * <"0/qpn/2", 0xc0000>
- * <"0/qpn/3", 0xd0000>
- */
- strcpy(key_str, MPID_NEM_IB_QPN_KEY);
- sprintf(remote_rank_str, "/%x", i);
- strcat(key_str, remote_rank_str);
- ibcom_errno =
- MPID_nem_ib_com_get_info_conn(MPID_nem_ib_conns[i].fd, MPID_NEM_IB_COM_INFOKEY_QP_QPN,
- &my_qpnum, sizeof(uint32_t));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_get_info_conn");
-
- mpi_errno =
- MPID_nem_ib_kvs_put_binary(MPID_nem_ib_myrank, key_str, (uint8_t *) & my_qpnum,
- sizeof(uint32_t));
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_kvs_put_binary");
-
-
- strcpy(key_str, MPID_NEM_IB_RMEM_KEY);
- sprintf(remote_rank_str, "/%x", i);
- strcat(key_str, remote_rank_str);
-
- ibcom_errno =
- MPID_nem_ib_com_get_info_mr(MPID_nem_ib_conns[i].fd, MPID_NEM_IB_COM_RDMAWR_TO,
- MPID_NEM_IB_COM_INFOKEY_MR_ADDR, &my_rmem, sizeof(void *));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_get_info_mr");
-
- dprintf("rmem=%p\n", my_rmem);
- mpi_errno =
- MPID_nem_ib_kvs_put_binary(MPID_nem_ib_myrank, key_str, (uint8_t *) & my_rmem,
- sizeof(void *));
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_kvs_put_binary");
-
- strcpy(key_str, MPID_NEM_IB_RKEY_KEY);
- sprintf(remote_rank_str, "/%x", i);
- strcat(key_str, remote_rank_str);
-
- ibcom_errno =
- MPID_nem_ib_com_get_info_mr(MPID_nem_ib_conns[i].fd, MPID_NEM_IB_COM_RDMAWR_TO,
- MPID_NEM_IB_COM_INFOKEY_MR_RKEY, &my_rkey, sizeof(int));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_get_info_mr");
-
- mpi_errno =
- MPID_nem_ib_kvs_put_binary(MPID_nem_ib_myrank, key_str, (uint8_t *) & my_rkey,
- sizeof(uint32_t));
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_kvs_put_binary");
- }
-
- MPIU_CHKLMEM_FREEALL();
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_ANNOUNCE_NETWORK_ADDR);
- return mpi_errno;
- fn_fail:
- MPIU_CHKLMEM_FREEALL();
- goto fn_exit;
-}
-#endif
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_connect_to_root
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_connect_to_root(const char *business_card, MPIDI_VC_t * new_vc)
-{
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CONNECT_TO_ROOT);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CONNECT_TO_ROOT);
-
- dprintf("toroot,%d->%d", MPID_nem_ib_myrank, new_vc->pg_rank);
- /* not implemented */
-
- //fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CONNECT_TO_ROOT);
- return MPI_SUCCESS;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_vc_onconnect
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_vc_onconnect(MPIDI_VC_t * vc)
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_VC_ONCONNECT);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_VC_ONCONNECT);
-
- /* store pointer to MPID_nem_ib_com */
- ibcom_errno =
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[vc->pg_rank].fd, &VC_FIELD(vc, ibcom));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_VC_ONCONNECT);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_vc_init
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_vc_init(MPIDI_VC_t * vc)
-{
- MPIDI_CH3I_VC *vc_ch = VC_CH(vc);
- int mpi_errno = MPI_SUCCESS;
-
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_VC_INIT);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_VC_INIT);
-
- vc_ib->sc = &MPID_nem_ib_conns[vc->pg_rank];
-
- /* initialize sendq */
- vc_ib->sendq.head = NULL;
- vc_ib->sendq.tail = NULL;
-#ifdef MPID_NEM_IB_ONDEMAND
- VC_FIELD(vc, connection_state) = MPID_NEM_IB_CM_CLOSED;
- VC_FIELD(vc, connection_guard) = 0;
-#endif
- VC_FIELD(vc, vc_terminate_buf) = NULL;
-
- /* rank is sent as wr_id and used to obtain vc in poll */
- MPID_nem_ib_conns[vc->pg_rank].vc = vc;
-
-#ifndef MPID_NEM_IB_ONDEMAND
- MPID_nem_ib_vc_onconnect(vc);
-
- /* wait until you-to-me eager-send QP becomes RTR */
- MPID_nem_ib_com_t *MPID_nem_ib_com_scratch_pad;
- ibcom_errno =
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[vc->pg_rank],
- &MPID_nem_ib_com_scratch_pad);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
-
- int ntrial = 0;
- volatile MPID_nem_ib_com_qp_state_t *rstate = (MPID_nem_ib_com_qp_state_t *)
- ((uint8_t *) MPID_nem_ib_com_scratch_pad->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_TO]
- + vc->pg_rank * sizeof(MPID_nem_ib_com_qp_state_t));
- dprintf("ib_init,rstate=%p,*rstate=%08x\n", rstate, *((uint32_t *) rstate));
- while (rstate->state != MPID_NEM_IB_COM_QP_STATE_RTR) {
- __asm__ __volatile__("pause;":::"memory");
- if (++ntrial > 1024) {
- /* detect RDMA-write failure */
- ibcom_errno = MPID_nem_ib_drain_scq_scratch_pad();
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_drain_scq_scratch_pad");
- }
- }
- dprintf("ib_init,you-to-me eager-send QP is RTR\n");
-
- /* post MPID_NEM_IB_COM_MAX_SQ_CAPACITY of recv commands beforehand, replenish when retiring them in ib_poll */
- int i;
- for (i = 0; i < MPID_NEM_IB_COM_MAX_RQ_CAPACITY; i++) {
- //dprintf("irecv,%d->%d\n", MPID_nem_ib_myrank, vc->pg_rank);
- ibcom_errno =
- MPID_nem_ib_com_irecv(MPID_nem_ib_conns[vc->pg_rank].fd, (uint64_t) vc->pg_rank);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_irecv");
- }
-
-#endif
- MPIDI_CHANGE_VC_STATE(vc, ACTIVE);
-
- VC_FIELD(vc, pending_sends) = 0;
-
- //MPIU_Assert(sizeof(MPID_nem_ib_netmod_hdr_t) == 8); /* assumption in ib_ibcom.h */
- MPIU_Assert(sizeof(MPID_nem_ib_netmod_trailer_t) == 1); /* assumption in ib_ibcom.h */
-
- uint32_t sz;
-
- /* assumption in ib_poll.c, must be power of two */
- for (sz = MPID_NEM_IB_COM_RDMABUF_SZSEG; sz > 0; sz >>= 1) {
- if (sz != 1 && (sz & 1)) {
- MPIU_Assert(0);
- }
- }
-
- char *val;
- val = getenv("MP2_IBA_EAGER_THRESHOLD");
- vc->eager_max_msg_sz = val ? atoi(val) : MPID_NEM_IB_EAGER_MAX_MSG_SZ;
- vc->ready_eager_max_msg_sz = val ? atoi(val) : MPID_NEM_IB_EAGER_MAX_MSG_SZ;
- dprintf("ib_vc_init,vc->eager_max_msg_sz=%d\n", vc->eager_max_msg_sz);
-
- /* vc->rndvSend_fn is set in MPID_nem_vc_init (in src/mpid/ch3/channels/nemesis/src/mpid_nem_init.c) */
- ;
- vc->sendNoncontig_fn = MPID_nem_ib_SendNoncontig;
-
- vc->comm_ops = &comm_ops;
-
-
- /* register packet handler */
- vc_ch->pkt_handler = MPID_nem_ib_pkt_handler;
- vc_ch->num_pkt_handlers = MPIDI_NEM_IB_PKT_NUM_PKT_HANDLERS;
- MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_EAGER_SEND] = MPID_nem_ib_PktHandler_EagerSend;
- MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_RMA_LMT_RTS] = MPID_nem_ib_PktHandler_rma_lmt_rts;
- MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_LMT_GET_DONE] = MPID_nem_ib_pkt_GET_DONE_handler;
- MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_LMT_RTS] = MPID_nem_ib_pkt_RTS_handler;
- MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_REQ_SEQ_NUM] = MPID_nem_ib_PktHandler_req_seq_num;
- MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM] = MPID_nem_ib_PktHandler_reply_seq_num;
- MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_CHG_RDMABUF_OCC_NOTIFY_STATE] =
- MPID_nem_ib_PktHandler_change_rdmabuf_occupancy_notify_state;
- MPID_nem_ib_pkt_handler[MPIDI_NEM_IB_PKT_RMA_LMT_GET_DONE] = MPID_nem_ib_pkt_rma_lmt_getdone;
-
- /* register CH3 send/recv functions */
- vc_ch->iStartContigMsg = MPID_nem_ib_iStartContigMsg;
- vc_ch->iSendContig = MPID_nem_ib_iSendContig;
-
- /* register CH3--lmt send/recv functions */
- vc_ch->lmt_initiate_lmt = MPID_nem_ib_lmt_initiate_lmt;
- vc_ch->lmt_start_recv = MPID_nem_ib_lmt_start_recv;
- vc_ch->lmt_handle_cookie = MPID_nem_ib_lmt_handle_cookie;
- vc_ch->lmt_done_send = MPID_nem_ib_lmt_done_send;
- vc_ch->lmt_done_recv = MPID_nem_ib_lmt_done_recv;
- vc_ch->lmt_vc_terminated = MPID_nem_ib_lmt_vc_terminated;
- vc_ch->next = NULL;
- vc_ch->prev = NULL;
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_VC_INIT);
- return mpi_errno;
- //fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_vc_destroy
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_vc_destroy(MPIDI_VC_t * vc)
-{
- int mpi_errno = MPI_SUCCESS;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_VC_DESTROY);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_VC_DESTROY);
- /* currently do nothing */
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_VC_DESTROY);
- return mpi_errno;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_vc_terminate
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_vc_terminate(MPIDI_VC_t * vc)
-{
- dprintf("ib_vc_terminate,pg_rank=%d\n", vc->pg_rank);
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_VC_TERMINATE);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_VC_TERMINATE);
-
- /* Check to make sure that it's OK to terminate the
- * connection without making sure that all sends have been sent */
- /* it is safe to only check command queue because
- * data transactions always proceed after confirming send by MPI_Wait
- * and control transactions always proceed after receiveing reply */
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
-
- /* store address of ringbuffer to clear in poll_eager */
- uint16_t remote_poll = 0;
- MPID_nem_ib_ringbuf_t *ringbuf;
- ringbuf = VC_FIELD(vc, ibcom->remote_ringbuf);
-
- switch (ringbuf->type) {
- case MPID_NEM_IB_RINGBUF_EXCLUSIVE:
- remote_poll = VC_FIELD(vc, ibcom->rsr_seq_num_poll);
- break;
- case MPID_NEM_IB_RINGBUF_SHARED:
- remote_poll = MPID_nem_ib_remote_poll_shared;
- break;
- default: /* FIXME */
- printf("unknown ringbuf->type\n");
- break;
- }
-
- /* Decrement because we increment this value in eager_poll. */
- remote_poll--;
-
- VC_FIELD(vc, vc_terminate_buf) =
- (uint8_t *) ringbuf->start +
- MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t) (remote_poll % ringbuf->nslot));
-
- dprintf
- ("vc_terminate,before,%d->%d,diff-rsr=%d,l diff-lsr=%d,sendq_empty=%d,ncqe=%d,pending_sends=%d\n",
- MPID_nem_ib_myrank, vc->pg_rank, MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
- vc_ib->
- ibcom->rsr_seq_num_tail_last_sent),
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
- MPID_nem_ib_sendq_empty(vc_ib->sendq), MPID_nem_ib_ncqe, VC_FIELD(vc, pending_sends));
-
-#ifdef MPID_NEM_IB_ONDEMAND
- MPID_nem_ib_cm_notify_send_req_t *req = MPIU_Malloc(sizeof(MPID_nem_ib_cm_notify_send_req_t));
- req->ibcom = MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank];
- req->my_rank = MPID_nem_ib_myrank;
- req->pg_rank = vc->pg_rank;
- MPID_nem_ib_cm_notify_sendq_enqueue(&MPID_nem_ib_cm_notify_sendq, req);
-#endif
-
- /* Empty sendq */
- while (!MPID_nem_ib_sendq_empty(vc_ib->sendq) ||
- VC_FIELD(vc, pending_sends) > 0 ||
- (MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank]->notify_outstanding_tx_empty !=
- NOTIFY_OUTSTANDING_TX_COMP)) {
-#ifdef MPID_NEM_IB_ONDEMAND
- MPID_nem_ib_cm_notify_progress(); /* progress cm_notify_sendq */
- MPID_nem_ib_cm_drain_rcq();
-#endif
- /* mimic ib_poll because vc_terminate might be called from ib_poll_eager */
- mpi_errno = MPID_nem_ib_send_progress(vc);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_send_progress");
- ibcom_errno = MPID_nem_ib_drain_scq(0);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_drain_scq");
-#ifdef MPID_NEM_IB_ONDEMAND
- ibcom_errno = MPID_nem_ib_cm_poll_syn();
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_poll_syn");
- ibcom_errno = MPID_nem_ib_cm_poll();
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_poll");
- ibcom_errno = MPID_nem_ib_cm_progress();
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_progress");
- ibcom_errno = MPID_nem_ib_cm_drain_scq();
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_drain_scq");
-#endif
- ibcom_errno = MPID_nem_ib_ringbuf_progress();
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_ringbuf_progress");
-
- MPID_nem_ib_progress_engine_vt += 1; /* Progress virtual time */
- }
-
- dprintf("init,middle,%d->%d,r rdmaocc=%d,l rdmaocc=%d,sendq=%d,ncqe=%d,pending_sends=%d\n",
- MPID_nem_ib_myrank, vc->pg_rank,
- MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
- vc_ib->ibcom->rsr_seq_num_tail_last_sent),
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
- MPID_nem_ib_sendq_empty(vc_ib->sendq), MPID_nem_ib_ncqe, VC_FIELD(vc, pending_sends));
-
- dprintf("init,middle2,%d->%d,r rdmaocc=%d,l rdmaocc=%d,sendq=%d,ncqe=%d,pending_sends=%d\n",
- MPID_nem_ib_myrank, vc->pg_rank,
- MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
- vc_ib->ibcom->rsr_seq_num_tail_last_sent),
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
- MPID_nem_ib_sendq_empty(vc_ib->sendq), MPID_nem_ib_ncqe, VC_FIELD(vc, pending_sends));
-
- if (MPID_nem_ib_ncqe > 0 || VC_FIELD(vc, pending_sends) > 0) {
- usleep(1000);
- MPID_nem_ib_drain_scq(0);
- }
-
- dprintf("init,after ,%d->%d,r rdmaocc=%d,l rdmaocc=%d,sendq=%d,ncqe=%d,pending_sends=%d\n",
- MPID_nem_ib_myrank, vc->pg_rank,
- MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
- vc_ib->ibcom->rsr_seq_num_tail_last_sent),
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
- MPID_nem_ib_sendq_empty(vc_ib->sendq), MPID_nem_ib_ncqe, VC_FIELD(vc, pending_sends));
-
- /* drain scratch-pad scq */
-#ifdef MPID_NEM_IB_ONDEMAND
- ibcom_errno = MPID_nem_ib_cm_drain_scq();
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_drain_scq");
- dprintf("init,scratch_pad,ncqe=%d,to_drain=%d\n", MPID_nem_ib_ncqe_scratch_pad,
- MPID_nem_ib_ncqe_scratch_pad_to_drain);
- dprintf("init,scratch_pad,ncom_scratch_pad=%d\n",
- MPID_nem_ib_scratch_pad_ibcoms[vc->pg_rank]->ncom_scratch_pad);
-#else
- ibcom_errno = MPID_nem_ib_drain_scq_scratch_pad();
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_drain_scq_scratch_pad");
-#endif
-
- mpi_errno = MPIDI_CH3U_Handle_connection(vc, MPIDI_VC_EVENT_TERMINATED);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
-
- dprintf("vc_terminate,exit\n");
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_VC_TERMINATE);
- return mpi_errno;
- fn_fail:
- MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "failure. mpi_errno = %d", mpi_errno));
- goto fn_exit;
-
-}
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
deleted file mode 100644
index 1235fdf..0000000
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
+++ /dev/null
@@ -1,540 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
-/*
- * (C) 2013 NEC Corporation
- * (C) 2014-2015 RIKEN AICS
- *
- * See COPYRIGHT in top-level directory.
- */
-
-#include "ib_impl.h"
-
-//#define MPID_NEM_IB_DEBUG_LMT
-#ifdef dprintf /* avoid redefinition with src/mpid/ch3/include/mpidimpl.h */
-#undef dprintf
-#endif
-#ifdef MPID_NEM_IB_DEBUG_LMT
-#define dprintf printf
-#else
-#define dprintf(...)
-#endif
-
-/* Get mode: sender sends RTS */
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_lmt_initiate_lmt
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_pkt,
- struct MPID_Request *req)
-{
- int mpi_errno = MPI_SUCCESS;
- int dt_contig;
- MPIDI_msg_sz_t data_sz;
- MPID_Datatype *dt_ptr;
- MPI_Aint dt_true_lb;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT);
-
- dprintf("lmt_initiate_lmt,enter,%d->%d,req=%p\n", MPID_nem_ib_myrank, vc->pg_rank, req);
-
- /* obtain dt_true_lb */
- /* see MPIDI_Datatype_get_info(in, in, out, out, out, out) (in src/mpid/ch3/include/mpidimpl.h) */
- MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr,
- dt_true_lb);
-
- /* FIXME: who frees s_cookie_buf? */
- /* malloc memory area for cookie. auto variable is NG because isend does not copy payload */
- MPID_nem_ib_lmt_cookie_t *s_cookie_buf =
- (MPID_nem_ib_lmt_cookie_t *) MPIU_Malloc(sizeof(MPID_nem_ib_lmt_cookie_t));
-
- /* remember address to "free" when receiving DONE from receiver */
- req->ch.s_cookie = s_cookie_buf;
-
- /* see MPIDI_CH3_PktHandler_RndvClrToSend (in src/mpid/ch3/src/ch3u_rndv.c) */
- //assert(dt_true_lb == 0);
- void *write_from_buf;
- if (dt_contig) {
- write_from_buf = (void *) ((char *) req->dev.user_buf + dt_true_lb);
- }
- else {
- /* see MPIDI_CH3_EagerNoncontigSend (in ch3u_eager.c) */
- req->dev.segment_ptr = MPID_Segment_alloc();
- MPIU_ERR_CHKANDJUMP((req->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER,
- "**outofmemory");
-
- MPID_Segment_init(req->dev.user_buf, req->dev.user_count, req->dev.datatype,
- req->dev.segment_ptr, 0);
- req->dev.segment_first = 0;
- req->dev.segment_size = data_sz;
-
- MPIDI_msg_sz_t last;
- last = req->dev.segment_size; /* segment_size is byte offset */
- MPIU_Assert(last > 0);
- REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc((size_t) req->dev.segment_size);
- MPIU_ERR_CHKANDJUMP(!REQ_FIELD(req, lmt_pack_buf), mpi_errno, MPI_ERR_OTHER,
- "**outofmemory");
- MPID_Segment_pack(req->dev.segment_ptr, req->dev.segment_first, &last,
- (char *) (REQ_FIELD(req, lmt_pack_buf)));
- MPIU_Assert(last == req->dev.segment_size);
- write_from_buf = REQ_FIELD(req, lmt_pack_buf);
- }
- dprintf
- ("lmt_initate_lmt,dt_contig=%d,write_from_buf=%p,req->dev.user_buf=%p,REQ_FIELD(req, lmt_pack_buf)=%p\n",
- dt_contig, write_from_buf, req->dev.user_buf, REQ_FIELD(req, lmt_pack_buf));
-
-#ifdef HAVE_LIBDCFA
-#else
- s_cookie_buf->addr = write_from_buf;
-#endif
- /* put sz, see MPID_nem_lmt_RndvSend (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c) */
- /* TODO remove sz field
- * pkt_RTS_handler (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c)
- * rreq->ch.lmt_data_sz = rts_pkt->data_sz; */
- //s_cookie_buf->sz = (uint32_t)((MPID_nem_pkt_lmt_rts_t*)rts_pkt)->data_sz;
-
- /* preserve and put tail, because tail magic is written on the tail of payload
- * because we don't want to add another SGE or RDMA command */
- MPIU_Assert(((MPID_nem_pkt_lmt_rts_t *) rts_pkt)->data_sz == data_sz);
- s_cookie_buf->tail = *((uint8_t *) ((uint8_t *) write_from_buf + data_sz - sizeof(uint8_t)));
- /* prepare magic */
- //*((uint32_t*)(write_from_buf + data_sz - sizeof(tailmagic_t))) = MPID_NEM_IB_COM_MAGIC;
-
- int post_num;
- uint32_t max_msg_sz;
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
-
- if (vc_ib->connection_state == MPID_NEM_IB_CM_ESTABLISHED) {
- MPID_nem_ib_com_get_info_conn(vc_ib->sc->fd, MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ,
- &max_msg_sz, sizeof(uint32_t));
- }
- else {
- /* If connection is not established, get max_msg_sz from the global value. */
- MPID_nem_ib_com_get_info_pattr(MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ, &max_msg_sz,
- sizeof(uint32_t));
- }
-
- /* Type of max_msg_sz is uint32_t. */
- post_num = (data_sz + (long) max_msg_sz - 1) / (long) max_msg_sz;
-
- s_cookie_buf->max_msg_sz = max_msg_sz;
- s_cookie_buf->seg_seq_num = 1;
- s_cookie_buf->seg_num = post_num;
-
- REQ_FIELD(req, buf.from) = write_from_buf;
- REQ_FIELD(req, data_sz) = data_sz;
- REQ_FIELD(req, seg_seq_num) = 1; // only send 1st-segment, even if there are some segments.
- REQ_FIELD(req, seg_num) = post_num;
- REQ_FIELD(req, max_msg_sz) = max_msg_sz;
-
- long length;
- if (post_num > 1) {
- length = max_msg_sz;
- }
- else {
- length = data_sz;
- }
- /* put IB rkey */
- struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache =
- MPID_nem_ib_com_reg_mr_fetch(write_from_buf, length, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
- MPIU_ERR_CHKANDJUMP(!mr_cache, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
- struct ibv_mr *mr = mr_cache->mr;
- REQ_FIELD(req, lmt_mr_cache) = (void *) mr_cache;
-#ifdef HAVE_LIBDCFA
- s_cookie_buf->addr = (void *) mr->host_addr;
- dprintf("lmt_initiate_lmt,s_cookie_buf->addr=%p\n", s_cookie_buf->addr);
-#endif
- s_cookie_buf->rkey = mr->rkey;
- dprintf("lmt_initiate_lmt,tail=%02x,mem-tail=%p,%02x,sz=%ld,raddr=%p,rkey=%08x\n",
- s_cookie_buf->tail, write_from_buf + data_sz - sizeof(uint8_t),
- *((uint8_t *) (write_from_buf + data_sz - sizeof(uint8_t))), data_sz,
- s_cookie_buf->addr, s_cookie_buf->rkey);
- /* send cookie. rts_pkt as the MPI-header, s_cookie_buf as the payload */
- MPID_nem_lmt_send_RTS(vc, (MPID_nem_pkt_lmt_rts_t *) rts_pkt, s_cookie_buf,
- sizeof(MPID_nem_ib_lmt_cookie_t));
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-/* essential lrecv part extracted for dequeueing and issue from sendq */
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_lmt_start_recv_core
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_lmt_start_recv_core(struct MPID_Request *req, void *raddr, uint32_t rkey, long len,
- void *write_to_buf, uint32_t max_msg_sz, int end)
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
- struct MPIDI_VC *vc = req->ch.vc;
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- int i;
- int divide;
- int posted_num;
- int last;
- uint32_t r_max_msg_sz; /* responder's max_msg_sz */
- void *write_pos;
- void *addr;
- long data_sz;
- MPIDI_msg_sz_t rest_data_sz;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);
-
- MPID_nem_ib_com_get_info_conn(vc_ib->sc->fd, MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ,
- &r_max_msg_sz, sizeof(uint32_t));
-
- divide = (len + r_max_msg_sz - 1) / r_max_msg_sz;
-
- write_pos = write_to_buf;
- posted_num = 0;
- last = MPID_NEM_IB_LMT_PART_OF_SEGMENT;
- rest_data_sz = len;
- addr = raddr;
-
- for (i = 0; i < divide; i++) {
- if (i == divide - 1)
- data_sz = len - i * r_max_msg_sz;
- else
- data_sz = r_max_msg_sz;
-
- if (i == divide - 1) {
- if (end)
- last = MPID_NEM_IB_LMT_LAST_PKT; /* last part of last segment packet */
- else
- last = MPID_NEM_IB_LMT_SEGMENT_LAST; /* last part of this segment */
-
- if (rest_data_sz < r_max_msg_sz)
- data_sz = rest_data_sz;
- }
-
- ibcom_errno =
- MPID_nem_ib_com_lrecv(vc_ib->sc->fd, (uint64_t) req, addr, data_sz, rkey,
- write_pos, last);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_lrecv");
-
- /* update position */
- write_pos = (void *) ((char *) write_pos + data_sz);
- addr = (void *) ((char *) addr + data_sz);
-
- /* update rest data size */
- rest_data_sz -= data_sz;
-
- /* count request number */
- posted_num++;
- }
-
- MPIU_Assert(rest_data_sz == 0);
- MPID_nem_ib_ncqe += posted_num;
- //dprintf("start_recv,ncqe=%d\n", MPID_nem_ib_ncqe);
- dprintf("lmt_start_recv_core,MPID_nem_ib_ncqe=%d\n", MPID_nem_ib_ncqe);
- dprintf
- ("lmt_start_recv_core,req=%p,sz=%ld,write_to_buf=%p,lmt_pack_buf=%p,user_buf=%p,raddr=%p,rkey=%08x,tail=%p=%02x\n",
- req, req->ch.lmt_data_sz, write_to_buf, REQ_FIELD(req, lmt_pack_buf), req->dev.user_buf,
- raddr, rkey, write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t),
- *((uint8_t *) (write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t))));
- //fflush(stdout);
-
-#ifdef MPID_NEM_IB_LMT_GET_CQE
- MPID_nem_ib_ncqe_to_drain += posted_num; /* use CQE instead of polling */
-#else
- /* drain_scq and ib_poll is not ordered, so both can decrement ref_count */
- MPIR_Request_add_ref(req);
-
- /* register to poll list in ib_poll() */
- /* don't use req->dev.next because it causes unknown problem */
- MPID_nem_ib_lmtq_enqueue(&MPID_nem_ib_lmtq, req);
- dprintf("lmt_start_recv_core,lmtq enqueue\n");
- //volatile uint8_t* tailmagic = (uint8_t*)((void*)req->dev.user_buf + req->ch.lmt_data_sz - sizeof(uint8_t));
- //dprintf("start_recv_core,cur_tail=%02x,lmt_receiver_tail=%02x\n", *tailmagic, REQ_FIELD(req, lmt_receiver_tail));
-#endif
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-/* Get protocol: (1) sender sends rts to receiver (2) receiver RDMA-reads (here)
- (3) receiver polls on end-flag (4) receiver sends done to sender
- caller: (in mpid_nem_lmt.c)
-*/
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_lmt_start_recv
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_lmt_start_recv(struct MPIDI_VC *vc, struct MPID_Request *req, MPID_IOV s_cookie)
-{
- int mpi_errno = MPI_SUCCESS;
- int dt_contig;
- MPIDI_msg_sz_t data_sz _UNUSED_;
- MPID_Datatype *dt_ptr;
- MPI_Aint dt_true_lb;
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_START_RECV);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_START_RECV);
-
- dprintf("lmt_start_recv,enter,%d<-%d,req=%p\n", MPID_nem_ib_myrank, vc->pg_rank, req);
-
- /* obtain dt_true_lb */
- /* see MPIDI_Datatype_get_info(in, in, out, out, out, out) (in src/mpid/ch3/include/mpidimpl.h) */
- MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr,
- dt_true_lb);
-
- MPID_nem_ib_lmt_cookie_t *s_cookie_buf = s_cookie.iov_base;
-
- /* stash vc for ib_poll */
- req->ch.vc = vc;
-
- void *write_to_buf;
- if (dt_contig) {
- write_to_buf = (void *) ((char *) req->dev.user_buf + dt_true_lb);
- }
- else {
- //REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc((size_t)req->ch.lmt_data_sz);
- REQ_FIELD(req, lmt_pack_buf) = MPID_nem_ib_stmalloc((size_t) req->ch.lmt_data_sz);
- MPIU_ERR_CHKANDJUMP(!REQ_FIELD(req, lmt_pack_buf), mpi_errno, MPI_ERR_OTHER,
- "**outofmemory");
- write_to_buf = REQ_FIELD(req, lmt_pack_buf);
- }
-
- REQ_FIELD(req, buf.to) = write_to_buf;
-
-#ifdef MPID_NEM_IB_LMT_GET_CQE
-#else
- /* unmark magic */
- *((uint8_t *) (write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t))) = ~s_cookie_buf->tail; /* size in cookie was not set */
-#endif
- dprintf
- ("lmt_start_recv,dt_contig=%d,write_to_buf=%p,req->dev.user_buf=%p,REQ_FIELD(req, lmt_pack_buf)=%p,marked-tail=%02x,unmarked-tail=%02x\n",
- dt_contig, write_to_buf, req->dev.user_buf, REQ_FIELD(req, lmt_pack_buf),
- s_cookie_buf->tail, *((uint8_t *) (write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t))));
-
- /* stash tail for poll because do_cts in mpid_nem_lmt.c free s_cookie_buf just after this function */
- REQ_FIELD(req, lmt_tail) = s_cookie_buf->tail;
- dprintf("lmt_start_recv,mem-tail=%p,%02x\n",
- write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t),
- *((uint8_t *) (write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t))));
-
- //dprintf("lmt_start_recv,sendq_empty=%d,ncom=%d,ncqe=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY, MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
-
- int last = 1;
- long length = req->ch.lmt_data_sz;
-
- if (s_cookie_buf->seg_seq_num != s_cookie_buf->seg_num) {
- last = 0;
- length = s_cookie_buf->max_msg_sz;
- }
-
- REQ_FIELD(req, max_msg_sz) = s_cookie_buf->max_msg_sz; /* store initiator's max_msg_sz */
- REQ_FIELD(req, seg_num) = s_cookie_buf->seg_num; /* store number of segments */
-
- /* try to issue RDMA-read command */
- int slack = 1; /* slack for control packet bringing sequence number */
- if (MPID_nem_ib_sendq_empty(vc_ib->sendq) &&
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack) {
- mpi_errno =
- MPID_nem_ib_lmt_start_recv_core(req, s_cookie_buf->addr, s_cookie_buf->rkey, length,
- write_to_buf, s_cookie_buf->max_msg_sz, last);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
- }
- else {
- /* enqueue command into send_queue */
- dprintf("lmt_start_recv, enqueuing,sendq_empty=%d,ncom=%d,ncqe=%d\n",
- MPID_nem_ib_sendq_empty(vc_ib->sendq),
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
-
- /* make raddr, (sz is in rreq->ch.lmt_data_sz), rkey, (user_buf is in req->dev.user_buf) survive enqueue, free cookie, dequeue */
- REQ_FIELD(req, lmt_raddr) = s_cookie_buf->addr;
- REQ_FIELD(req, lmt_rkey) = s_cookie_buf->rkey;
- REQ_FIELD(req, lmt_write_to_buf) = write_to_buf;
- REQ_FIELD(req, lmt_szsend) = length;
- REQ_FIELD(req, last) = last;
-
- MPID_nem_ib_sendq_enqueue(&vc_ib->sendq, req);
- }
-
-#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
- /* change remote notification policy of RDMA-write-to buf */
- //dprintf("lmt_start_recv,reply_seq_num,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
- MPID_nem_ib_change_rdmabuf_occupancy_notify_policy_lw(vc_ib, &vc_ib->ibcom->lsr_seq_num_tail);
- //dprintf("lmt_start_recv,reply_seq_num,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
-#endif
- //dprintf("lmt_start_recv,reply_seq_num,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
- /* try to send from sendq because at least one RDMA-write-to buffer has been released */
- //dprintf("lmt_start_recv,reply_seq_num,send_progress\n");
- if (!MPID_nem_ib_sendq_empty(vc_ib->sendq)) {
- dprintf("lmt_start_recv,ncom=%d,ncqe=%d,diff=%d\n",
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY,
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) < MPID_NEM_IB_COM_RDMABUF_NSEG);
- }
- if (!MPID_nem_ib_sendq_empty(vc_ib->sendq) && MPID_nem_ib_sendq_ready_to_send_head(vc_ib)) {
- dprintf("lmt_start_recv,send_progress\n");
- fflush(stdout);
- MPID_nem_ib_send_progress(vc);
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_START_RECV);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-/* when cookie is received in the middle of the lmt */
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_lmt_handle_cookie
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_lmt_handle_cookie(struct MPIDI_VC *vc, struct MPID_Request *req, MPID_IOV cookie)
-{
- int mpi_errno = MPI_SUCCESS;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_HANDLE_COOKIE);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_HANDLE_COOKIE);
-
- dprintf("lmt_handle_cookie,enter\n");
-
- /* Nothing to do */
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_HANDLE_COOKIE);
- return mpi_errno;
- //fn_fail:
- goto fn_exit;
-}
-
-/* when sender receives DONE from receiver */
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_lmt_done_send
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_lmt_done_send(struct MPIDI_VC *vc, struct MPID_Request *req)
-{
- int mpi_errno = MPI_SUCCESS;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_DONE_SEND);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_DONE_SEND);
-
- dprintf("lmt_done_send,enter,%d<-%d,req=%p,REQ_FIELD(req, lmt_pack_buf)=%p\n",
- MPID_nem_ib_myrank, vc->pg_rank, req, REQ_FIELD(req, lmt_pack_buf));
-
-
- /* free memory area for cookie */
- if (!req->ch.s_cookie) {
- dprintf("lmt_done_send,enter,req->ch.s_cookie is zero");
- }
- MPIU_Free(req->ch.s_cookie);
- //dprintf("lmt_done_send,free cookie,%p\n", req->ch.s_cookie);
-
- /* free temporal buffer for eager-send non-contiguous data.
- * MPIDI_CH3U_Recvq_FDU_or_AEP (in mpid_isend.c) sets req->dev.datatype */
- int is_contig;
- MPID_Datatype_is_contig(req->dev.datatype, &is_contig);
- if (!is_contig && REQ_FIELD(req, lmt_pack_buf)) {
- dprintf("lmt_done_send,lmt-get,non-contiguous,free lmt_pack_buf\n");
- MPIU_Free(REQ_FIELD(req, lmt_pack_buf));
- }
-
- /* mark completion on sreq */
- MPIU_ERR_CHKANDJUMP(req->dev.OnDataAvail, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_lmt_done_send");
- dprintf("lmt_done_send,1,req=%p,pcc=%d\n", req, MPIDI_CH3I_progress_completion_count.v);
- MPIDI_CH3U_Request_complete(req);
- dprintf("lmt_done_send,complete,req=%p\n", req);
- dprintf("lmt_done_send,2,req=%p,pcc=%d\n", req, MPIDI_CH3I_progress_completion_count.v);
- //dprintf("lmt_done_send, mark completion on sreq\n");
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_DONE_SEND);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-/* lmt-put (1) sender sends done when finding cqe of put (2) packet-handler of DONE on receiver (3) here */
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_lmt_done_recv
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_lmt_done_recv(struct MPIDI_VC *vc, struct MPID_Request *rreq)
-{
- int mpi_errno = MPI_SUCCESS;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_DONE_RECV);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_DONE_RECV);
-
- dprintf("lmt_done_recv,enter,rreq=%p,head=%p\n", rreq, MPID_nem_ib_lmtq.head);
-
-
- int is_contig;
- MPID_Datatype_is_contig(rreq->dev.datatype, &is_contig);
- if (!is_contig) {
- dprintf("lmt_done_recv,copying noncontiguous data to user buffer\n");
-
- /* see MPIDI_CH3U_Request_unpack_uebuf (in /src/mpid/ch3/src/ch3u_request.c) */
- /* or MPIDI_CH3U_Receive_data_found (in src/mpid/ch3/src/ch3u_handle_recv_pkt.c) */
- MPIDI_msg_sz_t unpack_sz = rreq->ch.lmt_data_sz;
- MPID_Segment seg;
- MPI_Aint last;
-
- MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, &seg, 0);
- last = unpack_sz;
- MPID_Segment_unpack(&seg, 0, &last, REQ_FIELD(rreq, lmt_pack_buf));
- if (last != unpack_sz) {
- /* --BEGIN ERROR HANDLING-- */
- /* received data was not entirely consumed by unpack()
- * because too few bytes remained to fill the next basic
- * datatype */
- MPIR_STATUS_SET_COUNT(rreq->status, last);
- rreq->status.MPI_ERROR =
- MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__,
- MPI_ERR_TYPE, "**MPID_nem_ib_lmt_done_recv", 0);
- /* --END ERROR HANDLING-- */
- }
-
- //MPIU_Free(REQ_FIELD(rreq, lmt_pack_buf));
- MPID_nem_ib_stfree(REQ_FIELD(rreq, lmt_pack_buf), (size_t) rreq->ch.lmt_data_sz);
- }
-
- dprintf("lmt_done_recv,1,req=%p,pcc=%d\n", rreq, MPIDI_CH3I_progress_completion_count.v);
- MPIDI_CH3U_Request_complete(rreq);
- dprintf("lmt_done_recv,complete,req=%p\n", rreq);
- dprintf("lmt_done_recv,2,pcc=%d\n", MPIDI_CH3I_progress_completion_count.v);
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_DONE_RECV);
- return mpi_errno;
- //fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_lmt_vc_terminated
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_lmt_vc_terminated(struct MPIDI_VC *vc)
-{
- int mpi_errno = MPI_SUCCESS;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_VC_TERMINATED);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_VC_TERMINATED);
-
- dprintf("lmt_vc_terminated,enter\n");
-
- /* Nothing to do */
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_VC_TERMINATED);
- return mpi_errno;
- //fn_fail:
- goto fn_exit;
-}
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_malloc.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_malloc.c
deleted file mode 100644
index e9a21c5..0000000
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_malloc.c
+++ /dev/null
@@ -1,540 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
-/*
- * (C) 2014-2015 RIKEN AICS
- *
- * See COPYRIGHT in top-level directory.
- */
-
-#define _GNU_SOURCE 1
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <unistd.h>
-#include <sys/syscall.h>
-#include <pthread.h>
-#include <malloc.h>
-#include "mpid_nem_impl.h"
-
-//#define __DEBUG__
-
-#ifdef __DEBUG__
-#define dprintf printf
-
-#define NUM_USED(addr, align, size) \
- (((size_t)addr & ~((size_t)align - 1)) == (size_t)addr) ? \
- ((size_t)align / size) : \
- (((size_t)addr & (align - 1)) / size)
-
-#else
-#define dprintf(...)
-#endif
-
-static void _local_malloc_initialize_hook(void);
-void *ib_malloc_hook(size_t size, const void *caller);
-void ib_free_hook(void *addr, const void *caller);
-void *ib_realloc_hook(void *addr, size_t size, const void *caller);
-
-#ifndef __MALLOC_HOOK_VOLATILE
-#define __MALLOC_HOOK_VOLATILE
-#endif
-void (*__MALLOC_HOOK_VOLATILE __malloc_initialize_hook) (void) = _local_malloc_initialize_hook;
-
-static pthread_mutex_t mutex;
-static int __initialized_malloc = 0;
-static int __tunnel_munmap = 0;
-
-#define POOL_MIN_POW (5)
-#define POOL_MAX_POW (14)
-#define PAGE_SIZE (1UL << 12)
-
-#define MMAPED_OFFSET_POW (8)
-#define MMAPED_OFFSET (1UL << MMAPED_OFFSET_POW) // 256byte
-
-#define ARRAY_SIZE (64) // x86_64
-
-#define DEFAULT_POOL_SIZE (1UL << 17) // 128Kbyte
-#define POOL_ALIGN_SIZE (DEFAULT_POOL_SIZE)
-
-#define do_segfault (*(unsigned int*)0 = 0) // segmentation fault
-
-static int use_ib_malloc = 0;
-
-static void ib_check_env(void)
-{
- char *target = NULL, *tmp_str = NULL;
-
- /* The order of comparison is the same as MPIR_T_cvar_init in mpich_cvars.c */
- tmp_str = getenv("MPICH_NEMESIS_NETMOD");
- if (tmp_str) {
- target = tmp_str;
- }
- tmp_str = getenv("MPIR_PARAM_NEMESIS_NETMOD");
- if (tmp_str) {
- target = tmp_str;
- }
- tmp_str = getenv("MPIR_CVAR_NEMESIS_NETMOD");
- if (tmp_str) {
- target = tmp_str;
- }
-
- /* If environment variable is set, then compare with it.
- * If environment variables are not set, then compare with the first element of netmod-list.
- */
- if ((target && !strncmp(target, "ib", MPID_NEM_MAX_NETMOD_STRING_LEN)) ||
- (!target && !strncmp(MPID_nem_netmod_strings[0], "ib", MPID_NEM_MAX_NETMOD_STRING_LEN))) {
- use_ib_malloc = 1;
- __malloc_hook = ib_malloc_hook;
- __free_hook = ib_free_hook;
- __realloc_hook = ib_realloc_hook;
- }
-}
-
-struct free_list {
- struct free_list *next;
- struct free_list *prev;
-};
-
-#define CHUNK (sizeof(struct free_list))
-
-static inline void list_init(struct free_list *head)
-{
- head->next = head;
- head->prev = head;
-}
-
-static inline void __list_add(struct free_list *new, struct free_list *prev, struct free_list *next)
-{
- next->prev = new;
- new->next = next;
- new->prev = prev;
- prev->next = new;
-}
-
-static inline void __list_del(struct free_list *prev, struct free_list *next)
-{
- next->prev = prev;
- prev->next = next;
-}
-
-static inline void list_add_head(struct free_list *new, struct free_list *head)
-{
- __list_add(new, head, head->next);
-}
-
-static inline void list_add_tail(struct free_list *new, struct free_list *head)
-{
- __list_add(new, head->prev, head);
-}
-
-static inline void list_del(struct free_list *list)
-{
- __list_del(list->prev, list->next);
-
- list->prev = NULL;
- list->next = NULL;
-}
-
-static inline int is_list_empty(struct free_list *list)
-{
- return (list->next == list) ? 1 : 0;
-}
-
-static struct free_list arena_flist[ARRAY_SIZE];
-
-struct pool_info {
- struct free_list list; /* 16byte (x86_64) */
- char *next_pos; /* 8byte (x86_64) */
- uint16_t size; /* 2byte */
- uint16_t num; /* 2byte */
- uint16_t free_num; /* 2byte */
- uint16_t pow; /* 2byte */
- uint16_t hole_num; /* 2byte */
- uint16_t num_per_page; /* 2byte */
- uint16_t count; /* 2byte */
-}; /* size of 'struct pool_info' must be smaller than MMAPED_OFFSET */
-
-#ifdef __x86_64__
-#define builtin_clz __builtin_clzl
-#define builtin_ctz __builtin_ctzl
-#else
-#define builtin_clz __builtin_clz
-#define builtin_ctz __builtin_ctz
-#endif
-
-/* Get a power of the argument */
-static int powoftwo(size_t val)
-{
- if (val <= (size_t) (1UL << POOL_MIN_POW))
- return POOL_MIN_POW;
-
- int shift_max;
-#if defined(__x86_64__)
- shift_max = 64;
-#else
- shift_max = 32;
-#endif
-
- /* If 'val' is power-of-two, we use 'ctz' */
-
- return (val & (val - 1UL)) ? (shift_max - builtin_clz(val)) : builtin_ctz(val);
-}
-
-static void *__alloc_mmap(size_t size, size_t align)
-{
- char *unaligned, *aligned;
- size_t misaligned;
- int ret;
-
- unaligned = mmap(0, size + align, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-
- if (unaligned == MAP_FAILED)
- return NULL;
-
- misaligned = (size_t) unaligned & ((size_t) align - 1);
- if (misaligned > 0) {
- size_t offset = align - misaligned;
- aligned = unaligned + offset;
-
- /* munmap : head */
- __tunnel_munmap = 1;
- ret = munmap(unaligned, offset);
- __tunnel_munmap = 0;
- if (ret)
- do_segfault;
- }
- else {
- aligned = unaligned;
- misaligned = align;
- }
-
- /* munmap : tail */
- __tunnel_munmap = 1;
- ret = munmap(aligned + size, misaligned);
- __tunnel_munmap = 0;
- if (ret)
- do_segfault;
-
- return (void *) aligned;
-}
-
-static void __init_pool_header_with_hole(struct pool_info *info, int i, int size)
-{
- info->size = 1 << i;
- info->hole_num = (MMAPED_OFFSET >> i) + 1;
- info->num_per_page = PAGE_SIZE >> i;
- info->num = size >> i;
- info->free_num = info->hole_num;
- info->count = info->hole_num;
- info->pow = i;
- info->next_pos = (char *) info + info->size * info->hole_num;
-}
-
-static void __init_pool_header(struct pool_info *info, int i, int size)
-{
- info->size = 1 << i;
- info->num = size >> i;
- info->free_num = 1;
- info->pow = i;
- info->next_pos = (char *) info + info->size;
-}
-
-static void _local_malloc_initialize_hook(void)
-{
- int i, j;
- char *aligned;
- size_t size;
- int count;
-
- pthread_mutex_init(&mutex, NULL);
-
- pthread_mutex_lock(&mutex);
-
- ib_check_env();
- if (!use_ib_malloc) {
- pthread_mutex_unlock(&mutex);
- return;
- }
-
- __initialized_malloc = 1;
-
- for (i = 0; i < ARRAY_SIZE; i++) {
- /* init list */
- list_init(&arena_flist[i]);
- }
-
- /* Allocate initial mempool
- *
- * We do not use 2^0, ..., 2^(POOL_MIN_POW - 1) byte.
- */
-
- /* First, allocate a initial area by one-time mmap() and split it */
- count = POOL_MAX_POW - POOL_MIN_POW + 1;
- size = (size_t) DEFAULT_POOL_SIZE; // default pool size is 128k
-
- aligned = (char *) __alloc_mmap(size * count, POOL_ALIGN_SIZE);
-
- if (aligned == NULL) {
- pthread_mutex_unlock(&mutex);
- return;
- }
-
- /* split allcated area */
- for (i = POOL_MIN_POW; i < POOL_MIN_POW + count; i++) {
- struct pool_info *info;
-
- info = (struct pool_info *) aligned;
-
- if (i <= MMAPED_OFFSET_POW) {
- __init_pool_header_with_hole(info, i, size);
-
- int elem = (DEFAULT_POOL_SIZE - (info->hole_num * info->size)) / (CHUNK + info->size);
- struct free_list *block_head = (struct free_list *) info->next_pos;
- for (j = 0; j < elem; j++) {
- if (((size_t) ((char *) block_head + CHUNK) & ((size_t) PAGE_SIZE - 1)) !=
- MMAPED_OFFSET) {
- list_add_tail(block_head, &arena_flist[i]);
- }
- block_head = (struct free_list *) ((char *) block_head + CHUNK + info->size);
- }
- }
- else {
- __init_pool_header(info, i, size);
- /* add list tail */
- list_add_tail(&(info->list), &arena_flist[i]);
- }
-
- aligned += size;
- }
-
- pthread_mutex_unlock(&mutex);
-}
-
-void *ib_malloc_hook(size_t size, const void *caller)
-{
- int i;
- int pow;
- char *ptr = NULL;
-
- if (!__initialized_malloc && __malloc_initialize_hook)
- __malloc_initialize_hook();
-
- pthread_mutex_lock(&mutex);
-
- pow = powoftwo(size);
-
- if (pow < 0 || pow >= ARRAY_SIZE) {
- pthread_mutex_unlock(&mutex);
- return NULL;
- }
-
- if (is_list_empty(&arena_flist[pow])) {
- char *tmp;
-
- if (pow > POOL_MAX_POW) {
- /* create memory area by mmap */
-
- tmp = (char *) __alloc_mmap(((size_t) 1 << pow) + PAGE_SIZE, PAGE_SIZE);
-
- if (tmp == NULL) {
- pthread_mutex_unlock(&mutex);
- return NULL;
- }
-
- *(int *) tmp = pow; //store 'power' for free()
-
- ptr = (char *) tmp + MMAPED_OFFSET;
-
- dprintf("malloc(%lu) [2^%d] ==> CREATE mmaped %p\n", size, pow, ptr);
- }
- else {
- /* create new pool */
- struct pool_info *info;
- size_t alloc_sz = DEFAULT_POOL_SIZE;
-
- tmp = (char *) __alloc_mmap(alloc_sz, POOL_ALIGN_SIZE);
-
- if (tmp == NULL) {
- pthread_mutex_unlock(&mutex);
- return NULL;
- }
-
- info = (struct pool_info *) tmp;
-
- if (pow <= MMAPED_OFFSET_POW) {
- __init_pool_header_with_hole(info, pow, alloc_sz);
-
- int elem =
- (DEFAULT_POOL_SIZE - (info->hole_num * info->size)) / (CHUNK + info->size);
- struct free_list *block_head = (struct free_list *) info->next_pos;
- for (i = 0; i < elem; i++) {
- if (((size_t) ((char *) block_head + CHUNK) & ((size_t) PAGE_SIZE - 1)) !=
- MMAPED_OFFSET) {
- list_add_tail(block_head, &arena_flist[pow]);
- }
- block_head = (struct free_list *) ((char *) block_head + CHUNK + info->size);
- }
-
- /* use head elem */
- struct free_list *head = (struct free_list *) (arena_flist[pow].next);
- ptr = (char *) head + CHUNK;
- dprintf("malloc(%lu) [2^%d] ==> USE pool %p\n", size, pow, ptr);
- list_del(head);
- }
- else {
- __init_pool_header(info, pow, alloc_sz);
- list_add_tail(&(info->list), &arena_flist[pow]);
-
- ptr = info->next_pos;
- info->next_pos += info->size;
-
- if (pow <= MMAPED_OFFSET_POW)
- info->count++;
-
- dprintf("malloc(%lu) [2^%d] ==> CREATE pool %p use = %lu\n", size, pow, ptr,
- NUM_USED(info->next_pos, POOL_ALIGN_SIZE, info->size));
- }
- }
- }
- else {
- if (pow > POOL_MAX_POW) {
- char *head = (char *) arena_flist[pow].next;
-
- list_del((struct free_list *) head);
-
- *(int *) head = pow; //store 'power' for free()
- ptr = (char *) head + MMAPED_OFFSET;
-
- dprintf("malloc(%lu) [2^%d] ==> USE mmaped %p\n", size, pow, ptr);
- }
- else if (pow > MMAPED_OFFSET_POW) {
- struct pool_info *info = (struct pool_info *) (arena_flist[pow].next);
-
- ptr = info->next_pos;
- info->next_pos += info->size;
-
- dprintf("malloc(%lu) [2^%d] ==> USE pool %p use = %lu\n", size, pow, ptr,
- NUM_USED(info->next_pos, POOL_ALIGN_SIZE, info->size));
-
- /* if 'info->nex_pos' is aligned, all blocks are used */
- if (((size_t) info->next_pos & ~(POOL_ALIGN_SIZE - 1)) == (size_t) info->next_pos) {
- list_del(&(info->list));
- }
- }
- else {
- char *info = (char *) (arena_flist[pow].next);
- ptr = (char *) info + CHUNK;
- dprintf("malloc(%lu) [2^%d] ==> USE pool %p\n", size, pow, ptr);
- list_del((struct free_list *) info);
- }
- }
-
- pthread_mutex_unlock(&mutex);
-
- return ptr;
-}
-
-static inline void free_core(void *addr)
-{
- pthread_mutex_lock(&mutex);
-
- if (((size_t) addr & ((size_t) PAGE_SIZE - 1)) == MMAPED_OFFSET) {
- char *head = (char *) addr - MMAPED_OFFSET;
- int power = (int) *(int *) head;
-
- dprintf("free(%p) --> free MMAPED [2^%d]\n", addr, power);
- list_add_tail((struct free_list *) head, &arena_flist[power]);
- }
- else {
- struct pool_info *info =
- (struct pool_info *) ((size_t) addr & ~((size_t) POOL_ALIGN_SIZE - 1));
-
- if (info->pow <= MMAPED_OFFSET_POW) {
- struct free_list *block_head = (struct free_list *) ((size_t) addr - CHUNK);
- list_add_head(block_head, &arena_flist[info->pow]);
- dprintf("free(%p) --> free BLOCK [2^%d]\n", addr, info->pow);
- }
- else {
- dprintf("free(%p) --> free POOL [2^%d] %lu / %u / %u (use / free / max)\n",
- addr, info->pow,
- NUM_USED(info->next_pos, POOL_ALIGN_SIZE, info->size),
- info->free_num + 1, info->num);
-
- info->free_num++;
- if (info->free_num == info->num) {
- /* intialize for reuse */
- info->free_num = 1;
- info->next_pos = (char *) info + info->size;
-
- list_add_tail(&(info->list), &arena_flist[info->pow]);
-
- dprintf(" POOL [2^%d] ALL FREED -> add list [%p]\n", info->pow,
- &arena_flist[info->pow]);
- }
- }
- }
-
- pthread_mutex_unlock(&mutex);
-}
-
-void ib_free_hook(void *addr, const void *caller)
-{
- if (addr) {
- free_core(addr);
- addr = NULL;
- }
-}
-
-void *ib_realloc_hook(void *addr, size_t size, const void *caller)
-{
- void *tmp;
-
- dprintf("realloc(%p, %lu)\n", addr, size);
-
- tmp = ib_malloc_hook(size, NULL);
-
- if (addr != NULL) {
- int old_pow, new_pow, power;
-
- new_pow = powoftwo(size);
-
- /* get power of 'addr' area */
- if (((size_t) addr & ((size_t) PAGE_SIZE - 1)) == MMAPED_OFFSET) {
- char *head = (char *) addr - MMAPED_OFFSET;
- old_pow = (int) *(int *) head;
- }
- else {
- struct pool_info *info =
- (struct pool_info *) ((size_t) addr & ~((size_t) POOL_ALIGN_SIZE - 1));
- old_pow = info->pow;
- }
-
- if (old_pow < new_pow)
- power = old_pow; /* expand */
- else
- power = new_pow; /* shrink */
-
- memcpy((char *) tmp, (char *) addr, (size_t) 1 << power);
-
- free_core(addr);
- }
-
- addr = tmp;
-
- return tmp;
-}
-
-int munmap(void *addr, size_t length)
-{
- if (!use_ib_malloc || __tunnel_munmap) {
- dprintf("munmap(%p, 0x%lx)\n", addr, length);
-
- return syscall(__NR_munmap, addr, length);
- }
- else {
- /* do nothing */
- }
-
- return 0;
-}
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
deleted file mode 100644
index d3cab00..0000000
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
+++ /dev/null
@@ -1,3176 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
-/*
- * (C) 2012 NEC Corporation
- * (C) 2014 RIKEN AICS
- * (C) 2014-2015 RIKEN AICS
- *
- * See COPYRIGHT in top-level directory.
- */
-
-#include "ib_impl.h"
-#include "mpidrma.h"
-
-//#define MPID_NEM_IB_DEBUG_POLL
-#ifdef dprintf /* avoid redefinition with src/mpid/ch3/include/mpidimpl.h */
-#undef dprintf
-#endif
-#ifdef MPID_NEM_IB_DEBUG_POLL
-#define dprintf printf
-#else
-#define dprintf(...)
-#endif
-
-static int entered_drain_scq = 0;
-
-#define MPID_NEM_IB_SEND_PROGRESS_POLLINGSET { \
- do { \
- int n; \
- for (n = 0; n < MPID_NEM_IB_NRINGBUF; n++) { \
- if (((MPID_nem_ib_ringbuf_allocated[n / 64] >> (n & 63)) & 1) == 0) { \
- continue; \
- } \
- mpi_errno = MPID_nem_ib_poll_eager(&MPID_nem_ib_ringbuf[n]); /*FIXME: perform send_progress for all sendqs */ \
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_poll_eager"); \
- } \
- } while (0); \
-}
-
-#define MPID_NEM_IB_CHECK_AND_SEND_PROGRESS \
- do { \
- if (!MPID_nem_ib_sendq_empty(vc_ib->sendq) && MPID_nem_ib_sendq_ready_to_send_head(vc_ib)) { \
- MPID_nem_ib_send_progress(vc); \
- } \
- } while (0)
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_drain_scq
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_drain_scq(int dont_call_progress)
-{
-
- int mpi_errno = MPI_SUCCESS;
- int result;
- int i;
- struct ibv_wc cqe[MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN];
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_DRAIN_SCQ);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_DRAIN_SCQ);
-
- /* prevent a call path drain_scq -> send_progress -> drain_scq */
- if (entered_drain_scq) {
- dprintf("drain_scq,re-enter\n");
- goto fn_exit;
- }
- entered_drain_scq = 1;
-
-#ifdef MPID_NEM_IB_ONDEMAND
- /* drain_scq is called after poll_eager calls vc_terminate
- * or nobody created QP */
- if (!MPID_nem_ib_rc_shared_scq) {
- dprintf("drain_scq,CQ is null\n");
- goto fn_exit;
- }
-#endif
-
- result =
- ibv_poll_cq(MPID_nem_ib_rc_shared_scq, /*3 */ MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN, &cqe[0]);
-
- MPIU_ERR_CHKANDJUMP(result < 0, mpi_errno, MPI_ERR_OTHER, "**netmod,ib,ibv_poll_cq");
-
- if (result > 0) {
- dprintf("drain_scq,result=%d\n", result);
- }
- for (i = 0; i < result; i++) {
- dprintf("drain_scq,i=%d\n", i);
-
- MPID_Request *req;
- MPID_Request_kind_t kind;
- int req_type, msg_type;
-
- /* Obtain sreq */
- //req = (MPID_Request *) cqe[i].wr_id;
- MPID_nem_ib_rc_send_request *req_wrap = (MPID_nem_ib_rc_send_request *) cqe[i].wr_id;
- req = (MPID_Request *) req_wrap->wr_id;
-
- /* decrement reference counter of mr_cache_entry registered by ib_com_isend or ib_com_lrecv */
- struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache =
- (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) req_wrap->mr_cache;
- if (mr_cache) {
- MPID_nem_ib_com_reg_mr_release(mr_cache);
- }
-
- kind = req->kind;
- req_type = MPIDI_Request_get_type(req);
- msg_type = MPIDI_Request_get_msg_type(req);
-
- dprintf("drain_scq,req=%p,req->ref_count=%d,cc_ptr=%d\n", req, req->ref_count,
- *req->cc_ptr);
- if (req->ref_count <= 0) {
- printf("%d\n", *(int *) 0);
- }
-
-#ifdef HAVE_LIBDCFA
- if (cqe[i].status != IBV_WC_SUCCESS) {
- printf("drain_scq,kind=%d,req_type=%d,msg_type=%d,cqe.status=%08x\n", kind, req_type,
- msg_type, cqe[i].status);
- }
-#else
- if (cqe[i].status != IBV_WC_SUCCESS) {
- printf
- ("drain_scq,kind=%d,req_type=%d,msg_type=%d,comm=%p,cqe.status=%08x,%s,sseq_num=%d\n",
- kind, req_type, msg_type, req->comm, cqe[i].status,
- ibv_wc_status_str(cqe[i].status), VC_FIELD(req->ch.vc, ibcom->sseq_num));
- }
-#endif
- MPID_NEM_IB_ERR_FATAL(cqe[i].status != IBV_WC_SUCCESS, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_drain_scq");
-
- /*
- * packets generated by MPIDI_CH3_iStartMsgv has req_type of RECV
- * lmt_initiate_lmt, lmt_send_put_done
- */
- if (
- //req_type == MPIDI_REQUEST_TYPE_SEND
- (req_type == MPIDI_REQUEST_TYPE_SEND || req_type == MPIDI_REQUEST_TYPE_RSEND ||
- req_type == MPIDI_REQUEST_TYPE_RECV || req_type == MPIDI_REQUEST_TYPE_SSEND ||
- req_type == MPIDI_REQUEST_TYPE_GET_ACCUM_RESP)
- && msg_type == MPIDI_REQUEST_EAGER_MSG) {
- dprintf("drain_scq,send/recv,eager,req_type=%d,,comm=%p,opcode=%d\n", req_type,
- req->comm, cqe[i].opcode);
-
- MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
- dprintf("drain_scq,MPIDI_REQUEST_EAGER_MSG,%d->%d,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_myrank, req->ch.vc->pg_rank, MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail)); /* moved before MPID_Request_release because this references req->ch.vc */
-
- /* free temporal buffer for eager-send non-contiguous data.
- * MPIDI_Request_create_sreq (in mpid_isend.c) sets req->dev.datatype
- * control message has a req_type of MPIDI_REQUEST_TYPE_RECV and
- * msg_type of MPIDI_REQUEST_EAGER_MSG because
- * control message send follows
- * MPIDI_CH3_iStartMsg/v-->MPID_nem_ib_iStartContigMsg-->MPID_nem_ib_iSendContig
- * and MPID_nem_ib_iSendContig set req->dev.state to zero.
- * see MPID_Request_create (in src/mpid/ch3/src/ch3u_request.c)
- * eager-short message has req->comm of zero
- */
- if (req_type == MPIDI_REQUEST_TYPE_SEND && req->comm) {
- /* exclude control messages by requiring MPIDI_REQUEST_TYPE_SEND
- * exclude eager-short by requiring req->comm != 0 */
- if (REQ_FIELD(req, lmt_pack_buf)) {
- dprintf("drain_scq,eager-send,non-contiguous,free lmt_pack_buf=%p\n",
- REQ_FIELD(req, lmt_pack_buf));
- MPIU_Free(REQ_FIELD(req, lmt_pack_buf));
- }
- }
-
- /* As for request by PKT_PUT, both req->type and req->comm are not set.
- * If receiver's data type is derived-type, req->dev.datatype_ptr is set.
- */
- if ((*req->cc_ptr == 1) && (req_type == 0) && !req->comm) {
- if (req->dev.datatype_ptr && (req->dev.segment_size > 0) &&
- REQ_FIELD(req, lmt_pack_buf)) {
- MPIU_Free(REQ_FIELD(req, lmt_pack_buf));
- }
- }
-
- /* decrement the number of entries in IB command queue */
- vc_ib->ibcom->ncom -= 1;
- MPID_nem_ib_ncqe -= 1;
- MPID_nem_ib_rdmawr_from_free(REQ_FIELD(req, buf_from), REQ_FIELD(req, buf_from_sz));
- dprintf("drain_scq,afree=%p,sz=%d\n", REQ_FIELD(req, buf_from),
- REQ_FIELD(req, buf_from_sz));
-
- dprintf("drain_scq,eager-send,ncqe=%d\n", MPID_nem_ib_ncqe);
- MPIU_Assert(req->ref_count >= 1 && req->ref_count <= 3);
-
- /* ref_count is decremented in drain_scq and wait */
- if (*req->cc_ptr > 0) {
- dprintf("drain_scq,MPID_nem_ib_ncqe_nces=%d,cc_ptr=%d,pending_sends=%d\n",
- MPID_nem_ib_ncqe_nces, *req->cc_ptr, VC_FIELD(req->ch.vc, pending_sends));
- MPID_nem_ib_ncqe_nces -= 1;
-
- int (*reqFn) (MPIDI_VC_t *, MPID_Request *, int *);
-
- (VC_FIELD(req->ch.vc, pending_sends)) -= 1;
-
- reqFn = req->dev.OnDataAvail;
- if (*req->cc_ptr == 2 && reqFn == MPIDI_CH3_ReqHandler_ReqOpsComplete) {
- MPIDI_CH3U_Request_complete(req);
- }
- /* as in the template */
- else if (!reqFn) {
- MPIDI_CH3U_Request_complete(req);
- dprintf("drain_scq,complete,req=%p\n", req);
- MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete");
- //dprintf("drain_scq,complete,req=%p,pcc incremented to %d\n", req,
- //MPIDI_CH3I_progress_completion_count.v);
- }
- else {
- dprintf("drain_scq,reqFn isn't zero\n");
- MPIDI_VC_t *vc = req->ch.vc;
- int complete = 0;
- mpi_errno = reqFn(vc, req, &complete);
- if (mpi_errno)
- MPIU_ERR_POP(mpi_errno);
- /* not-completed case is not implemented */
- MPIU_Assert(complete == TRUE);
- }
- }
- else {
- MPID_Request_release(req);
- dprintf("drain_scq,relese,req=%p\n", req);
- }
- /* try to send from sendq */
- //dprintf("ib_poll,SCQ,!lmt,send_progress\n");
- if (!MPID_nem_ib_sendq_empty(vc_ib->sendq)) {
- dprintf("drain_scq,eager-send,ncom=%d,ncqe=%d,diff=%d\n",
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY,
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) <
- vc_ib->ibcom->local_ringbuf_nslot);
-
- MPID_Request *sreq = MPID_nem_ib_sendq_head(vc_ib->sendq);
- int msg_type_sreq = MPIDI_Request_get_msg_type(sreq);
-
- if (sreq->kind == MPID_REQUEST_SEND && msg_type_sreq == MPIDI_REQUEST_EAGER_MSG) {
- dprintf("drain_scq,eager-send,head is eager-send\n");
- }
- else if (sreq->kind == MPID_REQUEST_RECV && msg_type_sreq == MPIDI_REQUEST_RNDV_MSG) {
- dprintf("drain_scq,eager-send,head is lmt RDMA-read\n");
- }
- else if (sreq->kind == MPID_REQUEST_SEND && msg_type_sreq == MPIDI_REQUEST_RNDV_MSG) {
- dprintf("drain_scq,eager-send,head is lmt RDMA-write\n");
- }
- }
- /* call MPID_nem_ib_send_progress for all VCs in polling-set
- * instead of VC which releases CQ, command
- * when releasing them
- * because commands for VC-A are blocked by the command
- * for VC-B and waiting in the sendq
- */
- dprintf("drain_scq,eager-send,send_progress\n");
- //MPID_NEM_IB_SEND_PROGRESS_POLLINGSET;
-
- dprintf("drain_scq,eager-send,next\n");
-
- MPIU_Free(req_wrap);
- }
- else if (req_type == MPIDI_REQUEST_TYPE_GET_RESP && msg_type == MPIDI_REQUEST_EAGER_MSG) {
- dprintf("drain_scq,GET_RESP,eager,req_type=%d,,comm=%p,opcode=%d\n", req_type,
- req->comm, cqe[i].opcode);
-
- MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
- dprintf("drain_scq,MPIDI_REQUEST_EAGER_MSG,%d->%d,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_myrank, req->ch.vc->pg_rank, MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail)); /* moved before MPID_Request_release because this references req->ch.vc */
-
- /* decrement the number of entries in IB command queue */
- vc_ib->ibcom->ncom -= 1;
- MPID_nem_ib_ncqe -= 1;
- MPID_nem_ib_rdmawr_from_free(REQ_FIELD(req, buf_from), REQ_FIELD(req, buf_from_sz));
-
- /* this request may be from Noncontig */
- if ((*req->cc_ptr == 1) && req->dev.datatype_ptr && (req->dev.segment_size > 0) &&
- REQ_FIELD(req, lmt_pack_buf)) {
- MPIU_Free(REQ_FIELD(req, lmt_pack_buf));
- }
-
- dprintf("drain_scq,GET_RESP,ncqe=%d\n", MPID_nem_ib_ncqe);
- MPIU_Assert(req->ref_count == 1 || req->ref_count == 2);
-
- /* ref_count is decremented in drain_scq and wait */
- dprintf("drain_scq,MPID_nem_ib_ncqe_nces=%d,cc_ptr=%d,pending_sends=%d\n",
- MPID_nem_ib_ncqe_nces, *req->cc_ptr, VC_FIELD(req->ch.vc, pending_sends));
- MPID_nem_ib_ncqe_nces -= 1;
-
- int (*reqFn) (MPIDI_VC_t *, MPID_Request *, int *);
-
- (VC_FIELD(req->ch.vc, pending_sends)) -= 1;
-
- reqFn = req->dev.OnDataAvail;
- if (*req->cc_ptr == 2 && reqFn == MPIDI_CH3_ReqHandler_GetSendComplete) {
- MPIDI_CH3U_Request_complete(req);
- }
- /* as in the template */
- else if (!reqFn) {
- MPIDI_CH3U_Request_complete(req);
- dprintf("drain_scq,complete,req=%p\n", req);
- MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete");
- //dprintf("drain_scq,complete,req=%p,pcc incremented to %d\n", req,
- //MPIDI_CH3I_progress_completion_count.v);
- }
- else {
- dprintf("drain_scq,reqFn isn't zero\n");
- dprintf("drain_scq,GET_RESP,before dev.OnDataAvail,ref_count=%d\n", req->ref_count);
- MPIDI_VC_t *vc = req->ch.vc;
- int complete = 0;
- mpi_errno = reqFn(vc, req, &complete);
- if (mpi_errno)
- MPIU_ERR_POP(mpi_errno);
- /* not-completed case is not implemented */
- MPIU_Assert(complete == TRUE);
- }
-
- //MPID_NEM_IB_SEND_PROGRESS_POLLINGSET;
-
- dprintf("drain_scq,GET_RESP,next\n");
-
- MPIU_Free(req_wrap);
- }
- else if (req_type == MPIDI_REQUEST_TYPE_RECV && msg_type == MPIDI_REQUEST_RNDV_MSG &&
- cqe[i].opcode == IBV_WC_RDMA_READ) {
- /* lmt get */
- /* the case for lmt-put-done or lmt-put where
- * (1) sender finds end-flag won't change (2) sender sends RTS to receiver
- * (3) receiver gets (4) here
- * is distinguished by cqe[i].opcode
- */
- dprintf("drain_scq,recv,rndv,rdma-read,kind=%d,opcode=%d\n", kind, cqe[i].opcode);
-
-
- MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
-#if defined(MPID_NEM_IB_LMT_GET_CQE)
-
- /* end of packet */
- if (req_wrap->mf == MPID_NEM_IB_LMT_LAST_PKT) {
- /* unpack non-contiguous dt */
- int is_contig;
- MPID_Datatype_is_contig(req->dev.datatype, &is_contig);
- if (!is_contig) {
- dprintf("drain_scq,lmt,GET_CQE,unpack noncontiguous data to user buffer\n");
-
- /* see MPIDI_CH3U_Request_unpack_uebuf (in /src/mpid/ch3/src/ch3u_request.c) */
- /* or MPIDI_CH3U_Receive_data_found (in src/mpid/ch3/src/ch3u_handle_recv_pkt.c) */
- MPIDI_msg_sz_t unpack_sz = req->ch.lmt_data_sz;
- MPID_Segment seg;
- MPI_Aint last;
-
- MPID_Segment_init(req->dev.user_buf, req->dev.user_count, req->dev.datatype,
- &seg, 0);
- last = unpack_sz;
- MPID_Segment_unpack(&seg, 0, &last, REQ_FIELD(req, lmt_pack_buf));
- if (last != unpack_sz) {
- /* --BEGIN ERROR HANDLING-- */
- /* received data was not entirely consumed by unpack()
- * because too few bytes remained to fill the next basic
- * datatype */
- MPIR_STATUS_SET_COUNT(req->status, last);
- req->status.MPI_ERROR =
- MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME,
- __LINE__, MPI_ERR_TYPE, "**MPID_nem_ib_poll", 0);
- /* --END ERROR HANDLING-- */
- }
- dprintf("drain_scq,lmt,GET_CQE,ref_count=%d,lmt_pack_buf=%p\n", req->ref_count,
- REQ_FIELD(req, lmt_pack_buf));
- MPID_nem_ib_stfree(REQ_FIELD(req, lmt_pack_buf), (size_t) req->ch.lmt_data_sz);
- }
- dprintf("drain_scq,lmt,GET_CQE,lmt_send_GET_DONE,rsr_seq_num_tail=%d\n",
- vc_ib->ibcom->rsr_seq_num_tail);
-
- /* send done to sender. vc is stashed in MPID_nem_ib_lmt_start_recv (in ib_lmt.c) */
- MPID_nem_ib_lmt_send_GET_DONE(req->ch.vc, req);
- }
- else if (req_wrap->mf == MPID_NEM_IB_LMT_SEGMENT_LAST) {
- MPID_nem_ib_lmt_send_GET_DONE(req->ch.vc, req);
- }
-#endif
- /* unmark "lmt is going on" */
-
- //dprintf("ib_poll,SCQ,lmt,%d->%d,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_myrank, req->ch.vc->pg_rank, MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail)); /* moved before MPID_Request_release because this references req->ch.vc */
-
- /* decrement the number of entries in IB command queue */
- vc_ib->ibcom->ncom -= 1;
- MPID_nem_ib_ncqe -= 1;
- dprintf("drain_scq,rdma-read,ncqe=%d\n", MPID_nem_ib_ncqe);
-
-#ifdef MPID_NEM_IB_LMT_GET_CQE
- if (req_wrap->mf == MPID_NEM_IB_LMT_LAST_PKT) {
- dprintf("drain_scq,GET_CQE,Request_complete\n");
- /* mark completion on rreq */
- MPIDI_CH3U_Request_complete(req);
- dprintf("drain_scq,complete,req=%p\n", req);
- }
-#else /* GET, and !GET_CQE */
-
- int is_contig;
- MPID_Datatype_is_contig(req->dev.datatype, &is_contig);
- if (!is_contig) {
- //if (req->ref_count == 1) {
- dprintf("drain_scq,GET&&!GET_CQE,ref_count=%d,lmt_pack_buf=%p\n", req->ref_count,
- REQ_FIELD(req, lmt_pack_buf));
- /* debug, polling waits forever when freeing here. */
- //free(REQ_FIELD(req, lmt_pack_buf));
- //MPID_nem_ib_stfree(REQ_FIELD(req, lmt_pack_buf), (size_t)req->ch.lmt_data_sz);
- //dprintf("drain_scq,lmt,insert to free-list=%p\n", MPID_nem_ib_fl);
- //} else {
- //dprintf("drain_scq,GET&&!GET_CQE,ref_count=%d,lmt_pack_buf=%p\n", req->ref_count, REQ_FIELD(req, lmt_pack_buf));
- //}
- }
-
- /* lmt_start_recv increments ref_count
- * drain_scq and ib_poll is not ordered, so both can decrement ref_count */
- MPID_Request_release(req);
- dprintf("drain_scq,relese,req=%p\n", req);
-#endif
- MPIU_Free(req_wrap);
-
- /* try to send from sendq */
- if (!MPID_nem_ib_sendq_empty(vc_ib->sendq)) {
- dprintf("drain_scq,GET,ncom=%d,ncqe=%d,diff=%d\n",
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY,
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) <
- vc_ib->ibcom->local_ringbuf_nslot);
- MPID_Request *sreq = MPID_nem_ib_sendq_head(vc_ib->sendq);
- int msg_type_sreq = MPIDI_Request_get_msg_type(sreq);
-
- if (sreq->kind == MPID_REQUEST_SEND && msg_type_sreq == MPIDI_REQUEST_EAGER_MSG) {
- dprintf("drain_scq,eager-send,head is eager-send\n");
- }
- else if (sreq->kind == MPID_REQUEST_RECV && msg_type_sreq == MPIDI_REQUEST_RNDV_MSG) {
- dprintf("drain_scq,eager-send,head is lmt\n");
- }
- }
- //if (!MPID_nem_ib_sendq_empty(vc_ib->sendq) && MPID_nem_ib_sendq_ready_to_send_head(vc_ib)) {
- //MPID_NEM_IB_SEND_PROGRESS_POLLINGSET
- //}
- }
- else if (req_type == MPIDI_REQUEST_TYPE_SEND && msg_type == MPIDI_REQUEST_RNDV_MSG &&
- cqe[i].opcode == IBV_WC_RDMA_READ) {
- MPID_nem_ib_vc_area *vc_ib = VC_IB(req->ch.vc);
-
- if (req_wrap->mf == MPID_NEM_IB_LMT_LAST_PKT) {
- MPID_nem_handle_pkt(req->ch.vc, (char *) REQ_FIELD(req, lmt_pack_buf),
- (MPIDI_msg_sz_t) (sizeof(MPIDI_CH3_Pkt_t) +
- req->ch.lmt_data_sz));
-
- MPID_nem_ib_lmt_send_PKT_LMT_DONE(req->ch.vc, req);
-
- MPIU_Free(REQ_FIELD(req, lmt_pack_buf));
- MPIDI_CH3U_Request_complete(req);
- }
- else if (req_wrap->mf == MPID_NEM_IB_LMT_SEGMENT_LAST) {
- MPID_nem_ib_lmt_send_PKT_LMT_DONE(req->ch.vc, req);
- }
-
- /* decrement the number of entries in IB command queue */
- vc_ib->ibcom->ncom -= 1;
- MPID_nem_ib_ncqe -= 1;
-
- MPIU_Free(req_wrap);
- }
- else {
- printf("drain_scq,unknown kind=%d,req_type=%d,msg_type=%d\n", kind, req_type, msg_type);
- assert(0);
- MPIU_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_drain_scq");
- MPIU_Free(req_wrap);
- }
- }
- if (!dont_call_progress) {
- MPID_NEM_IB_SEND_PROGRESS_POLLINGSET;
- }
- fn_exit:
- entered_drain_scq = 0;
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_DRAIN_SCQ);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-/* bottom part of MPID_nem_handle_pkt() */
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_handle_pkt_bh
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_handle_pkt_bh(MPIDI_VC_t * vc, MPID_Request * req, char *buf, MPIDI_msg_sz_t buflen)
-{
- int mpi_errno = MPI_SUCCESS;
- int complete = 0;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_HANDLE_PKT_BH);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_HANDLE_PKT_BH);
-
- while (buflen && !complete) {
- MPID_IOV *iov;
- int n_iov;
- iov = &req->dev.iov[req->dev.iov_offset];
- n_iov = req->dev.iov_count;
-
- while (n_iov && buflen >= iov->MPID_IOV_LEN) {
- size_t iov_len = iov->MPID_IOV_LEN;
- MPIU_Memcpy(iov->MPID_IOV_BUF, buf, iov_len);
-
- buflen -= iov_len;
- buf += iov_len;
- --n_iov;
- ++iov;
- }
-
- if (n_iov) {
- if (buflen > 0) {
- MPIU_Memcpy(iov->MPID_IOV_BUF, buf, buflen);
- iov->MPID_IOV_BUF = (void *) ((char *) iov->MPID_IOV_BUF + buflen);
- iov->MPID_IOV_LEN -= buflen;
- buflen = 0;
- }
-
- req->dev.iov_offset = iov - req->dev.iov;
- req->dev.iov_count = n_iov;
- }
- else {
- int (*reqFn) (MPIDI_VC_t *, MPID_Request *, int *);
-
- reqFn = req->dev.OnDataAvail;
- if (!reqFn) {
- MPIDI_CH3U_Request_complete(req);
- complete = TRUE;
- }
- else {
- mpi_errno = reqFn(vc, req, &complete);
- if (mpi_errno)
- MPIU_ERR_POP(mpi_errno);
- }
-
- if (!complete) {
- req->dev.iov_offset = 0;
- MPIU_Assert(req->dev.iov_count > 0 &&
- req->dev.iov[req->dev.iov_offset].MPID_IOV_LEN > 0);
- }
- }
- }
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_HANDLE_PKT_BH);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_drain_scq_scratch_pad
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_drain_scq_scratch_pad()
-{
-
- int mpi_errno = MPI_SUCCESS;
- int result;
- int i;
- struct ibv_wc cqe[MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN];
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_DRAIN_SCQ_SCRATCH_PAD);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_DRAIN_SCQ_SCRATCH_PAD);
-
- /* drain_scq_scratch_pad is called after poll_eager calls vc_terminate */
- if (!MPID_nem_ib_rc_shared_scq_scratch_pad) {
- dprintf("drain_scq_scratch_pad,CQ is null\n");
- goto fn_exit;
- }
-
- result =
- ibv_poll_cq(MPID_nem_ib_rc_shared_scq_scratch_pad, MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN,
- &cqe[0]);
- MPIU_ERR_CHKANDJUMP(result < 0, mpi_errno, MPI_ERR_OTHER, "**netmod,ib,ibv_poll_cq");
-
- if (result > 0) {
- dprintf("drain_scq_scratch_pad,found,result=%d\n", result);
- }
- for (i = 0; i < result; i++) {
-
-#ifdef HAVE_LIBDCFA
- if (cqe[i].status != IBV_WC_SUCCESS) {
- dprintf("drain_scq_scratch_pad,status=%08x\n", cqe[i].status);
- }
-#else
- if (cqe[i].status != IBV_WC_SUCCESS) {
- dprintf("drain_scq_scratch_pad,status=%08x,%s\n", cqe[i].status,
- ibv_wc_status_str(cqe[i].status));
- }
-#endif
- MPIU_ERR_CHKANDJUMP(cqe[i].status != IBV_WC_SUCCESS, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_drain_scq_scratch_pad");
-
- MPID_nem_ib_com_t *ibcom_scratch_pad = (MPID_nem_ib_com_t *) cqe[i].wr_id;
- dprintf("drain_scq_scratch_pad,ibcom_scratch_pad=%p\n", ibcom_scratch_pad);
- ibcom_scratch_pad->ncom_scratch_pad -= 1;
- MPID_nem_ib_ncqe_scratch_pad -= 1;
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_DRAIN_SCQ_SCRATCH_PAD);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_poll_eager
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_poll_eager(MPID_nem_ib_ringbuf_t * ringbuf)
-{
-
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
- struct MPIDI_VC *vc = NULL;
- MPID_nem_ib_vc_area *vc_ib;
- //int result;
- //struct ibv_wc cqe[MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN];
- //uint64_t tscs, tsce;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_POLL_EAGER);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_POLL_EAGER);
-
- //MPID_nem_ib_tsc_poll = MPID_nem_ib_rdtsc();
-
- uint16_t *remote_poll = NULL;
- switch (ringbuf->type) {
- case MPID_NEM_IB_RINGBUF_EXCLUSIVE:
- remote_poll = &VC_FIELD(ringbuf->vc, ibcom->rsr_seq_num_poll);
- break;
- case MPID_NEM_IB_RINGBUF_SHARED:
- remote_poll = &MPID_nem_ib_remote_poll_shared;
- break;
- default:
- printf("unknown ringbuf->type\n");
- }
-
- void *buf =
- (uint8_t *) ringbuf->start +
- MPID_NEM_IB_COM_RDMABUF_SZSEG * ((uint16_t) (*remote_poll % ringbuf->nslot));
- volatile uint64_t *head_flag = MPID_NEM_IB_NETMOD_HDR_HEAD_FLAG_PTR(buf);
- if (*head_flag == 0) {
- goto fn_exit;
- }
- dprintf("ib_poll_eager,remote_poll=%d,buf=%p,sz=%d\n", *remote_poll, buf,
- MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf));
-
- dprintf("ib_poll_eager,eager-send,found\n");
- fflush(stdout);
-
- //MPIU_ERR_CHKANDJUMP1(cqe.status != IBV_WC_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_poll_cq", "**MPID_nem_ib_com_poll_cq %s", MPID_nem_ib_com_strerror(ibcom_errno));
-
- int off_pow2_aligned;
- MPID_NEM_IB_OFF_POW2_ALIGNED(MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf));
- volatile MPID_nem_ib_netmod_trailer_t *netmod_trailer =
- (MPID_nem_ib_netmod_trailer_t *) ((uint8_t *) buf + off_pow2_aligned);
- dprintf("poll,off_pow2_aligned=%d,netmod_trailer=%p,sz=%d\n", off_pow2_aligned, netmod_trailer,
- MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf));
- //int k = 0;
- //tsce = MPID_nem_ib_rdtsc(); printf("9,%ld\n", tsce - tscs); // 55 for 512-byte
- //tscs = MPID_nem_ib_rdtsc();
- //#define MPID_NEM_IB_TLBPREF_POLL 20
-#ifdef MPID_NEM_IB_TLBPREF_POLL
- int tlb_pref_ahd = (uint64_t) tailmagic + 4096 * MPID_NEM_IB_TLBPREF_POLL - (uint64_t) buf;
-#endif
- while (netmod_trailer->tail_flag != MPID_NEM_IB_COM_MAGIC) {
-#ifdef MPID_NEM_IB_TLBPREF_POLL
- __asm__ __volatile__
- ("movq %0, %%rsi;" "movq 0(%%rsi), %%rax;"::"r"(buf + tlb_pref_ahd):"%rsi", "%rax");
- tlb_pref_ahd = (tlb_pref_ahd + 4096 * 20) % MPID_NEM_IB_COM_RDMABUF_SZ;
-#endif
- }
- //tsce = MPID_nem_ib_rdtsc(); printf("0,%ld\n", tsce - tscs); // 20-60 for 512-byte
- //tscs = MPID_nem_ib_rdtsc();
- //dprintf("magic wait=%d\n", k);
-
-
- /* this reduces memcpy in MPIDI_CH3U_Receive_data_found */
- /* MPIDI_CH3_PktHandler_EagerSend (in ch3u_eager.c)
- * MPIDI_CH3U_Receive_data_found (in ch3u_handle_recv_pkt.c)
- * MPIU_Memcpy((char*)(rreq->dev.user_buf) + dt_true_lb, buf, data_sz);
- * 600 cycle for 512B!!! --> 284 cycle with prefetch
- */
-
- void *rsi;
- for (rsi = (void *) buf; rsi < (void *) ((uint8_t *) buf + MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf));
- rsi = (uint8_t *) rsi + 64 * 4) {
-#ifdef __MIC__
- __asm__ __volatile__
- ("movq %0, %%rsi;"
- "vprefetch0 0x00(%%rsi);"
- "vprefetch0 0x40(%%rsi);" "vprefetch0 0x80(%%rsi);" "vprefetch0 0xc0(%%rsi);"::"r"(rsi)
- :"%rsi");
-#else
- __asm__ __volatile__
- ("movq %0, %%rsi;"
- "prefetchnta 0x00(%%rsi);"
- "prefetchnta 0x40(%%rsi);"
- "prefetchnta 0x80(%%rsi);" "prefetchnta 0xc0(%%rsi);"::"r"(rsi)
- :"%rsi");
-#endif
- }
-
- /* Increment here because handle_pkt of CLOSE calls poll_eager recursively */
- (*remote_poll) += 1;
- dprintf("ib_poll,inc,remote_poll=%d\n", *remote_poll);
-
- /* VC is stored in the packet for shared ring buffer */
- switch (ringbuf->type) {
- case MPID_NEM_IB_RINGBUF_EXCLUSIVE:
- vc = ringbuf->vc;
- break;
- case MPID_NEM_IB_RINGBUF_SHARED:
- vc = MPID_NEM_IB_NETMOD_HDR_VC_GET(buf);
- break;
- default:
- printf("unknown ringbuf->type\n");
- }
- vc_ib = VC_IB(vc);
- dprintf("poll_eager,vc=%p\n", vc);
-
- /* Save it because handle_pkt frees buf when the packet is MPIDI_CH3_PKT_CLOSE */
- ssize_t sz_pkt = MPID_NEM_IB_NETMOD_HDR_SIZEOF_GET(buf);
- MPIDI_CH3_Pkt_eager_send_t *pkt = (MPIDI_CH3_Pkt_eager_send_t *) ((uint8_t *) buf + sz_pkt);
- dprintf("pkt=%p,sizeof=%ld\n", pkt, sz_pkt);
- MPIU_Assert(MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) >= sz_pkt + sizeof(MPIDI_CH3_Pkt_t));
- dprintf
- ("handle_pkt,before,%d<-%d,id=%d,pkt->type=%d,pcc=%d,MPIDI_CH3_PKT_END_ALL=%d,pkt=%p,subtype=%d\n",
- MPID_nem_ib_myrank, vc->pg_rank, *remote_poll, pkt->type,
- MPIDI_CH3I_progress_completion_count.v, MPIDI_CH3_PKT_END_ALL, pkt,
- ((MPID_nem_pkt_netmod_t *) pkt)->subtype);
- /* see MPIDI_CH3_PktHandler_EagerSend (in src/mpid/ch3/src/ch3u_eager.c) */
- mpi_errno =
- MPID_nem_handle_pkt(vc, (char *) ((uint8_t *) buf + sz_pkt),
- (MPIDI_msg_sz_t) (MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) - sz_pkt));
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
- //tsce = MPID_nem_ib_rdtsc(); printf("0,%ld\n", tsce - tscs); // 512-byte, 900 cyc (1100 w/o prefetch)
-
- /* Update occupation status of remote SR (send request) queue */
- /* this includes local RDMA-wr-to buf occupation
- * because MPID_nem_handle_pkt releases RDMA-wr-to buf by copying data out */
- /* responder releases resource and then embed largest sequence number into MPI message bound to initiator */
- if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
- (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf)) {
- dprintf
- ("handle_pkt,after,%d<-%d,id=%d,pkt->type=%d,eagershort=%d,close=%d,rts=%d,piggy-backed-eagersend=%d\n",
- MPID_nem_ib_myrank, vc->pg_rank, *remote_poll, pkt->type,
- MPIDI_CH3_PKT_EAGERSHORT_SEND, MPIDI_CH3_PKT_CLOSE, MPIDI_NEM_PKT_LMT_RTS,
- MPIDI_NEM_IB_PKT_EAGER_SEND);
- }
-
- int notify_rate;
- if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
- (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf)) {
- ibcom_errno =
- MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get(MPID_nem_ib_conns[vc->pg_rank].fd,
- ¬ify_rate);
- dprintf("poll_eager,sendq=%d,ncom=%d,ncqe=%d,ldiff=%d(%d-%d),rate=%d\n",
- MPID_nem_ib_sendq_empty(vc_ib->sendq),
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY,
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
- vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail, notify_rate);
- }
-
- if (ringbuf->type == MPID_NEM_IB_RINGBUF_EXCLUSIVE) {
- dprintf("poll_eager,rdiff=%d(%d-%d)\n",
- MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
- vc_ib->ibcom->rsr_seq_num_tail_last_sent),
- vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent);
- }
-
- //dprintf("ib_poll,current pcc=%d\n", MPIDI_CH3I_progress_completion_count.v);
-
- /* Don't forget to put lmt-cookie types here!! */
- if (1) {
- /* lmt cookie messages or control message other than eager-short */
-
- /* eager-send with zero-length data is released here
- * because there is no way to trace the RDMA-write-to buffer addr
- * because rreq->dev.tmpbuf is set to zero in ch3_eager.c
- */
- if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
- (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf)) {
- dprintf("poll_eager,released,type=%d,MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM=%d\n", pkt->type,
- MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM);
- MPID_nem_ib_recv_buf_released(vc,
- (void *) ((uint8_t *) buf +
- sz_pkt + sizeof(MPIDI_CH3_Pkt_t)));
- }
- }
- else {
- if (MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf) == sz_pkt + sizeof(MPIDI_CH3_Pkt_t)) {
- if (pkt->type == MPIDI_CH3_PKT_EAGERSHORT_SEND
- //|| pkt->type == MPIDI_CH3_PKT_GET
-) {
- }
- else {
- printf("ib_poll,unknown pkt->type=%d\n", pkt->type);
- assert(0);
- MPIU_ERR_INTERNALANDJUMP(mpi_errno, "MPI header only but not released");
- }
- }
- }
-
- if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
- (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf)) {
- dprintf("ib_poll,hdr_ringbuf_type=%d\n", MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf));
-
- if (MPID_NEM_IB_NETMOD_HDR_RINGBUF_TYPE_GET(buf) & MPID_NEM_IB_RINGBUF_RELINDEX) {
- vc_ib->ibcom->lsr_seq_num_tail = MPID_NEM_IB_NETMOD_HDR_RELINDEX_GET(buf);
- dprintf("ib_poll,local_tail is updated to %d\n",
- MPID_NEM_IB_NETMOD_HDR_RELINDEX_GET(buf));
- }
- }
-
- /* Clear flag */
- if ((vc->state != MPIDI_VC_STATE_INACTIVE) ||
- (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf))
- MPID_NEM_IB_NETMOD_HDR_HEAD_FLAG_SET(buf, 0);
-
- if (vc->state == MPIDI_VC_STATE_INACTIVE && vc_ib->vc_terminate_buf == buf) {
- /* clear stored data */
- vc_ib->vc_terminate_buf = NULL;
-
- /* Destroy ring-buffer */
- ibcom_errno = MPID_nem_ib_ringbuf_free(vc);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_free");
-
- /* Check connection status stored in VC when on-demand connection is used */
- dprintf("vc_terminate,%d->%d,close\n", MPID_nem_ib_myrank, vc->pg_rank);
- ibcom_errno = MPID_nem_ib_com_close(vc_ib->sc->fd);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_close");
-
- /* Destroy array of scratch-pad QPs */
- MPIU_Assert(MPID_nem_ib_conns_ref_count > 0);
- if (--MPID_nem_ib_conns_ref_count == 0) {
- MPIU_Free(MPID_nem_ib_conns);
- }
-
- /* TODO don't create them for shared memory vc */
-
- /* Destroy scratch-pad */
- ibcom_errno = MPID_nem_ib_com_free(MPID_nem_ib_scratch_pad_fds[vc->pg_rank],
-#ifdef MPID_NEM_IB_ONDEMAND
- MPID_NEM_IB_CM_OFF_CMD +
- MPID_NEM_IB_CM_NSEG * sizeof(MPID_nem_ib_cm_cmd_t) +
- sizeof(MPID_nem_ib_ringbuf_headtail_t)
-#else
- MPID_nem_ib_nranks * sizeof(MPID_nem_ib_com_qp_state_t)
-#endif
-);
-
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_free");
-
- /* Destroy scratch-pad QP */
- ibcom_errno = MPID_nem_ib_com_close(MPID_nem_ib_scratch_pad_fds[vc->pg_rank]);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_close");
-
- /* Destroy array of scratch-pad QPs */
- MPIU_Assert(MPID_nem_ib_scratch_pad_fds_ref_count > 0);
- if (--MPID_nem_ib_scratch_pad_fds_ref_count == 0) {
- MPIU_Free(MPID_nem_ib_scratch_pad_fds);
- MPIU_Free(MPID_nem_ib_scratch_pad_ibcoms);
- }
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_POLL_EAGER);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_poll
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_poll(int in_blocking_poll)
-{
-
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
- uint32_t i;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_POLL);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_POLL);
-
- unsigned int progress_completion_count_old = MPIDI_CH3I_progress_completion_count.v;
-
- /* poll lmt */
- /* when receiver side sends CTS to sender side
- * sender receives CTS and give up sending RTS
- * sender initiates RDMA-write,
- * sender sends RTS of the next epoch,
- * to detect the end of RDMA-write first and DP the entry for CTS,
- * you should perform lmt-poll first, next eager-poll
- */
- MPID_Request *rreq, *prev_rreq;
- rreq = MPID_nem_ib_lmtq_head(MPID_nem_ib_lmtq);
- if (rreq) {
-#if defined (MPID_NEM_IB_TIMER_WAIT_IB_POLL)
- if (in_blocking_poll) {
- tsc[0] = MPI_rdtsc();
- }
-#endif
- // dprintf("ib_poll,poll lmtq\n");
- prev_rreq = NULL;
- do {
- /* Obtain cookie. pkt_RTS_handler memcpy it (in mpid_nem_lmt.c) */
- /* MPID_IOV_BUF is macro, converted into iov_base (in src/include/mpiiov.h) */
- /* do not use s_cookie_buf because do_cts frees it */
- //MPID_nem_ib_lmt_cookie_t* s_cookie_buf = (MPID_nem_ib_lmt_cookie_t*)rreq->ch.lmt_tmp_cookie.iov_base;
-
- /* Wait for completion of DMA */
- /* do not use s_cookie_buf->sz because do_cts frees it */
- volatile void *write_to_buf;
- int is_contig;
- MPID_Datatype_is_contig(rreq->dev.datatype, &is_contig);
- if (is_contig) {
- write_to_buf =
- (void *) ((char *) rreq->dev.user_buf /*+ REQ_FIELD(req, lmt_dt_true_lb) */);
- }
- else {
- write_to_buf = REQ_FIELD(rreq, lmt_pack_buf);
- }
-
- //assert(REQ_FIELD(rreq, lmt_dt_true_lb) == 0);
- volatile uint8_t *tailmagic =
- (uint8_t *) ((uint8_t *) write_to_buf /*+ REQ_FIELD(rreq, lmt_dt_true_lb) */ +
- rreq->ch.lmt_data_sz - sizeof(uint8_t));
-
- if (*tailmagic != REQ_FIELD(rreq, lmt_tail)) {
- goto next;
- }
- dprintf("ib_poll,sz=%ld,old tail=%02x,new tail=%02x\n", rreq->ch.lmt_data_sz,
- REQ_FIELD(rreq, lmt_tail), *tailmagic);
-
- dprintf
- ("ib_poll,lmt found,%d<-%d,req=%p,ref_count=%d,is_contig=%d,write_to_buf=%p,lmt_pack_buf=%p,user_buf=%p,tail=%p\n",
- MPID_nem_ib_myrank, rreq->ch.vc->pg_rank, rreq, rreq->ref_count, is_contig,
- write_to_buf, REQ_FIELD(rreq, lmt_pack_buf), rreq->dev.user_buf, tailmagic);
-
- /* unpack non-contiguous dt */
- if (!is_contig) {
- dprintf("ib_poll,copying noncontiguous data to user buffer\n");
-
- /* see MPIDI_CH3U_Request_unpack_uebuf (in /src/mpid/ch3/src/ch3u_request.c) */
- /* or MPIDI_CH3U_Receive_data_found (in src/mpid/ch3/src/ch3u_handle_recv_pkt.c) */
- MPIDI_msg_sz_t unpack_sz = rreq->ch.lmt_data_sz;
- MPID_Segment seg;
- MPI_Aint last;
-
- MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype,
- &seg, 0);
- last = unpack_sz;
- MPID_Segment_unpack(&seg, 0, &last, REQ_FIELD(rreq, lmt_pack_buf));
- if (last != unpack_sz) {
- /* --BEGIN ERROR HANDLING-- */
- /* received data was not entirely consumed by unpack()
- * because too few bytes remained to fill the next basic
- * datatype */
- MPIR_STATUS_SET_COUNT(rreq->status, last);
- rreq->status.MPI_ERROR =
- MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__,
- MPI_ERR_TYPE, "**MPID_nem_ib_poll", 0);
- /* --END ERROR HANDLING-- */
- }
- dprintf("ib_poll,lmt,ref_count=%d,lmt_pack_buf=%p\n", rreq->ref_count,
- REQ_FIELD(rreq, lmt_pack_buf));
- MPID_nem_ib_stfree(REQ_FIELD(rreq, lmt_pack_buf), (size_t) rreq->ch.lmt_data_sz);
- }
-
- /* send done to sender. vc is stashed in MPID_nem_ib_lmt_start_recv (in ib_lmt.c) */
-#ifdef MPID_NEM_IB_DEBUG_POLL
- MPID_nem_ib_vc_area *vc_ib = VC_IB(rreq->ch.vc);
-#endif
- dprintf("ib_poll,GET,lmt_send_GET_DONE,rsr_seq_num_tail=%d\n",
- vc_ib->ibcom->rsr_seq_num_tail);
- MPID_nem_ib_lmt_send_GET_DONE(rreq->ch.vc, rreq);
- dprintf("ib_poll,prev_rreq=%p,rreq->lmt_next=%p\n", prev_rreq,
- MPID_nem_ib_lmtq_next(rreq));
-
- /* unlink rreq */
- if (prev_rreq != NULL) {
- MPID_nem_ib_lmtq_next(prev_rreq) = MPID_nem_ib_lmtq_next(rreq);
- }
- else {
- MPID_nem_ib_lmtq_head(MPID_nem_ib_lmtq) = MPID_nem_ib_lmtq_next(rreq);
- }
- if (MPID_nem_ib_lmtq_next(rreq) == NULL) {
- MPID_nem_ib_lmtq.tail = prev_rreq;
- }
-
- /* save rreq->dev.next (and rreq) because decrementing reference-counter might free rreq */
- MPID_Request *tmp_rreq = rreq;
- rreq = MPID_nem_ib_lmtq_next(rreq);
-
- /* decrement completion-counter */
- dprintf("ib_poll,%d<-%d,", MPID_nem_ib_myrank, tmp_rreq->ch.vc->pg_rank);
- int incomplete;
- MPIDI_CH3U_Request_decrement_cc(tmp_rreq, &incomplete);
- dprintf("lmt,complete,tmp_rreq=%p,rreq->ref_count=%d,comm=%p\n", tmp_rreq,
- tmp_rreq->ref_count, tmp_rreq->comm);
-
- if (!incomplete) {
- MPIDI_CH3_Progress_signal_completion();
- }
-
- /* lmt_start_recv increments ref_count
- * drain_scq and ib_poll is not ordered, so both can decrement ref_count */
- /* ref_count is decremented
- * get-lmt: ib_poll, drain_scq, wait
- * put-lmt: ib_poll, wait */
- MPID_Request_release(tmp_rreq);
- dprintf("ib_poll,relese,req=%p\n", tmp_rreq);
- dprintf("ib_poll,lmt,after release,tmp_rreq=%p,rreq->ref_count=%d,comm=%p\n",
- tmp_rreq, tmp_rreq->ref_count, tmp_rreq->comm);
-
-
- goto next_unlinked;
- next:
- prev_rreq = rreq;
- rreq = MPID_nem_ib_lmtq_next(rreq);
- next_unlinked:;
- } while (rreq);
-#if defined (MPID_NEM_IB_TIMER_WAIT_IB_POLL)
- if (in_blocking_poll) {
- stsc[0] += MPI_rdtsc() - tsc[0];
- }
-#endif
- }
-
-#if defined (MPID_NEM_IB_TIMER_WAIT_IB_POLL)
- if (in_blocking_poll) {
- tsc[1] = MPI_rdtsc();
- }
-#endif
- int ncom_almost_full = 0;
-
- /* [MPID_NEM_IB_NRINGBUF-1] stores shared ring buffer */
- for (i = 0; i < MPID_NEM_IB_NRINGBUF; i++) {
- if ((((MPID_nem_ib_ringbuf_allocated[i / 64] >> (i & 63)) & 1) == 0) ||
- !MPID_nem_ib_ringbuf) {
- //dprintf("poll,cont\n");
- continue;
- }
- //tscs = MPID_nem_ib_rdtsc();
- //dprintf("poll,kicking progress engine for %d\n", i);
- mpi_errno = MPID_nem_ib_poll_eager(&MPID_nem_ib_ringbuf[i]);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_poll_eager");
-
- /* MPID_nem_ib_ringbuf may be freed in poll_eager, when we received CLOSE-packet. */
- if (!MPID_nem_ib_ringbuf) {
- dprintf("MPID_nem_ib_ringbuf is freed\n");
- continue;
- }
-
- /* without this, command in sendq doesn't have a chance
- * to perform send_progress
- * when send and progress_send call drain_scq asking it
- * for not performing send_progress and make the CQ empty */
- if (MPID_nem_ib_ringbuf[i].type == MPID_NEM_IB_RINGBUF_EXCLUSIVE) {
- mpi_errno = MPID_nem_ib_send_progress(MPID_nem_ib_ringbuf[i].vc);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_send_progress");
-
- ncom_almost_full |=
- (VC_FIELD(MPID_nem_ib_ringbuf[i].vc, ibcom->ncom) >=
- MPID_NEM_IB_COM_MAX_SQ_HEIGHT_DRAIN);
- }
- }
-#if defined (MPID_NEM_IB_TIMER_WAIT_IB_POLL)
- if (in_blocking_poll) {
- stsc[1] += MPI_rdtsc() - tsc[1];
- }
-#endif
-
- // lazy feching of completion queue entry because it causes cache-miss
-#if defined (MPID_NEM_IB_LMT_GET_CQE)
- if (MPID_nem_ib_ncqe_to_drain > 0 || MPID_nem_ib_ncqe_nces > 0 ||
- MPID_nem_ib_ncqe >= MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN || ncom_almost_full)
-#endif
-#if !defined (MPID_NEM_IB_LMT_GET_CQE)
- if (/*(in_blocking_poll && result == 0) || */ MPID_nem_ib_ncqe_nces >
- 0 || MPID_nem_ib_ncqe >= MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN || ncom_almost_full)
-#endif
- {
-#if defined (MPID_NEM_IB_TIMER_WAIT_IB_POLL)
- if (in_blocking_poll) {
- tsc[0] = MPI_rdtsc();
- }
-#endif
- //dprintf("ib_poll,calling drain_scq\n");
- ibcom_errno = MPID_nem_ib_drain_scq(0);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_drain_scq");
-#if defined (MPID_NEM_IB_TIMER_WAIT_IB_POLL)
- if (in_blocking_poll) {
- stsc[0] += MPI_rdtsc() - tsc[0];
- }
-#endif
- }
- /* aggressively perform drain_scq */
- ibcom_errno = MPID_nem_ib_drain_scq(0);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_drain_scq");
-
-#ifdef MPID_NEM_IB_ONDEMAND
- /* process incoming connection request */
- MPID_nem_ib_cm_poll_syn();
- MPID_nem_ib_cm_poll();
- //dprintf("ib_poll,MPID_nem_ib_ncqe_scratch_pad_to_drain=%d\n",
- //MPID_nem_ib_ncqe_scratch_pad_to_drain);
- /* process outgoing conncetion request */
- if (MPID_nem_ib_ncqe_scratch_pad_to_drain > 0 ||
- MPID_nem_ib_ncqe_scratch_pad >= MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN) {
- ibcom_errno = MPID_nem_ib_cm_drain_scq();
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_drain_scq");
- }
-
- /* Kick progress engine because time elapsed and it'd fire a event in the send queue */
- MPID_nem_ib_cm_progress();
-#endif
- MPID_nem_ib_ringbuf_progress();
- MPID_nem_ib_progress_engine_vt += 1; /* Progress virtual time */
-
- /* if polling on eager-send and lmt would repeat frequently, perform "pause" to yield instruction issue bandwitdh to other logical-core */
- if (in_blocking_poll && progress_completion_count_old == MPIDI_CH3I_progress_completion_count.v) {
- __asm__ __volatile__("pause;":::"memory");
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_POLL);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
- /* new rreq is obtained in MPID_Irecv in mpid_irecv.c,
- * so we associate rreq with a receive request and ibv_post_recv it
- * so that we can obtain rreq by ibv_poll_cq
- */
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_recv_posted
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_recv_posted(struct MPIDI_VC *vc, struct MPID_Request *req)
-{
-
- int mpi_errno = MPI_SUCCESS;
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RECV_POSTED);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RECV_POSTED);
- dprintf("recv_posted,enter,%d->%d,req=%p\n", MPID_nem_ib_myrank, vc->pg_rank, req);
-#ifdef MPID_NEM_IB_ONDEMAND
- if (vc_ib->connection_state != MPID_NEM_IB_CM_ESTABLISHED) {
- goto fn_exit;
- }
-#endif
-
- MPIDI_msg_sz_t data_sz;
- int dt_contig _UNUSED_;
- MPI_Aint dt_true_lb _UNUSED_;
- MPID_Datatype *dt_ptr;
- MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype,
- dt_contig, data_sz, dt_ptr, dt_true_lb);
- /* poll when rreq is for lmt */
- /* anticipating received message finds maching request in the posted-queue */
- if (data_sz + sizeof(MPIDI_CH3_Pkt_eager_send_t) > vc->eager_max_msg_sz) {
- //if (MPID_nem_ib_tsc_poll - MPID_nem_ib_rdtsc() > MPID_NEM_IB_POLL_PERIOD_RECV_POSTED) {
- if (VC_FIELD(vc, ibcom->remote_ringbuf)) {
- mpi_errno = MPID_nem_ib_poll_eager(VC_FIELD(vc, ibcom->remote_ringbuf));
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
- }
- }
-
- else {
- /* anticipating received message finds maching request in the posted-queue */
- if (VC_FIELD(vc, ibcom->remote_ringbuf)) {
- mpi_errno = MPID_nem_ib_poll_eager(VC_FIELD(vc, ibcom->remote_ringbuf));
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
- }
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RECV_POSTED);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-/* (1) packet-handler memcpy RDMA-write-to buf data to MPI user-buffer when matching request is found in posted-queue
- (2) MPI_Irecv memcpy RDMA-write-to buf data to MPI user-buffer when matching request is found in unexpected-queue
- the latter case can't be dealt with when call this after poll-found and packet-handler
- (packet-handler memcpy RDMA-write-to buf to another buffer when
- matching request is not found in posted-queue, so calling this after poll-found and packet-handler
- suffices in original MPICH implementation)
-*/
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_recv_buf_released
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_recv_buf_released(struct MPIDI_VC *vc, void *user_data)
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RECV_BUF_RELEASED);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RECV_BUF_RELEASED);
- dprintf("recv_buf_released,%d<-%d,user_data=%p\n", MPID_nem_ib_myrank, vc->pg_rank, user_data);
-
- /* Clear all possible tail flag slots */
- /* tail flag is located at MPID_NEM_IB_COM_INLINE_DATA boundary and variable length entails multiple prospective locations for the future use */
- /* see MPIDI_CH3_PktHandler_EagerShortSend (in src/mpid/ch3/src/ch3u_eager.c */
- /* eager-send with zero-length data is released in poll
- * because there is no way to trace the RDMA-write-to buffer addr
- * because rreq->dev.tmpbuf is set to zero in ch3_eager.c
- */
- if (user_data == NULL) {
- goto fn_exit;
- }
-
- if ((void *) MPID_nem_ib_rdmawr_to_alloc_start > user_data &&
- user_data >= (void *) (MPID_nem_ib_rdmawr_to_alloc_start +
- MPID_NEM_IB_COM_RDMABUF_SZ * MPID_NEM_IB_NRINGBUF)) {
- MPID_nem_ib_segv;
- }
- unsigned long mod =
- (unsigned long) ((uint8_t *) user_data -
- (uint8_t *) vc_ib->ibcom->remote_ringbuf->start) &
- (MPID_NEM_IB_COM_RDMABUF_SZSEG - 1);
- void *buf = (void *) ((uint8_t *) user_data - mod);
- //dprintf("recv_buf_released,clearing,buf=%p\n", buf);
- int off_pow2_aligned;
- MPID_NEM_IB_OFF_POW2_ALIGNED(MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf));
- //dprintf("recv_buf_released,sz=%d,pow2=%d\n", MPID_NEM_IB_NETMOD_HDR_SZ_GET(buf), off_pow2_aligned);
- uint32_t offset;
- for (offset = 15;;
- offset =
- (((offset + 1) << 1) - 1) > MPID_NEM_IB_MAX_OFF_POW2_ALIGNED ?
- MPID_NEM_IB_MAX_OFF_POW2_ALIGNED : (((offset + 1) << 1) - 1)) {
- MPID_nem_ib_netmod_trailer_t *netmod_trailer =
- (MPID_nem_ib_netmod_trailer_t *) ((uint8_t *) buf + offset);
- if (MPID_nem_ib_rdmawr_to_alloc_start > (uint8_t *) netmod_trailer &&
- (uint8_t *) netmod_trailer >=
- MPID_nem_ib_rdmawr_to_alloc_start + MPID_NEM_IB_COM_RDMABUF_SZ * MPID_NEM_IB_NRINGBUF) {
- MPID_nem_ib_segv;
- }
- netmod_trailer->tail_flag = 0;
- if (offset == off_pow2_aligned) {
- break;
- }
- }
-
- /* mark that one eager-send RDMA-write-to buffer has been released */
- uint16_t index_slot =
- (unsigned long) ((uint8_t *) user_data -
- (uint8_t *) vc_ib->ibcom->remote_ringbuf->start) /
- MPID_NEM_IB_COM_RDMABUF_SZSEG;
- MPIU_Assert(index_slot < (uint16_t) (vc_ib->ibcom->remote_ringbuf->nslot));
- dprintf("released,user_data=%p,mem=%p,sub=%08lx,index_slot=%d\n",
- user_data, vc_ib->ibcom->remote_ringbuf->start,
- (unsigned long) user_data -
- (unsigned long) vc_ib->ibcom->remote_ringbuf->start, index_slot);
- dprintf("released,index_slot=%d,released=%016lx\n", index_slot,
- vc_ib->ibcom->remote_ringbuf->remote_released[index_slot / 64]);
- vc_ib->ibcom->remote_ringbuf->remote_released[index_slot / 64] |= (1ULL << (index_slot & 63));
- dprintf("released,after bitset,%016lx\n",
- vc_ib->ibcom->remote_ringbuf->remote_released[index_slot / 64]);
- // int index_tail = (vc_ib->ibcom->rsr_seq_num_tail + 1) & (vc_ib->ibcom->local_ringbuf_nslot-1);
- MPID_nem_ib_ringbuf_headtail_t *headtail =
- (MPID_nem_ib_ringbuf_headtail_t *) ((uint8_t *) MPID_nem_ib_scratch_pad +
- MPID_NEM_IB_RINGBUF_OFF_HEAD);
- uint16_t index_tail = vc_ib->ibcom->remote_ringbuf->type == MPID_NEM_IB_RINGBUF_EXCLUSIVE ?
- ((uint16_t) (vc_ib->ibcom->rsr_seq_num_tail + 1) % vc_ib->ibcom->remote_ringbuf-> nslot) :
- ((uint16_t) (headtail->tail + 1) % vc_ib->ibcom->remote_ringbuf->nslot);
- dprintf("released,index_tail=%d\n", index_tail);
- dprintf("released,%016lx\n", vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64]);
- if (1 || (index_tail & 7) || MPID_nem_ib_diff16(index_slot, index_tail) >= vc_ib->ibcom->remote_ringbuf->nslot - 8) { /* avoid wrap-around */
- while (1) {
- if (((vc_ib->ibcom-> remote_ringbuf->remote_released[index_tail / 64] >> (index_tail & 63)) & 1) == 1) {
- if (vc_ib->ibcom->remote_ringbuf->type == MPID_NEM_IB_RINGBUF_EXCLUSIVE) {
- vc_ib->ibcom->rsr_seq_num_tail += 1;
- dprintf("exclusive ringbuf,remote_tail,incremented to %d\n",
- vc_ib->ibcom->rsr_seq_num_tail);
- }
- else {
- headtail->tail += 1;
- dprintf("shared ringbuf,tail,incremented to %d,head=%ld\n",
- headtail->tail, headtail->head);
- }
- vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64] &=
- ~(1ULL << (index_tail & 63));
- index_tail = (uint16_t) (index_tail + 1) % vc_ib->ibcom->remote_ringbuf->nslot;
- }
- else {
- break;
- }
- }
- }
- else {
- if (((vc_ib->ibcom->remote_ringbuf->remote_released[index_tail /
- 64] >> (index_tail & 63)) & 0xff) ==
- 0xff) {
- vc_ib->ibcom->rsr_seq_num_tail += 8;
- vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64] &=
- ~(0xffULL << (index_tail & 63));
- //dprintf("released[index_tail/64]=%016lx\n", vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64]);
- }
- }
-
- //dprintf("recv_buf_released,%d->%d,rsr_seq_num_tail=%d,rsr_seq_num_tail_last_sent=%d\n", MPID_nem_ib_myrank, vc->pg_rank, vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent);
-
- int notify_rate;
- ibcom_errno =
- MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get(MPID_nem_ib_conns
- [vc->pg_rank].fd, ¬ify_rate);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get");
- /* if you missed the chance to make eager-send message piggy-back it */
- if (vc_ib->ibcom->remote_ringbuf->type ==
- MPID_NEM_IB_RINGBUF_EXCLUSIVE &&
- MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
- vc_ib->ibcom->rsr_seq_num_tail_last_sent) >
- MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_RATE_DELAY_MULTIPLIER(notify_rate)
- //|| MPID_nem_ib_diff16(lsr_seq_num_head, vc_ib->ibcom->lsr_seq_num_tail_last_sent) == vc_ib->ibcom->local_ringbuf_nslot
-) {
- MPID_Request *sreq;
- sreq = MPID_nem_ib_sendq_head(vc_ib->sendq);
- if (sreq) {
- int msg_type = MPIDI_Request_get_msg_type(sreq);
- if (msg_type == MPIDI_REQUEST_EAGER_MSG && /* guard for the following pointer dereference */
- ((MPIDI_CH3_Pkt_t
- *) sreq->dev.iov[0].MPID_IOV_BUF)->type ==
- MPIDI_NEM_PKT_NETMOD
- &&
- ((MPID_nem_pkt_netmod_t *) sreq->dev.iov[0].MPID_IOV_BUF)->subtype ==
- MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM) {
- goto skip;
- }
- }
- //printf("recv_buf_released,sending reply_seq_num,diff=%d,rate=%d,id=%d\n", MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent), notify_rate + (notify_rate>>1), vc_ib->ibcom->sseq_num);
- MPID_nem_ib_send_reply_seq_num(vc);
- skip:;
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RECV_BUF_RELEASED);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-/* packet handler for wrapper packet of MPIDI_CH3_PKT_EAGER_SEND */
-/* see MPIDI_CH3_PktHandler_EagerSend (in src/mpid/ch3/src/ch3u_eager.c) */
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_PktHandler_EagerSend
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_PktHandler_EagerSend(MPIDI_VC_t * vc,
- MPIDI_CH3_Pkt_t * pkt, MPIDI_msg_sz_t * buflen /* out */ ,
- MPID_Request ** rreqp /* out */)
-{
- MPID_nem_ib_pkt_prefix_t *netmod_pkt = (MPID_nem_ib_pkt_prefix_t *) pkt;
- MPIDI_CH3_Pkt_eager_send_t *ch3_pkt =
- (MPIDI_CH3_Pkt_eager_send_t *) ((uint8_t *) pkt + sizeof(MPID_nem_ib_pkt_prefix_t));
- int mpi_errno = MPI_SUCCESS;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_EAGERSEND);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_EAGERSEND);
- dprintf("ib_pkthandler_eagersend,tag=%d\n", ch3_pkt->match.parts.tag);
- /* Check the assumption on sizeof(MPIDI_CH3_Pkt_t).
- * It is utilized to point the payload location in MPIDI_CH3_PktHandler_EagerSend
- * (src/mpid/ch3/src/ch3u_eager.c) that must be larger than sizeof(MPID_nem_ib_pkt_eager_send_t) */
- //if (sizeof(MPID_nem_ib_pkt_eager_send_t) > sizeof(MPIDI_CH3_Pkt_t)) {
- //MPIU_ERR_SETFATALANDJUMP(mpi_errno, MPI_ERR_INTERN, "**sizeof(MPIDI_CH3_Pkt_t)");
- //}
- /* Update occupation status of local SR (send request) queue */
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- dprintf
- ("MPID_nem_ib_PktHandler_EagerSend,lsr_seq_num_tail=%d,netmod_pkt->seq_num_tail=%d\n",
- vc_ib->ibcom->lsr_seq_num_tail, netmod_pkt->seq_num_tail);
- vc_ib->ibcom->lsr_seq_num_tail = netmod_pkt->seq_num_tail;
- dprintf
- ("MPID_nem_ib_PktHandler_EagerSend,lsr_seq_num_tail updated to %d\n",
- vc_ib->ibcom->lsr_seq_num_tail);
-#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
- /* change remote notification policy of RDMA-write-to buf */
- dprintf("pkthandler,eagersend,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
- MPID_nem_ib_change_rdmabuf_occupancy_notify_policy_lw(vc_ib, lsr_seq_num_tail);
- dprintf("pkthandler,eagersend,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
-#endif
- dprintf
- ("pkthandler,eagersend,sendq_empty=%d,ncom=%d,rdmabuf_occ=%d\n",
- MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom,
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
- /* try to send from sendq because at least one RDMA-write-to buffer has been released */
- /* calling drain_scq from progress_send derpives of chance
- * for ib_poll to drain sendq using ncqe
- * however transfers events to
- * (not to reply_seq_num because it's regulated by the rate)
- * fire on ib_poll using nces (e.g. MPI_Put) so we need to perform
- * progress_send for all of VCs using nces in ib_poll. */
- dprintf("pkthandler,eagersend,send_progress\n");
- fflush(stdout);
- MPID_NEM_IB_CHECK_AND_SEND_PROGRESS;
- /* fall back to the original handler */
- /* we don't need to worry about the difference caused by embedding seq_num
- * because size of MPI-header of MPIDI_CH3_PKT_EAGER_SEND equals to sizeof(MPIDI_CH3_Pkt_t)
- * see MPID_nem_ib_iSendContig
- */
- //ch3_pkt->type = MPIDI_CH3_PKT_EAGER_SEND;
- dprintf("ib_poll.c,before PktHandler_EagerSend,buflen=%ld\n", *buflen);
- MPIDI_msg_sz_t ch3_buflen = *buflen - sizeof(MPID_nem_ib_pkt_prefix_t);
- mpi_errno = MPIDI_CH3_PktHandler_EagerSend(vc, (MPIDI_CH3_Pkt_t *) ch3_pkt, &ch3_buflen, rreqp);
- dprintf("ib_poll.c,after PktHandler_EagerSend,buflen=%ld\n", ch3_buflen);
- *buflen = ch3_buflen + sizeof(MPID_nem_ib_pkt_prefix_t);
- dprintf("ib_poll.c,after addition,buflen=%ld\n", *buflen);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_PKTHANDLER_EAGERSEND);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_PktHandler_rma_lmt_rts
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_PktHandler_rma_lmt_rts(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen /* out */ ,
- MPID_Request ** rreqp /* out */)
-{
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- int mpi_errno = MPI_SUCCESS;
- MPID_Request *req = NULL;
- MPID_nem_ib_pkt_lmt_rts_t *rts_pkt = (MPID_nem_ib_pkt_lmt_rts_t *) pkt;
-
- void *write_to_buf;
- void *addr;
- uint32_t rkey;
- long length;
- int last;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_RMA_LMT_RTS);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_RMA_LMT_RTS);
-
- if (rts_pkt->seg_seq_num == 1) {
- // receive a packet for first segment
- MPIDI_CH3_Pkt_t *pkt_hdr = (MPIDI_CH3_Pkt_t *) ((uint8_t *) pkt + sizeof(MPIDI_CH3_Pkt_t));
- MPID_nem_ib_rma_lmt_cookie_t *s_cookie_buf =
- (MPID_nem_ib_rma_lmt_cookie_t *) ((uint8_t *) pkt + sizeof(MPIDI_CH3_Pkt_t) +
- sizeof(MPIDI_CH3_Pkt_t));
-
- req = MPID_Request_create();
- MPIU_Object_set_ref(req, 1); /* decrement only in drain_scq ? */
-
- req->ch.lmt_data_sz = s_cookie_buf->len;
- req->ch.lmt_req_id = s_cookie_buf->sender_req_id;
-
- REQ_FIELD(req, max_msg_sz) = s_cookie_buf->max_msg_sz;
-
- REQ_FIELD(req, lmt_pack_buf) =
- MPIU_Malloc(sizeof(MPIDI_CH3_Pkt_t) + (size_t) req->ch.lmt_data_sz);
- MPIU_ERR_CHKANDJUMP(!REQ_FIELD(req, lmt_pack_buf), mpi_errno, MPI_ERR_OTHER,
- "**outofmemory");
-
- memcpy(REQ_FIELD(req, lmt_pack_buf), pkt_hdr, sizeof(MPIDI_CH3_Pkt_t));
- REQ_FIELD(req, seg_num) = s_cookie_buf->seg_num; /* store number of segments */
-
- addr = s_cookie_buf->addr;
- rkey = s_cookie_buf->rkey;
-
- REQ_FIELD(req, lmt_tail) = s_cookie_buf->tail;
- }
- else {
- MPID_Request_get_ptr(rts_pkt->req_id, req);
- addr = rts_pkt->addr;
- rkey = rts_pkt->rkey;
- }
-
- if (rts_pkt->seg_seq_num == REQ_FIELD(req, seg_num)) {
- last = 1;
- length =
- req->ch.lmt_data_sz - (long) (rts_pkt->seg_seq_num - 1) * REQ_FIELD(req, max_msg_sz);
- }
- else {
- last = 0;
- length = REQ_FIELD(req, max_msg_sz);
- }
-
- /* RDMA READ buffer address */
- write_to_buf =
- (void *) ((char *) REQ_FIELD(req, lmt_pack_buf) + sizeof(MPIDI_CH3_Pkt_t) +
- (rts_pkt->seg_seq_num - 1) * REQ_FIELD(req, max_msg_sz));
-
- /* stash vc for ib_poll */
- req->ch.vc = vc;
-
- MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_SEND); // Set dummy type for ib_drain_scq
-
- /* try to issue RDMA-read command */
- int slack = 1; /* slack for control packet bringing sequence number */
- if (MPID_nem_ib_sendq_empty(vc_ib->sendq) &&
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack) {
- MPIDI_Request_set_msg_type(req, MPIDI_REQUEST_RNDV_MSG);
- mpi_errno =
- MPID_nem_ib_lmt_start_recv_core(req, addr, rkey, length, write_to_buf,
- REQ_FIELD(req, max_msg_sz), last);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
- }
- else {
- /* enqueue command into send_queue */
- dprintf("lmt_start_recv, enqueuing,sendq_empty=%d,ncom=%d,ncqe=%d\n",
- MPID_nem_ib_sendq_empty(vc_ib->sendq),
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
-
- /* make raddr, (sz is in rreq->ch.lmt_data_sz), rkey, (user_buf is in req->dev.user_buf) survive enqueue, free cookie, dequeue */
- REQ_FIELD(req, lmt_raddr) = addr;
- REQ_FIELD(req, lmt_rkey) = rkey;
- REQ_FIELD(req, lmt_write_to_buf) = write_to_buf;
- REQ_FIELD(req, lmt_szsend) = length;
- REQ_FIELD(req, last) = last; /* not support segmentation */
-
- /* set for send_progress */
- MPIDI_Request_set_msg_type(req, MPIDI_REQUEST_RNDV_MSG);
- req->kind = MPID_REQUEST_RECV;
-
- MPID_nem_ib_sendq_enqueue(&vc_ib->sendq, req);
- }
-
- if (rts_pkt->seg_seq_num == 1) {
- /* prefix + header + data */
- *buflen =
- sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_CH3_Pkt_t) +
- sizeof(MPID_nem_ib_rma_lmt_cookie_t);
- }
- else {
- *buflen = sizeof(MPIDI_CH3_Pkt_t);
- }
- *rreqp = NULL;
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_PKTHANDLER_RMA_LMT_RTS);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-/* MPI_Isend set req-type to MPIDI_REQUEST_TYPE_RECV */
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_pkt_GET_DONE_handler
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_pkt_GET_DONE_handler(MPIDI_VC_t * vc,
- MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
-{
- int mpi_errno = MPI_SUCCESS;
- MPID_nem_ib_pkt_lmt_get_done_t *const done_pkt = (MPID_nem_ib_pkt_lmt_get_done_t *) pkt;
- MPID_Request *req;
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKT_GET_DONE_HANDLER);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKT_GET_DONE_HANDLER);
- dprintf("get_done_handler,enter\n");
- *buflen = sizeof(MPIDI_CH3_Pkt_t);
- MPID_Request_get_ptr(done_pkt->req_id, req);
- MPIU_THREAD_CS_ENTER(LMT,);
- switch (MPIDI_Request_get_type(req)) {
- /* MPIDI_Request_set_type is not performed when
- * MPID_Isend --> FDU_or_AEP --> recv_posted --> ib_poll --> PUTCTS packet-handler */
- case MPIDI_REQUEST_TYPE_RECV:
- MPIU_ERR_INTERNALANDJUMP(mpi_errno, "unexpected request type");
- break;
- case MPIDI_REQUEST_TYPE_SEND:
- case MPIDI_REQUEST_TYPE_RSEND:
- case MPIDI_REQUEST_TYPE_SSEND:
- case MPIDI_REQUEST_TYPE_BSEND:
- /* decrement reference counter of mr_cache_entry */
- MPID_nem_ib_com_reg_mr_release(REQ_FIELD(req, lmt_mr_cache));
-
- /* try to send from sendq because at least one RDMA-write-to buffer has been released */
- //dprintf("lmt_start_recv,reply_seq_num,send_progress\n");
- if (!MPID_nem_ib_sendq_empty(vc_ib->sendq)) {
- dprintf("get_done_handler,ncom=%d,ncqe=%d,diff=%d(%d-%d)\n",
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY,
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) <
- vc_ib->ibcom->local_ringbuf_nslot, vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail);
- }
- dprintf("get_done_handler,send_progress\n");
- fflush(stdout);
-
- if (REQ_FIELD(req, seg_seq_num) == REQ_FIELD(req, seg_num)) {
- /* last packet of segments */
- MPID_NEM_IB_CHECK_AND_SEND_PROGRESS;
- mpi_errno = vc->ch.lmt_done_send(vc, req);
- if (mpi_errno)
- MPIU_ERR_POP(mpi_errno);
- }
- else {
- /* Send RTS for next segment */
- REQ_FIELD(req, seg_seq_num) += 1; /* next segment number */
- int next_seg_seq_num = REQ_FIELD(req, seg_seq_num);
-
- uint32_t length;
- if (next_seg_seq_num == REQ_FIELD(req, seg_num))
- length = REQ_FIELD(req, data_sz) - (long) (next_seg_seq_num - 1) * REQ_FIELD(req, max_msg_sz); //length of last segment
- else
- length = REQ_FIELD(req, max_msg_sz);
-
- void *addr =
- (void *) ((char *) REQ_FIELD(req, buf.from) +
- (long) (next_seg_seq_num - 1) * REQ_FIELD(req, max_msg_sz));
- struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache =
- MPID_nem_ib_com_reg_mr_fetch(addr, length, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
- MPIU_ERR_CHKANDJUMP(!mr_cache, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_reg_mr_fetch");
- struct ibv_mr *mr = mr_cache->mr;
- /* store new cache entry */
- REQ_FIELD(req, lmt_mr_cache) = (void *) mr_cache;
-
-#ifdef HAVE_LIBDCFA
- void *_addr = mr->host_addr;
-#else
- void *_addr = addr;
-#endif
- MPID_nem_ib_lmt_send_RTS(MPIDI_NEM_IB_PKT_LMT_RTS, vc, done_pkt->receiver_req_id, _addr,
- mr->rkey, next_seg_seq_num);
- }
- break;
- default:
- MPIU_ERR_INTERNALANDJUMP(mpi_errno, "unexpected request type");
- break;
- }
-
- *rreqp = NULL;
- fn_exit:
- MPIU_THREAD_CS_EXIT(LMT,);
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_PKT_GET_DONE_HANDLER);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_pkt_RTS_handler
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_pkt_RTS_handler(MPIDI_VC_t * vc,
- MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
-{
- int mpi_errno = MPI_SUCCESS;
- MPID_nem_ib_pkt_lmt_rts_t *const rts_pkt = (MPID_nem_ib_pkt_lmt_rts_t *) pkt;
- MPID_Request *req;
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- dprintf("ib_pkt_RTS_handler,enter\n");
- *buflen = sizeof(MPIDI_CH3_Pkt_t);
- MPID_Request_get_ptr(rts_pkt->req_id, req);
- MPIU_THREAD_CS_ENTER(LMT,);
-
- void *write_to_buf =
- (void *) ((char *) REQ_FIELD(req, buf.to) +
- (long) (rts_pkt->seg_seq_num - 1) * REQ_FIELD(req, max_msg_sz));
-
- int last;
- long length;
-
- /* last segment */
- if (rts_pkt->seg_seq_num == REQ_FIELD(req, seg_num)) {
- last = 1;
- length =
- req->ch.lmt_data_sz - (long) (rts_pkt->seg_seq_num - 1) * REQ_FIELD(req, max_msg_sz);
- }
- else {
- last = 0;
- length = REQ_FIELD(req, max_msg_sz);
- }
- /* try to issue RDMA-read command */
- int slack = 1; /* slack for control packet bringing sequence number */
- if (MPID_nem_ib_sendq_empty(vc_ib->sendq) &&
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack) {
- mpi_errno =
- MPID_nem_ib_lmt_start_recv_core(req, rts_pkt->addr, rts_pkt->rkey, length,
- write_to_buf, REQ_FIELD(req, max_msg_sz), last);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
- }
- else {
- /* enqueue command into send_queue */
- dprintf("ib_pkt_RTS_handler, enqueuing,sendq_empty=%d,ncom=%d,ncqe=%d\n",
- MPID_nem_ib_sendq_empty(vc_ib->sendq),
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY);
-
- /* make raddr, (sz is in rreq->ch.lmt_data_sz), rkey, (user_buf is in req->dev.user_buf) survive enqueue, free cookie, dequeue */
- REQ_FIELD(req, lmt_raddr) = rts_pkt->addr;
- REQ_FIELD(req, lmt_rkey) = rts_pkt->rkey;
- REQ_FIELD(req, lmt_write_to_buf) = write_to_buf;
- REQ_FIELD(req, lmt_szsend) = length;
- REQ_FIELD(req, last) = last;
-
- MPID_nem_ib_sendq_enqueue(&vc_ib->sendq, req);
- }
-
- *rreqp = NULL;
- fn_exit:
- MPIU_THREAD_CS_EXIT(LMT,);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_PktHandler_req_seq_num
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_PktHandler_req_seq_num(MPIDI_VC_t * vc,
- MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
-{
- int mpi_errno = MPI_SUCCESS;
- MPID_nem_ib_pkt_req_seq_num_t *const req_pkt = (MPID_nem_ib_pkt_req_seq_num_t *) pkt;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_REQ_SEQ_NUM);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_REQ_SEQ_NUM);
- /* mark as all of the message is read */
- *buflen = sizeof(MPIDI_CH3_Pkt_t);
- /* mark as I don't need continuation read request */
- *rreqp = NULL;
- /* update occupancy info of SR */
- /* request piggy-backs seq_num although it's requesting responder's seq_num */
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- vc_ib->ibcom->lsr_seq_num_tail = req_pkt->seq_num_tail;
- dprintf
- ("PktHandler_req_seq_num,sendq=%d,ncom=%d,ncqe=%d,diff=%d(%d-%d)\n",
- MPID_nem_ib_sendq_empty(vc_ib->sendq),
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY,
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) <
- vc_ib->ibcom->local_ringbuf_nslot, vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail);
- /* send reply */
- dprintf("PktHandler_req_seq_num,sending reply_seq_num,id=%d\n", vc_ib->ibcom->sseq_num);
- MPID_nem_ib_send_reply_seq_num(vc);
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_PKTHANDLER_REQ_SEQ_NUM);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_PktHandler_reply_seq_num
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_PktHandler_reply_seq_num(MPIDI_VC_t * vc,
- MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
-{
- int mpi_errno = MPI_SUCCESS;
- MPID_nem_ib_pkt_reply_seq_num_t *const reply_pkt = (MPID_nem_ib_pkt_reply_seq_num_t *) pkt;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_REPLY_SEQ_NUM);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_REPLY_SEQ_NUM);
- /* mark as all of the message is consumed */
- *buflen = sizeof(MPIDI_CH3_Pkt_t);
- /* mark as I don't need continuation read request */
- *rreqp = NULL;
- /* update occupancy info of RDMA-write-buf */
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- dprintf
- ("pkthandler,reply_seq_num,old lsr_seq_num=%d,reply_pkt->seq_num_tail=%d\n",
- vc_ib->ibcom->lsr_seq_num_tail, reply_pkt->seq_num_tail);
- vc_ib->ibcom->lsr_seq_num_tail = reply_pkt->seq_num_tail;
- //dprintf("pkthandler,reply_seq_num,new lsr_seq_num=%d\n", vc_ib->ibcom->lsr_seq_num_tail);
-#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
- /* change remote notification policy of RDMA-write-to buf */
- //dprintf("pkthandler,reply_seq_num,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
- MPID_nem_ib_change_rdmabuf_occupancy_notify_policy_lw(vc_ib, &(vc_ib->ibcom->lsr_seq_num_tail));
- //dprintf("pkthandler,reply_seq_num,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
-#endif
- //dprintf("pkthandler,reply_seq_num,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
- /* try to send from sendq because at least one RDMA-write-to buffer has been released */
- //dprintf("pkthandler,reply_seq_num,send_progress\n");
- dprintf("pkthandler,reply_seq_num,send_progress\n");
- MPID_NEM_IB_CHECK_AND_SEND_PROGRESS;
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_PKTHANDLER_REPLY_SEQ_NUM);
- return mpi_errno;
- //fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_PktHandler_change_rdmabuf_occupancy_notify_state
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_PktHandler_change_rdmabuf_occupancy_notify_state
- (MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt, MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp) {
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
- MPID_nem_ib_pkt_change_rdmabuf_occupancy_notify_state_t *const reply_pkt =
- (MPID_nem_ib_pkt_change_rdmabuf_occupancy_notify_state_t *) pkt;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_PKTHANDLER_CHANGE_RDMABUF_OCCUPANCY_NOTIFY_STATE);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_PKTHANDLER_CHANGE_RDMABUF_OCCUPANCY_NOTIFY_STATE);
- /* mark as all of the message is read */
- *buflen = sizeof(MPIDI_CH3_Pkt_t);
- /* mark as I don't need continuation read request */
- *rreqp = NULL;
- /* update occupancy info of SR */
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- dprintf("pkthandler,change notify state,old lstate=%d,pkt->state=%d\n",
- vc_ib->ibcom->rdmabuf_occupancy_notify_lstate, reply_pkt->state);
- int *rdmabuf_occupancy_notify_lstate;
- ibcom_errno =
- MPID_nem_ib_com_rdmabuf_occupancy_notify_lstate_get(vc_ib->sc->fd,
- &rdmabuf_occupancy_notify_lstate);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_rdmabuf_occupancy_notify_lstate_get");
- *rdmabuf_occupancy_notify_lstate = reply_pkt->state;
- dprintf("pkthandler,change notify state,new lstate=%d\n",
- vc_ib->ibcom->rdmabuf_occupancy_notify_lstate);
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_PKTHANDLER_CHANGE_RDMABUF_OCCUPANCY_NOTIFY_STATE);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_pkt_rma_lmt_getdone
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_pkt_rma_lmt_getdone(MPIDI_VC_t * vc,
- MPIDI_CH3_Pkt_t * pkt,
- MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
-{
- int mpi_errno = MPI_SUCCESS;
- MPID_nem_ib_pkt_lmt_get_done_t *const done_pkt = (MPID_nem_ib_pkt_lmt_get_done_t *) pkt;
- MPID_Request *req;
- int req_type;
-
- *buflen = sizeof(MPIDI_CH3_Pkt_t);
- MPID_Request_get_ptr(done_pkt->req_id, req);
-
- MPIU_THREAD_CS_ENTER(LMT,);
-
- /* decrement reference counter of mr_cache_entry */
- MPID_nem_ib_com_reg_mr_release(REQ_FIELD(req, lmt_mr_cache));
-
- if (REQ_FIELD(req, seg_seq_num) == REQ_FIELD(req, seg_num)) {
- req_type = MPIDI_Request_get_type(req);
- /* free memory area for cookie */
- if (!req->ch.s_cookie) {
- dprintf("lmt_done_send,enter,req->ch.s_cookie is zero");
- }
- MPIU_Free(req->ch.s_cookie);
-
- if ((req_type == 0 && !req->comm) || (req_type == MPIDI_REQUEST_TYPE_GET_RESP)) {
- if ((*req->cc_ptr == 1) && req->dev.datatype_ptr && (req->dev.segment_size > 0) &&
- REQ_FIELD(req, lmt_pack_buf)) {
- MPIU_Free(REQ_FIELD(req, lmt_pack_buf));
- }
- }
-
- int (*reqFn) (MPIDI_VC_t *, MPID_Request *, int *);
- reqFn = req->dev.OnDataAvail;
-
- if (*req->cc_ptr == 1 &&
- (reqFn == MPIDI_CH3_ReqHandler_ReqOpsComplete
- || reqFn == MPIDI_CH3_ReqHandler_GetSendComplete)) {
- MPIDI_VC_t *_vc = req->ch.vc;
- int complete = 0;
- mpi_errno = reqFn(_vc, req, &complete);
- }
- else {
- MPIDI_CH3U_Request_complete(req);
- }
- }
- else {
- REQ_FIELD(req, seg_seq_num) += 1; /* next segment number */
- int next_seg_seq_num = REQ_FIELD(req, seg_seq_num);
-
- uint32_t length;
- if (next_seg_seq_num == REQ_FIELD(req, seg_num))
- length = REQ_FIELD(req, data_sz) - (long) (next_seg_seq_num - 1) * REQ_FIELD(req, max_msg_sz); //length of last segment
- else
- length = REQ_FIELD(req, max_msg_sz);
-
- void *addr =
- (void *) ((char *) REQ_FIELD(req, buf.from) +
- (long) (next_seg_seq_num - 1) * REQ_FIELD(req, max_msg_sz));
- struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache =
- MPID_nem_ib_com_reg_mr_fetch(addr, length, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
- MPIU_ERR_CHKANDJUMP(!mr_cache, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
- struct ibv_mr *mr = mr_cache->mr;
- /* store new cache entry */
- REQ_FIELD(req, lmt_mr_cache) = (void *) mr_cache;
-
-#ifdef HAVE_LIBDCFA
- void *_addr = mr->host_addr;
-#else
- void *_addr = addr;
-#endif
- MPID_nem_ib_lmt_send_RTS(MPIDI_NEM_IB_PKT_RMA_LMT_RTS, vc, done_pkt->receiver_req_id, _addr,
- mr->rkey, next_seg_seq_num);
- }
-
- *rreqp = NULL;
- fn_exit:
- MPIU_THREAD_CS_EXIT(LMT,);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#ifdef MPID_NEM_IB_ONDEMAND
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_cm_drain_scq
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_cm_drain_scq()
-{
-
- int mpi_errno = MPI_SUCCESS;
- int result;
- int i;
- struct ibv_wc cqe[MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN];
- MPID_nem_ib_cm_cmd_shadow_t *shadow_cm;
- MPID_nem_ib_ringbuf_cmd_shadow_t *shadow_ringbuf;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_DRAIN_SCQ);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_DRAIN_SCQ);
- //dprintf("cm_drain_scq,enter\n");
- /* cm_drain_scq is called after poll_eager calls vc_terminate */
- if (!MPID_nem_ib_rc_shared_scq_scratch_pad) {
- dprintf("cm_drain_scq,CQ is null\n");
- goto fn_exit;
- }
-
- result =
- ibv_poll_cq(MPID_nem_ib_rc_shared_scq_scratch_pad,
- MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN, &cqe[0]);
- MPIU_ERR_CHKANDJUMP(result < 0, mpi_errno, MPI_ERR_OTHER, "**netmod,ib,ibv_poll_cq");
- if (result > 0) {
- dprintf("cm_drain_scq,found,result=%d\n", result);
- }
- for (i = 0; i < result; i++) {
-
- dprintf("cm_drain_scq,wr_id=%p\n", (void *) cqe[i].wr_id);
-#ifdef HAVE_LIBDCFA
- if (cqe[i].status != IBV_WC_SUCCESS) {
- dprintf("cm_drain_scq,status=%08x\n", cqe[i].status);
- MPID_nem_ib_segv;
- }
-#else
- if (cqe[i].status != IBV_WC_SUCCESS) {
- dprintf("cm_drain_scq,status=%08x,%s\n", cqe[i].status,
- ibv_wc_status_str(cqe[i].status));
- MPID_nem_ib_segv;
- }
-#endif
- MPIU_ERR_CHKANDJUMP(cqe[i].status != IBV_WC_SUCCESS, mpi_errno,
- MPI_ERR_OTHER, "**MPID_nem_ib_cm_drain_scq");
- MPID_nem_ib_cm_ringbuf_cmd_type_t *type =
- (MPID_nem_ib_cm_ringbuf_cmd_type_t *) cqe[i].wr_id;
- switch (*type) {
- case MPID_NEM_IB_CM_CAS:{
- shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
- dprintf("cm_drain_scq,cm_cas,req=%p,responder_rank=%d\n",
- shadow_cm->req, shadow_cm->req->responder_rank);
- /* Check if CAS have succeeded */
- uint64_t *cas_retval = (uint64_t *) shadow_cm->buf_from;
- if (*cas_retval == MPID_NEM_IB_CM_RELEASED) {
- /* CAS succeeded, so write command */
-
- dprintf("cm_drain_scq,cm_cas,succeeded\n");
- if (is_conn_established(shadow_cm->req->responder_rank)) {
- /* Explicitly release CAS word because
- * ConnectX-3 doesn't support safe CAS with PCI device and CPU */
- MPID_nem_ib_cm_cas_release(MPID_nem_ib_conns
- [shadow_cm->req->responder_rank].vc);
-
- shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
- dprintf("cm_drain_scq,cm_cas,established is true,%d->%d,tx=%d\n",
- MPID_nem_ib_myrank, shadow_cm->req->responder_rank,
- shadow_cm->req->ibcom->outstanding_connection_tx);
- /* Let the guard down to let the following connection request go. */
- VC_FIELD(MPID_nem_ib_conns[shadow_cm->req->responder_rank].vc,
- connection_guard) = 0;
- /* free memory : req->ref_count is 3, so call MPIU_Free() directly */
- //MPID_nem_ib_cm_request_release(shadow_cm->req);
- MPIU_Free(shadow_cm->req);
- }
- else {
- /* Increment receiving transaction counter. Initiator receives SYNACK and ACK2 */
- shadow_cm->req->ibcom->incoming_connection_tx += 2;
- dprintf("cm_drain_scq,cas succeeded,sending syn,%d->%d,connection_tx=%d\n",
- MPID_nem_ib_myrank, shadow_cm->req->responder_rank,
- shadow_cm->req->ibcom->outstanding_connection_tx);
- shadow_cm->req->state = MPID_NEM_IB_CM_SYN;
- if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
- shadow_cm->req->ibcom->ncom_scratch_pad <
- MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
- MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
- MPID_nem_ib_cm_ringbuf_tail) < MPID_NEM_IB_CM_NSEG) {
-
- MPID_nem_ib_cm_cmd_syn_t *cmd =
- (MPID_nem_ib_cm_cmd_syn_t *) shadow_cm->req->
- ibcom->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
- MPID_NEM_IB_CM_COMPOSE_SYN(cmd, shadow_cm->req);
- cmd->responder_ringbuf_index =
- shadow_cm->req->responder_ringbuf_index =
- MPID_nem_ib_cm_ringbuf_head;
- dprintf("cm_drain_scq,giving ringbuf_index=%d\n",
- cmd->responder_ringbuf_index);
- MPID_nem_ib_cm_ringbuf_head++;
- cmd->initiator_rank = MPID_nem_ib_myrank;
- MPID_nem_ib_cm_cmd_shadow_t *shadow_syn =
- (MPID_nem_ib_cm_cmd_shadow_t *)
- MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow_syn->type = shadow_cm->req->state;
- shadow_syn->req = shadow_cm->req;
- dprintf("shadow_syn=%p,shadow_syn->req=%p\n", shadow_syn,
- shadow_syn->req);
- mpi_errno =
- MPID_nem_ib_cm_cmd_core(shadow_cm->req->responder_rank, shadow_syn,
- (void *) cmd,
- sizeof(MPID_nem_ib_cm_cmd_syn_t),
- 1 /* syn:1 */ , 0);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_send_core");
- }
- else {
- MPID_NEM_IB_CM_COMPOSE_SYN((MPID_nem_ib_cm_cmd_syn_t *) &
- (shadow_cm->req->cmd), shadow_cm->req);
- MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, shadow_cm->req);
- dprintf("cm_drain_scq,enqueue syn,%d->%d\n",
- MPID_nem_ib_myrank, shadow_cm->req->responder_rank);
- }
- }
- }
- else {
- if (is_conn_established(shadow_cm->req->responder_rank)) {
- /* CAS is failed, and connection is already established */
-
- dprintf("cm_drain_scq,cm_cas,connection is already established\n");
- MPID_nem_ib_ncqe_scratch_pad_to_drain -= 1;
- shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
- shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
- dprintf("cm_drain_scq,cm_cas,cas failed,established is true,%d->%d,tx=%d\n",
- MPID_nem_ib_myrank, shadow_cm->req->responder_rank,
- shadow_cm->req->ibcom->outstanding_connection_tx);
- MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
- /* Let the guard down to let the following connection request go. */
- VC_FIELD(MPID_nem_ib_conns[shadow_cm->req->responder_rank].vc,
- connection_guard) = 0;
- /* free memory : req->ref_count is 2, so call MPIU_Free() directly */
- //MPID_nem_ib_cm_request_release(shadow_cm->req);
- MPIU_Free(shadow_cm->req);
- MPIU_Free(shadow_cm);
- break;
- }
-
- shadow_cm->req->retry_backoff =
- shadow_cm->req->retry_backoff ? (shadow_cm->req->retry_backoff << 1) : 1;
- shadow_cm->req->retry_decided = MPID_nem_ib_progress_engine_vt; /* Schedule retry */
- MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, shadow_cm->req);
- dprintf
- ("cm_drain_scq,cm_cas,cas failed,%d->%d,retval=%016lx,decided=%ld,backoff=%ld\n",
- MPID_nem_ib_myrank, shadow_cm->req->responder_rank, *cas_retval,
- shadow_cm->req->retry_decided, shadow_cm->req->retry_backoff);
- }
- MPID_nem_ib_ncqe_scratch_pad_to_drain -= 1;
- shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
- MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
- MPIU_Free(shadow_cm);
- break;
- }
- case MPID_NEM_IB_CM_CAS_RELEASE:{
- shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
- dprintf("cm_drain_scq,cm_cas_release,req=%p,responder_rank=%d\n",
- shadow_cm->req, shadow_cm->req->responder_rank);
- /* Check if CAS have succeeded */
- uint64_t *cas_retval = (uint64_t *) shadow_cm->buf_from;
- if (*cas_retval == MPID_nem_ib_myrank) {
- /* CAS succeeded */
- dprintf("cm_drain_scq,cm_cas_release,cas succeeded,%d->%d,retval=%016lx\n",
- MPID_nem_ib_myrank, shadow_cm->req->responder_rank, *cas_retval);
- shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
- MPID_nem_ib_cm_request_release(shadow_cm->req);
- }
- else {
-
- shadow_cm->req->retry_backoff =
- shadow_cm->req->retry_backoff ? (shadow_cm->req->retry_backoff << 1) : 1;
- shadow_cm->req->retry_decided = MPID_nem_ib_progress_engine_vt; /* Schedule retry */
- MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, shadow_cm->req);
- dprintf
- ("cm_drain_scq,cm_cas_release,cas failed,%d->%d,retval=%016lx,decided=%ld,backoff=%ld\n",
- MPID_nem_ib_myrank, shadow_cm->req->responder_rank, *cas_retval,
- shadow_cm->req->retry_decided, shadow_cm->req->retry_backoff);
- }
-
- shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
- MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
- MPIU_Free(shadow_cm);
- break;
- }
- case MPID_NEM_IB_CM_SYN:
- dprintf("cm_drain_scq,syn sent\n");
- shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
- shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
- dprintf("cm_drain_scq,syn sent,%d->%d,connection_tx=%d\n",
- MPID_nem_ib_myrank, shadow_cm->req->responder_rank,
- shadow_cm->req->ibcom->outstanding_connection_tx);
- dprintf("cm_drain_scq,syn,buf_from=%p,sz=%d\n", shadow_cm->buf_from,
- shadow_cm->buf_from_sz);
- MPID_nem_ib_cm_request_release(shadow_cm->req);
- MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
- MPIU_Free(shadow_cm);
- break;
- case MPID_NEM_IB_CM_CAS_RELEASE2:
- dprintf("cm_drain_scq,release2 sent\n");
- shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
- shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
- shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
- dprintf("cm_drain_scq,cas_release2 sent,%d->%d,connection_tx=%d\n",
- MPID_nem_ib_myrank, shadow_cm->req->responder_rank,
- shadow_cm->req->ibcom->outstanding_connection_tx);
- dprintf("cm_drain_scq,syn,buf_from=%p,sz=%d\n", shadow_cm->buf_from,
- shadow_cm->buf_from_sz);
- MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
- /* free memory : req->ref_count is 2, so call MPIU_Free() directly */
- //MPID_nem_ib_cm_request_release(shadow_cm->req);
- MPIU_Free(shadow_cm->req);
- MPIU_Free(shadow_cm);
- break;
- case MPID_NEM_IB_CM_SYNACK:
- shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
- dprintf("cm_drain_scq,synack sent,req=%p,initiator_rank=%d\n", shadow_cm->req,
- shadow_cm->req->initiator_rank);
- shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
- dprintf("cm_drain_scq,synack sent,%d->%d,tx=%d\n",
- MPID_nem_ib_myrank, shadow_cm->req->initiator_rank,
- shadow_cm->req->ibcom->outstanding_connection_tx);
- dprintf("cm_drain_scq,synack,buf_from=%p,sz=%d\n", shadow_cm->buf_from,
- shadow_cm->buf_from_sz);
- MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
- MPIU_Free(shadow_cm);
- break;
- case MPID_NEM_IB_CM_ACK1:
- dprintf("cm_drain_scq,ack1 sent\n");
- shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
- shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
- shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
- dprintf("cm_drain_scq,ack1,%d->%d,connection_tx=%d\n",
- MPID_nem_ib_myrank, shadow_cm->req->responder_rank,
- shadow_cm->req->ibcom->outstanding_connection_tx);
- dprintf("cm_drain_scq,ack1,buf_from=%p,sz=%d\n", shadow_cm->buf_from,
- shadow_cm->buf_from_sz);
- MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
- /* Finalize protocol because there is no referer in cm_drain_scq and sendq.
- * Note that there might be one in cm_poll. */
- MPID_nem_ib_cm_request_release(shadow_cm->req);
- MPIU_Free(shadow_cm);
- break;
- case MPID_NEM_IB_CM_ACK2:
- shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
- dprintf("cm_drain_scq,ack2 sent,req=%p,initiator_rank=%p=%d\n",
- shadow_cm->req, &shadow_cm->req->initiator_rank,
- shadow_cm->req->initiator_rank);
- shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
- shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
- dprintf("cm_drain_scq,ack2,%d->%d,tx=%d\n",
- MPID_nem_ib_myrank, shadow_cm->req->initiator_rank,
- shadow_cm->req->ibcom->outstanding_connection_tx);
- dprintf("cm_drain_scq,ack2,buf_from=%p,sz=%d\n", shadow_cm->buf_from,
- shadow_cm->buf_from_sz);
- MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
- /* Let the guard down to let the following connection request go. */
- VC_FIELD(MPID_nem_ib_conns[shadow_cm->req->initiator_rank].vc, connection_guard) = 0;
- /* Finalize protocol because there is no referer in cm_drain_scq, sendq
- * and cm_poll because cm_poll sent ACK2. */
- MPID_nem_ib_cm_request_release(shadow_cm->req);
- MPIU_Free(shadow_cm);
- break;
- case MPID_NEM_IB_CM_ALREADY_ESTABLISHED:
- case MPID_NEM_IB_CM_RESPONDER_IS_CONNECTING:
- /* These cases mean the end of CM-op, so we do the almost same operation as ack2 */
- shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
- dprintf
- ("cm_drain_scq,established or connecting sent,req=%p,initiator_rank=%p=%d\n",
- shadow_cm->req, &shadow_cm->req->initiator_rank, shadow_cm->req->initiator_rank);
- shadow_cm->req->ibcom->ncom_scratch_pad -= 1;
- shadow_cm->req->ibcom->outstanding_connection_tx -= 1;
- dprintf("cm_drain_scq,established or connecting sent,%d->%d,connection_tx=%d,type=%d\n",
- MPID_nem_ib_myrank, shadow_cm->req->initiator_rank,
- shadow_cm->req->ibcom->outstanding_connection_tx, *type);
- shadow_cm->req->ibcom->incoming_connection_tx -= 1;
- MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
- /* Let the guard down to let the following connection request go. */
- VC_FIELD(MPID_nem_ib_conns[shadow_cm->req->initiator_rank].vc, connection_guard) = 0;
- /* Finalize protocol because there is no referer in cm_drain_scq, sendq
- * and cm_poll because cm_poll sent ACK2. */
- MPID_nem_ib_cm_request_release(shadow_cm->req);
- MPIU_Free(shadow_cm);
- break;
- case MPID_NEM_IB_RINGBUF_ASK_FETCH:
- shadow_ringbuf = (MPID_nem_ib_ringbuf_cmd_shadow_t *) cqe[i].wr_id;
- memcpy(&shadow_ringbuf->req->fetched,
- shadow_ringbuf->buf_from, sizeof(MPID_nem_ib_ringbuf_headtail_t));
- dprintf("cm_drain_scq,ask_fetch sent,%d->%d,req=%p,fetched->head=%ld,tail=%d\n",
- MPID_nem_ib_myrank, shadow_ringbuf->req->vc->pg_rank,
- shadow_ringbuf->req, shadow_ringbuf->req->fetched.head,
- shadow_ringbuf->req->fetched.tail);
- /* Proceed to cas */
- MPID_nem_ib_ringbuf_ask_cas(shadow_ringbuf->req->vc, shadow_ringbuf->req);
- MPID_nem_ib_ncqe_scratch_pad_to_drain -= 1;
- shadow_ringbuf->req->ibcom->ncom_scratch_pad -= 1;
- MPID_nem_ib_rdmawr_from_free(shadow_ringbuf->buf_from, shadow_ringbuf->buf_from_sz);
- MPIU_Free(shadow_ringbuf);
- break;
- case MPID_NEM_IB_RINGBUF_ASK_CAS:{
- shadow_ringbuf = (MPID_nem_ib_ringbuf_cmd_shadow_t *) cqe[i].wr_id;
- /* Check if CAS have succeeded */
- MPID_nem_ib_ringbuf_headtail_t *cas_retval =
- (MPID_nem_ib_ringbuf_headtail_t *) shadow_ringbuf->buf_from;
- dprintf
- ("cm_drain_scq,ask_cas sent,req=%p,fetched.head=%lx,retval=%lx\n",
- shadow_ringbuf->req, shadow_ringbuf->req->fetched.head, cas_retval->head);
- if (cas_retval->head == shadow_ringbuf->req->fetched.head) {
- /* CAS succeeded */
- dprintf
- ("cm_drain_scq,ask_cas,cas succeeded,%d->%d,local_head=%d,local_tail=%d,nslot=%d\n",
- MPID_nem_ib_myrank, shadow_ringbuf->req->vc->pg_rank,
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num),
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->lsr_seq_num_tail),
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot));
- if (MPID_nem_ib_diff16
- (VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num),
- VC_FIELD(shadow_ringbuf->req->vc,
- ibcom->lsr_seq_num_tail)) >=
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot)) {
- dprintf("cm_drain_scq,ask_cas,refill fast path\n");
- /* Refill now when we don't have any slots */
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num) =
- (uint16_t) shadow_ringbuf->req->fetched.head;
- /* Move tail pointer to indicate only one slot is available to us */
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->lsr_seq_num_tail) = (uint16_t)
- (VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num) -
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot) + 1);
- dprintf
- ("cm_drain_scq,ask_cas,after refill,local_head=%d,local_tail=%d,nslot=%d\n",
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num),
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->lsr_seq_num_tail),
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot));
- }
- else {
- dprintf("cm_drain_scq,ask_cas,refill slow path\n");
- /* Enqueue slots to avoid overwriting the slots when we have some slots.
- * This happens when two or more asks succeeded before
- * the first queued send is issued. */
- MPID_nem_ib_ringbuf_sector_t *sector = (MPID_nem_ib_ringbuf_sector_t *)
- MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_sector_t));
- MPIU_ERR_CHKANDJUMP(!sector, mpi_errno, MPI_ERR_OTHER, "**malloc");
- sector->type = MPID_NEM_IB_RINGBUF_SHARED;
- sector->start =
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_start);
- sector->nslot =
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot);
- sector->head = (uint16_t) shadow_ringbuf->req->fetched.head;
- sector->tail =
- sector->head - VC_FIELD(shadow_ringbuf->req->vc,
- ibcom->local_ringbuf_nslot) + 1;
- MPID_nem_ib_ringbuf_sectorq_enqueue(&VC_FIELD
- (shadow_ringbuf->req->vc,
- ibcom->sectorq), sector);
- }
- /* Let the guard down so that the following ask-fetch can be issued */
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->ask_guard) = 0;
- /* Kick progress engine */
- dprintf
- ("cm_drain_scq,call send_progress for %d,ncom=%d,ncqe=%d,local_head=%d,local_tail=%d,nslot=%d\n",
- shadow_ringbuf->req->vc->pg_rank, VC_FIELD(shadow_ringbuf->req->vc,
- ibcom->ncom),
- MPID_nem_ib_ncqe, VC_FIELD(shadow_ringbuf->req->vc, ibcom->sseq_num),
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->lsr_seq_num_tail),
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->local_ringbuf_nslot)
-);
- MPID_nem_ib_send_progress(shadow_ringbuf->req->vc);
- MPIU_Free(shadow_ringbuf->req);
- }
- else {
- /* CAS failed */
- dprintf("ask-cas,failed\n");
- MPID_nem_ib_segv;
- /* Let the guard down so that this ask-fetch can be issued in ringbuf_progress */
- VC_FIELD(shadow_ringbuf->req->vc, ibcom->ask_guard) = 0;
- /* Retry from fetch */
- shadow_ringbuf->req->state = MPID_NEM_IB_RINGBUF_ASK_FETCH;
- /* Schedule retry */
- dprintf("cm_drain_scq,retval=%08lx,backoff=%ld\n",
- cas_retval->head, shadow_ringbuf->req->retry_backoff);
- MPID_NEM_IB_RINGBUF_UPDATE_BACKOFF(shadow_ringbuf->req->retry_backoff);
- shadow_ringbuf->req->retry_decided = MPID_nem_ib_progress_engine_vt;
- /* Make the ask-fetch in order */
- MPID_nem_ib_ringbuf_sendq_enqueue_at_head(&MPID_nem_ib_ringbuf_sendq,
- shadow_ringbuf->req);
- dprintf("cm_drain_scq,ask_cas,cas failed,decided=%ld,backoff=%ld\n",
- shadow_ringbuf->req->retry_decided, shadow_ringbuf->req->retry_backoff);
- }
- MPID_nem_ib_ncqe_scratch_pad_to_drain -= 1;
- shadow_ringbuf->req->ibcom->ncom_scratch_pad -= 1;
- MPID_nem_ib_rdmawr_from_free(shadow_ringbuf->buf_from, shadow_ringbuf->buf_from_sz);
- MPIU_Free(shadow_ringbuf);
- break;
- }
- case MPID_NEM_IB_NOTIFY_OUTSTANDING_TX_EMPTY:
- shadow_cm = (MPID_nem_ib_cm_cmd_shadow_t *) cqe[i].wr_id;
- shadow_cm->req->ibcom->notify_outstanding_tx_empty |= NOTIFY_OUTSTANDING_TX_SCQ;
- MPID_nem_ib_rdmawr_from_free(shadow_cm->buf_from, shadow_cm->buf_from_sz);
- MPIU_Free(shadow_cm->req);
- MPIU_Free(shadow_cm);
- break;
- default:
- printf("unknown type=%d\n", *type);
- MPIU_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_drain_scq");
- break;
- }
- MPID_nem_ib_ncqe_scratch_pad -= 1;
- }
- /* The number of CQE is reduced or a slot of the ringbuf is released, so kick progress engine */
- if (result > 0) {
- MPID_nem_ib_cm_progress();
- MPID_nem_ib_ringbuf_progress();
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_DRAIN_SCQ);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_cm_drain_rcq(void)
-{
- int mpi_errno = MPI_SUCCESS;
- int result;
- int i;
- struct ibv_wc cqe[MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN];
- MPID_nem_ib_cm_notify_send_t *shadow_cm;
-
- if (!MPID_nem_ib_rc_shared_rcq_scratch_pad) {
- dprintf("cm_drain_rcq,CQ is null\n");
- goto fn_exit;
- }
-
- result =
- ibv_poll_cq(MPID_nem_ib_rc_shared_rcq_scratch_pad, MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN,
- &cqe[0]);
- MPIU_ERR_CHKANDJUMP(result < 0, mpi_errno, MPI_ERR_OTHER, "**netmod,ib,ibv_poll_cq");
-
- if (result > 0) {
- dprintf("cm_drain_rcq,found,result=%d\n", result);
- }
- for (i = 0; i < result; i++) {
-
- dprintf("cm_drain_rcq,wr_id=%p\n", (void *) cqe[i].wr_id);
-
-#ifdef HAVE_LIBDCFA
- if (cqe[i].status != IBV_WC_SUCCESS) {
- dprintf("cm_drain_rcq,status=%08x\n", cqe[i].status);
- MPID_nem_ib_segv;
- }
-#else
- if (cqe[i].status != IBV_WC_SUCCESS) {
- dprintf("cm_drain_rcq,status=%08x,%s\n", cqe[i].status,
- ibv_wc_status_str(cqe[i].status));
- MPID_nem_ib_segv;
- }
-#endif
- MPIU_ERR_CHKANDJUMP(cqe[i].status != IBV_WC_SUCCESS, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_drain_rcq");
-
- MPID_nem_ib_cm_cmd_type_t *type = (MPID_nem_ib_cm_cmd_type_t *) cqe[i].wr_id;
- switch (*type) {
- case MPID_NEM_IB_NOTIFY_OUTSTANDING_TX_EMPTY:{
- int initiator_rank;
- MPID_nem_ib_com_t *ibcom;
-
- dprintf("cm_drain_rcq,notify_outstanding_tx_empty\n");
- shadow_cm = (MPID_nem_ib_cm_notify_send_t *) cqe[i].wr_id;
- initiator_rank = shadow_cm->initiator_rank;
-
- MPID_nem_ib_rdmawr_from_free(shadow_cm, sizeof(MPID_nem_ib_cm_notify_send_t));
-
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[initiator_rank], &ibcom);
- ibcom->notify_outstanding_tx_empty |= NOTIFY_OUTSTANDING_TX_RCQ;
- MPID_nem_ib_com_scratch_pad_recv(MPID_nem_ib_scratch_pad_fds[initiator_rank],
- sizeof(MPID_nem_ib_cm_notify_send_t));
- }
- break;
- default:
- printf("unknown type=%d\n", *type);
- MPIU_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_drain_rcq");
- break;
- }
- }
-
- fn_exit:
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_cm_poll_syn
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_cm_poll_syn()
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
- int ib_port = 1;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_POLL_SYN);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_POLL_SYN);
- /* scratch pad is freed after receiving CLOSE */
- if (!MPID_nem_ib_scratch_pad) {
- dprintf("cm_poll_syn,MPID_nem_ib_scratch_pad is zero\n");
- goto fn_exit;
- }
-
- /* Make the following store instruction onto the CAS word switch
- * the value from "acquired" to "released" by
- * waiting until modification on CAS word by a PCIe device
- * propagated to the cache tag. */
- volatile uint64_t *cas_word = (uint64_t *) (MPID_nem_ib_scratch_pad);
- if (*cas_word == MPID_NEM_IB_CM_RELEASED) {
- goto fn_exit;
- }
-
- /* Memory layout is (CAS-word:SYN#0:SYN#1:...:SYN#N:CMD#0:CMD#1:...CMD#M) */
- void *slot = (MPID_nem_ib_scratch_pad + MPID_NEM_IB_CM_OFF_SYN +
- sizeof(MPID_nem_ib_cm_cmd_t) * (0 % MPID_NEM_IB_CM_NSEG));
- volatile uint8_t *head_flag = (uint8_t *) slot;
- if (*head_flag == MPID_NEM_IB_CM_HEAD_FLAG_ZERO) {
- goto fn_exit;
- } /* Incoming message hasn't arrived */
-
- volatile MPID_nem_ib_cm_cmd_syn_t *syn_tail_flag = (MPID_nem_ib_cm_cmd_syn_t *) slot;
-
- switch (*head_flag) {
- case MPID_NEM_IB_CM_SYN:{
- int is_synack = 0;
- while (syn_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
- /* __asm__ __volatile__("pause;":::"memory"); */
- }
-
- MPID_nem_ib_cm_cmd_syn_t *syn = (MPID_nem_ib_cm_cmd_syn_t *) slot;
- dprintf("cm_poll_syn,syn detected!,%d->%d,ringbuf_index given=%d\n",
- syn->initiator_rank, MPID_nem_ib_myrank, syn->responder_ringbuf_index);
- MPID_nem_ib_cm_req_t *req = MPIU_Malloc(sizeof(MPID_nem_ib_cm_req_t));
- MPIU_ERR_CHKANDJUMP(!req, mpi_errno, MPI_ERR_OTHER, "**malloc");
- req->ref_count = 1; /* Released when draining SCQ of ACK2 */
- req->ringbuf_index = syn->responder_ringbuf_index;
- req->initiator_rank = syn->initiator_rank;
- req->responder_rank = MPID_nem_ib_myrank;
- ibcom_errno =
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds
- [req->initiator_rank], &req->ibcom);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_obtain_pointer");
- if (is_conn_established(syn->initiator_rank)) {
- dprintf("cm_poll_syn,established is true,%d->%d,connection_tx=%d\n",
- syn->initiator_rank, MPID_nem_ib_myrank,
- req->ibcom->outstanding_connection_tx);
- req->state = MPID_NEM_IB_CM_ALREADY_ESTABLISHED;
- }
- else if ((MPID_nem_ib_myrank > syn->initiator_rank) &&
- (req->ibcom->outstanding_connection_tx > 0)) {
- dprintf("cm_poll_syn,connection_tx>0,%d->%d,connection_tx=%d\n",
- syn->initiator_rank, MPID_nem_ib_myrank,
- req->ibcom->outstanding_connection_tx);
- req->state = MPID_NEM_IB_CM_RESPONDER_IS_CONNECTING;
- }
- else {
- /* Skip QP createion on race condition */
- if (!
- (VC_FIELD
- (MPID_nem_ib_conns[syn->initiator_rank].vc,
- connection_state) & MPID_NEM_IB_CM_LOCAL_QP_RESET)) {
- ibcom_errno =
- MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_RC,
- &MPID_nem_ib_conns[syn->initiator_rank].fd);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_open");
- /* store pointer to MPID_nem_ib_com */
- dprintf("cm_poll_syn,initiator fd=%d\n",
- MPID_nem_ib_conns[syn->initiator_rank].fd);
- ibcom_errno =
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[syn->initiator_rank].fd,
- &VC_FIELD(MPID_nem_ib_conns
- [syn->initiator_rank].vc, ibcom));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_obtain_pointer");
- /* Allocate RDMA-write-to ring-buf for remote */
- mpi_errno =
- MPID_nem_ib_ringbuf_alloc(MPID_nem_ib_conns[syn->initiator_rank].vc);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_ringbuf_alloc");
- /* Record state transition for race condition detection */
- VC_FIELD(MPID_nem_ib_conns[syn->initiator_rank].vc,
- connection_state) |= MPID_NEM_IB_CM_LOCAL_QP_RESET;
- }
-
- req->state = MPID_NEM_IB_CM_SYNACK;
- is_synack = 1;
- }
-
- /* Increment transaction counter here because this path is executed only once */
- req->ibcom->outstanding_connection_tx += 1;
- /* Increment receiving transaction counter.
- * In the case of SYNACK, Responder receives ack1
- * In the case of ALREADY_ESTABLISHED or RESPONDER_IS_CONNECTING,
- * decrement in cm_drain_scq.
- */
- req->ibcom->incoming_connection_tx += 1;
- if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
- req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
- MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
- MPID_nem_ib_cm_ringbuf_tail) < MPID_NEM_IB_CM_NSEG) {
-
- MPID_nem_ib_cm_cmd_synack_t *cmd =
- (MPID_nem_ib_cm_cmd_synack_t *) req->
- ibcom->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
- if (is_synack) {
- dprintf("cm_poll_syn,sending synack,%d->%d[%d],connection_tx=%d\n",
- MPID_nem_ib_myrank, syn->initiator_rank, req->ringbuf_index,
- req->ibcom->outstanding_connection_tx);
- MPID_NEM_IB_CM_COMPOSE_SYNACK(cmd, req, syn->initiator_req);
- dprintf
- ("cm_poll_syn,composing synack,responder_req=%p,cmd->rmem=%p,rkey=%08x,ringbuf_nslot=%d,remote_vc=%p\n",
- cmd->responder_req, cmd->rmem, cmd->rkey, cmd->ringbuf_nslot,
- cmd->remote_vc);
- cmd->initiator_ringbuf_index = req->initiator_ringbuf_index =
- MPID_nem_ib_cm_ringbuf_head;
- dprintf("cm_poll_syn,giving ringbuf_index=%d\n", cmd->initiator_ringbuf_index);
- MPID_nem_ib_cm_ringbuf_head++;
- }
- else {
- dprintf
- ("cm_poll_syn,sending established or connecting,%d->%d[%d],connection_tx=%d,state=%d\n",
- MPID_nem_ib_myrank, syn->initiator_rank, req->ringbuf_index,
- req->ibcom->outstanding_connection_tx, req->state);
- MPID_NEM_IB_CM_COMPOSE_END_CM(cmd, req, syn->initiator_req, req->state);
- }
- MPID_nem_ib_cm_cmd_shadow_t *shadow = (MPID_nem_ib_cm_cmd_shadow_t *)
- MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow->type = req->state;
- shadow->req = req;
- dprintf("cm_poll_syn,shadow=%p,shadow->req=%p\n", shadow, shadow->req);
- mpi_errno =
- MPID_nem_ib_cm_cmd_core(req->initiator_rank, shadow, (void *) cmd,
- sizeof(MPID_nem_ib_cm_cmd_synack_t), 0,
- req->ringbuf_index);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_send_core");
- }
- else {
- dprintf("cm_poll_syn,enqueue,ncqe=%d,ncom=%d,head=%d,tail=%d\n",
- MPID_nem_ib_ncqe_scratch_pad, req->ibcom->ncom_scratch_pad,
- MPID_nem_ib_cm_ringbuf_head, MPID_nem_ib_cm_ringbuf_tail);
- if (is_synack) {
- dprintf("cm_poll_syn,queueing syn,%d->%d,connection_tx=%d\n",
- MPID_nem_ib_myrank, syn->initiator_rank,
- req->ibcom->outstanding_connection_tx);
- MPID_NEM_IB_CM_COMPOSE_SYNACK((MPID_nem_ib_cm_cmd_synack_t *) &
- (req->cmd), req, syn->initiator_req);
- }
- else {
- dprintf
- ("cm_poll_syn,queueing established or connecting,%d->%d,connection_tx=%d,state=%d\n",
- MPID_nem_ib_myrank, syn->initiator_rank,
- req->ibcom->outstanding_connection_tx, req->state);
- MPID_NEM_IB_CM_COMPOSE_END_CM((MPID_nem_ib_cm_cmd_synack_t *) & (req->cmd), req,
- syn->initiator_req, req->state);
- }
- MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
- }
- }
- goto common_tail;
- break;
- case MPID_NEM_IB_CM_CAS_RELEASE2:{
- MPID_nem_ib_segv;
- /* Initiator requests to release CAS word.
- * Because connection is already established.
- * In this case, responder may already have performed vc_terminate. */
-
- while (syn_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
- /* __asm__ __volatile__("pause;":::"memory"); */
- }
-
-#ifdef MPID_NEM_IB_DEBUG_POLL
- MPID_nem_ib_cm_cmd_syn_t *syn = (MPID_nem_ib_cm_cmd_syn_t *) slot;
-#endif
- dprintf("cm_poll_syn,release2 detected,%d->%d\n",
- syn->initiator_rank, MPID_nem_ib_myrank);
- }
-
- common_tail:
-
- /* Clear head-flag */
- *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO;
-
- /* Clear tail-flag */
- syn_tail_flag->tail_flag.tail_flag = 0;
-
- /* Release CAS word.
- * Note that the following store instruction switches the value from "acquired" to "released"
- * because the load instruction above made the cache tag for the CAS word
- * reflect the switch of the value from "released" to "acquired".
- * We want to prevent the case where the store instruction switches the value from
- * "released" to "released" then a write command from a PCI device arrives
- * and switches the value from "released" to "acquired")
- */
- //*cas_word = MPID_NEM_IB_CM_RELEASED;
- dprintf("cm_poll_syn,exit,%d,cas_word,%p,%lx\n", MPID_nem_ib_myrank, cas_word, *cas_word);
- break;
- default:
- printf("unknown connection command\n");
- MPIU_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_poll");
- break;
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_POLL_SYN);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_cm_release
-#undef FCNAME
-int MPID_nem_ib_cm_release(uint16_t index)
-{
- int mpi_errno = MPI_SUCCESS;
- int old_ringbuf_tail = MPID_nem_ib_cm_ringbuf_tail;
- uint16_t index_slot = index % MPID_NEM_IB_CM_NSEG;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_RELEASE);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_RELEASE);
- //dprintf("user_data=%p,mem=%p,sub=%08lx,index=%d\n", user_data, vc_ib->ibcom->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO], (unsigned long)user_data - (unsigned long)vc_ib->ibcom->icom_mem[MPID_NEM_IB_COM_RDMAWR_TO], index);
- //dprintf("index=%d,released=%016lx\n", index, vc_ib->ibcom->remote_ringbuf->remote_released[index / 64]);
- MPID_nem_ib_cm_ringbuf_released[index_slot / 64] |= (1ULL << (index_slot & 63));
- //dprintf("released[index/64]=%016lx\n", vc_ib->ibcom->remote_ringbuf->remote_released[index / 64]);
- uint16_t index_tail = ((uint16_t) (MPID_nem_ib_cm_ringbuf_tail + 1) % MPID_NEM_IB_CM_NSEG);
- //dprintf("tail+1=%d,index_tail=%d\n", vc_ib->ibcom->rsr_seq_num_tail + 1, index_tail);
- //dprintf("released=%016lx\n", vc_ib->ibcom->remote_ringbuf->remote_released[index_tail / 64]);
- while (1) {
- if (((MPID_nem_ib_cm_ringbuf_released[index_tail / 64] >> (index_tail & 63)) & 1) == 1) {
- MPID_nem_ib_cm_ringbuf_tail++;
- MPID_nem_ib_cm_ringbuf_released[index_tail / 64] &= ~(1ULL << (index_tail & 63));
- dprintf("MPID_nem_ib_cm_ringbuf_tail,incremented to %d\n", MPID_nem_ib_cm_ringbuf_tail);
- index_tail = (uint16_t) (index_tail + 1) % MPID_NEM_IB_CM_NSEG;
- }
- else {
- break;
- }
- }
-
- /* A slot of the ringbuf is released, so kick progress engine */
- if (MPID_nem_ib_cm_ringbuf_tail != old_ringbuf_tail) {
- MPID_nem_ib_cm_progress();
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_RELEASE);
- return mpi_errno;
- //fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_cm_poll
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_cm_poll()
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
- uint16_t i;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_POLL);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_POLL);
- /* scratch pad is freed after receiving CLOSE */
- if (!MPID_nem_ib_scratch_pad) {
- dprintf("cm_poll,MPID_nem_ib_scratch_pad is zero\n");
- goto fn_exit;
- }
-
- /* Wrap-around tolerant by using "!=" */
- for (i = MPID_nem_ib_cm_ringbuf_tail + 1; i != MPID_nem_ib_cm_ringbuf_head; i++) {
-
- /* Memory layout is (CAS-word:SYN#0:SYN#1:...:SYN#N:CMD#0:CMD#1:...CMD#M) */
- void *slot = (MPID_nem_ib_scratch_pad + MPID_NEM_IB_CM_OFF_CMD +
- sizeof(MPID_nem_ib_cm_cmd_t) * ((uint16_t) (i % MPID_NEM_IB_CM_NSEG)));
- volatile uint8_t *head_flag = (uint8_t *) slot;
- if (*head_flag == MPID_NEM_IB_CM_HEAD_FLAG_ZERO) {
- continue;
- } /* Incoming message hasn't arrived */
-
- switch (*head_flag) {
- case MPID_NEM_IB_CM_SYNACK:{
- volatile MPID_nem_ib_cm_cmd_synack_t *synack_tail_flag =
- (MPID_nem_ib_cm_cmd_synack_t *) slot;
- while (synack_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
- /* __asm__ __volatile__("pause;":::"memory"); */
- }
-
- MPID_nem_ib_cm_cmd_synack_t *synack = (MPID_nem_ib_cm_cmd_synack_t *) slot;
- MPID_nem_ib_cm_req_t *req = (MPID_nem_ib_cm_req_t *) synack->initiator_req;
- req->ringbuf_index = synack->initiator_ringbuf_index;
- req->ibcom->incoming_connection_tx -= 1; /* SYNACK */
- dprintf
- ("cm_poll,synack detected!,%d->%d[%d],responder_req=%p,ringbuf_index=%d,tx=%d\n",
- req->responder_rank, MPID_nem_ib_myrank, i,
- synack->responder_req, synack->initiator_ringbuf_index,
- req->ibcom->outstanding_connection_tx);
- /* Deduct it from the packet */
- VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc,
- connection_state) |= MPID_NEM_IB_CM_REMOTE_QP_RESET;
- /* Skip QP state transition on race condition */
- if (!
- (VC_FIELD
- (MPID_nem_ib_conns[req->responder_rank].vc,
- connection_state) & MPID_NEM_IB_CM_LOCAL_QP_RTS)) {
- ibcom_errno =
- MPID_nem_ib_com_rts(MPID_nem_ib_conns[req->responder_rank].fd,
- synack->qpnum, synack->lid, &(synack->gid));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_rts");
- /* Connect ring buffer */
- ibcom_errno =
- MPID_nem_ib_com_connect_ringbuf(MPID_nem_ib_conns
- [req->responder_rank].fd,
- synack->ringbuf_type, synack->rmem,
- synack->rkey, synack->ringbuf_nslot,
- synack->remote_vc, 1);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_connect_ringbuf");
- dprintf("connect_ringbuf,%d-%d=%d\n",
- VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc,
- ibcom->sseq_num),
- VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc,
- ibcom->lsr_seq_num_tail),
- MPID_nem_ib_diff16(VC_FIELD
- (MPID_nem_ib_conns[req->responder_rank].vc,
- ibcom->sseq_num),
- VC_FIELD(MPID_nem_ib_conns
- [req->responder_rank].vc,
- ibcom->lsr_seq_num_tail))
-);
- /* Record state transition for race condition detection */
- VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc,
- connection_state) |= MPID_NEM_IB_CM_LOCAL_QP_RTS;
- }
-
- req->state = MPID_NEM_IB_CM_ACK1;
- if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
- req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY) {
-
- dprintf("cm_poll,sending ack1,req=%p,ringbuf_index=%d\n", req,
- req->ringbuf_index);
- MPID_nem_ib_cm_cmd_ack1_t *cmd =
- (MPID_nem_ib_cm_cmd_ack1_t *) req->
- ibcom->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
- MPID_NEM_IB_CM_COMPOSE_ACK1(cmd, req, synack->responder_req);
- dprintf
- ("cm_poll,composing ack1,cmd->responder_req=%p,cmd->rmem=%p,rkey=%08x,ringbuf_nslot=%d,remote_vc=%p\n",
- cmd->responder_req, cmd->rmem, cmd->rkey, cmd->ringbuf_nslot,
- cmd->remote_vc);
- MPID_nem_ib_cm_cmd_shadow_t *shadow = (MPID_nem_ib_cm_cmd_shadow_t *)
- MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow->type = req->state;
- shadow->req = req;
- dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
- mpi_errno =
- MPID_nem_ib_cm_cmd_core(req->responder_rank, shadow, (void *) cmd,
- sizeof(MPID_nem_ib_cm_cmd_ack1_t), 0,
- req->ringbuf_index);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_send_core");
- }
- else {
- MPID_NEM_IB_CM_COMPOSE_ACK1((MPID_nem_ib_cm_cmd_ack1_t *) &
- (req->cmd), req, synack->responder_req);
- MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
- }
-
- *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
- /* Clear all possible tail-flag slots */
- MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
-
- /* Explicitly release CAS word because
- * ConnectX-3 doesn't support safe CAS with PCI device and CPU */
- MPID_nem_ib_cm_cas_release(MPID_nem_ib_conns[req->responder_rank].vc);
- break;
- }
- case MPID_NEM_IB_CM_ALREADY_ESTABLISHED:
- case MPID_NEM_IB_CM_RESPONDER_IS_CONNECTING:
- {
- volatile MPID_nem_ib_cm_cmd_synack_t *synack_tail_flag =
- (MPID_nem_ib_cm_cmd_synack_t *) slot;
- while (synack_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
- /* __asm__ __volatile__("pause;":::"memory"); */
- }
-
- MPID_nem_ib_cm_cmd_synack_t *synack = (MPID_nem_ib_cm_cmd_synack_t *) slot;
- MPID_nem_ib_cm_req_t *req = (MPID_nem_ib_cm_req_t *) synack->initiator_req;
- /* These mean the end of CM-op, so decrement here. */
- req->ibcom->outstanding_connection_tx -= 1;
- req->ibcom->incoming_connection_tx -= 2;
- dprintf
- ("cm_poll,established or connecting detected!,%d->%d[%d],responder_req=%p,ringbuf_index=%d,tx=%d\n",
- req->responder_rank, MPID_nem_ib_myrank, i,
- synack->responder_req, synack->initiator_ringbuf_index,
- req->ibcom->outstanding_connection_tx);
- /* cm_release calls cm_progress, so we have to clear scratch_pad here. */
- *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
- /* Clear all possible tail-flag slots */
- MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
- /* The initiator release the slot for responder */
- MPID_nem_ib_cm_release(req->responder_ringbuf_index);
- /* Kick ask-send commands waiting for connection */
- MPID_nem_ib_ringbuf_progress();
- /* Kick send commands waiting for connection.
- * This might be a dupe when running-ahead transaction kicked it when receiving ACK1. */
- dprintf("cm_poll,kick progress engine for %d\n", req->responder_rank);
- MPID_nem_ib_send_progress(MPID_nem_ib_conns[req->responder_rank].vc);
- /* Let the following connection request go */
- VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_guard) = 0;
- /* Call cm_request_release twice.
- * If ref_count == 2, the memory of request is released here.
- * If ref_count == 3, the memory of request will be released on draining SCQ of SYN. */
- MPID_nem_ib_cm_request_release(req);
- MPID_nem_ib_cm_request_release(req);
-
- /* Explicitly release CAS word because
- * ConnectX-3 doesn't support safe CAS with PCI device and CPU */
- MPID_nem_ib_cm_cas_release(MPID_nem_ib_conns[req->responder_rank].vc);
- break;
- }
- case MPID_NEM_IB_CM_ACK1:{
- volatile MPID_nem_ib_cm_cmd_ack1_t *ack1_tail_flag =
- (MPID_nem_ib_cm_cmd_ack1_t *) slot;
- while (ack1_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
- /* __asm__ __volatile__("pause;":::"memory"); */
- }
-
- MPID_nem_ib_cm_cmd_ack1_t *ack1 = (MPID_nem_ib_cm_cmd_ack1_t *) slot;
- MPID_nem_ib_cm_req_t *req = (MPID_nem_ib_cm_req_t *) ack1->responder_req;
- req->ibcom->incoming_connection_tx -= 1; /* ACK1 */
- dprintf("cm_poll,ack1 detected!,%d->%d[%d],responder_req=%p,tx=%d\n",
- req->initiator_rank, MPID_nem_ib_myrank, i,
- ack1->responder_req, req->ibcom->outstanding_connection_tx);
- /* Deduct it from the packet */
- VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc,
- connection_state) |=
- (MPID_NEM_IB_CM_REMOTE_QP_RESET | MPID_NEM_IB_CM_REMOTE_QP_RTS);
- /* Skip QP createion on race condition */
- if (!
- (VC_FIELD
- (MPID_nem_ib_conns[req->initiator_rank].vc,
- connection_state) & MPID_NEM_IB_CM_LOCAL_QP_RTS)) {
- ibcom_errno =
- MPID_nem_ib_com_rts(MPID_nem_ib_conns[req->initiator_rank].fd,
- ack1->qpnum, ack1->lid, &(ack1->gid));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_rts");
- /* Connect ring buffer */
- ibcom_errno =
- MPID_nem_ib_com_connect_ringbuf(MPID_nem_ib_conns
- [req->initiator_rank].fd,
- ack1->ringbuf_type, ack1->rmem,
- ack1->rkey, ack1->ringbuf_nslot,
- ack1->remote_vc, 1);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_connect_ringbuf");
- dprintf("connect_ringbuf,%d-%d=%d\n",
- VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc,
- ibcom->sseq_num),
- VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc,
- ibcom->lsr_seq_num_tail),
- MPID_nem_ib_diff16(VC_FIELD
- (MPID_nem_ib_conns[req->initiator_rank].vc,
- ibcom->sseq_num),
- VC_FIELD(MPID_nem_ib_conns
- [req->initiator_rank].vc,
- ibcom->lsr_seq_num_tail))
-);
- MPID_nem_ib_vc_onconnect(MPID_nem_ib_conns[req->initiator_rank].vc);
- /* Record state transition for race condition detection */
- VC_FIELD(MPID_nem_ib_conns[req->initiator_rank].vc,
- connection_state) |= MPID_NEM_IB_CM_LOCAL_QP_RTS;
- }
-
- req->state = MPID_NEM_IB_CM_ACK2;
- if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
- req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY) {
-
- dprintf
- ("cm_poll,sending ack2,req=%p,ringbuf_index=%d,initiator_rank=%d,tx=%d\n",
- req, req->ringbuf_index, req->initiator_rank,
- req->ibcom->outstanding_connection_tx);
- MPID_nem_ib_cm_cmd_ack2_t *cmd =
- (MPID_nem_ib_cm_cmd_ack2_t *) req->
- ibcom->icom_mem[MPID_NEM_IB_COM_SCRATCH_PAD_FROM];
- MPID_NEM_IB_CM_COMPOSE_ACK2(cmd, ack1->initiator_req);
- MPID_nem_ib_cm_cmd_shadow_t *shadow = (MPID_nem_ib_cm_cmd_shadow_t *)
- MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow->type = req->state;
- shadow->req = req;
- dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
- mpi_errno =
- MPID_nem_ib_cm_cmd_core(req->initiator_rank, shadow, (void *) cmd,
- sizeof(MPID_nem_ib_cm_cmd_ack2_t), 0,
- req->ringbuf_index);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_send_core");
- }
- else {
- MPID_NEM_IB_CM_COMPOSE_ACK2((MPID_nem_ib_cm_cmd_ack2_t *) &
- (req->cmd), ack1->initiator_req);
- MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
- }
-
- /* cm_release calls cm_progress, so we have to clear scratch_pad here. */
- *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
- /* Clear all possible tail-flag slots */
- MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
- /* The responder release the slot for initiator */
- MPID_nem_ib_cm_release(req->initiator_ringbuf_index);
- /* Kick ask-send commands waiting for connection */
- MPID_nem_ib_ringbuf_progress();
- /* Kick send commands waiting for connection.
- * This might be a dupe when running-ahead transaction kicked it when receiving ACK2. */
- dprintf("cm_poll,kick progress engine for %d\n", req->initiator_rank);
- MPID_nem_ib_send_progress(MPID_nem_ib_conns[req->initiator_rank].vc);
- }
- //goto common_tail;
- break;
- case MPID_NEM_IB_CM_ACK2:{
- volatile MPID_nem_ib_cm_cmd_ack2_t *ack2_tail_flag =
- (MPID_nem_ib_cm_cmd_ack2_t *) slot;
- while (ack2_tail_flag->tail_flag.tail_flag != MPID_NEM_IB_COM_MAGIC) {
- /* __asm__ __volatile__("pause;":::"memory"); */
- }
- MPID_nem_ib_cm_cmd_ack2_t *ack2 = (MPID_nem_ib_cm_cmd_ack2_t *) slot;
- MPID_nem_ib_cm_req_t *req = (MPID_nem_ib_cm_req_t *) ack2->initiator_req;
- dprintf("cm_poll,ack2 detected!,%d->%d[%d],connection_tx=%d\n",
- req->responder_rank, MPID_nem_ib_myrank, i,
- req->ibcom->outstanding_connection_tx);
- req->ibcom->incoming_connection_tx -= 1; /* ACK2 */
- /* Deduct it from the packet */
- if (!
- (VC_FIELD
- (MPID_nem_ib_conns[req->responder_rank].vc,
- connection_state) & MPID_NEM_IB_CM_REMOTE_QP_RTS)) {
- MPID_nem_ib_vc_onconnect(MPID_nem_ib_conns[req->responder_rank].vc);
- /* Record state transition for race condition detection */
- VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc,
- connection_state) |= MPID_NEM_IB_CM_REMOTE_QP_RTS;
- }
-
- /* cm_release calls cm_progress, so we have to clear scratch_pad here. */
- *head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
- /* Clear all possible tail-flag slots */
- MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
- /* The initiator release the slot for responder */
- MPID_nem_ib_cm_release(req->responder_ringbuf_index);
- /* Acquire ring-buffer slot now that it's connected if requested so */
- if (req->ask_on_connect &&
- VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc,
- ibcom->local_ringbuf_type) == MPID_NEM_IB_RINGBUF_SHARED) {
- dprintf("cm_poll,ack2,ask on connect\n");
- mpi_errno =
- MPID_nem_ib_ringbuf_ask_fetch(MPID_nem_ib_conns[req->responder_rank].vc);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_ringbuf_ask_fetch");
- }
-
- /* Kick ask-send commands waiting for connection */
- MPID_nem_ib_ringbuf_progress();
- /* Kick send commands waiting for connection.
- * This might be a dupe when running-ahead transaction kicked it when receiving ACK1. */
- dprintf("cm_poll,kick progress engine for %d\n", req->responder_rank);
- MPID_nem_ib_send_progress(MPID_nem_ib_conns[req->responder_rank].vc);
- /* Let the following connection request go */
- VC_FIELD(MPID_nem_ib_conns[req->responder_rank].vc, connection_guard) = 0;
- /* Finalize protocol because there is no referer in cm_poll and sendq.
- * Note that there might be one which sent ACK1 in cm_drain_scq. */
- MPID_nem_ib_cm_request_release(req);
- }
- //common_tail:
- //*head_flag = MPID_NEM_IB_CM_HEAD_FLAG_ZERO; /* Clear head-flag */
- ///* Clear all possible tail-flag slots */
- //MPID_NEM_IB_CM_CLEAR_TAIL_FLAGS(slot);
- break;
- default:
- printf("unknown connection command\n");
- MPIU_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_poll");
- break;
- }
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_POLL);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#endif
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_ringbuf_alloc
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_ringbuf_alloc(MPIDI_VC_t * vc)
-{
- int mpi_errno = MPI_SUCCESS;
- int i;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_ALLOC);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_ALLOC);
- if (!MPID_nem_ib_ringbuf) {
- MPID_nem_ib_ringbuf = MPIU_Calloc(1, sizeof(MPID_nem_ib_ringbuf_t) * MPID_NEM_IB_NRINGBUF);
- MPIU_ERR_CHKANDJUMP(!MPID_nem_ib_ringbuf, mpi_errno, MPI_ERR_OTHER, "**malloc");
- }
-
- int found = 0;
- /* [MPID_NEM_IB_NRINGBUF-1] holds shared ring buffer */
- for (i = 0; i < MPID_NEM_IB_NRINGBUF - 1; i++) {
- if (((MPID_nem_ib_ringbuf_acquired[i / 64] >> (i & 63)) & 1) == 0) {
- found = 1;
- break;
- }
- }
-
- if (found) {
- MPID_nem_ib_ringbuf_acquired[i / 64] |= (1ULL << (i & 63));
- if (!MPID_nem_ib_ringbuf[i].start) {
- MPID_nem_ib_ringbuf[i].type = MPID_NEM_IB_RINGBUF_EXCLUSIVE;
- MPID_nem_ib_ringbuf[i].start = MPID_nem_ib_rdmawr_to_alloc(MPID_NEM_IB_RINGBUF_NSLOT);
- MPIU_ERR_CHKANDJUMP(!MPID_nem_ib_ringbuf[i].start, mpi_errno,
- MPI_ERR_OTHER, "**MPID_nem_ib_rdma_to_alloc");
- MPID_nem_ib_ringbuf[i].nslot = MPID_NEM_IB_RINGBUF_NSLOT;
- memset(MPID_nem_ib_ringbuf[i].remote_released, 0,
- (MPID_NEM_IB_RINGBUF_NSLOT + 63) / 64);
- MPID_nem_ib_ringbuf_allocated[i / 64] |= (1ULL << (i & 63));
- }
- VC_FIELD(vc, ibcom->remote_ringbuf) = &MPID_nem_ib_ringbuf[i];
- dprintf("ringbuf_alloc,start=%p\n", MPID_nem_ib_ringbuf[i].start);
- VC_FIELD(vc, ibcom->rsr_seq_num_poll) = 0;
- VC_FIELD(vc, ibcom->rsr_seq_num_tail) = -1;
- VC_FIELD(vc, ibcom->rsr_seq_num_tail_last_sent) = -1;
- MPID_nem_ib_ringbuf[i].vc = vc;
- dprintf
- ("ringbuf_alloc,i=%d,pg_rank=%d,ibcom=%p,ibcom->remote_ringbuf=%p\n",
- i, vc->pg_rank, VC_FIELD(vc, ibcom), VC_FIELD(vc, ibcom->remote_ringbuf));
- }
- else {
- if (!MPID_nem_ib_ringbuf[i].start) {
- MPID_nem_ib_ringbuf[i].type = MPID_NEM_IB_RINGBUF_SHARED;
- MPID_nem_ib_ringbuf[i].start = MPID_nem_ib_rdmawr_to_alloc(MPID_NEM_IB_RINGBUF_NSLOT);
- MPIU_ERR_CHKANDJUMP(!MPID_nem_ib_ringbuf[i].start, mpi_errno,
- MPI_ERR_OTHER, "**MPID_nem_ib_rdma_to_alloc");
- MPID_nem_ib_ringbuf[i].nslot = MPID_NEM_IB_RINGBUF_NSLOT;
- memset(MPID_nem_ib_ringbuf[i].remote_released, 0,
- (MPID_NEM_IB_RINGBUF_NSLOT + 63) / 64);
- MPID_nem_ib_ringbuf_allocated[i / 64] |= (1ULL << (i & 63));
- }
- MPID_nem_ib_ringbuf[i].ref_count++;
- VC_FIELD(vc, ibcom->remote_ringbuf) = &MPID_nem_ib_ringbuf[i];
- dprintf("ringbuf_alloc,not found\n");
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RINGBUF_ALLOC);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_ringbuf_free
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_ringbuf_free(MPIDI_VC_t * vc)
-{
- int mpi_errno = MPI_SUCCESS;
- int i;
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_FREE);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_FREE);
- /* No ring-buffer is allocated */
- if (!VC_FIELD(vc, ibcom->remote_ringbuf)) {
- goto fn_exit;
- }
-
- int index =
- ((uint8_t *) VC_FIELD(vc, ibcom->remote_ringbuf) -
- (uint8_t *) & MPID_nem_ib_ringbuf[0]) / sizeof(MPID_nem_ib_ringbuf_t);
- dprintf("ringbuf_free,index=%d\n", index);
- switch (VC_FIELD(vc, ibcom->remote_ringbuf)->type) {
- case MPID_NEM_IB_RINGBUF_EXCLUSIVE:
- dprintf("ringbuf_free,start=%p\n", VC_FIELD(vc, ibcom->remote_ringbuf)->start);
- MPID_nem_ib_rdmawr_to_free(VC_FIELD(vc, ibcom->remote_ringbuf)->start,
- MPID_NEM_IB_RINGBUF_NSLOT);
- VC_FIELD(vc, ibcom->remote_ringbuf)->start = NULL; /* initialize for re-allocate */
- MPID_nem_ib_ringbuf_allocated[index / 64] &= ~(1ULL << (index & 63));
- VC_FIELD(vc, ibcom->remote_ringbuf) = NULL;
- MPID_nem_ib_ringbuf_acquired[index / 64] &= ~(1ULL << (index & 63));
- dprintf("ringbuf_free,exclucsive,allocated=%0lx\n",
- MPID_nem_ib_ringbuf_allocated[index / 64]);
- break;
- case MPID_NEM_IB_RINGBUF_SHARED:
- dprintf("ringbuf_free,shared,ref_count=%d\n",
- VC_FIELD(vc, ibcom->remote_ringbuf)->ref_count);
- MPIU_Assert(VC_FIELD(vc, ibcom->remote_ringbuf)->ref_count > 0);
- if (--VC_FIELD(vc, ibcom->remote_ringbuf)->ref_count == 0) {
- MPID_nem_ib_rdmawr_to_free(VC_FIELD(vc, ibcom->remote_ringbuf)->start,
- MPID_NEM_IB_RINGBUF_NSLOT);
- VC_FIELD(vc, ibcom->remote_ringbuf)->start = NULL; /* initialize for re-allocate */
- MPID_nem_ib_ringbuf_allocated[index / 64] &= ~(1ULL << (index & 63));
- dprintf("ringbuf_free,shared,allocated=%0lx\n",
- MPID_nem_ib_ringbuf_allocated[index / 64]);
- }
- VC_FIELD(vc, ibcom->remote_ringbuf) = NULL;
- default:
- printf("unknown ring-buffer type\n");
- MPIU_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_free");
- break;
- }
-
- int found = 0;
- for (i = 0; i < (MPID_NEM_IB_NRINGBUF + 63) / 64; i++) {
- if (MPID_nem_ib_ringbuf_allocated[i] != 0) {
- found = 1;
- break;
- }
- }
-
- if (!found) {
- MPIU_Free(MPID_nem_ib_ringbuf);
- MPID_nem_ib_ringbuf = NULL;
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RINGBUF_FREE);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
deleted file mode 100644
index 78457eb..0000000
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
+++ /dev/null
@@ -1,386 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
-/*
- * (C) 2001-2009 University of Tokyo
- * (C) 2012 NEC Corporation
- * (C) 2012 University of Tokyo
- * (C) 2014-2015 RIKEN AICS
- *
- * See COPYRIGHT in top-level directory.
- */
-
-#include <unistd.h>
-#include <stdlib.h>
-#include "ib_ibcom.h"
-
-//#define MPID_NEM_IB_DEBUG_REG_MR
-#ifdef MPID_NEM_IB_DEBUG_REG_MR
-#define dprintf printf
-#else
-#define dprintf(...)
-#endif
-
-/* cache size of ibv_reg_mr */
-#define MPID_NEM_IB_COM_REG_MR_NLINE 4096
-#define MPID_NEM_IB_COM_REG_MR_NWAY 1024
-
-#define MPID_NEM_IB_COM_REG_MR_SZPAGE 4096
-#define MPID_NEM_IB_COM_REG_MR_LOGSZPAGE 12
-
-static int ref_count;
-
-/* Allocator using reference count at the head of
- aligned memory area */
-
-#define MPID_NEM_IB_NIALLOCID 32
-typedef struct {
- char *next;
-} free_list_t;
-
-#define MPID_NEM_IB_SZARENA 4096
-#define MPID_NEM_IB_CLUSTER_SIZE (MPID_NEM_IB_SZARENA/sz)
-#define MPID_NEM_IB_ROUNDUP64(addr, align) ((addr + align - 1) & ~((unsigned long)align - 1))
-#define MPID_NEM_IB_NCLUST_SLAB 1
-#define MPID_NEM_IB_COM_AALLOC_ID_MRCACHE 0
-
-static inline void *aalloc(size_t sz, int id)
-{
-#if 1 /* debug */
- return MPIU_Malloc(sz);
-#else
- char *p = free_list_front[id];
- if ((unsigned long) p & (MPID_NEM_IB_SZARENA - 1)) {
- free_list_front[id] += sz;
- return p;
- }
- else {
- char *q, r;
- if (arena_flist[id]) {
- q = arena_flist[id];
- arena_flist[id] = ((free_list_t *) arena_flist[id])->next;
- }
- else {
- q = mmap(NULL,
- MPID_NEM_IB_ROUNDUP64(MPID_NEM_IB_SZARENA * MPID_NEM_IB_NCLUST_SLAB, 4096),
- PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-#if MPID_NEM_IB_NCLUST_SLAB > 1
- arena_flist[id] = q + MPID_NEM_IB_SZARENA;
- for (p = arena_flist[id]; p < q + (MPID_NEM_IB_NCLUST_SLAB - 1) * MPID_NEM_IB_SZARENA;
- p += MPID_NEM_IB_SZARENA) {
- ((free_list_t *) p)->next = p + MPID_NEM_IB_SZARENA;
- }
- ((free_list_t *) p)->next = 0;
-#endif
- }
- *((int *) q) = MPID_NEM_IB_CLUSTER_SIZE - 1;
- // dprintf("q=%llx\n", q);
- q += sz + (MPID_NEM_IB_SZARENA % sz);
- free_list_front[id] = q + sz;
- return q;
- }
-#endif
-}
-
-static inline void afree(const void *p, int id)
-{
-#if 1 /* debug */
- return MPIU_Free((void *) p);
-#else
- p = (void *) ((unsigned long) p & ~(MPID_NEM_IB_SZARENA - 1));
- if (!(--(*((int *) p)))) {
- ((free_list_t *) p)->next = arena_flist[id];
- arena_flist[id] = (char *) p;
- }
-#endif
-}
-
-static struct MPID_nem_ib_com_reg_mr_listnode_t MPID_nem_ib_com_reg_mr_global_cache;
-static struct MPID_nem_ib_com_reg_mr_listnode_t
- MPID_nem_ib_com_reg_mr_cache[MPID_NEM_IB_COM_REG_MR_NLINE];
-
-static inline int MPID_nem_ib_com_hash_func(char *addr)
-{
- unsigned int v = (unsigned int) (unsigned long) addr;
- //v = v >> MPID_NEM_IB_COM_REG_MR_LOGSZPAGE; /* assume it is page aligned */
- v = v & (MPID_NEM_IB_COM_REG_MR_NLINE - 1);
- return (int) v;
-}
-
-static void MPID_nem_ib_com_reg_mr_insert(struct MPID_nem_ib_com_reg_mr_listnode_t *c,
- struct MPID_nem_ib_com_reg_mr_listnode_t *e)
-{
- struct MPID_nem_ib_com_reg_mr_listnode_t *next;
- struct MPID_nem_ib_com_reg_mr_listnode_t *prev;
- prev = c;
- next = prev->lru_next;
- e->lru_next = next;
- e->lru_prev = prev;
- next->lru_prev = e;
- prev->lru_next = e;
-}
-
-static void MPID_nem_ib_com_reg_mr_unlink(struct MPID_nem_ib_com_reg_mr_listnode_t *e)
-{
- struct MPID_nem_ib_com_reg_mr_listnode_t *next, *prev;
- next = e->lru_next;
- prev = e->lru_prev;
- next->lru_prev = prev;
- prev->lru_next = next;
-}
-
-static inline void __lru_queue_display()
-{
- struct MPID_nem_ib_com_reg_mr_cache_entry_t *p;
- int i = 0;
- for (i = 0; i < MPID_NEM_IB_COM_REG_MR_NLINE; i++) {
- dprintf("---- hash %d\n", i);
- for (p = (struct MPID_nem_ib_com_reg_mr_cache_entry_t *)
- MPID_nem_ib_com_reg_mr_cache[i].lru_next;
- p != (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) &MPID_nem_ib_com_reg_mr_cache[i];
- p = (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) p->lru_next) {
- if (p && p->addr) {
- dprintf("-------- p=%p,addr=%p,len=%ld,refc=%d,lru_next=%p\n", p, p->addr, p->len,
- p->refc, p->lru_next);
- }
- else {
- dprintf("-------- p=%p,lru_next=%p\n", p, p->lru_next);
- }
- }
- }
-}
-
-void *MPID_nem_ib_com_reg_mr_fetch(void *addr, long len,
- enum ibv_access_flags additional_flags, int mode)
-{
- int ibcom_errno;
- int key;
- struct MPID_nem_ib_com_reg_mr_cache_entry_t *e;
- static unsigned long long num_global_cache = 0ULL;
-
- /* we can't change addr because ibv_post_send assumes mr->host_addr (output of this function)
- * must have an exact mirror value of addr (input of this function) */
- void *addr_aligned = addr;
- long len_aligned = len;
-
- key = MPID_nem_ib_com_hash_func(addr);
-
- dprintf("[MrCache] addr=%p, len=%ld\n", addr, len);
- dprintf("[MrCache] aligned addr=%p, len=%ld\n", addr_aligned, len_aligned);
-
- //__lru_queue_display();
- int way = 0;
- for (e =
- (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) MPID_nem_ib_com_reg_mr_cache[key].lru_next;
- e != (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) &MPID_nem_ib_com_reg_mr_cache[key];
- e = (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) e->lru_next, way++) {
- //dprintf("e=%p, e->hash_next=%p\n", e, e->lru_next);
-
- if (e->addr <= addr_aligned &&
- (uint8_t *) addr_aligned + len_aligned <= (uint8_t *) e->addr + e->len) {
- //dprintf
- //("MPID_nem_ib_com_reg_mr_fetch,hit,entry addr=%p,len=%d,mr addr=%p,len=%ld,requested addr=%p,len=%d\n",
- //e->addr, e->len, e->mr->addr, e->mr->length, addr, len);
- goto hit;
- }
- }
-
- // miss
-
- e = aalloc(sizeof(struct MPID_nem_ib_com_reg_mr_cache_entry_t),
- MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
- /* reference counter is used when evicting entry */
- e->refc = 1;
-
- dprintf("MPID_nem_ib_com_reg_mr_fetch,miss,addr=%p,len=%ld\n", addr_aligned, len_aligned);
- /* register memory */
- ibcom_errno = MPID_nem_ib_com_reg_mr(addr_aligned, len_aligned, &e->mr, additional_flags);
- if (ibcom_errno != 0) {
- /* ib_com_reg_mr returns the errno of ibv_reg_mr */
- if (ibcom_errno == ENOMEM) {
- /* deregister memory region unused and re-register new one */
- struct MPID_nem_ib_com_reg_mr_listnode_t *ptr;
- struct MPID_nem_ib_com_reg_mr_cache_entry_t *victim;
- unsigned long long dereg_total = 0;
- int reg_success = 0;
- for (ptr = MPID_nem_ib_com_reg_mr_global_cache.lru_prev;
- ptr !=
- (struct MPID_nem_ib_com_reg_mr_listnode_t *) &MPID_nem_ib_com_reg_mr_global_cache;)
- {
- victim = list_entry(ptr, struct MPID_nem_ib_com_reg_mr_cache_entry_t, g_lru);
- ptr = ptr->lru_prev;
- /* 'refc == 0' means this cache_entry is not used */
- if (victim && victim->addr && (victim->refc == 0)) {
- MPID_nem_ib_com_reg_mr_unlink((struct MPID_nem_ib_com_reg_mr_listnode_t *)
- victim);
- MPID_nem_ib_com_reg_mr_unlink(&(victim->g_lru));
-
- ibcom_errno = MPID_nem_ib_com_dereg_mr(victim->mr);
- if (ibcom_errno) {
- printf("mrcache,MPID_nem_ib_com_dereg_mr\n");
- afree(e, MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
- goto fn_fail;
- }
- dereg_total += (unsigned long long) victim->len;
- afree(victim, MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
- num_global_cache--;
-
- /* end loop if the total length released exceeds the requested */
- if (dereg_total >= len_aligned) {
- dprintf("ib_com_reg_mr_fetch,dereg=%llu,len=%ld\n", dereg_total,
- len_aligned);
- /* re-registraion */
- ibcom_errno =
- MPID_nem_ib_com_reg_mr(addr_aligned, len_aligned, &e->mr,
- additional_flags);
- if (ibcom_errno == 0) {
- /* ibv_reg_mr success */
- reg_success = 1;
- break;
- }
- }
- }
- }
-
- if (reg_success == 0) {
- fprintf(stderr, "mrcache,MPID_nem_ib_com_reg_mr,failed\n");
- afree(e, MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
- goto fn_fail;
- }
- }
- else {
- /* errno is not ENOMEM */
- fprintf(stderr, "mrcache,MPID_nem_ib_com_reg_mr,errno=%d\n", ibcom_errno);
- afree(e, MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
- goto fn_fail;
- }
- }
- e->addr = addr_aligned;
- e->len = len_aligned;
-
- //dprintf("MPID_nem_ib_com_reg_mr_fetch,fill,e=%p,key=%d,mr=%p,mr addr=%p,len=%ld,lkey=%08x,rkey=%08x\n", e,
- //key, e->mr, e->mr->addr, e->mr->length, e->mr->lkey, e->mr->rkey);
-
- /* register to cache */
- MPID_nem_ib_com_reg_mr_insert(&MPID_nem_ib_com_reg_mr_cache[key],
- (struct MPID_nem_ib_com_reg_mr_listnode_t *) e);
- if (mode != MPID_NEM_IB_COM_REG_MR_STICKY) {
- /* register to global-cache */
- num_global_cache++;
- MPID_nem_ib_com_reg_mr_insert(&MPID_nem_ib_com_reg_mr_global_cache, &(e->g_lru));
- }
-
- //__lru_queue_display();
-
- goto fn_exit;
-
- hit:
-
- /* reference counter is used when evicting entry */
- e->refc++;
-
- /* move to head of the list */
- if (e !=
- (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) MPID_nem_ib_com_reg_mr_cache[key].lru_next)
- {
- MPID_nem_ib_com_reg_mr_unlink((struct MPID_nem_ib_com_reg_mr_listnode_t *) e);
- MPID_nem_ib_com_reg_mr_insert(&MPID_nem_ib_com_reg_mr_cache[key],
- (struct MPID_nem_ib_com_reg_mr_listnode_t *) e);
- }
-
- if (mode != MPID_NEM_IB_COM_REG_MR_STICKY) {
- /* move to head of the list in global-cache */
- MPID_nem_ib_com_reg_mr_unlink(&(e->g_lru));
- MPID_nem_ib_com_reg_mr_insert(&MPID_nem_ib_com_reg_mr_global_cache, &(e->g_lru));
- }
- //dprintf("[MrCache] reuse e=%p,key=%d,mr=%p,refc=%d,addr=%p,len=%ld,lkey=%08x,rkey=%08x\n", e,
- //key, e->mr, e->refc, e->mr->addr, e->mr->length, e->mr->lkey, e->mr->rkey);
-
- //__lru_queue_display();
-
- fn_exit:
- if (mode == MPID_NEM_IB_COM_REG_MR_STICKY)
- return e->mr;
- else
- return e;
- fn_fail:
- goto fn_exit;
-}
-
-void MPID_nem_ib_com_reg_mr_release(struct MPID_nem_ib_com_reg_mr_cache_entry_t *entry)
-{
- entry->refc--;
- MPIU_Assert(ref_count >= 0);
-}
-
-int MPID_nem_ib_com_register_cache_init()
-{
- int ibcom_errno = 0;
- int i;
-
- ref_count++;
- dprintf("cache_init,ref_count=%d\n", ref_count);
-
- if (ref_count == 1) {
- /* Using the address to the start node to express the end of the list
- * instead of using NULL */
- for (i = 0; i < MPID_NEM_IB_COM_REG_MR_NLINE; i++) {
- MPID_nem_ib_com_reg_mr_cache[i].lru_next =
- (struct MPID_nem_ib_com_reg_mr_listnode_t *) &MPID_nem_ib_com_reg_mr_cache[i];
- MPID_nem_ib_com_reg_mr_cache[i].lru_prev =
- (struct MPID_nem_ib_com_reg_mr_listnode_t *) &MPID_nem_ib_com_reg_mr_cache[i];
- }
- MPID_nem_ib_com_reg_mr_global_cache.lru_next =
- (struct MPID_nem_ib_com_reg_mr_listnode_t *) &MPID_nem_ib_com_reg_mr_global_cache;
- MPID_nem_ib_com_reg_mr_global_cache.lru_prev =
- (struct MPID_nem_ib_com_reg_mr_listnode_t *) &MPID_nem_ib_com_reg_mr_global_cache;
-
- dprintf("[MrCache] cache initializes %d entries\n", MPID_NEM_IB_COM_REG_MR_NLINE);
- }
-
- fn_exit:
- return ibcom_errno;
- //fn_fail:
- goto fn_exit;
-}
-
-int MPID_nem_ib_com_register_cache_release()
-{
- int ibcom_errno = 0;
- int ib_errno;
- struct MPID_nem_ib_com_reg_mr_cache_entry_t *p;
- int i = 0, cnt = 0;
-
- dprintf("cache_release,ref_count=%d\n", ref_count);
-
- MPIU_Assert(ref_count > 0);
- if (--ref_count > 0) {
- goto fn_exit;
- }
-
- for (i = 0; i < MPID_NEM_IB_COM_REG_MR_NLINE; i++) {
- for (p = (struct MPID_nem_ib_com_reg_mr_cache_entry_t *)
- MPID_nem_ib_com_reg_mr_cache[i].lru_next;
- p !=
- (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) &MPID_nem_ib_com_reg_mr_cache[i];) {
- if (p && p->addr) {
- ib_errno = MPID_nem_ib_com_dereg_mr(p->mr);
- MPID_NEM_IB_COM_ERR_CHKANDJUMP(ib_errno, -1, printf("MPID_nem_ib_com_dereg_mr"));
- struct MPID_nem_ib_com_reg_mr_cache_entry_t *p_old = p;
- p = (struct MPID_nem_ib_com_reg_mr_cache_entry_t *) p->lru_next;
- afree(p_old, MPID_NEM_IB_COM_AALLOC_ID_MRCACHE);
- cnt++;
- }
- }
- MPID_nem_ib_com_reg_mr_cache[i].lru_next =
- (struct MPID_nem_ib_com_reg_mr_listnode_t *) &MPID_nem_ib_com_reg_mr_cache[i];
- MPID_nem_ib_com_reg_mr_cache[i].lru_prev =
- (struct MPID_nem_ib_com_reg_mr_listnode_t *) &MPID_nem_ib_com_reg_mr_cache[i];
- }
-
- //__lru_queue_display();
-
- dprintf("[MrCache] cache destroyed %d entries\n", cnt);
- fn_exit:
- return ibcom_errno;
- fn_fail:
- goto fn_exit;
-}
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
deleted file mode 100644
index 701cc19..0000000
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
+++ /dev/null
@@ -1,1910 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
-/*
- * (C) 2012 NEC Corporation
- * (C) 2014 RIKEN AICS
- * (C) 2014-2015 RIKEN AICS
- *
- * See COPYRIGHT in top-level directory.
- */
-
-#include "ib_impl.h"
-
-//#define MPID_NEM_IB_DEBUG_SEND
-#ifdef dprintf /* avoid redefinition with src/mpid/ch3/include/mpidimpl.h */
-#undef dprintf
-#endif
-#ifdef MPID_NEM_IB_DEBUG_SEND
-#define dprintf printf
-#else
-#define dprintf(...)
-#endif
-
-static int entered_send_progress = 0;
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_iSendContig_core
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int MPID_nem_ib_iSendContig_core(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
- MPIDI_msg_sz_t hdr_sz, void *data, MPIDI_msg_sz_t data_sz)
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- MPID_nem_ib_pkt_lmt_rts_t pkt_rts;
- void *prefix;
- int sz_prefix;
- void *s_data;
- long s_data_sz;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_ISENDCONTIG_CORE);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_ISENDCONTIG_CORE);
-
- /* piggy-back SR occupancy info might copy and modify given header */
-
- //dprintf("isendcontig,rsr_seq_num_tail=%d,rsr_seq_num_tail_last_sent=%d\n", vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent);
-
- int notify_rate;
- ibcom_errno = MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get(vc_ib->sc->fd, ¬ify_rate);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_sq_occupancy_notify_rate_get");
-
- /* send RDMA-write-to buffer occupancy information */
- /* embed SR occupancy information and remember the last one sent */
- if (MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent)
- > notify_rate) {
- prefix = NULL;
- sz_prefix = 0;
- }
- else {
- prefix = NULL;
- sz_prefix = 0;
- }
-
- s_data = data;
- s_data_sz = data_sz;
-
- /* If request length is too long, create LMT packet */
- if (MPID_NEM_IB_NETMOD_HDR_SIZEOF(vc_ib->ibcom->local_ringbuf_type)
- + sizeof(MPIDI_CH3_Pkt_t) + data_sz
- > MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_netmod_trailer_t)) {
- pkt_rts.type = MPIDI_NEM_PKT_NETMOD;
-
- pkt_rts.subtype = MPIDI_NEM_IB_PKT_RMA_LMT_RTS;
-
- void *write_from_buf = data;
-
- uint32_t max_msg_sz;
- MPID_nem_ib_com_get_info_conn(vc_ib->sc->fd, MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ,
- &max_msg_sz, sizeof(uint32_t));
-
- MPID_nem_ib_rma_lmt_cookie_t *s_cookie_buf =
- (MPID_nem_ib_rma_lmt_cookie_t *) MPIU_Malloc(sizeof(MPID_nem_ib_rma_lmt_cookie_t));
-
- sreq->ch.s_cookie = s_cookie_buf;
-
- s_cookie_buf->tail =
- *((uint8_t *) ((uint8_t *) write_from_buf + data_sz - sizeof(uint8_t)));
-
- int post_num = (data_sz + (long) max_msg_sz - 1) / (long) max_msg_sz;
- long length;
- if (post_num > 1) {
- length = max_msg_sz;
- }
- else {
- length = data_sz;
- }
-
- /* put IB rkey */
- struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache =
- MPID_nem_ib_com_reg_mr_fetch(write_from_buf, length, 0,
- MPID_NEM_IB_COM_REG_MR_GLOBAL);
- MPIU_ERR_CHKANDJUMP(!mr_cache, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
- struct ibv_mr *mr = mr_cache->mr;
- REQ_FIELD(sreq, lmt_mr_cache) = (void *) mr_cache;
-#ifdef HAVE_LIBDCFA
- s_cookie_buf->addr = (void *) mr->host_addr;
-#else
- s_cookie_buf->addr = write_from_buf;
-#endif
- s_cookie_buf->rkey = mr->rkey;
- s_cookie_buf->len = data_sz;
- s_cookie_buf->sender_req_id = sreq->handle;
- s_cookie_buf->max_msg_sz = max_msg_sz;
- s_cookie_buf->seg_seq_num = 1;
- s_cookie_buf->seg_num = post_num;
-
- pkt_rts.seg_seq_num = 1;
-
- REQ_FIELD(sreq, buf.from) = write_from_buf;
- REQ_FIELD(sreq, data_sz) = data_sz;
- REQ_FIELD(sreq, seg_seq_num) = 1; // only send 1st-segment, even if there are some segments.
- REQ_FIELD(sreq, seg_num) = post_num;
- REQ_FIELD(sreq, max_msg_sz) = max_msg_sz;
-
- /* set for ib_com_isend */
- prefix = (void *) &pkt_rts;
- sz_prefix = sizeof(MPIDI_CH3_Pkt_t);
- s_data = (void *) s_cookie_buf;
- s_data_sz = sizeof(MPID_nem_ib_rma_lmt_cookie_t);
-
- /* Release Request, when sender receives DONE packet. */
- int incomplete;
- MPIDI_CH3U_Request_increment_cc(sreq, &incomplete); // decrement in drain_scq and pkt_rma_lmt_getdone
- }
-
- REQ_FIELD(sreq, lmt_pack_buf) = NULL;
-
- /* packet handlers including MPIDI_CH3_PktHandler_EagerSend and MPID_nem_handle_pkt assume this */
- hdr_sz = sizeof(MPIDI_CH3_Pkt_t);
-
- /* send myrank as wr_id so that receiver can find vc using MPID_nem_ib_conns in poll */
- /* packet handler of MPIDI_CH3_PKT_EAGER_SEND uses sizeof(MPIDI_CH3_Pkt_t), so ignoring hdr_sz */
-
- /* MPIDI_CH3_ReqHandler_GetSendRespComplete, drain_scq decrement it */
- if (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_GET_RESP) {
- // MPIR_Request_add_ref(sreq);
- //printf("isendcontig_core,MPIDI_CH3_PKT_GET_RESP,ref_count=%d\n", sreq->ref_count);
- }
-
- /* increment cc because PktHandler_EagerSyncAck, ssend.c, drain_scq decrement it */
- if (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_EAGER_SYNC_SEND) {
- //MPIR_Request_add_ref(sreq);
- }
- if (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_GET) {
- //printf("isendcontig_core,MPIDI_CH3_PKT_GET,ref_count=%d\n", sreq->ref_count);
- }
- if (hdr && ((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_ACCUMULATE) {
- dprintf("isendcontig_core,MPIDI_CH3_PKT_ACCUMULATE,ref_count=%d\n", sreq->ref_count);
- }
-
-#ifdef MPID_NEM_IB_DEBUG_SEND
- int msg_type = MPIDI_Request_get_msg_type(sreq);
-#endif
-
- dprintf
- ("isendcontig_core,sreq=%p,prefix=%p,sz_prefix=%d,hdr=%p,sz_hdr=%ld,data=%p,sz_data=%d,remote_ringbuf->type=%d\n",
- sreq, prefix, sz_prefix, hdr, hdr_sz, data, (int) data_sz,
- vc_ib->ibcom->remote_ringbuf->type);
-
- if (sizeof(MPIDI_CH3_Pkt_t) != hdr_sz) {
- printf("type=%d,subtype=%d\n", ((MPID_nem_pkt_netmod_t *) hdr)->type,
- ((MPID_nem_pkt_netmod_t *) hdr)->subtype);
- }
-
- int copied;
- ibcom_errno =
- MPID_nem_ib_com_isend(vc_ib->sc->fd,
- (uint64_t) sreq,
- prefix, sz_prefix,
- hdr, hdr_sz,
- s_data, (int) s_data_sz,
- &copied,
- vc_ib->ibcom->local_ringbuf_type, vc_ib->ibcom->remote_ringbuf->type,
- &REQ_FIELD(sreq, buf_from), &REQ_FIELD(sreq, buf_from_sz));
- MPIU_ERR_CHKFATALANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_isend");
- MPID_nem_ib_ncqe += 1;
- //dprintf("isendcontig_core,ncqe=%d\n", MPID_nem_ib_ncqe);
- dprintf("isendcontig_core,isend,kind=%d,msg_type=%d,copied=%d\n", sreq->kind, msg_type, copied); /*suspicious lines,(if1,on,on,off,if0) works */
-
-#ifdef __MIC__
- __asm__ __volatile__
- ("movq %0, %%rsi;"
- "vprefetch0 0x00(%%rsi);"
- "vprefetch0 0x40(%%rsi);"
- "vprefetch0 0x80(%%rsi);"
- "vprefetch0 0xc0(%%rsi);"::"r"((uint64_t) data + 4 * data_sz):"%rsi");
-#else
- __asm__ __volatile__
- ("movq %0, %%rsi;"
- "prefetchnta 0x00(%%rsi);"
- "prefetchnta 0x40(%%rsi);"
- "prefetchnta 0x80(%%rsi);"
- "prefetchnta 0xc0(%%rsi);"::"r"((uint64_t) data + 4 * data_sz):"%rsi");
-#endif
-
- MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "ib_send, fd=%d", vc_ib->sc->fd));
- vc_ib->pending_sends += 1;
- sreq->ch.vc = vc; /* used in poll */
-
- /* calling drain_scq from progress_send deprives of chance
- * for ib_poll to drain-sendq using ncqe
- * however transfers events to
- * (not to reply_seq_num because it's regulated by the rate)
- * fire on ib_poll using nces
- * (make SCQ full once, then put one command in sendq,
- * then send-->drain-scq to reduce CQE level under the threashold)
- * so we need to perform
- * progress_send for all of VCs using nces in ib_poll
- * (if we have drain-sendq in ib_poll, this isn't needed. */
-
- MPID_nem_ib_ncqe_nces += 1; /* it has different meaning, counting non-copied eager-send */
-
-#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
- //dprintf("isendcontig,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
- int *notify_rstate;
- ibcom_errno =
- MPID_nem_ib_com_rdmabuf_occupancy_notify_rstate_get(vc_ib->sc->fd, ¬ify_rstate);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_rdmabuf_occupancy_notify_rstate_get");
-
- dprintf("isendcontig,head=%d,tail=%d,hw=%d\n", vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail, MPID_NEM_IB_COM_RDMABUF_HIGH_WATER_MARK);
- /* if the number of slots in RMDA-write-to buffer have hit the high water-mark */
- if (*notify_rstate == MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_LW &&
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) >
- MPID_NEM_IB_COM_RDMABUF_HIGH_WATER_MARK) {
- dprintf("changing notify_rstate,id=%d\n", vc_ib->ibcom->sseq_num);
- /* remember remote notifying policy so that local can know when to change remote policy back to LW */
- *notify_rstate = MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_HW;
- /* change remote notifying policy of RDMA-write-to buf occupancy info */
- MPID_nem_ib_send_change_rdmabuf_occupancy_notify_state(vc,
- MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_HW);
- }
- //dprintf("isendcontig_core,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
-#endif
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_ISENDCONTIG_CORE);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_iSendContig
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
- MPIDI_msg_sz_t hdr_sz, void *data, MPIDI_msg_sz_t data_sz)
-{
- int mpi_errno = MPI_SUCCESS;
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_ISENDCONTIG);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_ISENDCONTIG);
-
- MPIU_Assert(hdr_sz <= sizeof(MPIDI_CH3_Pkt_t));
- MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "ib_iSendContig");
- MPIDI_DBG_Print_packet((MPIDI_CH3_Pkt_t *) hdr);
-
- if (vc_ib->connection_state == MPID_NEM_IB_CM_CLOSED) {
- if (vc_ib->connection_guard == 0) {
- vc_ib->connection_guard = 1;
- /* connected=no,ringbuf-type=shared,slot-available=no,
- * going-to-be-enqueued=yes case */
- MPID_nem_ib_cm_cas(vc, 0); /* Call ask_fetch just after it's connected */
- }
-
- }
- if (vc_ib->connection_state != MPID_NEM_IB_CM_ESTABLISHED) {
- /* connected=closed/transit,ringbuf-type=shared,slot-available=no,
- * going-to-be-enqueued=yes case */
- REQ_FIELD(sreq, ask) = 0; /* We can't ask because ring-buffer type is not determined yet. */
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_ask_fetch");
- }
- else {
- /* connected=established,ringbuf-type=shared,slot-available=no,
- * going-to-be-enqueued=yes case */
- if (vc_ib->ibcom->local_ringbuf_type == MPID_NEM_IB_RINGBUF_SHARED &&
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) >=
- vc_ib->ibcom->local_ringbuf_nslot) {
- dprintf("isendcontig,RINGBUF_SHARED and full,asking\n");
- mpi_errno = MPID_nem_ib_ringbuf_ask_fetch(vc);
- REQ_FIELD(sreq, ask) = 1;
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_ringbuf_ask_fetch");
- }
- else {
- REQ_FIELD(sreq, ask) = 0;
- }
- }
-
- /* set it for drain_scq */
- MPIDI_Request_set_msg_type(sreq, MPIDI_REQUEST_EAGER_MSG);
-
-#ifdef MPID_NEM_IB_ONDEMAND
- if (vc_ib->connection_state != MPID_NEM_IB_CM_ESTABLISHED) {
- goto enqueue;
- }
-#endif
-
- dprintf
- ("isendcontig,%d->%d,req=%p,type=%d,subtype=%d,data_sz=%ld,ldiff=%d(%d-%d),rdiff=%d(%d-%d)\n",
- MPID_nem_ib_myrank, vc->pg_rank, sreq, ((MPIDI_CH3_Pkt_t *) hdr)->type,
- ((MPID_nem_pkt_netmod_t *) hdr)->subtype, data_sz,
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
- vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail,
- MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
- vc_ib->ibcom->rsr_seq_num_tail_last_sent),
- vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent);
- dprintf("isendcontig,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n",
- MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe,
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
- /* if IB command overflow-queue is empty AND local IB command queue isn't full AND remote RDMA-write-to buf isn't getting overrun */
- MPIDI_CH3_Pkt_t *ch3_hdr = (MPIDI_CH3_Pkt_t *) hdr;
- MPID_nem_pkt_netmod_t *prefix = (MPID_nem_pkt_netmod_t *) hdr;
- /* reserve one slot for control packet bringing sequence number
- * to avoid dead-lock */
- int slack = ((ch3_hdr->type != MPIDI_NEM_PKT_NETMOD ||
- prefix->subtype != MPIDI_NEM_IB_PKT_REQ_SEQ_NUM) &&
- (ch3_hdr->type != MPIDI_NEM_PKT_NETMOD ||
- prefix->subtype != MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM) &&
- (ch3_hdr->type != MPIDI_NEM_PKT_NETMOD ||
- prefix->subtype != MPIDI_NEM_IB_PKT_LMT_GET_DONE) &&
- ch3_hdr->type != MPIDI_NEM_PKT_LMT_RTS &&
- ch3_hdr->type != MPIDI_NEM_PKT_LMT_CTS) ? MPID_NEM_IB_COM_AMT_SLACK : 0;
- /* make control packet bringing sequence number go ahead of
- * queued packets to avoid dead-lock */
- int goahead =
- (ch3_hdr->type == MPIDI_NEM_PKT_NETMOD && prefix->subtype == MPIDI_NEM_IB_PKT_REQ_SEQ_NUM)
- || (ch3_hdr->type == MPIDI_NEM_PKT_NETMOD &&
- prefix->subtype == MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM) ||
- (ch3_hdr->type == MPIDI_NEM_PKT_NETMOD && prefix->subtype == MPIDI_NEM_IB_PKT_LMT_GET_DONE)
- ? 1 : 0;
- dprintf("isendcontig,slack=%d,goahead=%d\n", slack, goahead);
-
- if ((goahead || MPID_nem_ib_sendq_empty(vc_ib->sendq)) &&
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack &&
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) <
- vc_ib->ibcom->local_ringbuf_nslot - slack) {
-
- mpi_errno = MPID_nem_ib_iSendContig_core(vc, sreq, hdr, hdr_sz, data, data_sz);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
-
- }
- else {
- /* enqueue command into send_queue */
- dprintf
- ("isendcontig,enqueuing,goahead=%d,sendq=%d,ncom=%d,ncqe=%d,ldiff=%d(%d-%d),slack=%d\n",
- goahead, MPID_nem_ib_sendq_empty(vc_ib->sendq),
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack,
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack,
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail),
- vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail, slack);
-
- /* this location because the above message refers undefined */
- enqueue:
-
- /* store required info. see MPIDI_CH3_iSendv in src/mpid/ch3/channels/nemesis/src/ch3_isendv.c */
- sreq->dev.pending_pkt = *(MPIDI_CH3_Pkt_t *) hdr;
- sreq->dev.iov[0].MPID_IOV_BUF = (char *) &sreq->dev.pending_pkt;
- sreq->dev.iov[0].MPID_IOV_LEN = hdr_sz;
- sreq->dev.iov[1].MPID_IOV_BUF = (char *) data;
- sreq->dev.iov[1].MPID_IOV_LEN = data_sz;
-
- sreq->dev.iov_count = 2;
- sreq->dev.iov_offset = 0;
- sreq->ch.noncontig = FALSE; /* used in send_progress */
- sreq->ch.vc = vc;
-
- if (data_sz > 0) {
- dprintf
- ("isendcontig,hdr=%p,hdr_sz=%ld,data=%p,data_sz=%ld,*(sreq->dev.iov[1].MPID_IOV_BUF)=%08x,sz=%ld,sz=%ld\n",
- hdr, hdr_sz, data, data_sz, *((uint32_t *) sreq->dev.iov[1].MPID_IOV_BUF),
- sizeof(sreq->dev.pending_pkt), sizeof(MPIDI_CH3_Pkt_t));
- }
-
- /* enqueue control message telling tail position of ring buffer for eager-send
- * at the head of software MPI command queue. We explain the reason. Consider this case.
- * rank-0 performs 64 eager-sends and 48 of them are enqueued.
- * rank-1 consumes 2 of them and send the control message.
- * rank-0 drains 2 commands from the command queue.
- * ...
- * rank-0 finds that head of ring buffer for receiving messages from rank-1 is
- * growing by the control message from rank-1 and try to send the control message,
- * but the command is queued at the tail.
- * rank-1 stops sending the control message to rank-1 because the ring buffer is full
- * rank-0 stops draining command queue.
- */
- dprintf("isendcontig,enqueuing,type=%d,\n", ((MPIDI_CH3_Pkt_t *) hdr)->type);
-
- if (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_NEM_PKT_NETMOD &&
- ((MPID_nem_pkt_netmod_t *) hdr)->subtype == MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM) {
- dprintf("isendcontig,REPLY_SEQ_NUM,enqueue_at_head\n");
- MPID_nem_ib_sendq_enqueue_at_head(&vc_ib->sendq, sreq);
- }
- else {
- MPID_nem_ib_sendq_enqueue(&vc_ib->sendq, sreq);
- }
- /* we don't need to perform send_progress() here because
- * the events where RDMA-write-to buffer release is detected or release of IB command queue id detected happens
- * only after ib_poll is called. it's different than the case where write(2) is used */
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_ISENDCONTIG);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_iStartContigMsg
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_iStartContigMsg(MPIDI_VC_t * vc, void *hdr, MPIDI_msg_sz_t hdr_sz,
- void *data, MPIDI_msg_sz_t data_sz, MPID_Request ** sreq_ptr)
-{
- MPID_Request *sreq = NULL;
- int mpi_errno = MPI_SUCCESS;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_ISTARTCONTIGMSG);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_ISTARTCONTIGMSG);
- MPIU_Assert(hdr_sz <= sizeof(MPIDI_CH3_Pkt_t));
- MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "ib_iStartContigMsg");
- MPIDI_DBG_Print_packet((MPIDI_CH3_Pkt_t *) hdr);
-
- /* FIXME: avoid creating a request when not queued */
-
- if (hdr && ((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_GET) {
- //printf("istarctontig,MPIDI_CH3_PKT_GET,ref_count=%d\n", sreq->ref_count);
- /* sreq here is released by drain_scq, caller
- * request in MPIDI_CH3I_Recv_rma_msg is
- * released by PKT_GET_RESP, MPIDI_CH3I_RMAListComplete */
- }
-
- //tscs = MPID_nem_ib_rdtsc();
- sreq = MPID_Request_create();
- MPIU_Assert(sreq != NULL);
- MPIU_Object_set_ref(sreq, 2);
- sreq->kind = MPID_REQUEST_SEND;
- sreq->dev.OnDataAvail = 0;
- //tsce = MPID_nem_ib_rdtsc(); printf("rc,%ld\n", tsce - tscs); // 124.15 cycles
-
- mpi_errno = MPID_nem_ib_iSendContig(vc, sreq, hdr, hdr_sz, data, data_sz);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
-
- fn_exit:
- *sreq_ptr = sreq;
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_ISTARTCONTIGMSG);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_SendNoncontig_core
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int MPID_nem_ib_SendNoncontig_core(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
- MPIDI_msg_sz_t hdr_sz)
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
- MPIDI_msg_sz_t last;
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
-
- void *prefix;
- int prefix_sz;
- void *data;
- long data_sz;
- MPID_nem_ib_pkt_lmt_rts_t pkt_rts;
-
- prefix = NULL;
- prefix_sz = 0;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_SENDNONCONTIG_CORE);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_SENDNONCONTIG_CORE);
-
- last = sreq->dev.segment_size; /* segment_size is byte offset */
- if (last > 0) {
- data_sz = sreq->dev.segment_size - sreq->dev.segment_first;
- REQ_FIELD(sreq, lmt_pack_buf) = MPIU_Malloc((size_t) data_sz);
- MPIU_ERR_CHKANDJUMP(!REQ_FIELD(sreq, lmt_pack_buf), mpi_errno, MPI_ERR_OTHER,
- "**outofmemory");
- MPID_Segment_pack(sreq->dev.segment_ptr, sreq->dev.segment_first, &last,
- (char *) REQ_FIELD(sreq, lmt_pack_buf));
- MPIU_Assert(last == sreq->dev.segment_size);
- }
- else {
- REQ_FIELD(sreq, lmt_pack_buf) = NULL;
- data_sz = 0;
- }
-
- data = (void *) REQ_FIELD(sreq, lmt_pack_buf);
-
- /* If request length is too long, create LMT packet */
- if (MPID_NEM_IB_NETMOD_HDR_SIZEOF(vc_ib->ibcom->local_ringbuf_type)
- + sizeof(MPIDI_CH3_Pkt_t) + data_sz
- > MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_netmod_trailer_t)) {
- pkt_rts.type = MPIDI_NEM_PKT_NETMOD;
-
- pkt_rts.subtype = MPIDI_NEM_IB_PKT_RMA_LMT_RTS;
-
- void *write_from_buf = REQ_FIELD(sreq, lmt_pack_buf);
-
- uint32_t max_msg_sz;
- MPID_nem_ib_com_get_info_conn(vc_ib->sc->fd, MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ,
- &max_msg_sz, sizeof(uint32_t));
-
- MPID_nem_ib_rma_lmt_cookie_t *s_cookie_buf =
- (MPID_nem_ib_rma_lmt_cookie_t *) MPIU_Malloc(sizeof(MPID_nem_ib_rma_lmt_cookie_t));
-
- sreq->ch.s_cookie = s_cookie_buf;
-
- s_cookie_buf->tail =
- *((uint8_t *) ((uint8_t *) write_from_buf + data_sz - sizeof(uint8_t)));
-
- int post_num = (data_sz + (long) max_msg_sz - 1) / (long) max_msg_sz;
- long length;
- if (post_num > 1) {
- length = max_msg_sz;
- }
- else {
- length = data_sz;
- }
-
- /* put IB rkey */
- struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache =
- MPID_nem_ib_com_reg_mr_fetch(write_from_buf, length, 0,
- MPID_NEM_IB_COM_REG_MR_GLOBAL);
- MPIU_ERR_CHKANDJUMP(!mr_cache, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
- struct ibv_mr *mr = mr_cache->mr;
- REQ_FIELD(sreq, lmt_mr_cache) = (void *) mr_cache;
-#ifdef HAVE_LIBDCFA
- s_cookie_buf->addr = (void *) mr->host_addr;
-#else
- s_cookie_buf->addr = write_from_buf;
-#endif
- s_cookie_buf->rkey = mr->rkey;
- s_cookie_buf->len = data_sz;
- s_cookie_buf->sender_req_id = sreq->handle;
- s_cookie_buf->max_msg_sz = max_msg_sz;
- s_cookie_buf->seg_seq_num = 1;
- s_cookie_buf->seg_num = post_num;
-
- pkt_rts.seg_seq_num = 1;
-
- REQ_FIELD(sreq, buf.from) = write_from_buf;
- REQ_FIELD(sreq, data_sz) = data_sz;
- REQ_FIELD(sreq, seg_seq_num) = 1; // only send 1st-segment, even if there are some segments.
- REQ_FIELD(sreq, seg_num) = post_num;
- REQ_FIELD(sreq, max_msg_sz) = max_msg_sz;
-
- /* set for ib_com_isend */
- prefix = (void *) &pkt_rts;
- prefix_sz = sizeof(MPIDI_CH3_Pkt_t);
- data = (void *) s_cookie_buf;
- data_sz = sizeof(MPID_nem_ib_rma_lmt_cookie_t);
-
- /* Release Request, when sender receives DONE packet. */
- int incomplete;
- MPIDI_CH3U_Request_increment_cc(sreq, &incomplete); // decrement in drain_scq and pkt_rma_lmt_getdone
- }
-
- /* packet handlers assume this */
- hdr_sz = sizeof(MPIDI_CH3_Pkt_t);
-
- /* increment cc because PktHandler_EagerSyncAck, ssend.c, drain_scq decrement it */
- if (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_EAGER_SYNC_SEND) {
- //MPIR_Request_add_ref(sreq);
- }
-
- if (sizeof(MPIDI_CH3_Pkt_t) != hdr_sz) {
- printf("type=%d,subtype=%d\n", ((MPID_nem_pkt_netmod_t *) hdr)->type,
- ((MPID_nem_pkt_netmod_t *) hdr)->subtype);
- }
-
- int copied;
- dprintf("sendnoncontig_core,isend,%d->%d,seq_num=%d,remote_ringbuf->type=%d\n",
- MPID_nem_ib_myrank, vc->pg_rank, vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->remote_ringbuf->type);
-
- ibcom_errno =
- MPID_nem_ib_com_isend(vc_ib->sc->fd,
- (uint64_t) sreq,
- prefix, prefix_sz,
- hdr, hdr_sz,
- data, data_sz,
- &copied,
- vc_ib->ibcom->local_ringbuf_type, vc_ib->ibcom->remote_ringbuf->type,
- &REQ_FIELD(sreq, buf_from), &REQ_FIELD(sreq, buf_from_sz));
- MPIU_ERR_CHKANDJUMP(ibcom_errno != 0, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_isend");
- MPID_nem_ib_ncqe += 1;
- dprintf("sendnoncontig_core,ncqe=%d\n", MPID_nem_ib_ncqe);
-
- vc_ib->pending_sends += 1;
- sreq->ch.vc = vc; /* used in poll */
-
- MPID_nem_ib_ncqe_nces += 1; /* it has different meaning, counting non-copied eager-short */
-
-#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
- //dprintf("isendcontig,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
- int *notify_rstate;
- ibcom_errno =
- MPID_nem_ib_com_rdmabuf_occupancy_notify_rstate_get(vc_ib->sc->fd, ¬ify_rstate);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_rdmabuf_occupancy_notify_rstate_get");
-
- dprintf("isendcontig,head=%d,tail=%d,hw=%d\n", vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail, MPID_NEM_IB_COM_RDMABUF_HIGH_WATER_MARK);
- /* if the number of slots in RMDA-write-to buffer have hit the high water-mark */
- if (*notify_rstate == MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_LW &&
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) >
- MPID_NEM_IB_COM_RDMABUF_HIGH_WATER_MARK) {
- dprintf("changing notify_rstate,id=%d\n", vc_ib->ibcom->sseq_num);
- /* remember remote notifying policy so that local can know when to change remote policy back to LW */
- *notify_rstate = MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_HW;
- /* change remote notifying policy of RDMA-write-to buf occupancy info */
- MPID_nem_ib_send_change_rdmabuf_occupancy_notify_state(vc,
- MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_HW);
- }
- //dprintf("isendcontig_core,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
-#endif
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_SENDNONCONTIG_CORE);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_SendNoncontig
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_SendNoncontig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
- MPIDI_msg_sz_t hdr_sz)
-{
- int mpi_errno = MPI_SUCCESS;
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_SENDNONCONTIG);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_SENDNONCONTIG);
- MPIU_Assert(hdr_sz <= sizeof(MPIDI_CH3_Pkt_t));
- MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "ib_SendNoncontig");
-
- if (vc_ib->connection_state == MPID_NEM_IB_CM_CLOSED) {
- if (vc_ib->connection_guard == 0) {
- vc_ib->connection_guard = 1;
- /* connected=closed,ringbuf-type=shared,slot-available=no,
- * going-to-be-enqueued=yes case */
- MPID_nem_ib_cm_cas(vc, 0); /* Call ask_fetch just after it's connected */
- }
- }
- if (vc_ib->connection_state != MPID_NEM_IB_CM_ESTABLISHED) {
- /* connected=closed/transit,ringbuf-type=shared,slot-available=no,
- * going-to-be-enqueued=yes case */
- REQ_FIELD(sreq, ask) = 0;
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_ask_fetch");
- }
- else {
- /* connected=established,ringbuf-type=shared,slot-available=no,
- * going-to-be-enqueued=yes case */
- if (vc_ib->ibcom->local_ringbuf_type == MPID_NEM_IB_RINGBUF_SHARED &&
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) >=
- vc_ib->ibcom->local_ringbuf_nslot) {
- dprintf("sendnoncontig,RINGBUF_SHARED and full,asking\n");
- REQ_FIELD(sreq, ask) = 1;
- mpi_errno = MPID_nem_ib_ringbuf_ask_fetch(vc);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_ringbuf_ask_fetch");
- }
- else {
- REQ_FIELD(sreq, ask) = 0;
- }
- }
-
- dprintf("sendnoncontig,%d->%d,sendq_empty=%d,ncom=%d,rdmabuf_occ=%d\n", MPID_nem_ib_myrank,
- vc->pg_rank, MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom,
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
- /* set it for drain_scq */
- MPIDI_Request_set_msg_type(sreq, MPIDI_REQUEST_EAGER_MSG);
-
- /* if IB command overflow-queue is empty AND local IB command queue isn't full AND remote RDMA-write-to buf isn't getting overrun */
- /* set it for drain_scq */
- int slack = MPID_NEM_IB_COM_AMT_SLACK; /* slack for control packet bringing sequence number */
-
- if (
-#ifdef MPID_NEM_IB_ONDEMAND
- vc_ib->connection_state == MPID_NEM_IB_CM_ESTABLISHED &&
-#endif
- MPID_nem_ib_sendq_empty(vc_ib->sendq) &&
- vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
- MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack &&
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) <
- vc_ib->ibcom->local_ringbuf_nslot - slack) {
-
- mpi_errno = MPID_nem_ib_SendNoncontig_core(vc, sreq, hdr, hdr_sz);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
-
- }
- else {
- /* enqueue command into send_queue */
- dprintf("sendnoncontig, enqueuing");
- //enqueue:
- /* store required info. see MPIDI_CH3_iSendv in src/mpid/ch3/channels/nemesis/src/ch3_isendv.c */
- sreq->dev.pending_pkt = *(MPIDI_CH3_Pkt_t *) hdr;
- sreq->dev.iov[0].MPID_IOV_BUF = (char *) &sreq->dev.pending_pkt;
- sreq->dev.iov[0].MPID_IOV_LEN = hdr_sz;
-
- sreq->dev.iov_count = 1;
- sreq->dev.iov_offset = 0;
- sreq->ch.noncontig = TRUE;
- sreq->ch.vc = vc;
-
- MPID_nem_ib_sendq_enqueue(&vc_ib->sendq, sreq);
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_SENDNONCONTIG);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-/* see MPIDI_CH3I_Shm_send_progress (in src/mpid/ch3/channels/nemesis/src/ch3_progress.c) */
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_send_progress
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_send_progress(MPIDI_VC_t * vc)
-{
- int mpi_errno = MPI_SUCCESS;
- MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
- MPID_Request *sreq, *prev_sreq;
- int msg_type;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_SEND_PROGRESS);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_SEND_PROGRESS);
-
- //dprintf("send_progress,enter\n");
-
-#ifdef MPID_NEM_IB_ONDEMAND
- if (vc_ib->connection_state != MPID_NEM_IB_CM_ESTABLISHED) {
- //dprintf("send_progress,connection_state=%08x\n", vc_ib->connection_state);
- goto fn_exit;
- }
-#endif
-
- /* prevent a call path send_progress -> drain_scq -> send_progress */
- if (entered_send_progress) {
- goto fn_exit;
- }
- entered_send_progress = 1;
-
- sreq = MPID_nem_ib_sendq_head(vc_ib->sendq);
- if (sreq) {
- prev_sreq = NULL;
- do {
- msg_type = MPIDI_Request_get_msg_type(sreq);
-
- MPIDI_CH3_Pkt_t *ch3_hdr = (MPIDI_CH3_Pkt_t *) sreq->dev.iov[0].MPID_IOV_BUF;
- MPID_nem_pkt_netmod_t *netmod_pkt =
- (MPID_nem_pkt_netmod_t *) sreq->dev.iov[0].MPID_IOV_BUF;
- int slack = (msg_type == MPIDI_REQUEST_EAGER_MSG) ? /* guard from RDMA-read or RDMA-write */
- (((ch3_hdr->type != MPIDI_NEM_PKT_NETMOD ||
- netmod_pkt->subtype != MPIDI_NEM_IB_PKT_REQ_SEQ_NUM) &&
- (ch3_hdr->type != MPIDI_NEM_PKT_NETMOD ||
- netmod_pkt->subtype != MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM) &&
- (ch3_hdr->type != MPIDI_NEM_PKT_NETMOD ||
- netmod_pkt->subtype != MPIDI_NEM_IB_PKT_LMT_GET_DONE) &&
- ch3_hdr->type != MPIDI_NEM_PKT_LMT_RTS &&
- ch3_hdr->type !=
- MPIDI_NEM_PKT_LMT_CTS) ? MPID_NEM_IB_COM_AMT_SLACK : 0) :
- MPID_NEM_IB_COM_AMT_SLACK;
-
- /* Temporary fix until removing slack */
- if (vc_ib->ibcom->local_ringbuf_type == MPID_NEM_IB_RINGBUF_SHARED) {
- slack = 0;
- }
-
- /* Refill slots from queue
- * We don't need refill code in sendcontig because
- * there is an order where (1) send, (2) it's queued, (3) then ask obtains slots,
- * (4) then we can refill them here. */
-
- if (vc_ib->ibcom->local_ringbuf_type == MPID_NEM_IB_RINGBUF_SHARED &&
- (msg_type == MPIDI_REQUEST_EAGER_MSG &&
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) >=
- vc_ib->ibcom->local_ringbuf_nslot)) {
- /* Prevent RDMA-read for rendezvous protocol from issuing ask */
-
- if (!REQ_FIELD(sreq, ask)) { /* First packet after connection hasn't asked slot */
- /* Transitioning from exclusive to shared and need to issue ask.
- * This case is detected because exclusive entries in the queue are deleted
- * and deprived of slots of exclusive and the last state is set to
- * shared when deciding a transition from exclusive to shared
- * and an issued or queued ask must be in the queue or ringbuf_sendq
- * when staying shared. */
- dprintf("send_progress,call ask_fetch,%d->%d\n",
- MPID_nem_ib_myrank, vc->pg_rank);
- mpi_errno = MPID_nem_ib_ringbuf_ask_fetch(vc);
- REQ_FIELD(sreq, ask) = 1;
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_ringbuf_ask_fetch");
- }
- else if (!MPID_nem_ib_ringbuf_sectorq_empty(vc_ib->ibcom->sectorq)) {
- /* Staying shared or transitioning from shared to exclusive.
- * We need to consume acquires slots in the latter case.
- * Transitioning from shared to exclusive is achieved by
- * finding an exlusive entry. */
- MPID_nem_ib_ringbuf_sector_t *sector =
- MPID_nem_ib_ringbuf_sectorq_head(vc_ib->ibcom->sectorq);
-
- vc_ib->ibcom->local_ringbuf_type = sector->type;
- vc_ib->ibcom->local_ringbuf_start = sector->start;
- vc_ib->ibcom->local_ringbuf_nslot = sector->nslot;
- vc_ib->ibcom->sseq_num = sector->head;
- vc_ib->ibcom->lsr_seq_num_tail = sector->tail;
-
- MPID_nem_ib_ringbuf_sectorq_dequeue(&vc_ib->ibcom->sectorq, §or);
- MPIU_Free(sector);
-
- dprintf
- ("send_progress,refill,next type=%d,start=%p,local_head=%d,local_tail=%d\n",
- vc_ib->ibcom->local_ringbuf_type, vc_ib->ibcom->local_ringbuf_start,
- vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail);
- }
- }
-
- if (vc_ib->ibcom->ncom >= MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack ||
- MPID_nem_ib_ncqe >= MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack ||
- (msg_type == MPIDI_REQUEST_EAGER_MSG &&
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail) >=
- vc_ib->ibcom->local_ringbuf_nslot - slack)) {
- /* Exit when full because this reduces the search cost.
- * Note that RDMA-read for rendezvous protocol can be issued even
- * when no ring-buffer slot is available. */
- goto fn_exit;
- }
-
- if (vc_ib != MPID_nem_ib_debug_current_vc_ib) {
- dprintf("send_progress,vc_ib != MPID_nem_ib_debug_current_vc_ib\n");
- }
- dprintf("send_progress,req=%p,kind=%d,msg_type=%d\n", sreq, sreq->kind, msg_type);
- if (msg_type == MPIDI_REQUEST_EAGER_MSG) {
- dprintf("send_progress,ch3_hdr->type=%d\n", ch3_hdr->type);
- }
- dprintf("send_progress,%d->%d,rdiff=%d(%d-%d),ldiff=%d(%d-%d),slack=%d\n",
- MPID_nem_ib_myrank, sreq->ch.vc->pg_rank,
- MPID_nem_ib_diff16(vc_ib->ibcom->rsr_seq_num_tail,
- vc_ib->ibcom->rsr_seq_num_tail_last_sent),
- vc_ib->ibcom->rsr_seq_num_tail, vc_ib->ibcom->rsr_seq_num_tail_last_sent,
- MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
- vc_ib->ibcom->lsr_seq_num_tail),
- vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail, slack);
- if (sreq->kind == MPID_REQUEST_SEND && msg_type == MPIDI_REQUEST_EAGER_MSG) {
- if (!sreq->ch.noncontig) {
- dprintf
- ("send_progress,contig,ch3_hdr->type=%d,sseq_num=%d,MPIDI_NEM_PKT_LMT_RTS=%d,MPIDI_NEM_IB_PKT_LMT_GET_DONE=%d\n",
- ch3_hdr->type, vc_ib->ibcom->sseq_num, MPIDI_NEM_PKT_LMT_RTS,
- MPIDI_NEM_IB_PKT_LMT_GET_DONE);
- if (sreq->dev.iov[1].MPID_IOV_LEN > 0) {
- dprintf
- ("send_progress,send,contig,sreq->dev.iov[1].MPID_IOV_BUF)=%p,*(sreq->dev.iov[1].MPID_IOV_BUF)=%08x\n",
- sreq->dev.iov[1].MPID_IOV_BUF,
- *((uint32_t *) sreq->dev.iov[1].MPID_IOV_BUF));
- }
- MPIU_Assert(sreq->dev.iov_count > 0);
-
- switch (ch3_hdr->type) {
- /* send current rsr_seq_num_tail because message from target to initiator
- * might have happened while being queued */
- case MPIDI_NEM_PKT_LMT_RTS:{
- dprintf("send_progress,MPIDI_NEM_PKT_LMT_RTS,rsr_seq_num_tail=%d\n",
- vc_ib->ibcom->rsr_seq_num_tail);
- break;
- }
-
- case MPIDI_NEM_PKT_LMT_CTS:{
- dprintf("send_progress,MPIDI_NEM_PKT_LMT_CTS,rsr_seq_num_tail=%d\n",
- vc_ib->ibcom->rsr_seq_num_tail);
- break;
- }
-
- default:;
- }
-
- if (ch3_hdr->type == MPIDI_NEM_PKT_NETMOD) {
- switch (netmod_pkt->subtype) {
- /* send current rsr_seq_num_tail because message from target to initiator
- * might have happened while being queued */
- case MPIDI_NEM_IB_PKT_LMT_GET_DONE:{
- break;
- }
- case MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM:{
- MPID_nem_ib_pkt_reply_seq_num_t *_pkt =
- (MPID_nem_ib_pkt_reply_seq_num_t *) sreq->dev.
- iov[0].MPID_IOV_BUF;
- dprintf
- ("send_progress,MPIDI_NEM_IB_PKT_REPLY_SEQ_NUM,rsr_seq_num_tail=%d\n",
- vc_ib->ibcom->rsr_seq_num_tail);
- /* embed SR occupancy information */
- _pkt->seq_num_tail = vc_ib->ibcom->rsr_seq_num_tail;
- /* remember the last one sent */
- vc_ib->ibcom->rsr_seq_num_tail_last_sent =
- vc_ib->ibcom->rsr_seq_num_tail;
- break;
- }
-
- default:;
- }
- }
-
-
- mpi_errno =
- MPID_nem_ib_iSendContig_core(sreq->ch.vc, sreq,
- sreq->dev.iov[0].MPID_IOV_BUF,
- sreq->dev.iov[0].MPID_IOV_LEN,
- sreq->dev.iov[1].MPID_IOV_BUF,
- sreq->dev.iov[1].MPID_IOV_LEN);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
- }
- else {
- dprintf("send_progress,send,noncontig\n");
- mpi_errno =
- MPID_nem_ib_SendNoncontig_core(sreq->ch.vc, sreq,
- sreq->dev.iov[0].MPID_IOV_BUF,
- sreq->dev.iov[0].MPID_IOV_LEN);
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
- }
- }
- else if (sreq->kind == MPID_REQUEST_RECV && msg_type == MPIDI_REQUEST_RNDV_MSG) {
-
- dprintf("send_progress,kick lmt_start_recv_core,prev=%p,next=%p\n", prev_sreq,
- MPID_nem_ib_sendq_next(sreq));
- mpi_errno =
- MPID_nem_ib_lmt_start_recv_core(sreq, REQ_FIELD(sreq, lmt_raddr),
- REQ_FIELD(sreq, lmt_rkey), REQ_FIELD(sreq,
- lmt_szsend),
- REQ_FIELD(sreq, lmt_write_to_buf),
- REQ_FIELD(sreq, max_msg_sz), REQ_FIELD(sreq,
- last));
- if (mpi_errno) {
- MPIU_ERR_POP(mpi_errno);
- }
- }
- else if (sreq->kind == MPID_REQUEST_SEND && msg_type == MPIDI_REQUEST_RNDV_MSG) {
- }
- else {
- dprintf("send_progress,unknown sreq=%p,sreq->kind=%d,msg_type=%d\n", sreq,
- sreq->kind, msg_type);
- assert(0);
- MPIU_ERR_INTERNALANDJUMP(mpi_errno, "send_progress,unknown type");
- }
-
-
- /* unlink sreq */
- if (prev_sreq != NULL) {
- MPID_nem_ib_sendq_next(prev_sreq) = MPID_nem_ib_sendq_next(sreq);
- }
- else {
- MPID_nem_ib_sendq_head(vc_ib->sendq) = MPID_nem_ib_sendq_next(sreq);
- }
- if (MPID_nem_ib_sendq_next(sreq) == NULL) {
- vc_ib->sendq.tail = prev_sreq;
- }
-
- /* save sreq->dev.next (and sreq) because decrementing reference-counter might free sreq */
- //MPID_Request *tmp_sreq = sreq;
- sreq = MPID_nem_ib_sendq_next(sreq);
- goto next_unlinked;
- //next:
- prev_sreq = sreq;
- sreq = MPID_nem_ib_sendq_next(sreq);
- next_unlinked:;
- if (!sreq) {
- dprintf("send_progress,sendq has got empty!\n");
- }
- } while (sreq);
- }
-
- //dprintf("send_progress,exit,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
-
- fn_exit:
- entered_send_progress = 0;
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_SEND_PROGRESS);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#ifdef MPID_NEM_IB_ONDEMAND
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_cm_progress
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_cm_progress()
-{
- int mpi_errno = MPI_SUCCESS;
- MPID_nem_ib_cm_req_t *sreq, *prev_sreq;
- MPID_nem_ib_cm_cmd_shadow_t *shadow;
- int is_established = 0;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_PROGRESS);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_PROGRESS);
-
- //dprintf("cm_send_progress,enter\n");
-
- sreq = MPID_nem_ib_cm_sendq_head(MPID_nem_ib_cm_sendq);
- if (sreq) {
- prev_sreq = NULL;
- do {
-
- if (sreq->ibcom->ncom >= MPID_NEM_IB_COM_MAX_SQ_CAPACITY ||
- MPID_nem_ib_ncqe_scratch_pad >= MPID_NEM_IB_COM_MAX_CQ_CAPACITY) {
- goto next;
- }
-
- switch (sreq->state) {
- case MPID_NEM_IB_CM_CAS:
- if (is_conn_established(sreq->responder_rank)) {
- dprintf("cm_progress,cm_cas,connection is already established\n");
- is_established = 1;
- break;
- }
-
- /* This comparison is OK if the diff is within 63-bit range */
- if (MPID_nem_ib_diff63(MPID_nem_ib_progress_engine_vt, sreq->retry_decided) <
- sreq->retry_backoff) {
- goto next;
- }
- dprintf
- ("cm_progress,retry CAS,responder_rank=%d,req=%p,decided=%ld,vt=%ld,backoff=%ld\n",
- sreq->responder_rank, sreq, sreq->retry_decided,
- MPID_nem_ib_progress_engine_vt, sreq->retry_backoff);
- shadow = (MPID_nem_ib_cm_cmd_shadow_t *)
- MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow->type = sreq->state;
- shadow->req = sreq;
- mpi_errno = MPID_nem_ib_cm_cas_core(sreq->responder_rank, shadow);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_connect_cas_core");
- break;
- case MPID_NEM_IB_CM_CAS_RELEASE:
- dprintf
- ("cm_progress,retry CAS_RELEASE,responder_rank=%d,req=%p,decided=%ld,vt=%ld,backoff=%ld\n",
- sreq->responder_rank, sreq, sreq->retry_decided,
- MPID_nem_ib_progress_engine_vt, sreq->retry_backoff);
- shadow = (MPID_nem_ib_cm_cmd_shadow_t *)
- MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow->type = sreq->state;
- shadow->req = sreq;
- mpi_errno = MPID_nem_ib_cm_cas_release_core(sreq->responder_rank, shadow);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_cas_release_core");
- break;
- case MPID_NEM_IB_CM_SYN:
- if (is_conn_established(sreq->responder_rank)) {
- /* Explicitly release CAS word because
- * ConnectX-3 doesn't support safe CAS with PCI device and CPU */
- MPID_nem_ib_cm_cas_release(MPID_nem_ib_conns[sreq->responder_rank].vc);
- dprintf("cm_progress,syn,established is true,%d->%d,connection_tx=%d\n",
- MPID_nem_ib_myrank, sreq->responder_rank,
- sreq->ibcom->outstanding_connection_tx);
- is_established = 1;
- break;
- }
-
- /* The initiator acqurire slot for the responder when sending syn */
- if (MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
- MPID_nem_ib_cm_ringbuf_tail) >= MPID_NEM_IB_CM_NSEG) {
- goto next;
- }
- ((MPID_nem_ib_cm_cmd_syn_t *) & sreq->cmd)->responder_ringbuf_index =
- MPID_nem_ib_cm_ringbuf_head;
- sreq->responder_ringbuf_index = MPID_nem_ib_cm_ringbuf_head;
- ((MPID_nem_ib_cm_cmd_syn_t *) & sreq->cmd)->initiator_rank = MPID_nem_ib_myrank;
-
- MPID_nem_ib_cm_ringbuf_head++;
- shadow = (MPID_nem_ib_cm_cmd_shadow_t *)
- MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow->type = sreq->state;
- shadow->req = sreq;
- dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
- mpi_errno =
- MPID_nem_ib_cm_cmd_core(sreq->responder_rank, shadow,
- (void *) (&sreq->cmd),
- sizeof(MPID_nem_ib_cm_cmd_synack_t), 1 /* syn:1 */ , 0);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_send_core");
- break;
- case MPID_NEM_IB_CM_CAS_RELEASE2:
- dprintf("cm_progress,sending cas_release2,%d->%d\n", MPID_nem_ib_myrank,
- sreq->responder_rank);
-
- ((MPID_nem_ib_cm_cmd_syn_t *) & sreq->cmd)->initiator_rank = MPID_nem_ib_myrank;
-
- shadow = (MPID_nem_ib_cm_cmd_shadow_t *)
- MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow->type = sreq->state;
- shadow->req = sreq;
- dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
- mpi_errno =
- MPID_nem_ib_cm_cmd_core(sreq->responder_rank, shadow,
- (void *) (&sreq->cmd),
- sizeof(MPID_nem_ib_cm_cmd_synack_t), 1 /* syn:1 */ , 0);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_send_core");
- break;
- case MPID_NEM_IB_CM_SYNACK:
- /* The responder acquire slot for the initiator when sending synack */
- if (MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
- MPID_nem_ib_cm_ringbuf_tail) >= MPID_NEM_IB_CM_NSEG) {
- goto next;
- }
- ((MPID_nem_ib_cm_cmd_synack_t *) & sreq->cmd)->initiator_ringbuf_index =
- MPID_nem_ib_cm_ringbuf_head;
- sreq->initiator_ringbuf_index = MPID_nem_ib_cm_ringbuf_head;
- MPID_nem_ib_cm_ringbuf_head++;
- shadow = (MPID_nem_ib_cm_cmd_shadow_t *)
- MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow->type = sreq->state;
- shadow->req = sreq;
- dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
- mpi_errno =
- MPID_nem_ib_cm_cmd_core(sreq->initiator_rank, shadow,
- (void *) (&sreq->cmd),
- sizeof(MPID_nem_ib_cm_cmd_synack_t), 0,
- sreq->ringbuf_index);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_send_core");
- break;
- case MPID_NEM_IB_CM_ACK1:
- shadow = (MPID_nem_ib_cm_cmd_shadow_t *)
- MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow->type = sreq->state;
- shadow->req = sreq;
- dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
- mpi_errno =
- MPID_nem_ib_cm_cmd_core(sreq->responder_rank, shadow,
- (void *) (&sreq->cmd),
- sizeof(MPID_nem_ib_cm_cmd_ack1_t), 0,
- sreq->ringbuf_index);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_send_core");
- break;
- case MPID_NEM_IB_CM_ACK2:
- shadow = (MPID_nem_ib_cm_cmd_shadow_t *)
- MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow->type = sreq->state;
- shadow->req = sreq;
- dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
- mpi_errno =
- MPID_nem_ib_cm_cmd_core(sreq->initiator_rank, shadow,
- (void *) (&sreq->cmd),
- sizeof(MPID_nem_ib_cm_cmd_ack2_t), 0,
- sreq->ringbuf_index);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_send_core");
- break;
- case MPID_NEM_IB_CM_ALREADY_ESTABLISHED:
- case MPID_NEM_IB_CM_RESPONDER_IS_CONNECTING:
- shadow = (MPID_nem_ib_cm_cmd_shadow_t *)
- MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow->type = sreq->state;
- shadow->req = sreq;
- dprintf("shadow=%p,shadow->req=%p\n", shadow, shadow->req);
- mpi_errno =
- MPID_nem_ib_cm_cmd_core(sreq->initiator_rank, shadow,
- (void *) (&sreq->cmd),
- sizeof(MPID_nem_ib_cm_cmd_synack_t), 0,
- sreq->ringbuf_index);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_send_core");
- break;
- default:
- dprintf("cm_progress,unknown state=%d\n", sreq->state);
- assert(0);
- MPIU_ERR_INTERNALANDJUMP(mpi_errno, "cm_progress,unknown state");
- }
-
- /* unlink sreq */
- if (prev_sreq != NULL) {
- MPID_nem_ib_cm_sendq_next(prev_sreq) = MPID_nem_ib_cm_sendq_next(sreq);
- }
- else {
- MPID_nem_ib_cm_sendq_head(MPID_nem_ib_cm_sendq) = MPID_nem_ib_cm_sendq_next(sreq);
- }
- if (MPID_nem_ib_cm_sendq_next(sreq) == NULL) {
- MPID_nem_ib_cm_sendq.tail = prev_sreq;
- }
-
- /* save sreq->dev.next (and sreq) because decrementing reference-counter might free sreq */
- MPID_nem_ib_cm_req_t *tmp_sreq = sreq;
- sreq = MPID_nem_ib_cm_sendq_next(sreq);
-
- if (is_established) {
- dprintf("cm_progress,destroy connect-op\n");
-
- /* don't connect */
- tmp_sreq->ibcom->outstanding_connection_tx -= 1;
-
- /* Let the guard down to let the following connection request go. */
- VC_FIELD(MPID_nem_ib_conns[tmp_sreq->responder_rank].vc, connection_guard) = 0;
-
- /* free memory : req->ref_count is 2, so call MPIU_Free() directly */
-// MPID_nem_ib_cm_request_release(tmp_sreq);
- MPIU_Free(tmp_sreq);
-
- is_established = 0;
- break;
- }
- goto next_unlinked;
- next:
- prev_sreq = sreq;
- sreq = MPID_nem_ib_cm_sendq_next(sreq);
- next_unlinked:;
- } while (sreq);
- }
-
- fn_exit:
- entered_send_progress = 0;
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_PROGRESS);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_cm_cas_core
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_cm_cas_core(int rank, MPID_nem_ib_cm_cmd_shadow_t * shadow)
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_CAS_CORE);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_CAS_CORE);
-
- MPID_nem_ib_com_t *conp;
- ibcom_errno = MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[rank], &conp);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_cas_scratch_pad");
- dprintf("cm_cas_core,%d->%d,conp=%p,remote_addr=%lx\n",
- MPID_nem_ib_myrank, rank, conp,
- (unsigned long) conp->icom_rmem[MPID_NEM_IB_COM_SCRATCH_PAD_TO] + 0);
-
- /* Compare-and-swap rank to acquire communication manager port */
- ibcom_errno = MPID_nem_ib_com_cas_scratch_pad(MPID_nem_ib_scratch_pad_fds[rank], (uint64_t) shadow, 0, MPID_NEM_IB_CM_RELEASED, MPID_nem_ib_myrank /*rank */ , /*debug */
- &shadow->buf_from, &shadow->buf_from_sz);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_cas_scratch_pad");
- MPID_nem_ib_ncqe_scratch_pad += 1;
-
- /* Direct poll to drain CQ to check CAS result */
- MPID_nem_ib_ncqe_scratch_pad_to_drain += 1;
- dprintf("ringbuf_cm_cas_core,scratch_pad_to_drain=%d\n", MPID_nem_ib_ncqe_scratch_pad_to_drain);
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_CAS_CORE);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_cm_cas
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_cm_cas(MPIDI_VC_t * vc, uint32_t ask_on_connect)
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_CAS);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_CAS);
-
- dprintf("cm_cas,enter\n");
-
- /* Prepare request structure for enqueued case */
- MPID_nem_ib_cm_req_t *req = MPIU_Malloc(sizeof(MPID_nem_ib_cm_req_t));
- MPIU_ERR_CHKANDJUMP(!req, mpi_errno, MPI_ERR_OTHER, "**malloc");
- dprintf("req=%p\n", req);
- req->state = MPID_NEM_IB_CM_CAS;
- req->ref_count = 3; /* Released on receiving ACK2 and draining SCQ of SYN and ACK1 */
- req->retry_backoff = 0;
- req->initiator_rank = MPID_nem_ib_myrank;
- req->responder_rank = vc->pg_rank;
- req->ask_on_connect = ask_on_connect;
- ibcom_errno =
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[vc->pg_rank], &req->ibcom);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
- dprintf("req->ibcom=%p\n", req->ibcom);
-
- /* Increment transaction counter here because cm_cas is called only once
- * (cm_cas_core might be called more than once when retrying) */
- req->ibcom->outstanding_connection_tx += 1;
- dprintf("cm_cas,%d->%d,connection_tx=%d\n", MPID_nem_ib_myrank, vc->pg_rank,
- req->ibcom->outstanding_connection_tx);
-
- /* Acquire remote scratch pad */
- if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
- req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
- MPID_nem_ib_diff16(MPID_nem_ib_cm_ringbuf_head,
- MPID_nem_ib_cm_ringbuf_tail) < MPID_NEM_IB_CM_NSEG) {
- MPID_nem_ib_cm_cmd_shadow_t *shadow =
- (MPID_nem_ib_cm_cmd_shadow_t *) MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow->type = req->state;
- shadow->req = req;
-
- mpi_errno = MPID_nem_ib_cm_cas_core(req->responder_rank, shadow);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_cas");
- }
- else {
- dprintf("cm_cas,enqueue\n");
- req->retry_decided = MPID_nem_ib_progress_engine_vt;
- MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_CAS);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_cm_cas_release_core
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_cm_cas_release_core(int rank, MPID_nem_ib_cm_cmd_shadow_t * shadow)
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_CAS_RELEASE_CORE);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_CAS_RELEASE_CORE);
-
- MPID_nem_ib_com_t *conp;
- ibcom_errno = MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[rank], &conp);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_cas_scratch_pad");
- dprintf("cm_cas_release_core,%d->%d,conp=%p,remote_addr=%lx\n",
- MPID_nem_ib_myrank, rank, conp,
- (unsigned long) conp->icom_rmem[MPID_NEM_IB_COM_SCRATCH_PAD_TO] + 0);
-
- /* Compare-and-swap rank to acquire communication manager port */
- ibcom_errno = MPID_nem_ib_com_cas_scratch_pad(MPID_nem_ib_scratch_pad_fds[rank], (uint64_t) shadow, 0, MPID_nem_ib_myrank, MPID_NEM_IB_CM_RELEASED /*rank */ , /*debug */
- &shadow->buf_from, &shadow->buf_from_sz);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_cas_scratch_pad");
- MPID_nem_ib_ncqe_scratch_pad += 1;
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_CAS_RELEASE_CORE);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_cm_cas_release
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_cm_cas_release(MPIDI_VC_t * vc)
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_CAS_RELEASE);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_CAS_RELEASE);
-
- dprintf("cm_cas_release,enter\n");
-
- /* Prepare request structure for enqueued case */
- MPID_nem_ib_cm_req_t *req = MPIU_Malloc(sizeof(MPID_nem_ib_cm_req_t));
- MPIU_ERR_CHKANDJUMP(!req, mpi_errno, MPI_ERR_OTHER, "**malloc");
- dprintf("req=%p\n", req);
- req->state = MPID_NEM_IB_CM_CAS_RELEASE;
- req->ref_count = 1; /* Released on draining SCQ */
- req->retry_backoff = 0;
- req->initiator_rank = MPID_nem_ib_myrank;
- req->responder_rank = vc->pg_rank;
- ibcom_errno =
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[vc->pg_rank], &req->ibcom);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
- dprintf("req->ibcom=%p\n", req->ibcom);
-
- /* Increment transaction counter here because cm_cas_release is called only once
- * (cm_cas_release_core might be called more than once when retrying) */
- req->ibcom->outstanding_connection_tx += 1;
- dprintf("cm_cas_release,%d->%d,connection_tx=%d\n", MPID_nem_ib_myrank, vc->pg_rank,
- req->ibcom->outstanding_connection_tx);
-
- /* Acquire remote scratch pad */
- if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
- req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY) {
- MPID_nem_ib_cm_cmd_shadow_t *shadow =
- (MPID_nem_ib_cm_cmd_shadow_t *) MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- shadow->type = req->state;
- shadow->req = req;
-
- mpi_errno = MPID_nem_ib_cm_cas_release_core(req->responder_rank, shadow);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_cm_cas_release");
- }
- else {
- dprintf("cm_cas_release,enqueue\n");
- req->retry_decided = MPID_nem_ib_progress_engine_vt;
- MPID_nem_ib_cm_sendq_enqueue(&MPID_nem_ib_cm_sendq, req);
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_CAS_RELEASE);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-/* We're trying to send SYN when syn is one */
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_cm_cmd_core
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_cm_cmd_core(int rank, MPID_nem_ib_cm_cmd_shadow_t * shadow, void *buf,
- MPIDI_msg_sz_t sz, uint32_t syn, uint16_t ringbuf_index)
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
- int ib_port = 1;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_CM_CMD_CORE);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_CM_CMD_CORE);
-
- dprintf("cm_cmd_core,enter,syn=%d\n", syn);
-
- shadow->req->ibcom = MPID_nem_ib_scratch_pad_ibcoms[rank];
- ibcom_errno =
- MPID_nem_ib_com_put_scratch_pad(MPID_nem_ib_scratch_pad_fds[rank],
- (uint64_t) shadow,
- syn ? MPID_NEM_IB_CM_OFF_SYN :
- MPID_NEM_IB_CM_OFF_CMD +
- sizeof(MPID_nem_ib_cm_cmd_t) *
- ((uint16_t) (ringbuf_index % MPID_NEM_IB_CM_NSEG)),
- sz, buf, &(shadow->buf_from), &(shadow->buf_from_sz));
-
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_put_scratch_pad");
- MPID_nem_ib_ncqe_scratch_pad += 1;
-
- if (syn) {
- /* Skip QP createion on race condition */
- if (!(VC_FIELD(MPID_nem_ib_conns[rank].vc, connection_state) &
- MPID_NEM_IB_CM_LOCAL_QP_RESET)) {
-
- /* Prepare QP (RESET). Attempting to overlap it with preparing QP (RESET) on the responder side */
- ibcom_errno =
- MPID_nem_ib_com_open(ib_port, MPID_NEM_IB_COM_OPEN_RC, &MPID_nem_ib_conns[rank].fd);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_open");
-
- VC_FIELD(MPID_nem_ib_conns[rank].vc, connection_state) |= MPID_NEM_IB_CM_LOCAL_QP_RESET;
-
- /* Store pointer to MPID_nem_ib_com */
- ibcom_errno = MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_conns[rank].fd,
- &VC_FIELD(MPID_nem_ib_conns[rank].vc,
- ibcom));
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_com_obtain_pointer");
-
- /* Allocate RDMA-write-to ring-buf for remote */
- mpi_errno = MPID_nem_ib_ringbuf_alloc(MPID_nem_ib_conns[rank].vc);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ring_alloc");
- }
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_CM_CMD_CORE);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_cm_notify_send
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_cm_notify_send(int pg_rank, int myrank)
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
-
- MPID_nem_ib_cm_cmd_shadow_t *shadow =
- (MPID_nem_ib_cm_cmd_shadow_t *) MPIU_Malloc(sizeof(MPID_nem_ib_cm_cmd_shadow_t));
- MPID_nem_ib_cm_notify_send_t *buf_from = (MPID_nem_ib_cm_notify_send_t *)
- MPID_nem_ib_rdmawr_from_alloc(sizeof(MPID_nem_ib_cm_notify_send_t));
- MPID_nem_ib_cm_req_t *req = MPIU_Malloc(sizeof(MPID_nem_ib_cm_req_t));
-
- shadow->type = MPID_NEM_IB_NOTIFY_OUTSTANDING_TX_EMPTY;
-
- buf_from->type = MPID_NEM_IB_NOTIFY_OUTSTANDING_TX_EMPTY;
- buf_from->initiator_rank = myrank;
- shadow->req = req;
- shadow->buf_from = (void *) buf_from;
- shadow->buf_from_sz = sizeof(MPID_nem_ib_cm_notify_send_t);
-
- shadow->req->ibcom = MPID_nem_ib_scratch_pad_ibcoms[pg_rank];
-
- ibcom_errno =
- MPID_nem_ib_com_wr_scratch_pad(MPID_nem_ib_scratch_pad_fds[pg_rank],
- (uint64_t) shadow, shadow->buf_from, shadow->buf_from_sz);
-
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_wr_scratch_pad");
-
- fn_exit:
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_cm_notify_progress
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_cm_notify_progress(void)
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
- MPID_nem_ib_cm_notify_send_req_t *sreq, *prev_sreq;
-
- sreq = MPID_nem_ib_cm_notify_sendq_head(MPID_nem_ib_cm_notify_sendq);
- if (sreq) {
- prev_sreq = NULL;
- do {
- if (sreq->ibcom->outstanding_connection_tx != 0) {
- goto next;
- }
-
- ibcom_errno = MPID_nem_ib_cm_notify_send(sreq->pg_rank, sreq->my_rank);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_cm_notify_send");
-
- /* unlink sreq */
- if (prev_sreq != NULL) {
- MPID_nem_ib_cm_notify_sendq_next(prev_sreq) =
- MPID_nem_ib_cm_notify_sendq_next(sreq);
- }
- else {
- MPID_nem_ib_cm_notify_sendq_head(MPID_nem_ib_cm_notify_sendq) =
- MPID_nem_ib_cm_notify_sendq_next(sreq);
- }
- if (MPID_nem_ib_cm_notify_sendq_next(sreq) == NULL) {
- MPID_nem_ib_cm_notify_sendq.tail = prev_sreq;
- }
-
- MPID_nem_ib_cm_notify_send_req_t *tmp_sreq = sreq;
- sreq = MPID_nem_ib_cm_notify_sendq_next(sreq);
-
- MPIU_Free(tmp_sreq);
-
- goto next_unlinked;
- next:
- prev_sreq = sreq;
- sreq = MPID_nem_ib_cm_notify_sendq_next(sreq);
- next_unlinked:;
- } while (sreq);
- }
-
- fn_exit:
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-#endif /* MPID_NEM_ONDEMAND */
-
-/* RDMA-read the head pointer of the shared ring buffer */
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_ringbuf_ask_fetch_core
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_ringbuf_ask_fetch_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t * shadow,
- MPIDI_msg_sz_t sz)
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_FETCH_CORE);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_FETCH_CORE);
-
- dprintf("ringbuf_ask_fetch_core,req=%p\n", shadow->req);
-
- ibcom_errno =
- MPID_nem_ib_com_get_scratch_pad(MPID_nem_ib_scratch_pad_fds[vc->pg_rank],
- (uint64_t) shadow,
- MPID_NEM_IB_RINGBUF_OFF_HEAD,
- sz, &shadow->buf_from, &shadow->buf_from_sz);
-
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_get_scratch_pad");
- MPID_nem_ib_ncqe_scratch_pad += 1;
-
- /* Direct poll to drain CQ to issue CAS */
- MPID_nem_ib_ncqe_scratch_pad_to_drain += 1;
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_FETCH_CORE);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_ringbuf_ask_fetch
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_ringbuf_ask_fetch(MPIDI_VC_t * vc)
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_FETCH);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_FETCH);
-
- dprintf("ringbuf_ask_fetch,enter\n");
-
- /* Prepare state of ask-send */
- MPID_nem_ib_ringbuf_req_t *req = MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_req_t));
- MPIU_ERR_CHKANDJUMP(!req, mpi_errno, MPI_ERR_OTHER, "**malloc");
- dprintf("ask_fetch,req=%p\n", req);
- req->state = MPID_NEM_IB_RINGBUF_ASK_FETCH;
- req->retry_backoff = 0;
- req->vc = vc;
- ibcom_errno =
- MPID_nem_ib_com_obtain_pointer(MPID_nem_ib_scratch_pad_fds[vc->pg_rank], &req->ibcom);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_obtain_pointer");
-
- dprintf("ask_fetch,connection=%08x,ncqe=%d,ncom=%d,guard=%d\n",
- VC_FIELD(vc, connection_state),
- MPID_nem_ib_ncqe_scratch_pad,
- req->ibcom->ncom_scratch_pad, VC_FIELD(vc, ibcom->ask_guard)
-);
-
- /* Acquire remote scratch pad */
- if (VC_FIELD(vc, connection_state) == MPID_NEM_IB_CM_ESTABLISHED &&
- MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
- req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY &&
- !VC_FIELD(vc, ibcom->ask_guard)) {
-
- /* Let the guard up here to prevent CAS conflicts between consecutive asks
- * from the same process */
- VC_FIELD(vc, ibcom->ask_guard) = 1;
-
- MPID_nem_ib_ringbuf_cmd_shadow_t *shadow = (MPID_nem_ib_ringbuf_cmd_shadow_t *)
- MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_cmd_shadow_t));
- shadow->type = req->state;
- shadow->req = req;
-
- mpi_errno =
- MPID_nem_ib_ringbuf_ask_fetch_core(req->vc, shadow,
- sizeof(MPID_nem_ib_ringbuf_headtail_t));
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_ringbuf_ask_fetch");
- }
- else {
- dprintf("ask_fetch,enqueue,req=%p\n", req);
- MPID_nem_ib_ringbuf_sendq_enqueue(&MPID_nem_ib_ringbuf_sendq, req);
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_FETCH);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_ringbuf_ask_cas_core
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_ringbuf_ask_cas_core(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_cmd_shadow_t * shadow,
- uint64_t head)
-{
- int mpi_errno = MPI_SUCCESS;
- int ibcom_errno;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_CAS_CORE);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_CAS_CORE);
-
- dprintf("ringbuf_ask_cas_core,req=%p,head=%ld\n", shadow->req, head);
-
- /* Compare-and-swap to increment head pointer */
- ibcom_errno =
- MPID_nem_ib_com_cas_scratch_pad(MPID_nem_ib_scratch_pad_fds[vc->pg_rank],
- (uint64_t) shadow,
- MPID_NEM_IB_RINGBUF_OFF_HEAD,
- head, head + 1, &shadow->buf_from, &shadow->buf_from_sz);
- MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_cas_scratch_pad");
- MPID_nem_ib_ncqe_scratch_pad += 1;
-
- /* Direct poll to drain CQ to check CAS result */
- MPID_nem_ib_ncqe_scratch_pad_to_drain += 1;
- dprintf("ringbuf_ask_cas_core,scratch_pad_to_drain=%d\n",
- MPID_nem_ib_ncqe_scratch_pad_to_drain);
-
- /* Let the guard down here to overlap CAS with a fetch of the following request
- * when CAS fails, out-of-order acquire may happen, but it's OK */
- VC_FIELD(vc, ibcom->ask_guard) = 0;
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_CAS_CORE);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_ringbuf_ask_cas
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_ringbuf_ask_cas(MPIDI_VC_t * vc, MPID_nem_ib_ringbuf_req_t * req)
-{
- int mpi_errno = MPI_SUCCESS;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_CAS);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_CAS);
-
- dprintf("ask_cas,ncqe=%d,ncom=%d,head=%ld,tail=%d,diff=%d,nslot=%d\n",
- MPID_nem_ib_ncqe_scratch_pad,
- req->ibcom->ncom_scratch_pad,
- req->fetched.head, req->fetched.tail,
- MPID_nem_ib_diff16(req->fetched.head, req->fetched.tail),
- VC_FIELD(vc, ibcom->local_ringbuf_nslot)
-);
-
- /* Acquire one slot of the shared ring buffer */
- if (MPID_nem_ib_ncqe_scratch_pad < MPID_NEM_IB_COM_MAX_CQ_CAPACITY &&
- req->ibcom->ncom_scratch_pad < MPID_NEM_IB_COM_MAX_SQ_CAPACITY) {
-
- if (MPID_nem_ib_diff16(req->fetched.head, req->fetched.tail) <
- VC_FIELD(vc, ibcom->local_ringbuf_nslot)) {
-
- dprintf("ask_cas,core\n");
- req->state = MPID_NEM_IB_RINGBUF_ASK_CAS;
- MPID_nem_ib_ringbuf_cmd_shadow_t *shadow = (MPID_nem_ib_ringbuf_cmd_shadow_t *)
- MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_cmd_shadow_t));
- shadow->type = req->state;
- shadow->req = req;
- mpi_errno = MPID_nem_ib_ringbuf_ask_cas_core(vc, shadow, (uint64_t) req->fetched.head);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_ringbuf_ask_cas");
- }
- else {
- dprintf("ask_cas,ringbuf full,enqueue\n");
- /* Ring-buffer is full */
-
- /* Let the guard down so that this ask-fetch can be issued in ringbuf_progress */
-
- /* Retry from fetch */
-
- /* Schedule retry */
- req->retry_decided = MPID_nem_ib_progress_engine_vt;
- req->retry_backoff = 0;
-
- /* Make the ask-fetch in order */
- MPID_nem_ib_ringbuf_sendq_enqueue_at_head(&MPID_nem_ib_ringbuf_sendq, req);
- }
- }
- else {
- dprintf("ask_cas,ncqe or ncom full,enqueue\n");
- req->retry_decided = MPID_nem_ib_progress_engine_vt;
- req->retry_backoff = 0;
- MPID_nem_ib_ringbuf_sendq_enqueue(&MPID_nem_ib_ringbuf_sendq, req);
- }
-
- fn_exit:
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RINGBUF_ASK_CAS);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPID_nem_ib_ringbuf_progress
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPID_nem_ib_ringbuf_progress()
-{
- int mpi_errno = MPI_SUCCESS;
- MPID_nem_ib_ringbuf_req_t *sreq, *prev_sreq;
- MPID_nem_ib_ringbuf_cmd_shadow_t *shadow;
-
- MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_RINGBUF_PROGRESS);
- MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_RINGBUF_PROGRESS);
-
- //dprintf("rinbguf_send_progress,enter\n");
-
- sreq = MPID_nem_ib_ringbuf_sendq_head(MPID_nem_ib_ringbuf_sendq);
- if (sreq) {
- prev_sreq = NULL;
- do {
- if (VC_FIELD(sreq->vc, connection_state) != MPID_NEM_IB_CM_ESTABLISHED ||
- sreq->ibcom->ncom >= MPID_NEM_IB_COM_MAX_SQ_CAPACITY ||
- MPID_nem_ib_ncqe_scratch_pad >= MPID_NEM_IB_COM_MAX_CQ_CAPACITY) {
- goto next;
- }
-
- switch (sreq->state) {
- case MPID_NEM_IB_RINGBUF_ASK_CAS:
- dprintf("ringbuf_progress,ask_cas,req=%p\n", sreq);
- shadow = (MPID_nem_ib_ringbuf_cmd_shadow_t *)
- MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_cmd_shadow_t));
- shadow->type = sreq->state;
- shadow->req = sreq;
- mpi_errno =
- MPID_nem_ib_ringbuf_ask_cas_core(sreq->vc, shadow,
- (uint64_t) sreq->fetched.head);
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_ringbuf_connect_cas_core");
- break;
- case MPID_NEM_IB_RINGBUF_ASK_FETCH:
- if (MPID_nem_ib_diff63(MPID_nem_ib_progress_engine_vt, sreq->retry_decided) <
- sreq->retry_backoff) {
- dprintf("ringbuf_progress,vt=%ld,retry_decided=%ld,diff=%ld,backoff=%ld\n",
- MPID_nem_ib_progress_engine_vt, sreq->retry_decided,
- MPID_nem_ib_diff63(MPID_nem_ib_progress_engine_vt, sreq->retry_decided),
- sreq->retry_backoff);
- goto next;
- }
- //dprintf("ringbuf_progress,ask_fetch,decided=%ld,vt=%ld,backoff=%ld\n",
- //sreq->retry_decided, MPID_nem_ib_progress_engine_vt, sreq->retry_backoff);
-
- /* Enqueued speculatively, so discard if not needed. */
- if (VC_FIELD(sreq->vc, ibcom->local_ringbuf_type) == MPID_NEM_IB_RINGBUF_SHARED) {
- if (VC_FIELD(sreq->vc, ibcom->ask_guard)) {
- goto next;
- }
- dprintf("ringbuf_progress,ask_fetch,req=%p\n", sreq);
- VC_FIELD(sreq->vc, ibcom->ask_guard) = 1;
- shadow = (MPID_nem_ib_ringbuf_cmd_shadow_t *)
- MPIU_Malloc(sizeof(MPID_nem_ib_ringbuf_cmd_shadow_t));
- shadow->type = sreq->state;
- shadow->req = sreq;
- mpi_errno =
- MPID_nem_ib_ringbuf_ask_fetch_core(sreq->vc, shadow,
- sizeof(MPID_nem_ib_ringbuf_headtail_t));
- MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER,
- "**MPID_nem_ib_ringbuf_send_core");
- }
- break;
- default:
- dprintf("ringbuf_progress,unknown state=%d\n", sreq->state);
- assert(0);
- MPIU_ERR_INTERNALANDJUMP(mpi_errno, "ringbuf_progress,unknown state");
- }
-
- /* unlink sreq */
- if (prev_sreq != NULL) {
- MPID_nem_ib_ringbuf_sendq_next(prev_sreq) = MPID_nem_ib_ringbuf_sendq_next(sreq);
- }
- else {
- MPID_nem_ib_ringbuf_sendq_head(MPID_nem_ib_ringbuf_sendq) =
- MPID_nem_ib_ringbuf_sendq_next(sreq);
- }
- if (MPID_nem_ib_ringbuf_sendq_next(sreq) == NULL) {
- MPID_nem_ib_ringbuf_sendq.tail = prev_sreq;
- }
-
- /* save sreq->dev.next (and sreq) because decrementing reference-counter might free sreq */
- //MPID_nem_ib_ringbuf_req_t *tmp_sreq = sreq;
- sreq = MPID_nem_ib_ringbuf_sendq_next(sreq);
-
- goto next_unlinked;
- next:
- prev_sreq = sreq;
- sreq = MPID_nem_ib_ringbuf_sendq_next(sreq);
- next_unlinked:;
- } while (sreq);
- }
-
- fn_exit:
- entered_send_progress = 0;
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_RINGBUF_PROGRESS);
- return mpi_errno;
- fn_fail:
- goto fn_exit;
-}
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/subconfigure.m4 b/src/mpid/ch3/channels/nemesis/netmod/ib/subconfigure.m4
deleted file mode 100644
index 505746f..0000000
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/subconfigure.m4
+++ /dev/null
@@ -1,46 +0,0 @@
-[#] start of __file__
-dnl MPICH_SUBCFG_AFTER=src/mpid/ch3/channels/nemesis
-
-AC_DEFUN([PAC_SUBCFG_PREREQ_]PAC_SUBCFG_AUTO_SUFFIX,[
- AM_COND_IF([BUILD_CH3_NEMESIS],[
- for net in $nemesis_networks ; do
- AS_CASE([$net],[ib],[build_nemesis_netmod_ib=yes])
- done
- ])
- AM_CONDITIONAL([BUILD_NEMESIS_NETMOD_IB],[test "X$build_nemesis_netmod_ib" = "Xyes"])
-
- # check if getpagesize is available
- AC_CHECK_FUNCS(getpagesize)
-])dnl
-
-AC_DEFUN([PAC_SUBCFG_BODY_]PAC_SUBCFG_AUTO_SUFFIX,[
-AM_COND_IF([BUILD_NEMESIS_NETMOD_IB],[
- AC_MSG_NOTICE([RUNNING CONFIGURE FOR ch3:nemesis:ib])
-
- PAC_SET_HEADER_LIB_PATH(ib)
- PAC_PUSH_FLAG(LIBS)
- PAC_CHECK_HEADER_LIB(dcfa.h,dcfa,ibv_open_device,dcfa_found=yes,dcfa_found=no)
- PAC_POP_FLAG(LIBS)
- if test "${dcfa_found}" = "yes" ; then
- AC_MSG_NOTICE([libdcfa is going to be linked.])
- PAC_APPEND_FLAG([-ldcfa],[WRAPPER_LIBS])
- else
- PAC_PUSH_FLAG(LIBS)
- PAC_CHECK_HEADER_LIB([infiniband/verbs.h],ibverbs,ibv_open_device,ibverbs_found=yes,ibverbs_found=no)
- PAC_POP_FLAG(LIBS)
- if test "${ibverbs_found}" = "yes" ; then
- AC_MSG_NOTICE([libibverbs is going to be linked.])
- PAC_APPEND_FLAG([-libverbs],[WRAPPER_LIBS])
- else
- AC_MSG_ERROR([Internal error: neither ibverbs nor dcfa was found])
- fi
- fi
-
- AC_DEFINE([MPID_NEM_IB_VERSION], ["0.9.0"], [Version of the IB netmod])
- AC_DEFINE([MPID_NEM_IB_RELEASE_DATE], ["2013-11-18"], [Release date of the IB netmod])
- AC_DEFINE([ENABLE_COMM_OVERRIDES], 1, [define to add per-vc function pointers to override send and recv functions, registered in ib_init.c])
-# AC_DEFINE([ENABLE_RNDV_WAIT_TIMER], 1, [make MPI_Wtime returns wait time. Wait time is elapsed time from MPIDI_CH3_Progress_start to MPIDI_CH3_Progress_end])
-])dnl end AM_COND_IF(BUILD_NEMESIS_NETMOD_IB,...)
-])dnl end _BODY
-
-[#] end of __file__
-----------------------------------------------------------------------
Summary of changes:
CHANGES | 3 -
README.vin | 39 -
src/mpid/ch3/channels/nemesis/netmod/Makefile.mk | 1 -
.../ch3/channels/nemesis/netmod/ib/Makefile.mk | 24 -
.../channels/nemesis/netmod/ib/cross_values.txt | 16 -
.../ch3/channels/nemesis/netmod/ib/errnames.txt | 69 -
.../ch3/channels/nemesis/netmod/ib/ib_finalize.c | 38 -
src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c | 2548 ----------------
src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h | 785 -----
src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h | 1061 -------
src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c | 1105 -------
src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c | 540 ----
.../ch3/channels/nemesis/netmod/ib/ib_malloc.c | 540 ----
src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c | 3176 --------------------
.../ch3/channels/nemesis/netmod/ib/ib_reg_mr.c | 386 ---
src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c | 1910 ------------
.../ch3/channels/nemesis/netmod/ib/subconfigure.m4 | 46 -
17 files changed, 0 insertions(+), 12287 deletions(-)
delete mode 100644 src/mpid/ch3/channels/nemesis/netmod/ib/Makefile.mk
delete mode 100644 src/mpid/ch3/channels/nemesis/netmod/ib/cross_values.txt
delete mode 100644 src/mpid/ch3/channels/nemesis/netmod/ib/errnames.txt
delete mode 100644 src/mpid/ch3/channels/nemesis/netmod/ib/ib_finalize.c
delete mode 100644 src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.c
delete mode 100644 src/mpid/ch3/channels/nemesis/netmod/ib/ib_ibcom.h
delete mode 100644 src/mpid/ch3/channels/nemesis/netmod/ib/ib_impl.h
delete mode 100644 src/mpid/ch3/channels/nemesis/netmod/ib/ib_init.c
delete mode 100644 src/mpid/ch3/channels/nemesis/netmod/ib/ib_lmt.c
delete mode 100644 src/mpid/ch3/channels/nemesis/netmod/ib/ib_malloc.c
delete mode 100644 src/mpid/ch3/channels/nemesis/netmod/ib/ib_poll.c
delete mode 100644 src/mpid/ch3/channels/nemesis/netmod/ib/ib_reg_mr.c
delete mode 100644 src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
delete mode 100644 src/mpid/ch3/channels/nemesis/netmod/ib/subconfigure.m4
hooks/post-receive
--
MPICH primary repository
More information about the commits
mailing list