[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.1.2-85-g95cfe8a

Service Account noreply at mpich.org
Fri Aug 8 19:41:06 CDT 2014


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  95cfe8a33eca95eb722a6216f997c6d7306ded98 (commit)
       via  cf8ec4e69b6a675e2602e8f84d56ae283c823ddd (commit)
      from  8b73bd97d382c6ac8bd8d5158a475bfe6b78a063 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/95cfe8a33eca95eb722a6216f997c6d7306ded98

commit 95cfe8a33eca95eb722a6216f997c6d7306ded98
Author: Igor Ivanov <Igor.Ivanov at itseez.com>
Date:   Fri Aug 8 16:19:32 2014 +0300

    netmod/mxm: Update MXM
    
    List of changes:
    - Adapted cancel recv to meet current code;
    - Changed active messaging callback flag;
    - Added possibility to configurate MXM netmode using
      MPICH_NETMOD_MXM_BULK_CONNECT and MPICH_NETMOD_MXM_BULK_DISCONNECT;
    - Improved debug output;
    
    Signed-off-by: Igor Ivanov <Igor.Ivanov at itseez.com>
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_cancel.c b/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_cancel.c
index 6144ace..df49239 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_cancel.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_cancel.c
@@ -66,7 +66,10 @@ int MPID_nem_mxm_cancel_recv(MPIDI_VC_t * vc, MPID_Request * req)
     _dbg_mxm_out_req(req);
 
   fn_exit:
-    return mpi_errno;
+    /* This function returns sero in case request is canceled
+     * and nonzero otherwise
+     */
+    return (!MPIR_STATUS_GET_CANCEL_BIT(req->status));
   fn_fail:ATTRIBUTE((unused))
         goto fn_exit;
 }
diff --git a/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_init.c b/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_init.c
index bc6d6d4..da5409a 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_init.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_init.c
@@ -67,6 +67,7 @@ static int _mxm_connect(MPID_nem_mxm_ep_t * ep, const char *business_card,
 static int _mxm_disconnect(MPID_nem_mxm_ep_t * ep);
 static int _mxm_add_comm(MPID_Comm * comm, void *param);
 static int _mxm_del_comm(MPID_Comm * comm, void *param);
+static int _mxm_conf(void);
 
 
 #undef FUNCNAME
@@ -78,6 +79,7 @@ int MPID_nem_mxm_post_init(void)
     int mpi_errno = MPI_SUCCESS;
 
 #if MXM_API >= MXM_VERSION(3,1)
+    /* Current logic guarantees that all VCs have been initialized before post init call */
     if (_mxm_obj.conf.bulk_connect) {
         mxm_ep_wireup(_mxm_obj.mxm_ep);
     }
@@ -308,8 +310,16 @@ int MPID_nem_mxm_vc_terminate(MPIDI_VC_t * vc)
     MPIDI_STATE_DECL(MPID_STATE_MXM_VC_TERMINATE);
     MPIDI_FUNC_ENTER(MPID_STATE_MXM_VC_TERMINATE);
 
-    while ((VC_FIELD(vc, pending_sends)) > 0)
-        MPID_nem_mxm_poll(FALSE);
+    if (vc->state != MPIDI_VC_STATE_CLOSED) {
+        /* VC is terminated as a result of a fault.  Complete
+         * outstanding sends with an error and terminate connection
+         * immediately. */
+        MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+    }
+    else {
+        while ((VC_FIELD(vc, pending_sends)) > 0)
+            MPID_nem_mxm_poll(FALSE);
+    }
 
     mpi_errno = MPIDI_CH3U_Handle_connection(vc, MPIDI_VC_EVENT_TERMINATED);
     if (mpi_errno)
@@ -322,11 +332,12 @@ int MPID_nem_mxm_vc_terminate(MPIDI_VC_t * vc)
     goto fn_exit;
 }
 
-static int _mxm_init(int rank, int size)
+static int _mxm_conf(void)
 {
     int mpi_errno = MPI_SUCCESS;
     mxm_error_t ret = MXM_OK;
     unsigned long cur_ver;
+    char *env_val = NULL;
 
     cur_ver = mxm_get_version();
     if (cur_ver != MXM_API) {
@@ -347,14 +358,34 @@ static int _mxm_init(int rank, int size)
              "%ld.%ld", (cur_ver >> MXM_MAJOR_BIT) & 0xff, (cur_ver >> MXM_MINOR_BIT) & 0xff);
 #endif
 
-    if (cur_ver < MXM_VERSION(3, 2)) {
+    env_val = getenv("MPICH_NETMOD_MXM_BULK_CONNECT");
+    _mxm_obj.conf.bulk_connect = (env_val ? atoi(env_val) : (cur_ver < MXM_VERSION(3, 2) ? 0 : 1));
+    env_val = getenv("MPICH_NETMOD_MXM_BULK_DISCONNECT");
+    _mxm_obj.conf.bulk_disconnect = (env_val ?
+                                     atoi(env_val) : (cur_ver < MXM_VERSION(3, 2) ? 0 : 1));
+    if (cur_ver < MXM_VERSION(3, 2) &&
+        (_mxm_obj.conf.bulk_connect || _mxm_obj.conf.bulk_disconnect)) {
         _mxm_obj.conf.bulk_connect = 0;
         _mxm_obj.conf.bulk_disconnect = 0;
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE,
+                         (MPIU_DBG_FDEST,
+                          "WARNING: MPICH runs with %s version of MXM that is less than 3.2, "
+                          "so bulk connect/disconnect cannot work properly and will be turn off.",
+                          _mxm_obj.runtime_version));
     }
-    else {
-        _mxm_obj.conf.bulk_connect = 1;
-        _mxm_obj.conf.bulk_disconnect = 1;
-    }
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static int _mxm_init(int rank, int size)
+{
+    int mpi_errno = MPI_SUCCESS;
+    mxm_error_t ret = MXM_OK;
+
+    ret = _mxm_conf();
 
     ret = mxm_config_read_opts(&_mxm_obj.mxm_ctx_opts, &_mxm_obj.mxm_ep_opts, "MPICH2", NULL, 0);
     MPIU_ERR_CHKANDJUMP1(ret != MXM_OK,
@@ -369,7 +400,7 @@ static int _mxm_init(int rank, int size)
 
     ret =
         mxm_set_am_handler(_mxm_obj.mxm_context, MXM_MPICH_HID_ADI_MSG, MPID_nem_mxm_get_adi_msg,
-                           MXM_AM_FLAG_THREAD_SAFE);
+                           0);
     MPIU_ERR_CHKANDJUMP1(ret != MXM_OK, mpi_errno, MPI_ERR_OTHER, "**mxm_set_am_handler",
                          "**mxm_set_am_handler %s", mxm_error_string(ret));
 
diff --git a/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_poll.c b/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_poll.c
index 50487fa..df6fa59 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_poll.c
@@ -69,10 +69,10 @@ void MPID_nem_mxm_get_adi_msg(mxm_conn_h conn, mxm_imm_t imm, void *data,
 
     MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "MPID_nem_mxm_get_adi_msg");
 
-    _dbg_mxm_output(5, "========> Getting ADI msg (data_size %d) \n", length);
-
     vc = mxm_conn_ctx_get(conn);
 
+    _dbg_mxm_output(5, "========> Getting ADI msg (from=%d data_size %d) \n", vc->pg_rank, length);
+
     MPID_nem_handle_pkt(vc, data, (MPIDI_msg_sz_t) (length));
 }
 
@@ -379,7 +379,6 @@ static int _mxm_irecv(MPID_nem_mxm_ep_t * ep, MPID_nem_mxm_req_area * req, int i
         mpi_errno = MPI_ERR_OTHER;
         goto fn_fail;
     }
-//    list_enqueue(&ep->out_queue, &req->mxm_req->queue);
 
   fn_exit:
     return mpi_errno;
diff --git a/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_send.c b/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_send.c
index 7a8f12a..3e60d56 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_send.c
@@ -54,8 +54,10 @@ int MPID_nem_mxm_iSendContig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr, MP
         REQ_FIELD(sreq, iov_buf)[1].length = data_sz;
     }
 
-    _dbg_mxm_output(5, "iSendContig ========> Sending ADI msg for req %p (data_size %d, %d) \n",
-                    sreq, sizeof(MPIDI_CH3_Pkt_t), data_sz);
+    _dbg_mxm_output(5,
+                    "iSendContig ========> Sending ADI msg (to=%d type=%d) for req %p (data_size %d, %d) \n",
+                    vc->pg_rank, sreq->dev.pending_pkt.type, sreq, sizeof(MPIDI_CH3_Pkt_t),
+                    data_sz);
 
     (VC_FIELD(vc, pending_sends)) += 1;
     sreq->ch.vc = vc;
@@ -100,8 +102,10 @@ int MPID_nem_mxm_iStartContigMsg(MPIDI_VC_t * vc, void *hdr, MPIDI_msg_sz_t hdr_
     sreq->dev.OnDataAvail = NULL;
     sreq->dev.tmpbuf = NULL;
 
-    _dbg_mxm_output(5, "iStartContigMsg ========> Sending ADI msg for req %p (data_size %d, %d) \n",
-                    sreq, sizeof(MPIDI_CH3_Pkt_t), data_sz);
+    _dbg_mxm_output(5,
+                    "iStartContigMsg ========> Sending ADI msg (to=%d type=%d) for req %p (data_size %d, %d) \n",
+                    vc->pg_rank, sreq->dev.pending_pkt.type, sreq, sizeof(MPIDI_CH3_Pkt_t),
+                    data_sz);
 
     REQ_FIELD(sreq, ctx) = sreq;
     REQ_FIELD(sreq, iov_buf) = REQ_FIELD(sreq, tmp_buf);
@@ -151,8 +155,10 @@ int MPID_nem_mxm_SendNoncontig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
     MPIU_Memcpy(&(sreq->dev.pending_pkt), (char *) hdr, sizeof(MPIDI_CH3_Pkt_t));
     sreq->dev.tmpbuf = NULL;
 
-    _dbg_mxm_output(5, "SendNoncontig ========> Sending ADI msg for req %p (data_size %d, %d) \n",
-                    sreq, sizeof(MPIDI_CH3_Pkt_t), sreq->dev.segment_size);
+    _dbg_mxm_output(5,
+                    "SendNoncontig ========> Sending ADI msg (to=%d type=%d) for req %p (data_size %d, %d) \n",
+                    vc->pg_rank, sreq->dev.pending_pkt.type, sreq, sizeof(MPIDI_CH3_Pkt_t),
+                    sreq->dev.segment_size);
 
     REQ_FIELD(sreq, ctx) = sreq;
     REQ_FIELD(sreq, iov_buf) = REQ_FIELD(sreq, tmp_buf);
@@ -269,8 +275,7 @@ int MPID_nem_mxm_send(MPIDI_VC_t * vc, const void *buf, int count, MPI_Datatype
 
     mpi_errno = _mxm_isend(VC_FIELD(vc, mxm_ep), REQ_BASE(sreq), MXM_MPICH_ISEND,
                            (mxm_mq_h) comm->ch.netmod_comm, comm->rank, tag, _mxm_tag_mpi2mxm(tag,
-                                                                                              comm->
-                                                                                              context_id
+                                                                                              comm->context_id
                                                                                               +
                                                                                               context_offset),
                            1);
@@ -367,8 +372,7 @@ int MPID_nem_mxm_ssend(MPIDI_VC_t * vc, const void *buf, int count, MPI_Datatype
 
     mpi_errno = _mxm_isend(VC_FIELD(vc, mxm_ep), REQ_BASE(sreq), MXM_MPICH_ISEND_SYNC,
                            (mxm_mq_h) comm->ch.netmod_comm, comm->rank, tag, _mxm_tag_mpi2mxm(tag,
-                                                                                              comm->
-                                                                                              context_id
+                                                                                              comm->context_id
                                                                                               +
                                                                                               context_offset),
                            1);
@@ -465,8 +469,7 @@ int MPID_nem_mxm_isend(MPIDI_VC_t * vc, const void *buf, int count, MPI_Datatype
 
     mpi_errno = _mxm_isend(VC_FIELD(vc, mxm_ep), REQ_BASE(sreq), MXM_MPICH_ISEND,
                            (mxm_mq_h) comm->ch.netmod_comm, comm->rank, tag, _mxm_tag_mpi2mxm(tag,
-                                                                                              comm->
-                                                                                              context_id
+                                                                                              comm->context_id
                                                                                               +
                                                                                               context_offset),
                            0);
@@ -564,8 +567,7 @@ int MPID_nem_mxm_issend(MPIDI_VC_t * vc, const void *buf, int count, MPI_Datatyp
 
     mpi_errno = _mxm_isend(VC_FIELD(vc, mxm_ep), REQ_BASE(sreq), MXM_MPICH_ISEND_SYNC,
                            (mxm_mq_h) comm->ch.netmod_comm, comm->rank, tag, _mxm_tag_mpi2mxm(tag,
-                                                                                              comm->
-                                                                                              context_id
+                                                                                              comm->context_id
                                                                                               +
                                                                                               context_offset),
                            0);
@@ -712,7 +714,6 @@ static int _mxm_isend(MPID_nem_mxm_ep_t * ep, MPID_nem_mxm_req_area * req,
 
     if (block)
         _mxm_req_wait(&mxm_sreq->base);
-//    list_enqueue(&ep->out_queue, &req->mxm_req->queue);
 
   fn_exit:
     return mpi_errno;

http://git.mpich.org/mpich.git/commitdiff/cf8ec4e69b6a675e2602e8f84d56ae283c823ddd

commit cf8ec4e69b6a675e2602e8f84d56ae283c823ddd
Author: Igor Ivanov <Igor.Ivanov at itseez.com>
Date:   Tue Jul 29 18:58:31 2014 +0300

    nemesis: Fix memory leak related MPID_nem_register_initcomp_cb
    
    Every call of MPID_nem_register_initcomp_cb allocates memory but
    it is not freed after completion of all post netmod init callbacks.
    
    Signed-off-by: Igor Ivanov <Igor.Ivanov at itseez.com>
    
    Change-Id: Iacbbf4d95cf4ed11c2fc9f753d42608b19ef0d35
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_init.c b/src/mpid/ch3/channels/nemesis/src/ch3_init.c
index 78a44a9..0cb3a36 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_init.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_init.c
@@ -346,6 +346,7 @@ int MPIDI_CH3_InitCompleted(void)
 {
     int mpi_errno = MPI_SUCCESS;
     initcomp_cb_t *ep;
+    initcomp_cb_t *ep_tmp;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_INITCOMPLETED);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_INITCOMPLETED);
@@ -354,7 +355,9 @@ int MPIDI_CH3_InitCompleted(void)
     {
         mpi_errno = ep->callback();
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        ep_tmp = ep;
         ep = ep->next;
+        MPIU_Free(ep_tmp);
     }
 
  fn_exit:

-----------------------------------------------------------------------

Summary of changes:
 .../ch3/channels/nemesis/netmod/mxm/mxm_cancel.c   |    5 ++-
 .../ch3/channels/nemesis/netmod/mxm/mxm_init.c     |   49 ++++++++++++++++----
 .../ch3/channels/nemesis/netmod/mxm/mxm_poll.c     |    5 +-
 .../ch3/channels/nemesis/netmod/mxm/mxm_send.c     |   31 ++++++------
 src/mpid/ch3/channels/nemesis/src/ch3_init.c       |    3 +
 5 files changed, 65 insertions(+), 28 deletions(-)


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list