[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.0.4-435-g117a24f

mysql vizuser noreply at mpich.org
Sat Aug 3 18:17:50 CDT 2013


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  117a24f5bc55ba13ca643903b5a64469d5ac946c (commit)
      from  d81904d6f6a76a44588b34e8878c5297486d9ea2 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/117a24f5bc55ba13ca643903b5a64469d5ac946c

commit 117a24f5bc55ba13ca643903b5a64469d5ac946c
Author: Pavan Balaji <balaji at mcs.anl.gov>
Date:   Sat Aug 3 13:04:53 2013 -0500

    Intel scif code contribution.
    
    This update fixes the following problems:
     * Unexpected connection, mostly appears with number of ranks > 3
     * Several problems related to SCIF DMA usage
     * Intel(R) Symmetric Communication Interface (Intel(R) SCI)
       registered memory used for DMA is not unregistered yet
    
    Current status:
     * IMB-MPI1 w/ CHECK runs well
     * MPICH tests run except spawn tests
    
    Known issues:
     * Spawn tests still fail

diff --git a/src/mpid/ch3/channels/nemesis/netmod/scif/scif_finalize.c b/src/mpid/ch3/channels/nemesis/netmod/scif/scif_finalize.c
index 0989842..ea1ed87 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/scif/scif_finalize.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/scif/scif_finalize.c
@@ -17,10 +17,21 @@
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPID_nem_scif_finalize(void)
 {
+    int i, ret;
+    scifconn_t *it_sc;
+
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_SCIF_FINALIZE);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_SCIF_FINALIZE);
 
+    for (i = 0; i < MPID_nem_scif_nranks; ++i) {
+        it_sc = &MPID_nem_scif_conns[i];
+        if (it_sc->fd == -1) {
+            continue;   /* no connection */
+        }
+        if (scif_close(it_sc->fd) == 0)
+            it_sc->fd = -1;
+    }
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_SCIF_FINALIZE);
     return MPI_SUCCESS;
 }
diff --git a/src/mpid/ch3/channels/nemesis/netmod/scif/scifrw.c b/src/mpid/ch3/channels/nemesis/netmod/scif/scifrw.c
index a4e0588..c1536d2 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/scif/scifrw.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/scif/scifrw.c
@@ -4,7 +4,7 @@
  *      See COPYRIGHT in top-level directory.
  *
  *  Portions of this code were written by Intel Corporation.
- *  Copyright (C) 2011-2013 Intel Corporation.  Intel provides this material
+ *  Copyright (C) 2011-2012 Intel Corporation.  Intel provides this material
  *  to Argonne National Laboratory subject to Software Grant and Corporate
  *  Contributor License Agreement dated February 8, 2012.
  */
@@ -56,6 +56,8 @@ int MPID_nem_scif_init_shmsend(shmchan_t * csend, int ep, int rank)
     csend->pos = -1;
     csend->reg = 0;
     csend->rank = rank;
+    csend->dma_count = 0;
+    csend->dma_chdseqno = 0;
 
   fn_exit:
     return retval;
@@ -87,6 +89,29 @@ int MPID_nem_scif_init_shmrecv(shmchan_t * crecv, int ep, off_t offs, int rank)
     return retval;
 }
 
+void MPID_nem_scif_unregmem(int ep, shmchan_t * c)
+{
+    regmem_t *rp = c->reg, *prev = c->reg;
+    uint64_t lseqno = c->dma_chdseqno;
+
+    while (rp && c->dma_count) {
+        if (rp->seqno <= lseqno) {
+            scif_unregister(ep, rp->offset, rp->size);
+            prev->next = rp->next;
+            if (c->reg == rp) {
+                c->reg = rp->next;
+            }
+            free(rp);
+            --c->dma_count;
+            rp = prev->next;
+        }
+        else {
+            prev = rp;
+            rp = rp->next;
+        }
+    }
+}
+
 static regmem_t *regmem(int ep, shmchan_t * c, void *addr, size_t len)
 {
     regmem_t *rp;
@@ -161,6 +186,8 @@ static int dma_read(int ep, shmchan_t * c, void *recv_buf, off_t raddr, size_t m
             fprintf(stderr, "recv_buf: %p raddr: 0x%lx buflen: %ld\n",
                     recv_buf, raddr + c->pos, buflen);
         }
+        scif_fence_mark(ep, SCIF_FENCE_INIT_SELF, &mark);
+        scif_fence_wait(ep, mark);
         *did_dma = 1;
         goto fn_exit;
     }
@@ -423,6 +450,11 @@ ssize_t MPID_nem_scif_writev(int ep, shmchan_t * c, const struct iovec * iov, in
         c->curp += len;
         nwritten += iovlen;
         ++c->seqno;
+
+        if (did_dma) {
+            rp->seqno = c->seqno;
+            ++c->dma_count;
+        }
     }
   fn_exit:
 #if !defined(__MIC__)
diff --git a/src/mpid/ch3/channels/nemesis/netmod/scif/scifrw.h b/src/mpid/ch3/channels/nemesis/netmod/scif/scifrw.h
index 075eaf4..f371fc0 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/scif/scifrw.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/scif/scifrw.h
@@ -24,6 +24,8 @@ typedef struct regmem {
     char *base;
     size_t size;
     off_t offset;               /* for DMA */
+    uint64_t seqno;             /* sender sequence number, */
+    /* used for unreg */
     struct regmem *next;
 } regmem_t;
 
@@ -47,6 +49,8 @@ typedef struct {
     ssize_t dmaend;             /* end of the read in the DMA buffer */
     regmem_t *reg;              /* registration list */
     int rank;                   /* mostly for debugging */
+    int dma_count;              /* count of DMA ops (send only) */
+    int dma_chdseqno;           /* last checked segno           */
 } shmchan_t;
 
 int MPID_nem_scif_init_shmsend(shmchan_t * csend, int ep, int rank);
@@ -70,4 +74,15 @@ static inline int MPID_nem_scif_chk_seqno(shmchan_t * csend, int seqno)
 {
     return *csend->lseqno >= seqno;
 }
+
+static inline int MPID_nem_scif_chk_dma_unreg(shmchan_t * csend)
+{
+    uint64_t lseqno = *csend->lseqno;
+
+    if (lseqno != csend->dma_chdseqno) {
+        csend->dma_chdseqno = lseqno;
+        return csend->dma_count;
+    }
+    return 0;
+}
 #endif
diff --git a/src/mpid/ch3/channels/nemesis/netmod/scif/scifsm.c b/src/mpid/ch3/channels/nemesis/netmod/scif/scifsm.c
index f77ec5a..e2dd96b 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/scif/scifsm.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/scif/scifsm.c
@@ -70,10 +70,8 @@ static int MPID_nem_scif_recv_handler(scifconn_t * const sc)
                 rreq->dev.iov_count =
                     &rreq->dev.iov[rreq->dev.iov_offset + rreq->dev.iov_count] - iov;
                 rreq->dev.iov_offset = iov - rreq->dev.iov;
-                MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "bytes_recvd = %ld",
-                               (long int) bytes_recvd);
-                MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "iov len = %ld",
-                               (long int) iov->MPID_IOV_LEN);
+                MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "bytes_recvd = %ld", (long int) bytes_recvd);
+                MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "iov len = %ld", (long int) iov->MPID_IOV_LEN);
                 MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "iov_offset = %d", rreq->dev.iov_offset);
                 goto fn_exit;
             }
@@ -111,8 +109,7 @@ static int MPID_nem_scif_recv_handler(scifconn_t * const sc)
     return mpi_errno;
   fn_fail:     /* comm related failures jump here */
     {
-        MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail",
-                      "**comm_fail %d", sc_vc->pg_rank);
+        MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", sc_vc->pg_rank);
     }
   fn_noncomm_fail:     /* NON-comm related failures jump here */
     goto fn_exit;
@@ -143,6 +140,13 @@ static int state_commrdy_handler(int is_readable, int is_writeable, scifconn_t *
             MPIU_ERR_POP(mpi_errno);
         /* check to see if this VC is waiting for outstanding sends to
          * finish in order to terminate */
+        if (MPIDI_CH3I_Sendq_empty(sc_vc_scif->send_queue) && sc_vc_scif->terminate == 1) {
+            /* The sendq is empty, so we can immediately terminate
+             * the connection. */
+            mpi_errno = MPID_nem_scif_vc_terminated(sc_vc);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+        }
     }
   fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_STATE_COMMRDY_HANDLER);
@@ -166,6 +170,9 @@ int MPID_nem_scif_connpoll(int in_blocking_poll)
         scifconn_t *it_sc = &MPID_nem_scif_conns[i];
         if (it_sc->fd == -1)
             continue;   /* no connection */
+        if (MPID_nem_scif_chk_dma_unreg(&it_sc->csend)) {
+            MPID_nem_scif_unregmem(it_sc->fd, &it_sc->csend);
+        }
         is_readable = MPID_nem_scif_poll_recv(&it_sc->crecv);
         is_writeable = MPID_nem_scif_poll_send(it_sc->fd, &it_sc->csend);
         MPIU_ERR_CHKANDJUMP1(is_writeable == -1, mpi_errno, MPI_ERR_OTHER,
@@ -180,8 +187,7 @@ int MPID_nem_scif_connpoll(int in_blocking_poll)
             fds.revents = 0;
             MPIU_ERR_CHKANDJUMP1(poll(&fds, 1, 0) < 0, mpi_errno, MPI_ERR_OTHER,
                                  "**poll", "**poll %s", MPIU_Strerror(errno));
-            MPIU_ERR_CHKANDJUMP(fds.revents & (POLLERR | POLLHUP), mpi_errno,
-                                MPI_ERR_OTHER, "**poll");
+            MPIU_ERR_CHKANDJUMP(fds.revents & POLLERR, mpi_errno, MPI_ERR_OTHER, "**poll");
         }
     }
     if (npolls++ >= NPOLLS)
@@ -190,7 +196,6 @@ int MPID_nem_scif_connpoll(int in_blocking_poll)
   fn_exit:
     return mpi_errno;
   fn_fail:
-    MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE,
-                     (MPIU_DBG_FDEST, "failure. mpi_errno = %d", mpi_errno));
+    MPIU_DBG_MSG_FMT(NEM_SOCK_DET, VERBOSE, (MPIU_DBG_FDEST, "failure. mpi_errno = %d", mpi_errno));
     goto fn_exit;
 }

-----------------------------------------------------------------------

Summary of changes:
 .../channels/nemesis/netmod/scif/scif_finalize.c   |   11 ++++++
 src/mpid/ch3/channels/nemesis/netmod/scif/scifrw.c |   34 +++++++++++++++++++-
 src/mpid/ch3/channels/nemesis/netmod/scif/scifrw.h |   15 +++++++++
 src/mpid/ch3/channels/nemesis/netmod/scif/scifsm.c |   25 +++++++++------
 4 files changed, 74 insertions(+), 11 deletions(-)


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list