[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.1.3-152-g35de3d1

Service Account noreply at mpich.org
Thu Nov 6 22:32:43 CST 2014


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  35de3d16d75207b1a7270a469c954e1226f25a8c (commit)
       via  cf13c7850dd482380a475f132885428ecf918051 (commit)
       via  af39138730abda2829e98f28eaa3106f4e8b3820 (commit)
      from  ed2813ae133657ac84669f29cfc12c0ce8267c2f (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/35de3d16d75207b1a7270a469c954e1226f25a8c

commit 35de3d16d75207b1a7270a469c954e1226f25a8c
Author: Huiwei Lu <huiweilu at mcs.anl.gov>
Date:   Thu Nov 6 15:58:40 2014 -0600

    Fixes env in runtest
    
    When the parameter of 'env' is parsed the first time, it adds an extra
    space in the front. When the script kicks off each test, this extra
    space is not a correct form the script want to interpret and it
    complains in the output: "not in a=b form".
    
    Signed-off-by: Wesley Bland <wbland at anl.gov>

diff --git a/test/mpi/runtests.in b/test/mpi/runtests.in
index 1659c8f..ba8dd6a 100644
--- a/test/mpi/runtests.in
+++ b/test/mpi/runtests.in
@@ -439,7 +439,12 @@ sub RunList {
 		    $mpiexecArgs = "$mpiexecArgs $value";
 		}
 		elsif ($key eq "env") {
-		    $progEnv = "$progEnv $value";
+		    if ($progEnv eq "") {
+			$progEnv = "$value";
+		    }
+		    else {
+			$progEnv = "$progEnv $value";
+		    }
 		}
 		elsif ($key eq "mpiversion") {
 		    $mpiVersion = $value;

http://git.mpich.org/mpich.git/commitdiff/cf13c7850dd482380a475f132885428ecf918051

commit cf13c7850dd482380a475f132885428ecf918051
Author: Huiwei Lu <huiweilu at mcs.anl.gov>
Date:   Wed Nov 5 14:55:55 2014 -0600

    Adds a CVAR to enable/disable fault tolerance
    
    MPIR_CVAR_ENABLE_FT is added to enable/disable fault tolerance related
    code. For performance consideration, FT is disabled by default.
    
    Changes FT related LMT RTS code to use this CVAR.
    
    Signed-off-by: Wesley Bland <wbland at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c b/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c
index 28662af..5df9a57 100644
--- a/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c
+++ b/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c
@@ -9,17 +9,20 @@
 /*
 === BEGIN_MPI_T_CVAR_INFO_BLOCK ===
 
+categories:
+    - name        : FT
+      description : cvars that control behavior of fault tolerance
+
 cvars:
-   - name       : MPIR_CVAR_NEM_LMT_RTS_QUEUE_SIZE
-     category   : CH3
-     type       : int
-     default    : 1024
+   - name       : MPIR_CVAR_ENABLE_FT
+     category   : FT
+     type       : boolean
+     default    : false
      class      : device
      verbosity  : MPI_T_VERBOSITY_USER_BASIC
      scope      : MPI_T_SCOPE_ALL_EQ
      description : >-
-       The initial size of the NEM_LMT_RTS_QUEUE used to track RTS
-       messages before the LMT setup.
+       Enable fault tolerance functions
 
 === END_MPI_T_CVAR_INFO_BLOCK ===
 */
@@ -123,8 +126,10 @@ int MPID_nem_lmt_RndvSend(MPID_Request **sreq_p, const void * buf, int count,
 
     MPIU_THREAD_CS_ENTER(LMT,);
     mpi_errno = vc->ch.lmt_initiate_lmt(vc, &upkt.p, sreq);
-    if (MPI_SUCCESS == mpi_errno)
-        MPID_nem_lmt_rtsq_enqueue(&vc->ch.lmt_rts_queue, sreq);
+    if (MPIR_CVAR_ENABLE_FT) {
+        if (MPI_SUCCESS == mpi_errno)
+            MPID_nem_lmt_rtsq_enqueue(&vc->ch.lmt_rts_queue, sreq);
+    }
     MPIU_THREAD_CS_EXIT(LMT,);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
@@ -319,8 +324,10 @@ static int pkt_CTS_handler(MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt, MPIDI_msg_sz_t
     MPID_Request_get_ptr(cts_pkt->sender_req_id, sreq);
 
     MPIU_THREAD_CS_ENTER(LMT,);
-    /* Remove the request from the VC RTS queue. */
-    MPID_nem_lmt_rtsq_search_remove(&vc->ch.lmt_rts_queue, cts_pkt->sender_req_id, &rts_sreq);
+    if (MPIR_CVAR_ENABLE_FT) {
+        /* Remove the request from the VC RTS queue. */
+        MPID_nem_lmt_rtsq_search_remove(&vc->ch.lmt_rts_queue, cts_pkt->sender_req_id, &rts_sreq);
+    }
     MPIU_THREAD_CS_EXIT(LMT,);
 
     sreq->ch.lmt_req_id = cts_pkt->receiver_req_id;
diff --git a/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt_shm.c b/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt_shm.c
index 525ffd4..b491936 100644
--- a/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt_shm.c
+++ b/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt_shm.c
@@ -813,13 +813,15 @@ int MPID_nem_lmt_shm_vc_terminated(MPIDI_VC_t *vc)
     }
 
     /* If there is anything in the RTS queue, it needs to be cleared out. */
-    MPIU_THREAD_CS_ENTER(LMT,);
-    while (!MPID_nem_lmt_rtsq_empty(vc_ch->lmt_rts_queue)) {
-        MPID_nem_lmt_rtsq_dequeue(&vc_ch->lmt_rts_queue, &req);
-        req->status.MPI_ERROR = req_errno;
-        MPIDI_CH3U_Request_complete(req);
+    if (MPIR_CVAR_ENABLE_FT) {
+        MPIU_THREAD_CS_ENTER(LMT,);
+        while (!MPID_nem_lmt_rtsq_empty(vc_ch->lmt_rts_queue)) {
+            MPID_nem_lmt_rtsq_dequeue(&vc_ch->lmt_rts_queue, &req);
+            req->status.MPI_ERROR = req_errno;
+            MPIDI_CH3U_Request_complete(req);
+        }
+        MPIU_THREAD_CS_EXIT(LMT,);
     }
-    MPIU_THREAD_CS_EXIT(LMT,);
 
     /* We empty the vc queue, but don't remove the vc from the global
        list.  That will eventually happen when lmt_shm_progress()
diff --git a/test/mpi/ft/testlist b/test/mpi/ft/testlist
index 41fa987..d2706ce 100644
--- a/test/mpi/ft/testlist
+++ b/test/mpi/ft/testlist
@@ -1,18 +1,18 @@
-die 4 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false resultTest=TestStatusNoErrors
-abort 2 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false xfail=ticket1537
-sendalive 4 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false resultTest=TestStatusNoErrors
-isendalive 3 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false
-multi_isendalive 4 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false
-senddead 2 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
-recvdead 2 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
-isenddead 2 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
-irecvdead 2 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
-barrier 4 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
-gather 4 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
-reduce 4 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
-bcast 4 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
-scatter 4 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
-anysource 3 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
-revoke_nofail 2 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
-shrink 8 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
-agree 4 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
+die 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false resultTest=TestStatusNoErrors
+abort 2 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false xfail=ticket1537
+sendalive 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false resultTest=TestStatusNoErrors
+isendalive 3 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false
+multi_isendalive 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false
+senddead 2 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
+recvdead 2 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
+isenddead 2 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
+irecvdead 2 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
+barrier 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
+gather 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
+reduce 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
+bcast 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
+scatter 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
+anysource 3 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
+revoke_nofail 2 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
+shrink 8 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945
+agree 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=ticket1945

http://git.mpich.org/mpich.git/commitdiff/af39138730abda2829e98f28eaa3106f4e8b3820

commit af39138730abda2829e98f28eaa3106f4e8b3820
Author: Huiwei Lu <huiweilu at mcs.anl.gov>
Date:   Mon Nov 3 11:47:07 2014 -0600

    Improves RTS queue to be dynamic and VC specific
    
    For fault tolerance use, a RTS queue is added in [81b3911a] to track shm
    LMT RTS messages. However, the queue is global and static, which may not
    be scalable.
    
    This patch moves the RTS queue to struct MPIDI_CH3I_VC, to be VC
    specific as the lmt_queue is.  Also it improves the queue to use
    GENERIC_Q and the 'dev.next' field so it does not need to malloc
    additional space.
    
    Signed-off-by: Wesley Bland <wbland at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/include/mpid_nem_impl.h b/src/mpid/ch3/channels/nemesis/include/mpid_nem_impl.h
index c5b9111..a726b7d 100644
--- a/src/mpid/ch3/channels/nemesis/include/mpid_nem_impl.h
+++ b/src/mpid/ch3/channels/nemesis/include/mpid_nem_impl.h
@@ -55,10 +55,6 @@ typedef struct MPID_nem_pkt_lmt_rts
 }
 MPID_nem_pkt_lmt_rts_t;
 
-extern int *MPID_nem_lmt_rts_queue;
-extern int MPID_nem_lmt_rts_queue_last_inserted;
-extern int MPID_nem_lmt_rts_queue_size;
-
 typedef struct MPID_nem_pkt_lmt_cts
 {
     MPIDI_CH3_Pkt_type_t type;
diff --git a/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_pre.h b/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_pre.h
index 8289043..96c43f9 100644
--- a/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_pre.h
+++ b/src/mpid/ch3/channels/nemesis/include/mpidi_ch3_pre.h
@@ -7,6 +7,7 @@
 #if !defined(MPICH_MPIDI_CH3_PRE_H_INCLUDED)
 #define MPICH_MPIDI_CH3_PRE_H_INCLUDED
 #include "mpid_nem_pre.h"
+#include "mpid_nem_generic_queue.h"
 
 #if defined(HAVE_NETINET_IN_H)
     #include <netinet/in.h>
@@ -47,6 +48,30 @@ MPIDI_CH3I_VC_state_t;
 #define MPID_NEM_VC_NETMOD_AREA_LEN 128
 #define MPID_NEM_REQ_NETMOD_AREA_LEN 192
 
+/* define functions for access MPID_nem_lmt_rts_queue_t */
+typedef GENERIC_Q_DECL(struct MPID_Request) MPID_nem_lmt_rts_queue_t;
+#define MPID_nem_lmt_rtsq_empty(q) GENERIC_Q_EMPTY (q)
+#define MPID_nem_lmt_rtsq_head(q) GENERIC_Q_HEAD (q)
+#define MPID_nem_lmt_rtsq_enqueue(qp, ep) do {                                          \
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST,                         \
+                          "MPID_nem_lmt_rtsq_enqueue req=%p (handle=%#x), queue=%p",    \
+                          ep, (ep)->handle, qp));                                       \
+        GENERIC_Q_ENQUEUE (qp, ep, dev.next);                                           \
+    } while (0)
+#define MPID_nem_lmt_rtsq_dequeue(qp, epp)  do {                                        \
+        GENERIC_Q_DEQUEUE (qp, epp, dev.next);                                          \
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST,                         \
+                          "MPID_nem_lmt_rtsq_dequeue req=%p (handle=%#x), queue=%p",    \
+                          *(epp), *(epp) ? (*(epp))->handle : -1, qp));                 \
+    } while (0)
+#define MPID_nem_lmt_rtsq_search_remove(qp, req_id, epp) do {                           \
+        GENERIC_Q_SEARCH_REMOVE(qp, _e->handle == (req_id), epp,                        \
+                struct MPID_Request, dev.next);                                         \
+        MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST,                         \
+                    "MPID_nem_lmt_rtsq_search_remove req=%p (handle=%#x), queue=%p",    \
+                    *(epp), req_id, qp));                                               \
+} while (0)
+
 typedef struct MPIDI_CH3I_VC
 {
     int pg_rank;
@@ -110,6 +135,7 @@ typedef struct MPIDI_CH3I_VC
     struct {struct MPID_nem_lmt_shm_wait_element *head, *tail;} lmt_queue;
     struct MPID_nem_lmt_shm_wait_element *lmt_active_lmt;
     int lmt_enqueued; /* FIXME: used for debugging */
+    MPID_nem_lmt_rts_queue_t lmt_rts_queue;
 
     /* Pointer to per-vc packet handlers */
     MPIDI_CH3_PktHandler_Fcn **pkt_handler;
diff --git a/src/mpid/ch3/channels/nemesis/src/mpid_nem_finalize.c b/src/mpid/ch3/channels/nemesis/src/mpid_nem_finalize.c
index 1ee3f46..272ac50 100644
--- a/src/mpid/ch3/channels/nemesis/src/mpid_nem_finalize.c
+++ b/src/mpid/ch3/channels/nemesis/src/mpid_nem_finalize.c
@@ -29,7 +29,6 @@ int MPID_nem_finalize(void)
     /* these are allocated in MPID_nem_mpich_init, not MPID_nem_init */
     MPIU_Free(MPID_nem_recv_seqno);
     MPIU_Free(MPID_nem_fboxq_elem_list);
-    MPIU_Free(MPID_nem_lmt_rts_queue);
 
     /* from MPID_nem_init */
     MPIU_Free(MPID_nem_mem_region.FreeQ);
diff --git a/src/mpid/ch3/channels/nemesis/src/mpid_nem_init.c b/src/mpid/ch3/channels/nemesis/src/mpid_nem_init.c
index 6a23ed1..a92c763 100644
--- a/src/mpid/ch3/channels/nemesis/src/mpid_nem_init.c
+++ b/src/mpid/ch3/channels/nemesis/src/mpid_nem_init.c
@@ -544,6 +544,8 @@ MPID_nem_vc_init (MPIDI_VC_t *vc)
         vc_ch->lmt_queue.tail      = NULL;
         vc_ch->lmt_active_lmt      = NULL;
         vc_ch->lmt_enqueued        = FALSE;
+        vc_ch->lmt_rts_queue.head  = NULL;
+        vc_ch->lmt_rts_queue.tail  = NULL;
 
         if (MPIR_CVAR_NEMESIS_SHM_EAGER_MAX_SZ == -1)
             vc->eager_max_msg_sz = MPID_NEM_MPICH_DATA_LEN - sizeof(MPIDI_CH3_Pkt_t);
diff --git a/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c b/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c
index 5ef65de..28662af 100644
--- a/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c
+++ b/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c
@@ -90,7 +90,6 @@ int MPID_nem_lmt_RndvSend(MPID_Request **sreq_p, const void * buf, int count,
     MPID_PKT_DECL_CAST(upkt, MPID_nem_pkt_lmt_rts_t, rts_pkt);
     MPIDI_VC_t *vc;
     MPID_Request *sreq =*sreq_p;
-    int i;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_LMT_RNDVSEND);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_LMT_RNDVSEND);
@@ -124,30 +123,8 @@ int MPID_nem_lmt_RndvSend(MPID_Request **sreq_p, const void * buf, int count,
 
     MPIU_THREAD_CS_ENTER(LMT,);
     mpi_errno = vc->ch.lmt_initiate_lmt(vc, &upkt.p, sreq);
-    if (MPI_SUCCESS == mpi_errno) {
-        /* If this loops all the way around and can't find a place to put the
-         * RTS request, it will just drop the request and leave it out of the
-         * queue. It will print a message to warn the user. This should only
-         * affect FT and not matching so we'll consider this ok for now. */
-        for (i = MPID_nem_lmt_rts_queue_last_inserted + 1; ; i++) {
-            if (i == MPID_nem_lmt_rts_queue_size) {
-                i = -1;
-                continue;
-            }
-
-            if (MPID_nem_lmt_rts_queue[i] == MPI_REQUEST_NULL) {
-                MPID_nem_lmt_rts_queue[i] = sreq->handle;
-                MPID_nem_lmt_rts_queue_last_inserted = i;
-                break;
-            }
-
-            if (i == MPID_nem_lmt_rts_queue_last_inserted && !warning_printed) {
-                MPIU_Internal_error_printf("LMT RTS queue exceeded. FT not provided for overflowed messages.\n");
-                warning_printed = 1;
-                break;
-            }
-        }
-    }
+    if (MPI_SUCCESS == mpi_errno)
+        MPID_nem_lmt_rtsq_enqueue(&vc->ch.lmt_rts_queue, sreq);
     MPIU_THREAD_CS_EXIT(LMT,);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
@@ -329,7 +306,6 @@ static int pkt_CTS_handler(MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt, MPIDI_msg_sz_t
     char *data_buf;
     MPIDI_msg_sz_t data_len;
     int mpi_errno = MPI_SUCCESS;
-    int i;
     MPIU_CHKPMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_PKT_CTS_HANDLER);
 
@@ -343,20 +319,8 @@ static int pkt_CTS_handler(MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt, MPIDI_msg_sz_t
     MPID_Request_get_ptr(cts_pkt->sender_req_id, sreq);
 
     MPIU_THREAD_CS_ENTER(LMT,);
-    /* Remove the request from the RTS queue. */
-    for (i = MPID_nem_lmt_rts_queue_last_inserted + 1;
-            i != MPID_nem_lmt_rts_queue_last_inserted;
-            i++) {
-        if (i == MPID_nem_lmt_rts_queue_size) {
-            i = -1;
-            continue;
-        }
-
-        if (MPID_nem_lmt_rts_queue[i] == cts_pkt->sender_req_id) {
-            MPID_nem_lmt_rts_queue[i] = MPI_REQUEST_NULL;
-            break;
-        }
-    }
+    /* Remove the request from the VC RTS queue. */
+    MPID_nem_lmt_rtsq_search_remove(&vc->ch.lmt_rts_queue, cts_pkt->sender_req_id, &rts_sreq);
     MPIU_THREAD_CS_EXIT(LMT,);
 
     sreq->ch.lmt_req_id = cts_pkt->receiver_req_id;
diff --git a/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt_shm.c b/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt_shm.c
index d312f4c..525ffd4 100644
--- a/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt_shm.c
+++ b/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt_shm.c
@@ -804,31 +804,22 @@ int MPID_nem_lmt_shm_vc_terminated(MPIDI_VC_t *vc)
     MPID_nem_lmt_shm_wait_element_t *we;
     int req_errno = MPI_SUCCESS;
     MPID_Request *req = NULL;
-    int i;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_LMT_SHM_VC_TERMINATED);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_LMT_SHM_VC_TERMINATED);
 
     if (vc->state != MPIDI_VC_STATE_CLOSED) {
         MPIU_ERR_SET1(req_errno, MPIX_ERR_PROC_FAILED, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+    }
 
-        /* If there is anything in the RTS queue, it needs to be cleared out. */
-        MPIU_THREAD_CS_ENTER(LMT,);
-        for (i = 0; i < MPID_nem_lmt_rts_queue_size; i++) {
-            if (MPI_REQUEST_NULL != MPID_nem_lmt_rts_queue[i]) {
-                MPID_Request_get_ptr(MPID_nem_lmt_rts_queue[i], req);
-                MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Checking RTS message");
-
-                if (req->ch.vc != NULL && req->ch.vc->pg_rank == vc->pg_rank) {
-                    MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Removing RTS message");
-                    req->status.MPI_ERROR = req_errno;
-                    MPIDI_CH3U_Request_complete(req);
-                    MPID_nem_lmt_rts_queue[i] = MPI_REQUEST_NULL;
-                }
-            }
-        }
-        MPIU_THREAD_CS_EXIT(LMT,);
+    /* If there is anything in the RTS queue, it needs to be cleared out. */
+    MPIU_THREAD_CS_ENTER(LMT,);
+    while (!MPID_nem_lmt_rtsq_empty(vc_ch->lmt_rts_queue)) {
+        MPID_nem_lmt_rtsq_dequeue(&vc_ch->lmt_rts_queue, &req);
+        req->status.MPI_ERROR = req_errno;
+        MPIDI_CH3U_Request_complete(req);
     }
+    MPIU_THREAD_CS_EXIT(LMT,);
 
     /* We empty the vc queue, but don't remove the vc from the global
        list.  That will eventually happen when lmt_shm_progress()
diff --git a/src/mpid/ch3/channels/nemesis/src/mpid_nem_mpich.c b/src/mpid/ch3/channels/nemesis/src/mpid_nem_mpich.c
index ee267bd..4bdaefb 100644
--- a/src/mpid/ch3/channels/nemesis/src/mpid_nem_mpich.c
+++ b/src/mpid/ch3/channels/nemesis/src/mpid_nem_mpich.c
@@ -23,10 +23,6 @@ MPID_nem_cell_ptr_t MPID_nem_prefetched_cell = 0;
 
 unsigned short *MPID_nem_recv_seqno = 0;
 
-int *MPID_nem_lmt_rts_queue;
-int MPID_nem_lmt_rts_queue_size;
-int MPID_nem_lmt_rts_queue_last_inserted = 0;
-
 #undef FUNCNAME
 #define FUNCNAME MPID_nem_mpich_init
 #undef FCNAME
@@ -36,7 +32,7 @@ MPID_nem_mpich_init(void)
 {
     int mpi_errno = MPI_SUCCESS;
     int i;
-    MPIU_CHKPMEM_DECL (3);
+    MPIU_CHKPMEM_DECL (2);
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_MPICH_INIT);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_MPICH_INIT);
@@ -68,15 +64,6 @@ MPID_nem_mpich_init(void)
     MPID_nem_curr_fbox_all_poll = &MPID_nem_fboxq_elem_list[0];
     MPID_nem_fboxq_elem_list_last = &MPID_nem_fboxq_elem_list[MPID_nem_mem_region.num_local - 1];
 
-    /* Create a queue of MPID_NEM_LMT_RTS_QUEUE_SIZE ints to hold outstanding
-     * RTS requests. If we run out of space, we'll just drop the extra
-     * requests. This won't cause a matching problem, it will just prevent FT
-     * from working for those requests that get dropped. */
-    MPID_nem_lmt_rts_queue_size = MPIR_CVAR_NEM_LMT_RTS_QUEUE_SIZE;
-    MPIU_CHKPMEM_MALLOC(MPID_nem_lmt_rts_queue, int *, sizeof(int) * MPID_nem_lmt_rts_queue_size, mpi_errno, "lmt rts queue");
-    for (i = 0; i < MPID_nem_lmt_rts_queue_size; i++)
-        MPID_nem_lmt_rts_queue[i] = MPI_REQUEST_NULL;
-
     MPIU_CHKPMEM_COMMIT();
 fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_MPICH_INIT);

-----------------------------------------------------------------------

Summary of changes:
 .../ch3/channels/nemesis/include/mpid_nem_impl.h   |    4 -
 .../ch3/channels/nemesis/include/mpidi_ch3_pre.h   |   26 +++++++++
 .../ch3/channels/nemesis/src/mpid_nem_finalize.c   |    1 -
 src/mpid/ch3/channels/nemesis/src/mpid_nem_init.c  |    2 +
 src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c   |   59 +++++---------------
 .../ch3/channels/nemesis/src/mpid_nem_lmt_shm.c    |   21 ++-----
 src/mpid/ch3/channels/nemesis/src/mpid_nem_mpich.c |   15 +-----
 test/mpi/ft/testlist                               |   36 ++++++------
 test/mpi/runtests.in                               |    7 ++-
 9 files changed, 75 insertions(+), 96 deletions(-)


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list