[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.1.3-63-g81b3911

Service Account noreply at mpich.org
Fri Oct 31 09:15:01 CDT 2014


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  81b3911ac6a8e03d93fa6bf19644383acddb9d05 (commit)
       via  45ceb3d01f3f3bc0214f99e31f1a63da55dddb2e (commit)
      from  48ec4d8a44fdfd9fdef2b0888577817febc82adb (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/81b3911ac6a8e03d93fa6bf19644383acddb9d05

commit 81b3911ac6a8e03d93fa6bf19644383acddb9d05
Author: Wesley Bland <wbland at anl.gov>
Date:   Wed Oct 29 13:31:36 2014 -0500

    Add a queue for shm RTS messages
    
    RTS messages (the first part of the LMT sequence) had no way of being
    cancelled if an error occurred. This adds a small queue that keeps track
    of these messages. If a failure is detected, the message is removed from
    the queue and the associated request is cancelled to get out of the
    progress engine.
    
    See #1945
    
    Signed-off-by: Huiwei Lu <huiweilu at mcs.anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/include/mpid_nem_impl.h b/src/mpid/ch3/channels/nemesis/include/mpid_nem_impl.h
index a726b7d..69b145d 100644
--- a/src/mpid/ch3/channels/nemesis/include/mpid_nem_impl.h
+++ b/src/mpid/ch3/channels/nemesis/include/mpid_nem_impl.h
@@ -55,6 +55,10 @@ typedef struct MPID_nem_pkt_lmt_rts
 }
 MPID_nem_pkt_lmt_rts_t;
 
+#define MPID_NEM_LMT_RTS_QUEUE_SIZE 1024
+extern int *MPID_nem_lmt_rts_queue;
+extern int MPID_nem_lmt_rts_queue_last_inserted;
+
 typedef struct MPID_nem_pkt_lmt_cts
 {
     MPIDI_CH3_Pkt_type_t type;
diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_istartmsgv.c b/src/mpid/ch3/channels/nemesis/src/ch3_istartmsgv.c
index 02d7cb3..10af38c 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_istartmsgv.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_istartmsgv.c
@@ -32,7 +32,7 @@
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPIDI_CH3_iStartMsgv (MPIDI_VC_t *vc, MPID_IOV *iov, int n_iov, MPID_Request **sreq_ptr)
 {
-    MPID_Request * sreq = NULL;
+    MPID_Request * sreq = *sreq_ptr = NULL;
     int mpi_errno = MPI_SUCCESS;
     int in_cs = FALSE;
     int again = 0;
diff --git a/src/mpid/ch3/channels/nemesis/src/mpid_nem_finalize.c b/src/mpid/ch3/channels/nemesis/src/mpid_nem_finalize.c
index 272ac50..1ee3f46 100644
--- a/src/mpid/ch3/channels/nemesis/src/mpid_nem_finalize.c
+++ b/src/mpid/ch3/channels/nemesis/src/mpid_nem_finalize.c
@@ -29,6 +29,7 @@ int MPID_nem_finalize(void)
     /* these are allocated in MPID_nem_mpich_init, not MPID_nem_init */
     MPIU_Free(MPID_nem_recv_seqno);
     MPIU_Free(MPID_nem_fboxq_elem_list);
+    MPIU_Free(MPID_nem_lmt_rts_queue);
 
     /* from MPID_nem_init */
     MPIU_Free(MPID_nem_mem_region.FreeQ);
diff --git a/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c b/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c
index 5c10104..f7396c8 100644
--- a/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c
+++ b/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c
@@ -70,6 +70,7 @@ int MPID_nem_lmt_RndvSend(MPID_Request **sreq_p, const void * buf, int count,
     MPID_PKT_DECL_CAST(upkt, MPID_nem_pkt_lmt_rts_t, rts_pkt);
     MPIDI_VC_t *vc;
     MPID_Request *sreq =*sreq_p;
+    int i;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_LMT_RNDVSEND);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_LMT_RNDVSEND);
@@ -99,9 +100,30 @@ int MPID_nem_lmt_RndvSend(MPID_Request **sreq_p, const void * buf, int count,
     MPIDI_VC_FAI_send_seqnum(vc, seqnum);
     MPIDI_Pkt_set_seqnum(rts_pkt, seqnum);
     MPIDI_Request_set_seqnum(sreq, seqnum);
+    sreq->ch.vc = vc;
 
     MPIU_THREAD_CS_ENTER(LMT,);
     mpi_errno = vc->ch.lmt_initiate_lmt(vc, &upkt.p, sreq);
+    if (MPI_SUCCESS == mpi_errno) {
+        /* If this loops all the way around and can't find a place to put the
+         * RTS request, it will just drop the request and leave it out of the
+         * queue silently. This should only affect FT and not matching so we'll
+         * consider this ok for now. */
+        for (i = MPID_nem_lmt_rts_queue_last_inserted + 1;
+             i != MPID_nem_lmt_rts_queue_last_inserted;
+             i++) {
+            if (i == MPID_NEM_LMT_RTS_QUEUE_SIZE) {
+                i = -1;
+                continue;
+            }
+
+            if (MPID_nem_lmt_rts_queue[i] == MPI_REQUEST_NULL) {
+                MPID_nem_lmt_rts_queue[i] = sreq->handle;
+                MPID_nem_lmt_rts_queue_last_inserted = i;
+                break;
+            }
+        }
+    }
     MPIU_THREAD_CS_EXIT(LMT,);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
@@ -283,6 +305,7 @@ static int pkt_CTS_handler(MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt, MPIDI_msg_sz_t
     char *data_buf;
     MPIDI_msg_sz_t data_len;
     int mpi_errno = MPI_SUCCESS;
+    int i;
     MPIU_CHKPMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_PKT_CTS_HANDLER);
 
@@ -295,6 +318,23 @@ static int pkt_CTS_handler(MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt, MPIDI_msg_sz_t
 
     MPID_Request_get_ptr(cts_pkt->sender_req_id, sreq);
 
+    MPIU_THREAD_CS_ENTER(LMT,);
+    /* Remove the request from the RTS queue. */
+    for (i = MPID_nem_lmt_rts_queue_last_inserted + 1;
+            i != MPID_nem_lmt_rts_queue_last_inserted;
+            i++) {
+        if (i == MPID_NEM_LMT_RTS_QUEUE_SIZE) {
+            i = -1;
+            continue;
+        }
+
+        if (MPID_nem_lmt_rts_queue[i] == cts_pkt->sender_req_id) {
+            MPID_nem_lmt_rts_queue[i] = MPI_REQUEST_NULL;
+            break;
+        }
+    }
+    MPIU_THREAD_CS_EXIT(LMT,);
+
     sreq->ch.lmt_req_id = cts_pkt->receiver_req_id;
     sreq->ch.lmt_data_sz = cts_pkt->data_sz;
 
diff --git a/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt_shm.c b/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt_shm.c
index e8ccc6d..6a1715b 100644
--- a/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt_shm.c
+++ b/src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt_shm.c
@@ -802,22 +802,53 @@ int MPID_nem_lmt_shm_vc_terminated(MPIDI_VC_t *vc)
     int mpi_errno = MPI_SUCCESS;
     MPIDI_CH3I_VC *vc_ch = &vc->ch;
     MPID_nem_lmt_shm_wait_element_t *we;
+    int req_errno = MPI_SUCCESS;
+    MPID_Request *req = NULL;
+    int i;
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_LMT_SHM_VC_TERMINATED);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_LMT_SHM_VC_TERMINATED);
 
+    if (vc->state != MPIDI_VC_STATE_CLOSED) {
+        MPIU_ERR_SET1(req_errno, MPIX_ERR_PROC_FAILED, "**comm_fail", "**comm_fail %d", vc->pg_rank);
+
+        /* If there is anything in the RTS queue, it needs to be cleared out. */
+        MPIU_THREAD_CS_ENTER(LMT,);
+        for (i = 0; i < MPID_NEM_LMT_RTS_QUEUE_SIZE; i++) {
+            if (MPI_REQUEST_NULL != MPID_nem_lmt_rts_queue[i]) {
+                MPID_Request_get_ptr(MPID_nem_lmt_rts_queue[i], req);
+                MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Checking RTS message");
+
+                if (req->ch.vc != NULL && req->ch.vc->pg_rank == vc->pg_rank) {
+                    MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Removing RTS message");
+                    req->status.MPI_ERROR = req_errno;
+                    MPIDI_CH3U_Request_complete(req);
+                    MPID_nem_lmt_rts_queue[i] = MPI_REQUEST_NULL;
+                }
+            }
+        }
+        MPIU_THREAD_CS_EXIT(LMT,);
+    }
+
     /* We empty the vc queue, but don't remove the vc from the global
        list.  That will eventually happen when lmt_shm_progress()
        calls lmt_shm_progress_vc() and it finds an empty queue. */
 
     if (vc_ch->lmt_active_lmt) {
+        MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Clearing active LMT");
+        vc_ch->lmt_active_lmt->req->status.MPI_ERROR = req_errno;
         MPIDI_CH3U_Request_complete(vc_ch->lmt_active_lmt->req);
         MPIU_Free(vc_ch->lmt_active_lmt);
         vc_ch->lmt_active_lmt = NULL;
     }
 
+    if (!LMT_SHM_Q_EMPTY(vc_ch->lmt_queue)) {
+        MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Empty LMT queue");
+    }
+
     while (!LMT_SHM_Q_EMPTY(vc_ch->lmt_queue)) {
         LMT_SHM_Q_DEQUEUE(&vc_ch->lmt_queue, &we);
+        we->req->status.MPI_ERROR = req_errno;
         MPIDI_CH3U_Request_complete(we->req);
         MPIU_Free(we);
     }
diff --git a/src/mpid/ch3/channels/nemesis/src/mpid_nem_mpich.c b/src/mpid/ch3/channels/nemesis/src/mpid_nem_mpich.c
index 4bdaefb..9e7fc5b 100644
--- a/src/mpid/ch3/channels/nemesis/src/mpid_nem_mpich.c
+++ b/src/mpid/ch3/channels/nemesis/src/mpid_nem_mpich.c
@@ -23,6 +23,9 @@ MPID_nem_cell_ptr_t MPID_nem_prefetched_cell = 0;
 
 unsigned short *MPID_nem_recv_seqno = 0;
 
+int *MPID_nem_lmt_rts_queue;
+int MPID_nem_lmt_rts_queue_last_inserted = 0;
+
 #undef FUNCNAME
 #define FUNCNAME MPID_nem_mpich_init
 #undef FCNAME
@@ -32,7 +35,7 @@ MPID_nem_mpich_init(void)
 {
     int mpi_errno = MPI_SUCCESS;
     int i;
-    MPIU_CHKPMEM_DECL (2);
+    MPIU_CHKPMEM_DECL (3);
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_MPICH_INIT);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_MPICH_INIT);
@@ -64,6 +67,14 @@ MPID_nem_mpich_init(void)
     MPID_nem_curr_fbox_all_poll = &MPID_nem_fboxq_elem_list[0];
     MPID_nem_fboxq_elem_list_last = &MPID_nem_fboxq_elem_list[MPID_nem_mem_region.num_local - 1];
 
+    /* Create a queue of MPID_NEM_LMT_RTS_QUEUE_SIZE ints to hold outstanding
+     * RTS requests. If we run out of space, we'll just drop the extra
+     * requests. This won't cause a matching problem, it will just prevent FT
+     * from working for those requests that get dropped. */
+    MPIU_CHKPMEM_MALLOC(MPID_nem_lmt_rts_queue, int *, sizeof(int) * MPID_NEM_LMT_RTS_QUEUE_SIZE, mpi_errno, "lmt rts queue");
+    for (i = 0; i < MPID_NEM_LMT_RTS_QUEUE_SIZE; i++)
+        MPID_nem_lmt_rts_queue[i] = MPI_REQUEST_NULL;
+
     MPIU_CHKPMEM_COMMIT();
 fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_MPICH_INIT);

http://git.mpich.org/mpich.git/commitdiff/45ceb3d01f3f3bc0214f99e31f1a63da55dddb2e

commit 45ceb3d01f3f3bc0214f99e31f1a63da55dddb2e
Author: Wesley Bland <wbland at anl.gov>
Date:   Thu Oct 23 12:33:59 2014 -0500

    Improve the senddead test
    
    Added another send call to this test to take out the race condition. Now, the
    test should fail under any circumstances if the send request isn't being
    cleaned up correctly.
    
    Signed-off-by: Huiwei Lu <huiweilu at mcs.anl.gov>

diff --git a/test/mpi/ft/senddead.c b/test/mpi/ft/senddead.c
index 9524789..49b07f9 100644
--- a/test/mpi/ft/senddead.c
+++ b/test/mpi/ft/senddead.c
@@ -37,6 +37,13 @@ int main(int argc, char **argv)
         MPI_Error_class(err, &errclass);
         if ((err) && (errclass != MPIX_ERR_PROC_FAILED)) {
             fprintf(stderr, "Wrong error code (%d) returned. Expected MPIX_ERR_PROC_FAILED\n", errclass);
+        }
+#endif
+        err = MPI_Send(buf, 100000, MPI_CHAR, 1, 0, MPI_COMM_WORLD);
+#if defined (MPICH) && (MPICH_NUMVERSION >= 30100102)
+        MPI_Error_class(err, &errclass);
+        if ((err) && (errclass != MPIX_ERR_PROC_FAILED)) {
+            fprintf(stderr, "Wrong error code (%d) returned. Expected MPIX_ERR_PROC_FAILED\n", errclass);
         } else {
             printf(" No Errors\n");
             fflush(stdout);

-----------------------------------------------------------------------

Summary of changes:
 .../ch3/channels/nemesis/include/mpid_nem_impl.h   |    4 ++
 src/mpid/ch3/channels/nemesis/src/ch3_istartmsgv.c |    2 +-
 .../ch3/channels/nemesis/src/mpid_nem_finalize.c   |    1 +
 src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c   |   40 ++++++++++++++++++++
 .../ch3/channels/nemesis/src/mpid_nem_lmt_shm.c    |   31 +++++++++++++++
 src/mpid/ch3/channels/nemesis/src/mpid_nem_mpich.c |   13 ++++++-
 test/mpi/ft/senddead.c                             |    7 +++
 7 files changed, 96 insertions(+), 2 deletions(-)


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list