[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.2b4-155-gcc4f192

Service Account noreply at mpich.org
Tue Aug 18 14:28:22 CDT 2015


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  cc4f192f5e7e492c1d92459ac1ef640641abcf3c (commit)
      from  8e449a06e6db8d277558296e6a15cfe86b59c61a (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/cc4f192f5e7e492c1d92459ac1ef640641abcf3c

commit cc4f192f5e7e492c1d92459ac1ef640641abcf3c
Author: Huiwei Lu <huiweilu at mcs.anl.gov>
Date:   Tue Aug 11 20:25:54 2015 -0500

    Fixing idup error handling
    
    Adding error handling for idup context id exhaustion.
    
    Fix #2296
    
    Signed-off-by: Lena Oden <loden at anl.gov>

diff --git a/src/mpi/comm/contextid.c b/src/mpi/comm/contextid.c
index 6cace52..3f58f63 100644
--- a/src/mpi/comm/contextid.c
+++ b/src/mpi/comm/contextid.c
@@ -41,6 +41,7 @@ cvars:
  */
 static uint32_t context_mask[MPIR_MAX_CONTEXT_MASK];
 static int initialize_context_mask = 1;
+const int ALL_OWN_MASK_FLAG = MPIR_MAX_CONTEXT_MASK;
 
 /* utility function to pretty print a context ID for debugging purposes, see
  * mpiimpl.h for more info on the various fields */
@@ -325,7 +326,6 @@ int MPIR_Get_contextid_sparse_group(MPID_Comm * comm_ptr, MPID_Group * group_ptr
                                     MPIU_Context_id_t * context_id, int ignore_id)
 {
     int mpi_errno = MPI_SUCCESS;
-    const int ALL_OWN_MASK_FLAG = MPIR_MAX_CONTEXT_MASK;
     uint32_t local_mask[MPIR_MAX_CONTEXT_MASK + 1];
     int own_mask = 0;
     int own_eager_mask = 0;
@@ -602,7 +602,7 @@ struct gcn_state {
     MPID_Sched_t s;
     MPID_Comm *new_comm;
     MPID_Comm_kind_t gcn_cid_kind;
-    uint32_t local_mask[MPIR_MAX_CONTEXT_MASK];
+    uint32_t local_mask[MPIR_MAX_CONTEXT_MASK+1];
 };
 
 static int sched_cb_gcn_copy_mask(MPID_Comm * comm, int tag, void *state);
@@ -691,6 +691,7 @@ static int sched_cb_gcn_allocate_cid(MPID_Comm * comm, int tag, void *state)
     int mpi_errno = MPI_SUCCESS;
     struct gcn_state *st = state;
     MPIU_Context_id_t newctxid;
+    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
 
     if (st->own_eager_mask) {
         newctxid = find_and_allocate_context_id(st->local_mask);
@@ -720,11 +721,32 @@ static int sched_cb_gcn_allocate_cid(MPID_Comm * comm, int tag, void *state)
     }
 
     if (*st->ctx0 == 0) {
-        /* do not own mask, try again */
-        mpi_errno = MPID_Sched_cb(&sched_cb_gcn_copy_mask, st, st->s);
-        if (mpi_errno)
-            MPIR_ERR_POP(mpi_errno);
-        MPID_SCHED_BARRIER(st->s);
+        if (st->local_mask[ALL_OWN_MASK_FLAG] == 1) {
+            /* --BEGIN ERROR HANDLING-- */
+            int nfree = 0;
+            int ntotal = 0;
+            int minfree;
+            context_mask_stats(&nfree, &ntotal);
+            minfree = nfree;
+            MPIR_Allreduce_impl(MPI_IN_PLACE, &minfree, 1, MPI_INT,
+                                MPI_MIN, st->comm_ptr, &errflag);
+            if (minfree > 0) {
+                MPIR_ERR_SETANDJUMP3(mpi_errno, MPI_ERR_OTHER,
+                                     "**toomanycommfrag", "**toomanycommfrag %d %d %d",
+                                     nfree, ntotal, minfree);
+            } else {
+                MPIR_ERR_SETANDJUMP3(mpi_errno, MPI_ERR_OTHER,
+                                     "**toomanycomm", "**toomanycomm %d %d %d",
+                                     nfree, ntotal, minfree);
+            }
+            /* --END ERROR HANDLING-- */
+        } else {
+            /* do not own mask, try again */
+            mpi_errno = MPID_Sched_cb(&sched_cb_gcn_copy_mask, st, st->s);
+            if (mpi_errno)
+                MPIR_ERR_POP(mpi_errno);
+            MPID_SCHED_BARRIER(st->s);
+        }
     }
     else {
         /* Successfully allocated a context id */
@@ -738,10 +760,15 @@ static int sched_cb_gcn_allocate_cid(MPID_Comm * comm, int tag, void *state)
         MPID_SCHED_BARRIER(st->s);
     }
 
-    /* --BEGIN ERROR HANDLING-- */
-    /* --END ERROR HANDLING-- */
-  fn_fail:
+  fn_exit:
     return mpi_errno;
+  fn_fail:
+    /* In the case of failure, the new communicator was half created.
+     * So we need to clean the memory allocated for it. */
+    MPIR_Comm_map_free(st->new_comm);
+    MPIU_Handle_obj_free(&MPID_Comm_mem, st->new_comm);
+    MPIU_Free(st);
+    goto fn_exit;
 }
 
 #undef FUNCNAME
@@ -754,7 +781,7 @@ static int sched_cb_gcn_copy_mask(MPID_Comm * comm, int tag, void *state)
     struct gcn_state *st = state;
 
     if (st->first_iter) {
-        memset(st->local_mask, 0, MPIR_MAX_CONTEXT_MASK * sizeof(int));
+        memset(st->local_mask, 0, (MPIR_MAX_CONTEXT_MASK+1) * sizeof(int));
         st->own_eager_mask = 0;
 
         /* Attempt to reserve the eager mask segment */
@@ -786,6 +813,7 @@ static int sched_cb_gcn_copy_mask(MPID_Comm * comm, int tag, void *state)
             || (st->comm_ptr->idup_count > 1 && st->seqnum != st->comm_ptr->idup_next_seqnum)) {
             memset(st->local_mask, 0, MPIR_MAX_CONTEXT_MASK * sizeof(int));
             st->own_mask = 0;
+            st->local_mask[ALL_OWN_MASK_FLAG] = 0;
         }
         else {
             /* Copy safe mask segment to local_mask */
@@ -797,13 +825,13 @@ static int sched_cb_gcn_copy_mask(MPID_Comm * comm, int tag, void *state)
 
             mask_in_use = 1;
             st->own_mask = 1;
+            st->local_mask[ALL_OWN_MASK_FLAG] = 1;
         }
-
     }
 
     mpi_errno =
         st->comm_ptr->coll_fns->Iallreduce_sched(MPI_IN_PLACE, st->local_mask,
-                                                 MPIR_MAX_CONTEXT_MASK, MPI_UINT32_T, MPI_BAND,
+                                                 MPIR_MAX_CONTEXT_MASK + 1, MPI_UINT32_T, MPI_BAND,
                                                  st->comm_ptr, st->s);
     if (mpi_errno)
         MPIR_ERR_POP(mpi_errno);

-----------------------------------------------------------------------

Summary of changes:
 src/mpi/comm/contextid.c |   54 ++++++++++++++++++++++++++++++++++-----------
 1 files changed, 41 insertions(+), 13 deletions(-)


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list