[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.0.3-11-g3f38e1b

mysql vizuser noreply at mpich.org
Sun Apr 7 15:45:53 CDT 2013


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  3f38e1b9ff5ecc9a916aec41fb964c1cec2e3622 (commit)
       via  24c1c9351a1811baa84b029dd3163d2ece0995b4 (commit)
       via  ca42e9d4fccddab38643445e891927e06a90c01e (commit)
      from  83115a2f6317a91a4858ba1202bd902e18e41845 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/3f38e1b9ff5ecc9a916aec41fb964c1cec2e3622

commit 3f38e1b9ff5ecc9a916aec41fb964c1cec2e3622
Author: Dave Goodell <goodell at mcs.anl.gov>
Date:   Thu Apr 4 21:38:08 2013 -0500

    report context ID counts on allocation failure
    
    When we run out of context IDs, we now report how many context IDs are
    free on each process in the input communicator.  This should make it
    easier to identify the causes of bad allocation patterns in the wild.
    
    Reviewed-by: dinan

diff --git a/src/mpi/comm/commutil.c b/src/mpi/comm/commutil.c
index f07bd9d..95fcb6f 100644
--- a/src/mpi/comm/commutil.c
+++ b/src/mpi/comm/commutil.c
@@ -859,11 +859,25 @@ int MPIR_Get_contextid_sparse_group(MPID_Comm *comm_ptr, MPID_Group *group_ptr,
 
     if (ignore_id) {
         *context_id = MPIR_Locate_context_bit(local_mask);
-        MPIU_ERR_CHKANDJUMP(!(*context_id), mpi_errno, MPIR_ERR_RECOVERABLE, "**toomanycomm");
+        if (*context_id == 0) {
+            int nfree = -1;
+            int ntotal = -1;
+            MPIR_ContextMaskStats(&nfree, &ntotal);
+            MPIU_ERR_SETANDJUMP3(mpi_errno, MPIR_ERR_RECOVERABLE,
+                                 "**toomanycomm", "**toomanycomm %d %d %d",
+                                 nfree, ntotal, ignore_id);
+        }
     }
     else {
         *context_id = MPIR_Find_and_allocate_context_id(local_mask);
-        MPIU_ERR_CHKANDJUMP(!(*context_id), mpi_errno, MPIR_ERR_RECOVERABLE, "**toomanycomm");
+        if (*context_id == 0) {
+            int nfree = -1;
+            int ntotal = -1;
+            MPIR_ContextMaskStats(&nfree, &ntotal);
+            MPIU_ERR_SETANDJUMP3(mpi_errno, MPIR_ERR_RECOVERABLE,
+                                 "**toomanycomm", "**toomanycomm %d %d %d",
+                                 nfree, ntotal, ignore_id);
+        }
     }
 
 fn_exit:
@@ -1114,6 +1128,8 @@ int MPIR_Get_contextid_sparse_group(MPID_Comm *comm_ptr, MPID_Group *group_ptr,
          * succeed because there is no common context ID. */
         if (*context_id == 0 && local_mask[ALL_OWN_MASK_FLAG] == 1) {
             /* --BEGIN ERROR HANDLING-- */
+            int nfree = 0;
+            int ntotal = 0;
             if (own_mask) {
                 MPIU_THREAD_CS_ENTER(CONTEXTID,);
                 mask_in_use = 0;
@@ -1124,7 +1140,10 @@ int MPIR_Get_contextid_sparse_group(MPID_Comm *comm_ptr, MPID_Group *group_ptr,
                 MPIU_THREAD_CS_EXIT(CONTEXTID,);
             }
 
-            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**toomanycomm");
+            MPIR_ContextMaskStats(&nfree, &ntotal);
+            MPIU_ERR_SETANDJUMP3(mpi_errno, MPIR_ERR_RECOVERABLE,
+                                 "**toomanycommfrag", "**toomanycommfrag %d %d %d",
+                                 nfree, ntotal, ignore_id);
             /* --END ERROR HANDLING-- */
         }
 
@@ -1169,7 +1188,14 @@ static int gcn_helper(MPID_Comm *comm, int tag, void *state)
     MPIR_Context_id_t newctxid;
 
     newctxid = MPIR_Find_and_allocate_context_id(st->local_mask);
-    MPIU_ERR_CHKANDJUMP(!newctxid, mpi_errno, MPIR_ERR_RECOVERABLE, "**toomanycomm");
+    if (!newctxid) {
+        int nfree = -1;
+        int ntotal = -1;
+        MPIR_ContextMaskStats(&nfree, &ntotal);
+        MPIU_ERR_SETANDJUMP3(mpi_errno, MPIR_ERR_RECOVERABLE,
+                             "**toomanycomm", "**toomanycomm %d %d %d",
+                             nfree, ntotal, /*ignore_id=*/0);
+    }
 
     if (st->ctx0)
         *st->ctx0 = newctxid;
@@ -1528,7 +1554,12 @@ int MPIR_Comm_copy( MPID_Comm *comm_ptr, int size, MPID_Comm **outcomm_ptr )
     }
     /* --BEGIN ERROR HANDLING-- */
     if (new_context_id == 0) {
-        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**toomanycomm" );
+        int nfree = -1;
+        int ntotal = -1;
+        MPIR_ContextMaskStats(&nfree, &ntotal);
+        MPIU_ERR_SETANDJUMP3(mpi_errno, MPIR_ERR_RECOVERABLE,
+                             "**toomanycomm", "**toomanycomm %d %d %d",
+                             nfree, ntotal, /*ignore_id=*/0);
     }
     /* --END ERROR HANDLING-- */
 
diff --git a/src/mpi/errhan/errnames.txt b/src/mpi/errhan/errnames.txt
index 32bf69d..8e78c48 100644
--- a/src/mpi/errhan/errnames.txt
+++ b/src/mpi/errhan/errnames.txt
@@ -91,6 +91,9 @@ was not started with MPI_GREQUEST_START
 **attrsentinal:Internal fields in an attribute have been overwritten; \
  possible errors in using the attribute value in user code.
 **toomanycomm:Too many communicators
+**toomanycomm %d %d %d:Too many communicators (%d/%d free on this process; ignore_id=%d)
+**toomanycommfrag: Cannot allocate context ID because of fragmentation
+**toomanycommfrag %d %d %d: Cannot allocate context ID because of fragmentation (%d/%d free on this process; ignore_id=%d)
 **commperm:Cannot free permanent communicator
 **commperm %s:Cannot free permanent communicator %s
 **group:Invalid group

http://git.mpich.org/mpich.git/commitdiff/24c1c9351a1811baa84b029dd3163d2ece0995b4

commit 24c1c9351a1811baa84b029dd3163d2ece0995b4
Author: Dave Goodell <goodell at mcs.anl.gov>
Date:   Thu Apr 4 21:36:46 2013 -0500

    add `MPIR_ContextMaskStats` debug routine
    
    This should be useful for debugging context ID allocation issues in the
    wild.
    
    Reviewed-by: dinan

diff --git a/src/mpi/comm/commutil.c b/src/mpi/comm/commutil.c
index a65a4af..f07bd9d 100644
--- a/src/mpi/comm/commutil.c
+++ b/src/mpi/comm/commutil.c
@@ -630,6 +630,41 @@ char *MPIR_ContextMaskToStr( void )
     return bufstr;
 }
 
+/* Returns useful debugging information about the context ID mask bit-vector.
+ * This includes the total number of possibly valid IDs (the size of the ID
+ * space) and the number of free IDs remaining in the mask.  NULL arguments are
+ * fine, they will be ignored.
+ *
+ * This routine is for debugging in very particular situations and does not
+ * attempt to control concurrent access to the mask vector.
+ *
+ * Callers should own the context ID critical section, or should be prepared to
+ * suffer data races in any fine-grained locking configuration.
+ *
+ * The routine is non-static in order to permit "in the field debugging".  We
+ * provide a prototype here to keep the compiler happy. */
+void MPIR_ContextMaskStats(int *free_ids, int *total_ids);
+void MPIR_ContextMaskStats(int *free_ids, int *total_ids)
+{
+    if (free_ids) {
+        int i, j;
+        *free_ids = 0;
+
+        /* if this ever needs to be fast, use a lookup table to do a per-nibble
+         * or per-byte lookup of the popcount instead of checking each bit at a
+         * time (or just track the count when manipulating the mask and keep
+         * that count stored in a variable) */
+        for (i = 0; i < MPIR_MAX_CONTEXT_MASK; ++i) {
+            for (j = 0; j < sizeof(context_mask[0])*8; ++j) {
+                *free_ids += (context_mask[i] & (0x1 << j)) >> j;
+            }
+        }
+    }
+    if (total_ids) {
+        *total_ids = MPIR_MAX_CONTEXT_MASK*sizeof(context_mask[0])*8;
+    }
+}
+
 #ifdef MPICH_DEBUG_HANDLEALLOC
 static int MPIU_CheckContextIDsOnFinalize(void *context_mask_ptr)
 {

http://git.mpich.org/mpich.git/commitdiff/ca42e9d4fccddab38643445e891927e06a90c01e

commit ca42e9d4fccddab38643445e891927e06a90c01e
Author: Dave Goodell <goodell at mcs.anl.gov>
Date:   Thu Apr 4 13:10:42 2013 -0500

    make `MPIR_ContextMaskToStr` non-static
    
    This permits external users/developers to use this routine for debugging
    context ID allocation problems in the field.  At some point we may want
    to expose this info through the `MPI_T_` interface.
    
    Note that this isn't the way I would do it if I were building this stuff
    from scratch, I'm just taking a direct path from what we've got to
    something more useful.
    
    Reviewed-by: dinan

diff --git a/src/mpi/comm/commutil.c b/src/mpi/comm/commutil.c
index 8dc7b55..a65a4af 100644
--- a/src/mpi/comm/commutil.c
+++ b/src/mpi/comm/commutil.c
@@ -599,12 +599,22 @@ int MPIR_Comm_is_node_consecutive(MPID_Comm * comm)
 static uint32_t context_mask[MPIR_MAX_CONTEXT_MASK];
 static int initialize_context_mask = 1;
 
-#ifdef USE_DBG_LOGGING
 /* Create a string that contains the context mask.  This is
    used only with the logging interface, and must be used by one thread at 
    a time (should this be enforced by the logging interface?).
-   Converts the mask to hex and returns a pointer to that string */
-static char *MPIR_ContextMaskToStr( void )
+   Converts the mask to hex and returns a pointer to that string.
+
+   Callers should own the context ID critical section, or should be prepared to
+   suffer data races in any fine-grained locking configuration.
+
+   This routine is no longer static in order to allow advanced users and
+   developers to debug context ID problems "in the field".  We provide a
+   prototype here to keep the compiler happy, but users will need to put a
+   (possibly "extern") copy of the prototype in their own code in order to call
+   this routine.
+ */
+char *MPIR_ContextMaskToStr( void );
+char *MPIR_ContextMaskToStr( void )
 {
     static char bufstr[MPIR_MAX_CONTEXT_MASK*8+1];
     int i;
@@ -619,7 +629,6 @@ static char *MPIR_ContextMaskToStr( void )
     }
     return bufstr;
 }
-#endif
 
 #ifdef MPICH_DEBUG_HANDLEALLOC
 static int MPIU_CheckContextIDsOnFinalize(void *context_mask_ptr)

-----------------------------------------------------------------------

Summary of changes:
 src/mpi/comm/commutil.c     |   93 ++++++++++++++++++++++++++++++++++++++----
 src/mpi/errhan/errnames.txt |    3 +
 2 files changed, 87 insertions(+), 9 deletions(-)


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list