[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.2b4-197-g0c2459c

Service Account noreply at mpich.org
Thu Aug 27 11:09:28 CDT 2015


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  0c2459cc4debb5344e98de6db57d1d38062f9d5d (commit)
       via  fefa62be3d0ef54835830db86d7f7e32551a53aa (commit)
       via  bddf8aa4c0df1d91f08828efdbbccf7fd153c981 (commit)
       via  9596856eb2797e003e998fc65f8ed090974afe1c (commit)
      from  eec68f7b6d2e26743b09b2a541aab68ebad5e062 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/0c2459cc4debb5344e98de6db57d1d38062f9d5d

commit 0c2459cc4debb5344e98de6db57d1d38062f9d5d
Author: Lena Oden <loden at anl.gov>
Date:   Tue Aug 25 14:57:09 2015 -0500

    Multiple comm_idups in different threads are not supported for socks
    
    Tests for socks are disabled
    Refs #2108
    
    Signed-off-by: Huiwei Lu <huiweilu at mcs.anl.gov>

diff --git a/test/mpi/threads/comm/testlist.in b/test/mpi/threads/comm/testlist.in
index 7457ad2..506bb75 100644
--- a/test/mpi/threads/comm/testlist.in
+++ b/test/mpi/threads/comm/testlist.in
@@ -6,5 +6,5 @@ comm_create_group_threads 4 mpiversion=3.0
 comm_create_group_threads2 4 mpiversion=3.0
 @comm_overlap@ comm_idup 4 mpiversion=3.0
 @comm_overlap@ ctxidup 4 mpiversion=3.0
-idup_nb 4 mpiversion=3.0
-idup_comm_gen 4 mpiversion=3.0
+ at comm_overlap@ idup_nb 4 mpiversion=3.0
+ at comm_overlap@ idup_comm_gen 4 mpiversion=3.0

http://git.mpich.org/mpich.git/commitdiff/fefa62be3d0ef54835830db86d7f7e32551a53aa

commit fefa62be3d0ef54835830db86d7f7e32551a53aa
Author: Lena Oden <loden at anl.gov>
Date:   Tue Aug 25 16:12:14 2015 -0500

    Fix error handling from comm_idup
    
    If comm_idup fails, it must be ensured that pending comm_idup function
    are still scheduled and can own the mask. Therefore, the failing comm_idup
    must be removed from the linked list in the case of an failure.
    
    Fixes #2296
    
    Signed-off-by: Huiwei Lu <huiweilu at mcs.anl.gov>

diff --git a/src/mpi/comm/contextid.c b/src/mpi/comm/contextid.c
index 022b2de..704d834 100644
--- a/src/mpi/comm/contextid.c
+++ b/src/mpi/comm/contextid.c
@@ -791,6 +791,14 @@ static int sched_cb_gcn_allocate_cid(MPID_Comm * comm, int tag, void *state)
   fn_exit:
     return mpi_errno;
   fn_fail:
+    /* make sure that the pending comm_idups are still scheduled */
+     if(last_idup == st){
+        last_idup = st->next;
+     }
+     else {
+        for (tmp = last_idup; tmp->next != st; tmp = tmp->next);
+        tmp->next = st->next;
+     }
     /* In the case of failure, the new communicator was half created.
      * So we need to clean the memory allocated for it. */
     MPIR_Comm_map_free(st->new_comm);

http://git.mpich.org/mpich.git/commitdiff/bddf8aa4c0df1d91f08828efdbbccf7fd153c981

commit bddf8aa4c0df1d91f08828efdbbccf7fd153c981
Author: Lena Oden <loden at anl.gov>
Date:   Tue Aug 25 10:52:01 2015 -0500

    Orders multiple comm_idups in a list
    
    This patch modifies the handling of multiple MPI_Comm_idup.
    Instead of using a sequence number for every communicator
    to handle the ordering of comm_idups, now a linked list is used.
    Only the first comm_idup in the list can own the mask.
    
    The comm_idup operations are ordered in the list using the
    context id of the parent communicator.
    
    Signed-off-by: Huiwei Lu <huiweilu at mcs.anl.gov>

diff --git a/src/include/mpiimpl.h b/src/include/mpiimpl.h
index 48163b5..e705e5d 100644
--- a/src/include/mpiimpl.h
+++ b/src/include/mpiimpl.h
@@ -1227,12 +1227,6 @@ typedef struct MPID_Comm {
 
     int revoked;                    /* Flag to track whether the communicator
                                      * has been revoked */
-
-    int idup_count;              /* how many MPI_COMM_IDUPs duplicating from
-                                    the current communicator at the same time */
-    int idup_curr_seqnum;        /* give each child communicator a sequence number */
-    int idup_next_seqnum;        /* the smallest sequence number wins  */
-
     MPID_Info *info;                /* Hints to the communicator */
 
 #ifdef MPID_HAS_HETERO
diff --git a/src/mpi/comm/commutil.c b/src/mpi/comm/commutil.c
index 151d283..683b4e1 100644
--- a/src/mpi/comm/commutil.c
+++ b/src/mpi/comm/commutil.c
@@ -101,11 +101,6 @@ int MPIR_Comm_init(MPID_Comm * comm_p)
 
     /* Initialize the revoked flag as false */
     comm_p->revoked = 0;
-
-    comm_p->idup_count = 0;
-    comm_p->idup_curr_seqnum = 0;
-    comm_p->idup_next_seqnum = 0;
-
     comm_p->mapper_head = NULL;
     comm_p->mapper_tail = NULL;
 
diff --git a/src/mpi/comm/contextid.c b/src/mpi/comm/contextid.c
index 3f58f63..022b2de 100644
--- a/src/mpi/comm/contextid.c
+++ b/src/mpi/comm/contextid.c
@@ -603,7 +603,35 @@ struct gcn_state {
     MPID_Comm *new_comm;
     MPID_Comm_kind_t gcn_cid_kind;
     uint32_t local_mask[MPIR_MAX_CONTEXT_MASK+1];
+    struct gcn_state* next;
 };
+struct gcn_state *last_idup = NULL;
+
+/* All pending idups are added to the list of "last_idup" in the increasing
+ * order of its parent communicator context id. */
+#undef FUNCNAME
+#define FUNCNAME add_gcn_to_list
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+static int add_gcn_to_list (struct gcn_state *new_state)
+{
+    int mpi_errno = 0;
+    struct gcn_state *tmp;
+    if(last_idup == NULL) {
+        last_idup = new_state;
+        new_state->next = NULL;
+    } else if (last_idup->comm_ptr->context_id > new_state->comm_ptr->context_id) {
+        new_state->next = last_idup;
+        last_idup = new_state;
+    } else {
+        for(tmp = last_idup;
+            tmp->next!= NULL && new_state->comm_ptr->context_id >= tmp->next->comm_ptr->context_id;
+            tmp = tmp->next);
+        new_state->next = tmp->next;
+        tmp->next = new_state;
+    }
+    return mpi_errno;
+}
 
 static int sched_cb_gcn_copy_mask(MPID_Comm * comm, int tag, void *state);
 static int sched_cb_gcn_allocate_cid(MPID_Comm * comm, int tag, void *state);
@@ -689,10 +717,9 @@ static int sched_cb_gcn_bcast(MPID_Comm * comm, int tag, void *state)
 static int sched_cb_gcn_allocate_cid(MPID_Comm * comm, int tag, void *state)
 {
     int mpi_errno = MPI_SUCCESS;
-    struct gcn_state *st = state;
+    struct gcn_state *st = state, *tmp;
     MPIU_Context_id_t newctxid;
     MPIR_Errflag_t errflag = MPIR_ERR_NONE;
-
     if (st->own_eager_mask) {
         newctxid = find_and_allocate_context_id(st->local_mask);
         if (st->ctx0)
@@ -747,13 +774,14 @@ static int sched_cb_gcn_allocate_cid(MPID_Comm * comm, int tag, void *state)
                 MPIR_ERR_POP(mpi_errno);
             MPID_SCHED_BARRIER(st->s);
         }
-    }
-    else {
+    } else {
         /* Successfully allocated a context id */
-
-        st->comm_ptr->idup_next_seqnum++;
-        st->comm_ptr->idup_count--;
-
+        if(last_idup == st){
+            last_idup = st->next;
+        } else {
+            for (tmp = last_idup; tmp->next != st; tmp = tmp->next);
+            tmp->next = st->next;
+        }
         mpi_errno = MPID_Sched_cb(&sched_cb_gcn_bcast, st, st->s);
         if (mpi_errno)
             MPIR_ERR_POP(mpi_errno);
@@ -809,8 +837,7 @@ static int sched_cb_gcn_copy_mask(MPID_Comm * comm, int tag, void *state)
          * 3. for the case that multiple communicators duplicating from the
          *    same communicator at the same time, the sequence number of the
          *    current MPI_COMM_IDUP operation is not the smallest. */
-        if (mask_in_use || (st->comm_ptr->context_id != lowest_context_id) || ( st->comm_ptr->context_id == lowest_context_id && lowest_tag < st->tag)
-            || (st->comm_ptr->idup_count > 1 && st->seqnum != st->comm_ptr->idup_next_seqnum)) {
+        if (mask_in_use || lowest_tag < st->tag || st != last_idup) {
             memset(st->local_mask, 0, MPIR_MAX_CONTEXT_MASK * sizeof(int));
             st->own_mask = 0;
             st->local_mask[ALL_OWN_MASK_FLAG] = 0;
@@ -918,11 +945,6 @@ static int sched_get_cid_nonblock(MPID_Comm * comm_ptr, MPID_Comm * newcomm,
     st->own_eager_mask = 0;
     st->first_iter = 1;
     st->new_comm = newcomm;
-    /* idup_count > 1 means there are multiple communicators duplicating
-     * from the current communicator at the same time. And
-     * idup_curr_seqnum gives each duplication operation a priority */
-    st->comm_ptr->idup_count++;
-    st->seqnum = st->comm_ptr->idup_curr_seqnum++;
     st->own_mask = 0;
     if (eager_nelem < 0) {
         /* Ensure that at least one word of deadlock-free context IDs is
@@ -931,7 +953,7 @@ static int sched_get_cid_nonblock(MPID_Comm * comm_ptr, MPID_Comm * newcomm,
                     MPIR_CVAR_CTXID_EAGER_SIZE < MPIR_MAX_CONTEXT_MASK - 1);
         eager_nelem = MPIR_CVAR_CTXID_EAGER_SIZE;
     }
-
+    add_gcn_to_list(st);
     mpi_errno = MPID_Sched_cb(&sched_cb_gcn_copy_mask, st, s);
     if (mpi_errno)
         MPIR_ERR_POP(mpi_errno);

http://git.mpich.org/mpich.git/commitdiff/9596856eb2797e003e998fc65f8ed090974afe1c

commit 9596856eb2797e003e998fc65f8ed090974afe1c
Author: Lena Oden <loden at anl.gov>
Date:   Tue Aug 25 09:18:06 2015 -0500

    Modify too_many_icomms2 to wait for remaining idups
    
    The previous version of the test did not check the status of the
    non-completed comm_idups. This version uses the MPI_Error field in
    MPI_Status wait for the non-completed comm_idup operations
    
    Signed-off-by: Huiwei Lu <huiweilu at mcs.anl.gov>

diff --git a/test/mpi/errors/comm/too_many_icomms2.c b/test/mpi/errors/comm/too_many_icomms2.c
index bb0afd6..9430b81 100644
--- a/test/mpi/errors/comm/too_many_icomms2.c
+++ b/test/mpi/errors/comm/too_many_icomms2.c
@@ -26,7 +26,7 @@ static const int verbose = 0;
 int main(int argc, char **argv)
 {
     int rank, nproc, mpi_errno;
-    int i, ncomm, block;
+    int i, j, ncomm, block;
     int errors = 1;
     MPI_Comm *comm_hdls;
     MPI_Request req[WAIT_COMM];
@@ -37,6 +37,7 @@ int main(int argc, char **argv)
     MPI_Comm_size(MPI_COMM_WORLD, &nproc);
 
     MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
+    MPI_Status error_status[WAIT_COMM];
     comm_hdls = malloc(sizeof(MPI_Comm) * MAX_NCOMM);
 
 
@@ -46,16 +47,31 @@ int main(int argc, char **argv)
         /* Note: the comms we create are all dups of MPI_COMM_WORLD */
         MPI_Comm_idup(MPI_COMM_WORLD, &comm_hdls[i], &req[block++]);
         if(block == WAIT_COMM ){
-            mpi_errno = MPI_Waitall(block, req, MPI_STATUSES_IGNORE);
+            mpi_errno = MPI_Waitall(block, req, error_status);
             if (mpi_errno == MPI_SUCCESS) {
                 ncomm+=block;
             }
             else {
                 if (verbose)
                     printf("%d: Error creating comm %d\n", rank, i);
-                 errors = 0;
-                 block = 0;
-                 break;
+                for(j = 0; j <  block; j++) {
+                    if(error_status[j].MPI_ERROR == MPI_SUCCESS){
+                        ncomm+=1;
+                    }
+                    else if(error_status[j].MPI_ERROR == MPI_ERR_PENDING) {
+                        mpi_errno = MPI_Wait(&req[j], MPI_STATUSES_IGNORE);
+                        if(mpi_errno == MPI_SUCCESS) {
+                            ncomm+=1;
+                        }
+                        else {
+                            if (verbose)
+                                printf("%d: Error creating comm %d\n", rank, i);
+                        }
+                    }
+                }
+                errors = 0;
+                block = 0;
+                break;
             }
             block = 0;
         }

-----------------------------------------------------------------------

Summary of changes:
 src/include/mpiimpl.h                   |    6 ---
 src/mpi/comm/commutil.c                 |    5 --
 src/mpi/comm/contextid.c                |   62 +++++++++++++++++++++++--------
 test/mpi/errors/comm/too_many_icomms2.c |   26 ++++++++++--
 test/mpi/threads/comm/testlist.in       |    4 +-
 5 files changed, 69 insertions(+), 34 deletions(-)


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list