[mpich-commits] [mpich] MPICH primary repository branch, revoke-fixes, created. v3.2a2-169-gdead7ba
Service Account
noreply at mpich.org
Mon Feb 23 10:14:35 CST 2015
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".
The branch, revoke-fixes has been created
at dead7ba7f76c532392b24bfb86a1979a1463b7d7 (commit)
- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/dead7ba7f76c532392b24bfb86a1979a1463b7d7
commit dead7ba7f76c532392b24bfb86a1979a1463b7d7
Author: Wesley Bland <wbland at anl.gov>
Date: Fri Feb 20 16:30:41 2015 -0600
Add tests for multiple shrinks
This tests that having multiple processes fail in sequence will not
cause a series of shrinks to fail.
diff --git a/test/mpi/ft/Makefile.am b/test/mpi/ft/Makefile.am
index 7c094b5..e61467f 100644
--- a/test/mpi/ft/Makefile.am
+++ b/test/mpi/ft/Makefile.am
@@ -15,4 +15,4 @@ EXTRA_DIST = testlist
noinst_PROGRAMS = die abort sendalive isendalive senddead recvdead isenddead \
irecvdead barrier gather reduce bcast scatter failure_ack \
anysource revoke_nofail shrink agree multi_isendalive \
- agree_shrink revoke_shrink nbccoll
+ agree_shrink revoke_shrink nbccoll multi_shrink
diff --git a/test/mpi/ft/multi_shrink.c b/test/mpi/ft/multi_shrink.c
new file mode 100644
index 0000000..8050954
--- /dev/null
+++ b/test/mpi/ft/multi_shrink.c
@@ -0,0 +1,68 @@
+
+
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ * (C) 2015 by Argonne National Laboratory.
+ * See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+
+/*
+ * This test ensures that shrink works correctly when executed multiple times
+ */
+int main(int argc, char **argv)
+{
+ int rank, orig_size, lastsize, newsize, rc, errclass, errs = 0, fails = 0;
+ MPI_Comm world = MPI_COMM_WORLD, newcomm;
+
+ MPI_Init(&argc, &argv);
+ MPI_Comm_rank(world, &rank);
+ MPI_Comm_size(world, &orig_size);
+ lastsize = orig_size;
+ MPI_Comm_set_errhandler(world, MPI_ERRORS_RETURN);
+
+ if (orig_size < 8) {
+ fprintf(stderr, "Must run with at least 8 processes\n");
+ MPI_Abort(world, 1);
+ }
+
+ while (fails < orig_size - 1) {
+ fprintf(stderr, "ROUND: %d\n", fails+1);
+ if (orig_size - (fails + 1) == rank) {
+ fprintf(stderr, "SUICIDE!\n");
+ exit(EXIT_FAILURE);
+ }
+
+ rc = MPI_Barrier(world);
+
+ rc = MPIX_Comm_shrink(world, &newcomm);
+ if (rc) {
+ MPI_Error_class(rc, &errclass);
+ fprintf(stderr, "Expected MPI_SUCCESS from MPIX_Comm_shrink. Received: %d\n", errclass);
+ errs++;
+ MPI_Abort(world, 1);
+ }
+ MPI_Comm_size(newcomm, &newsize);
+ if (newsize != lastsize-1) {
+ fprintf(stderr, "Expected size to be %d but found %d\n", lastsize-1, newsize);
+ errs++;
+ MPI_Abort(world, 1);
+ } else {
+ lastsize = newsize;
+ }
+
+ MPI_Comm_free(&world);
+ world = newcomm;
+ fails++;
+ }
+
+ if (0 == rank) fprintf(stdout, " No Errors\n");
+
+ MPI_Finalize();
+
+ return 0;
+}
diff --git a/test/mpi/ft/testlist b/test/mpi/ft/testlist
index 03868d4..90a8f33 100644
--- a/test/mpi/ft/testlist
+++ b/test/mpi/ft/testlist
@@ -19,3 +19,4 @@ agree 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=Te
agree_shrink 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
revoke_shrink 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10 xfail=2198
nbccoll 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
+multi_shrink 8 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false timeLimit=10
http://git.mpich.org/mpich.git/commitdiff/e32b4e49d62dacb632d2a68731431a1cbee5aad7
commit e32b4e49d62dacb632d2a68731431a1cbee5aad7
Author: Wesley Bland <wbland at anl.gov>
Date: Fri Feb 20 16:13:42 2015 -0600
Clear error bits in MPIC functions
When an operation is completed in an MPIC function, it should no longer
have the error bits set since that information has now been captured in
the errflag. This prevents unnecessary assert failures.
diff --git a/src/mpi/coll/helper_fns.c b/src/mpi/coll/helper_fns.c
index 1e5daa4..e132f14 100644
--- a/src/mpi/coll/helper_fns.c
+++ b/src/mpi/coll/helper_fns.c
@@ -231,6 +231,8 @@ int MPIC_Wait(MPID_Request * request_ptr, mpir_errflag_t *errflag)
if (request_ptr->kind == MPID_REQUEST_RECV)
MPIR_Process_status(&request_ptr->status, errflag);
+ MPIR_TAG_CLEAR_ERROR_BITS(request_ptr->status.MPI_TAG);
+
fn_exit:
MPIU_DBG_MSG_D(PT2PT, TYPICAL, "OUT: errflag = %d", *errflag);
MPIDI_PT2PT_FUNC_EXIT(MPID_STATE_MPIC_WAIT);
@@ -359,6 +361,8 @@ int MPIC_Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag,
MPID_Request_release(request_ptr);
} else {
MPIR_Process_status(status, errflag);
+
+ MPIR_TAG_CLEAR_ERROR_BITS(status->MPI_TAG);
}
if (MPI_SUCCESS == MPIR_ERR_GET_CLASS(status->MPI_ERROR)) {
http://git.mpich.org/mpich.git/commitdiff/3e102e368639746cfac0964164fca58cf2698e79
commit 3e102e368639746cfac0964164fca58cf2698e79
Author: Wesley Bland <wbland at anl.gov>
Date: Fri Feb 13 10:57:48 2015 -0600
Simplify the revoke_shrink test
The revoke_shrink test was more complex then necessary to test the
functionality it was looking for. This removes the extra loops and goes
straight to the important part.
diff --git a/test/mpi/ft/revoke_shrink.c b/test/mpi/ft/revoke_shrink.c
index 2e30213..cfb1c84 100644
--- a/test/mpi/ft/revoke_shrink.c
+++ b/test/mpi/ft/revoke_shrink.c
@@ -13,11 +13,15 @@
#include "mpi.h"
MPI_Comm comm_all;
+void error_handler(MPI_Comm *communicator, int *error_code, ...);
void error_handler(MPI_Comm *communicator, int *error_code, ...) {
MPI_Comm *new_comm = malloc(sizeof(MPI_Comm));
+ fprintf(stderr, "REVOKING\n");
MPIX_Comm_revoke(comm_all);
+
+ fprintf(stderr, "SHRINKING\n");
MPIX_Comm_shrink(comm_all, new_comm);
MPI_Comm_free(&comm_all);
@@ -26,9 +30,10 @@ void error_handler(MPI_Comm *communicator, int *error_code, ...) {
}
int main(int argc, char *argv[]) {
- int rank, size, i;
+ int rank, size;
int sum = 0, val = 1;
int errs = 0;
+ int rc, ec;
MPI_Errhandler errhandler;
MPI_Init(&argc, &argv);
@@ -45,18 +50,29 @@ int main(int argc, char *argv[]) {
MPI_Comm_create_errhandler(&error_handler, &errhandler);
MPI_Comm_set_errhandler(comm_all, errhandler);
- for (i = 0; i < 10; ++i) {
- MPI_Comm_size(comm_all, &size);
- sum = 0;
- if (i == 5 && rank == 1) {
- exit(1);
- } else if (i != 5) {
- MPI_Allreduce(&val, &sum, 1, MPI_INT, MPI_SUM, comm_all);
- if (sum != size && rank == 0) {
- errs++;
- fprintf(stderr, "Incorrect answer: %d != %d\n", sum, size);
- }
- }
+ MPI_Comm_size(comm_all, &size);
+ if (rank == 1) {
+ exit(1);
+ }
+
+ rc = MPI_Allreduce(&val, &sum, 1, MPI_INT, MPI_SUM, comm_all);
+ MPI_Error_class(rc, &ec);
+ if (ec != MPIX_ERR_PROC_FAILED && ec != MPIX_ERR_REVOKED) {
+ errs++;
+ fprintf(stderr, "Expected MPIX_ERR_PROC_FAILED or MPIX_ERR_REVOKED, returned %d\n", ec);
+ }
+
+ rc = MPI_Allreduce(&val, &sum, 1, MPI_INT, MPI_SUM, comm_all);
+ if (rc != MPI_SUCCESS) {
+ MPI_Error_class(rc, &ec);
+ fprintf(stderr, "Expected MPI_SUCCESS, returned %d\n", ec);
+ errs++;
+ }
+
+ MPI_Comm_size(comm_all, &size);
+ if (sum != size && rank == 0) {
+ errs++;
+ fprintf(stderr, "Incorrect answer: %d != %d\n", sum, size);
}
if (0 == rank && errs) {
http://git.mpich.org/mpich.git/commitdiff/097092db36527c80ec5ae80a2ad84f39b02a72db
commit 097092db36527c80ec5ae80a2ad84f39b02a72db
Author: Wesley Bland <wbland at anl.gov>
Date: Fri Feb 13 10:56:37 2015 -0600
Add some debug output for revoked sends
If a send message is getting silently dropped because the communicator
has been revoked, drop a message in the log.
diff --git a/src/mpid/ch3/src/ch3u_eager.c b/src/mpid/ch3/src/ch3u_eager.c
index 92440a0..25dd225 100644
--- a/src/mpid/ch3/src/ch3u_eager.c
+++ b/src/mpid/ch3/src/ch3u_eager.c
@@ -309,6 +309,7 @@ int MPIDI_CH3_PktHandler_EagerShortSend( MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt,
* which this message is being sent has been revoked and we shouldn't
* bother finishing this. */
if (!found && rreq->cc == 0) {
+ MPIU_DBG_MSG(CH3_OTHER,VERBOSE, "Communicator revoked. Don't send message");
*rreqp = NULL;
goto fn_fail;
}
@@ -623,6 +624,7 @@ int MPIDI_CH3_PktHandler_EagerSend( MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt,
* which this message is being sent has been revoked and we shouldn't
* bother finishing this. */
if (!found && rreq->cc == 0) {
+ MPIU_DBG_MSG(CH3_OTHER,VERBOSE, "Communicator revoked. Don't send message");
*rreqp = NULL;
goto fn_fail;
}
@@ -707,6 +709,7 @@ int MPIDI_CH3_PktHandler_ReadySend( MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt,
* which this message is being sent has been revoked and we shouldn't
* bother finishing this. */
if (!found && rreq->cc == 0) {
+ MPIU_DBG_MSG(CH3_OTHER,VERBOSE, "Communicator revoked. Don't send message");
*rreqp = NULL;
goto fn_fail;
}
diff --git a/src/mpid/ch3/src/ch3u_rndv.c b/src/mpid/ch3/src/ch3u_rndv.c
index 6861f44..ea92aa9 100644
--- a/src/mpid/ch3/src/ch3u_rndv.c
+++ b/src/mpid/ch3/src/ch3u_rndv.c
@@ -132,6 +132,7 @@ int MPIDI_CH3_PktHandler_RndvReqToSend( MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt,
* which this message is being sent has been revoked and we shouldn't
* bother finishing this. */
if (!found && rreq->cc == 0) {
+ MPIU_DBG_MSG(CH3_OTHER,VERBOSE, "Communicator revoked. Don't send message");
*rreqp = NULL;
goto fn_fail;
}
-----------------------------------------------------------------------
hooks/post-receive
--
MPICH primary repository
More information about the commits
mailing list