[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.2a2-21-gdc1eb33
Service Account
noreply at mpich.org
Wed Nov 26 10:49:53 CST 2014
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".
The branch, master has been updated
via dc1eb333f5961cdba9d952ff5017c0d06625d9e8 (commit)
via 6d818532d13a6aeb044cd6ac3894a3c76b046d98 (commit)
via b1e89abf9102a9690a4ce394442212e600332094 (commit)
via fef27d3bb2b5b76d0efb243bf5b0fdb776e2b3a9 (commit)
from 230c2df337b4d450e84b39e9a2093aa33432b484 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/dc1eb333f5961cdba9d952ff5017c0d06625d9e8
commit dc1eb333f5961cdba9d952ff5017c0d06625d9e8
Author: Ken Raffenetti <raffenet at mcs.anl.gov>
Date: Thu Nov 13 16:55:45 2014 -0600
fixup test for abort with fault tolerance
MPICH now behaves correctly for this test. There is no reason for it to output
" No errors", since the only thing we are testing for is that it does not
timeout. We also use a non-zero error code in MPI_Abort to fit the requirements
of the test runner. Closes #1537
Signed-off-by: Pavan Balaji <balaji at anl.gov>
diff --git a/test/mpi/ft/abort.c b/test/mpi/ft/abort.c
index d9a6ad6..a2f6d59 100644
--- a/test/mpi/ft/abort.c
+++ b/test/mpi/ft/abort.c
@@ -14,11 +14,8 @@ int main(int argc, char **argv)
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
- printf(" No Errors\n");
- fflush( stdout );
-
if (rank == 0)
- MPI_Abort(MPI_COMM_WORLD, MPI_SUCCESS);
+ MPI_Abort(MPI_COMM_WORLD, 1);
while(1)
;
diff --git a/test/mpi/ft/testlist b/test/mpi/ft/testlist
index 582f06d..67676f3 100644
--- a/test/mpi/ft/testlist
+++ b/test/mpi/ft/testlist
@@ -1,5 +1,5 @@
die 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false resultTest=TestStatusNoErrors
-abort 2 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false xfail=ticket1537
+abort 2 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false resultTest=TestStatus
sendalive 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false resultTest=TestStatusNoErrors
isendalive 3 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false
multi_isendalive 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false
http://git.mpich.org/mpich.git/commitdiff/6d818532d13a6aeb044cd6ac3894a3c76b046d98
commit 6d818532d13a6aeb044cd6ac3894a3c76b046d98
Author: Ken Raffenetti <raffenet at mcs.anl.gov>
Date: Wed Nov 19 16:35:16 2014 -0600
modify fatal error handler
If a fatal error occurs, pass the MPI error code to MPID_Abort. To ensure
non-zero exit status with dynamic error codes, we set the first available
dynamic error class to 1. #Refs 1537
Signed-off-by: Pavan Balaji <balaji at anl.gov>
diff --git a/src/mpi/errhan/dynerrutil.c b/src/mpi/errhan/dynerrutil.c
index 2e3bb27..8ac17af 100644
--- a/src/mpi/errhan/dynerrutil.c
+++ b/src/mpi/errhan/dynerrutil.c
@@ -46,7 +46,7 @@
static int not_initialized = 1; /* This allows us to use atomic decr */
static const char *(user_class_msgs[ERROR_MAX_NCLASS]) = { 0 };
static const char *(user_code_msgs[ERROR_MAX_NCODE]) = { 0 };
-static int first_free_class = 0;
+static int first_free_class = 1; /* class 0 is reserved */
static int first_free_code = 1; /* code 0 is reserved */
static const char empty_error_string[1] = { 0 };
diff --git a/src/mpi/errhan/errutil.c b/src/mpi/errhan/errutil.c
index ff3dd13..fbe520f 100644
--- a/src/mpi/errhan/errutil.c
+++ b/src/mpi/errhan/errutil.c
@@ -455,9 +455,9 @@ static void handleFatalError( MPID_Comm *comm_ptr,
MPIU_Snprintf(error_msg, MAX_ERRMSG_STRING, "Fatal error in %s: ", fcname);
len = (int)strlen(error_msg);
MPIR_Err_get_string(errcode, &error_msg[len], MAX_ERRMSG_STRING-len, NULL);
- /* The third argument is a return code, a value of 1 usually indicates
- an error */
- MPID_Abort(comm_ptr, MPI_SUCCESS, 1, error_msg);
+
+ /* The third argument is a return code. We simply pass the error code. */
+ MPID_Abort(comm_ptr, MPI_SUCCESS, errcode, error_msg);
}
/* --END ERROR HANDLING-- */
http://git.mpich.org/mpich.git/commitdiff/b1e89abf9102a9690a4ce394442212e600332094
commit b1e89abf9102a9690a4ce394442212e600332094
Author: Ken Raffenetti <raffenet at mcs.anl.gov>
Date: Thu Nov 13 16:42:15 2014 -0600
implement PMI_Abort
Implement abort in the Hydra PMI server and modify simple PMI to send
an abort command. Previously, we just exited the calling process and
relied on the process manager to detect it and cleanup the rest of the
job. Refs #1537
Signed-off-by: Pavan Balaji <balaji at anl.gov>
diff --git a/src/pm/hydra/pm/pmiserv/pmiserv_pmi_v1.c b/src/pm/hydra/pm/pmiserv/pmiserv_pmi_v1.c
index 9955e18..b3c0dfe 100644
--- a/src/pm/hydra/pm/pmiserv/pmiserv_pmi_v1.c
+++ b/src/pm/hydra/pm/pmiserv/pmiserv_pmi_v1.c
@@ -760,6 +760,38 @@ static HYD_status fn_lookup_name(int fd, int pid, int pgid, char *args[])
goto fn_exit;
}
+static HYD_status fn_abort(int fd, int pid, int pgid, char *args[])
+{
+ int token_count;
+ struct HYD_pmcd_token *tokens;
+ /* set a default exit code of 1 */
+ int exitcode = 1;
+ HYD_status status = HYD_SUCCESS;
+
+ HYDU_FUNC_ENTER();
+
+ status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count);
+ HYDU_ERR_POP(status, "unable to convert args to tokens\n");
+
+ if (HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "exitcode") == NULL)
+ HYDU_ERR_POP(status, "cannot find token: exitcode\n");
+
+ exitcode = atoi(HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "exitcode"));
+
+ fn_exit:
+ /* clean everything up and exit */
+ status = HYDT_bsci_wait_for_completion(0);
+ exit(exitcode);
+
+ /* never get here */
+ HYDU_FUNC_EXIT();
+ return status;
+
+ fn_fail:
+ goto fn_exit;
+}
+
+
/* TODO: abort, create_kvs, destroy_kvs, getbyidx */
static struct HYD_pmcd_pmi_handle pmi_v1_handle_fns_foo[] = {
{"barrier_in", fn_barrier_in},
@@ -769,6 +801,7 @@ static struct HYD_pmcd_pmi_handle pmi_v1_handle_fns_foo[] = {
{"publish_name", fn_publish_name},
{"unpublish_name", fn_unpublish_name},
{"lookup_name", fn_lookup_name},
+ {"abort", fn_abort},
{"\0", NULL}
};
diff --git a/src/pmi/simple/simple_pmi.c b/src/pmi/simple/simple_pmi.c
index 24daf96..1327f0d 100644
--- a/src/pmi/simple/simple_pmi.c
+++ b/src/pmi/simple/simple_pmi.c
@@ -322,8 +322,15 @@ int PMI_Finalize( void )
int PMI_Abort(int exit_code, const char error_msg[])
{
- PMIU_printf(1, "aborting job:\n%s\n", error_msg);
- MPIU_Exit(exit_code);
+ char buf[PMIU_MAXLINE];
+
+ /* include exit_code in the abort command */
+ MPIU_Snprintf( buf, PMIU_MAXLINE, "cmd=abort exitcode=%d\n", exit_code);
+
+ PMIU_printf(PMI_debug, "aborting job:\n%s\n", error_msg);
+ GetResponse( buf, "", 0 );
+
+ /* the above command should not return */
return -1;
}
http://git.mpich.org/mpich.git/commitdiff/fef27d3bb2b5b76d0efb243bf5b0fdb776e2b3a9
commit fef27d3bb2b5b76d0efb243bf5b0fdb776e2b3a9
Author: Ken Raffenetti <raffenet at mcs.anl.gov>
Date: Thu Nov 13 16:36:03 2014 -0600
ch3: cleanup abort code
We simply use PMI_Abort in both the sock and nemesis code. Remove
extra functions and constants that are not useful. Refs #1537
Signed-off-by: Pavan Balaji <balaji at anl.gov>
diff --git a/src/mpid/ch3/channels/nemesis/src/Makefile.mk b/src/mpid/ch3/channels/nemesis/src/Makefile.mk
index 21857f5..d55697c 100644
--- a/src/mpid/ch3/channels/nemesis/src/Makefile.mk
+++ b/src/mpid/ch3/channels/nemesis/src/Makefile.mk
@@ -13,7 +13,6 @@ mpi_core_sources += \
src/mpid/ch3/channels/nemesis/src/ch3_istartmsg.c \
src/mpid/ch3/channels/nemesis/src/ch3_istartmsgv.c \
src/mpid/ch3/channels/nemesis/src/ch3_progress.c \
- src/mpid/ch3/channels/nemesis/src/ch3_abort.c \
src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c \
src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c \
src/mpid/ch3/channels/nemesis/src/ch3i_comm.c \
diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_abort.c b/src/mpid/ch3/channels/nemesis/src/ch3_abort.c
deleted file mode 100644
index c07fdb6..0000000
--- a/src/mpid/ch3/channels/nemesis/src/ch3_abort.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
-/*
- * (C) 2001 by Argonne National Laboratory.
- * See COPYRIGHT in top-level directory.
- */
-
-#include "mpid_nem_impl.h"
-
-#ifdef USE_PMI2_API
-#include "pmi2.h"
-#else
-#include "pmi.h"
-#endif
-
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3_Abort
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_Abort(int exit_code, char *error_msg)
-{
- MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_ABORT);
-
- MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_ABORT);
-
-#ifdef USE_PMI2_API
- PMI2_Abort(TRUE, error_msg);
-#else
- PMI_Abort(exit_code, error_msg);
-#endif
- /* if abort returns for some reason, exit here */
-
- MPIU_Error_printf("%s", error_msg);
- fflush(stderr);
-
- MPIU_Exit(exit_code);
-
- MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_ABORT);
- return MPI_ERR_INTERN;
-}
diff --git a/src/mpid/ch3/channels/sock/include/mpidi_ch3_pre.h b/src/mpid/ch3/channels/sock/include/mpidi_ch3_pre.h
index ad8b39b..6e965d0 100644
--- a/src/mpid/ch3/channels/sock/include/mpidi_ch3_pre.h
+++ b/src/mpid/ch3/channels/sock/include/mpidi_ch3_pre.h
@@ -18,7 +18,6 @@
/* FIXME: These should be removed */
#define MPIDI_DEV_IMPLEMENTS_KVS
-#define MPIDI_DEV_IMPLEMENTS_ABORT
/* FIXME: Are the following packet extensions? Can the socket connect/accept
packets be made part of the util/sock support? */
diff --git a/src/mpid/ch3/src/mpid_abort.c b/src/mpid/ch3/src/mpid_abort.c
index 74b8a56..b48e618 100644
--- a/src/mpid/ch3/src/mpid_abort.c
+++ b/src/mpid/ch3/src/mpid_abort.c
@@ -6,15 +6,11 @@
#include "mpidimpl.h"
-/* FIXME: Who uses/sets MPIDI_DEV_IMPLEMENTS_ABORT? */
-#ifdef MPIDI_DEV_IMPLEMENTS_ABORT
#ifdef USE_PMI2_API
#include "pmi2.h"
#else
#include "pmi.h"
#endif
-static int MPIDI_CH3I_PMI_Abort(int exit_code, const char *error_msg);
-#endif
/* FIXME: We should move this into a header file so that we don't
need the ifdef. Also, don't use exit (add to coding check) since
@@ -87,36 +83,6 @@ int MPID_Abort(MPID_Comm * comm, int mpi_errno, int exit_code,
MPIR_DebuggerSetAborting( error_msg );
#endif
- /* FIXME: This should not use an ifelse chain. Either define the function
- by name or set a function pointer */
-#ifdef MPIDI_CH3_IMPLEMENTS_ABORT
- MPIDI_CH3_Abort(exit_code, error_msg);
-#elif defined(MPIDI_DEV_IMPLEMENTS_ABORT)
- MPIDI_CH3I_PMI_Abort(exit_code, error_msg);
-#else
- if (error_msg[0]) MPIU_Error_printf("%s\n", error_msg);
- fflush(stderr);
-#endif
-
- /* ch3_abort should not return but if it does, exit here. If it does,
- add the function exit code before calling the final exit. */
- MPIDI_FUNC_EXIT(MPID_STATE_MPID_ABORT);
- MPIU_Exit(exit_code);
-
- return MPI_ERR_INTERN;
-}
-
-#ifdef MPIDI_DEV_IMPLEMENTS_ABORT
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_PMI_Abort
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int MPIDI_CH3I_PMI_Abort(int exit_code, const char *error_msg)
-{
- MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PMI_ABORT);
-
- MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PMI_ABORT);
-
/* Dumping the error message in MPICH and passing the same
* message to the PM as well. This might cause duplicate messages,
* but it is better to have two messages than none. Note that the
@@ -138,10 +104,10 @@ static int MPIDI_CH3I_PMI_Abort(int exit_code, const char *error_msg)
PMI_Abort(exit_code, error_msg);
#endif
- /* if abort returns for some reason, exit here */
- exit(exit_code);
+ /* pmi_abort should not return but if it does, exit here. If it does,
+ add the function exit code before calling the final exit. */
+ MPIDI_FUNC_EXIT(MPID_STATE_MPID_ABORT);
+ MPIU_Exit(exit_code);
- MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PMI_ABORT);
- return MPI_ERR_INTERN;
+ return MPI_ERR_INTERN;
}
-#endif
-----------------------------------------------------------------------
Summary of changes:
src/mpi/errhan/dynerrutil.c | 2 +-
src/mpi/errhan/errutil.c | 6 +-
src/mpid/ch3/channels/nemesis/src/Makefile.mk | 1 -
src/mpid/ch3/channels/nemesis/src/ch3_abort.c | 39 -----------------
src/mpid/ch3/channels/sock/include/mpidi_ch3_pre.h | 1 -
src/mpid/ch3/src/mpid_abort.c | 44 ++-----------------
src/pm/hydra/pm/pmiserv/pmiserv_pmi_v1.c | 33 +++++++++++++++
src/pmi/simple/simple_pmi.c | 11 ++++-
test/mpi/ft/abort.c | 5 +--
test/mpi/ft/testlist | 2 +-
10 files changed, 53 insertions(+), 91 deletions(-)
delete mode 100644 src/mpid/ch3/channels/nemesis/src/ch3_abort.c
hooks/post-receive
--
MPICH primary repository
More information about the commits
mailing list