[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.2a2-21-gdc1eb33

Service Account noreply at mpich.org
Wed Nov 26 10:49:53 CST 2014


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  dc1eb333f5961cdba9d952ff5017c0d06625d9e8 (commit)
       via  6d818532d13a6aeb044cd6ac3894a3c76b046d98 (commit)
       via  b1e89abf9102a9690a4ce394442212e600332094 (commit)
       via  fef27d3bb2b5b76d0efb243bf5b0fdb776e2b3a9 (commit)
      from  230c2df337b4d450e84b39e9a2093aa33432b484 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/dc1eb333f5961cdba9d952ff5017c0d06625d9e8

commit dc1eb333f5961cdba9d952ff5017c0d06625d9e8
Author: Ken Raffenetti <raffenet at mcs.anl.gov>
Date:   Thu Nov 13 16:55:45 2014 -0600

    fixup test for abort with fault tolerance
    
    MPICH now behaves correctly for this test. There is no reason for it to output
    " No errors", since the only thing we are testing for is that it does not
    timeout. We also use a non-zero error code in MPI_Abort to fit the requirements
    of the test runner. Closes #1537
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/test/mpi/ft/abort.c b/test/mpi/ft/abort.c
index d9a6ad6..a2f6d59 100644
--- a/test/mpi/ft/abort.c
+++ b/test/mpi/ft/abort.c
@@ -14,11 +14,8 @@ int main(int argc, char **argv)
     MPI_Init(&argc, &argv);
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
-    printf(" No Errors\n");
-    fflush( stdout );
-
     if (rank == 0)
-        MPI_Abort(MPI_COMM_WORLD, MPI_SUCCESS);
+        MPI_Abort(MPI_COMM_WORLD, 1);
 
     while(1)
         ;
diff --git a/test/mpi/ft/testlist b/test/mpi/ft/testlist
index 582f06d..67676f3 100644
--- a/test/mpi/ft/testlist
+++ b/test/mpi/ft/testlist
@@ -1,5 +1,5 @@
 die 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false resultTest=TestStatusNoErrors
-abort 2 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false xfail=ticket1537
+abort 2 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false resultTest=TestStatus
 sendalive 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup timeLimit=10 strict=false resultTest=TestStatusNoErrors
 isendalive 3 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false
 multi_isendalive 4 env=MPIR_CVAR_ENABLE_FT=1 mpiexecarg=-disable-auto-cleanup resultTest=TestStatusNoErrors strict=false

http://git.mpich.org/mpich.git/commitdiff/6d818532d13a6aeb044cd6ac3894a3c76b046d98

commit 6d818532d13a6aeb044cd6ac3894a3c76b046d98
Author: Ken Raffenetti <raffenet at mcs.anl.gov>
Date:   Wed Nov 19 16:35:16 2014 -0600

    modify fatal error handler
    
    If a fatal error occurs, pass the MPI error code to MPID_Abort. To ensure
    non-zero exit status with dynamic error codes, we set the first available
    dynamic error class to 1. #Refs 1537
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpi/errhan/dynerrutil.c b/src/mpi/errhan/dynerrutil.c
index 2e3bb27..8ac17af 100644
--- a/src/mpi/errhan/dynerrutil.c
+++ b/src/mpi/errhan/dynerrutil.c
@@ -46,7 +46,7 @@
 static int  not_initialized = 1;  /* This allows us to use atomic decr */
 static const char *(user_class_msgs[ERROR_MAX_NCLASS]) = { 0 };
 static const char *(user_code_msgs[ERROR_MAX_NCODE]) = { 0 };
-static int  first_free_class = 0;
+static int  first_free_class = 1;  /* class 0 is reserved */
 static int  first_free_code  = 1;  /* code 0 is reserved */
 static const char empty_error_string[1] = { 0 };
 
diff --git a/src/mpi/errhan/errutil.c b/src/mpi/errhan/errutil.c
index ff3dd13..fbe520f 100644
--- a/src/mpi/errhan/errutil.c
+++ b/src/mpi/errhan/errutil.c
@@ -455,9 +455,9 @@ static void handleFatalError( MPID_Comm *comm_ptr,
     MPIU_Snprintf(error_msg, MAX_ERRMSG_STRING, "Fatal error in %s: ", fcname);
     len = (int)strlen(error_msg);
     MPIR_Err_get_string(errcode, &error_msg[len], MAX_ERRMSG_STRING-len, NULL);
-    /* The third argument is a return code, a value of 1 usually indicates
-       an error */
-    MPID_Abort(comm_ptr, MPI_SUCCESS, 1, error_msg);
+
+    /* The third argument is a return code. We simply pass the error code. */
+    MPID_Abort(comm_ptr, MPI_SUCCESS, errcode, error_msg);
 }
 /* --END ERROR HANDLING-- */
 

http://git.mpich.org/mpich.git/commitdiff/b1e89abf9102a9690a4ce394442212e600332094

commit b1e89abf9102a9690a4ce394442212e600332094
Author: Ken Raffenetti <raffenet at mcs.anl.gov>
Date:   Thu Nov 13 16:42:15 2014 -0600

    implement PMI_Abort
    
    Implement abort in the Hydra PMI server and modify simple PMI to send
    an abort command. Previously, we just exited the calling process and
    relied on the process manager to detect it and cleanup the rest of the
    job. Refs #1537
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/pm/hydra/pm/pmiserv/pmiserv_pmi_v1.c b/src/pm/hydra/pm/pmiserv/pmiserv_pmi_v1.c
index 9955e18..b3c0dfe 100644
--- a/src/pm/hydra/pm/pmiserv/pmiserv_pmi_v1.c
+++ b/src/pm/hydra/pm/pmiserv/pmiserv_pmi_v1.c
@@ -760,6 +760,38 @@ static HYD_status fn_lookup_name(int fd, int pid, int pgid, char *args[])
     goto fn_exit;
 }
 
+static HYD_status fn_abort(int fd, int pid, int pgid, char *args[])
+{
+    int token_count;
+    struct HYD_pmcd_token *tokens;
+    /* set a default exit code of 1 */
+    int exitcode = 1;
+    HYD_status status = HYD_SUCCESS;
+
+    HYDU_FUNC_ENTER();
+
+    status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count);
+    HYDU_ERR_POP(status, "unable to convert args to tokens\n");
+
+    if (HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "exitcode") == NULL)
+        HYDU_ERR_POP(status, "cannot find token: exitcode\n");
+
+    exitcode = atoi(HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "exitcode"));
+
+  fn_exit:
+    /* clean everything up and exit */
+    status = HYDT_bsci_wait_for_completion(0);
+    exit(exitcode);
+
+    /* never get here */
+    HYDU_FUNC_EXIT();
+    return status;
+
+  fn_fail:
+    goto fn_exit;
+}
+
+
 /* TODO: abort, create_kvs, destroy_kvs, getbyidx */
 static struct HYD_pmcd_pmi_handle pmi_v1_handle_fns_foo[] = {
     {"barrier_in", fn_barrier_in},
@@ -769,6 +801,7 @@ static struct HYD_pmcd_pmi_handle pmi_v1_handle_fns_foo[] = {
     {"publish_name", fn_publish_name},
     {"unpublish_name", fn_unpublish_name},
     {"lookup_name", fn_lookup_name},
+    {"abort", fn_abort},
     {"\0", NULL}
 };
 
diff --git a/src/pmi/simple/simple_pmi.c b/src/pmi/simple/simple_pmi.c
index 24daf96..1327f0d 100644
--- a/src/pmi/simple/simple_pmi.c
+++ b/src/pmi/simple/simple_pmi.c
@@ -322,8 +322,15 @@ int PMI_Finalize( void )
 
 int PMI_Abort(int exit_code, const char error_msg[])
 {
-    PMIU_printf(1, "aborting job:\n%s\n", error_msg);
-    MPIU_Exit(exit_code);
+    char buf[PMIU_MAXLINE];
+
+    /* include exit_code in the abort command */
+    MPIU_Snprintf( buf, PMIU_MAXLINE, "cmd=abort exitcode=%d\n", exit_code);
+
+    PMIU_printf(PMI_debug, "aborting job:\n%s\n", error_msg);
+    GetResponse( buf, "", 0 );
+
+    /* the above command should not return */
     return -1;
 }
 

http://git.mpich.org/mpich.git/commitdiff/fef27d3bb2b5b76d0efb243bf5b0fdb776e2b3a9

commit fef27d3bb2b5b76d0efb243bf5b0fdb776e2b3a9
Author: Ken Raffenetti <raffenet at mcs.anl.gov>
Date:   Thu Nov 13 16:36:03 2014 -0600

    ch3: cleanup abort code
    
    We simply use PMI_Abort in both the sock and nemesis code. Remove
    extra functions and constants that are not useful. Refs #1537
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/src/Makefile.mk b/src/mpid/ch3/channels/nemesis/src/Makefile.mk
index 21857f5..d55697c 100644
--- a/src/mpid/ch3/channels/nemesis/src/Makefile.mk
+++ b/src/mpid/ch3/channels/nemesis/src/Makefile.mk
@@ -13,7 +13,6 @@ mpi_core_sources +=				\
     src/mpid/ch3/channels/nemesis/src/ch3_istartmsg.c		\
     src/mpid/ch3/channels/nemesis/src/ch3_istartmsgv.c		\
     src/mpid/ch3/channels/nemesis/src/ch3_progress.c		\
-    src/mpid/ch3/channels/nemesis/src/ch3_abort.c		\
     src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c             \
     src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c             \
     src/mpid/ch3/channels/nemesis/src/ch3i_comm.c		\
diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_abort.c b/src/mpid/ch3/channels/nemesis/src/ch3_abort.c
deleted file mode 100644
index c07fdb6..0000000
--- a/src/mpid/ch3/channels/nemesis/src/ch3_abort.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
-/*
- *  (C) 2001 by Argonne National Laboratory.
- *      See COPYRIGHT in top-level directory.
- */
-
-#include "mpid_nem_impl.h"
-
-#ifdef USE_PMI2_API
-#include "pmi2.h"
-#else
-#include "pmi.h"
-#endif
-
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3_Abort
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_Abort(int exit_code, char *error_msg)
-{
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_ABORT);
-
-    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_ABORT);
-
-#ifdef USE_PMI2_API
-    PMI2_Abort(TRUE, error_msg);
-#else
-    PMI_Abort(exit_code, error_msg);
-#endif
-    /* if abort returns for some reason, exit here */
-
-    MPIU_Error_printf("%s", error_msg);
-    fflush(stderr);
-
-    MPIU_Exit(exit_code);
-
-    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_ABORT);
-    return MPI_ERR_INTERN;
-}
diff --git a/src/mpid/ch3/channels/sock/include/mpidi_ch3_pre.h b/src/mpid/ch3/channels/sock/include/mpidi_ch3_pre.h
index ad8b39b..6e965d0 100644
--- a/src/mpid/ch3/channels/sock/include/mpidi_ch3_pre.h
+++ b/src/mpid/ch3/channels/sock/include/mpidi_ch3_pre.h
@@ -18,7 +18,6 @@
 
 /* FIXME: These should be removed */
 #define MPIDI_DEV_IMPLEMENTS_KVS
-#define MPIDI_DEV_IMPLEMENTS_ABORT
 
 /* FIXME: Are the following packet extensions?  Can the socket connect/accept
    packets be made part of the util/sock support? */
diff --git a/src/mpid/ch3/src/mpid_abort.c b/src/mpid/ch3/src/mpid_abort.c
index 74b8a56..b48e618 100644
--- a/src/mpid/ch3/src/mpid_abort.c
+++ b/src/mpid/ch3/src/mpid_abort.c
@@ -6,15 +6,11 @@
 
 #include "mpidimpl.h"
 
-/* FIXME: Who uses/sets MPIDI_DEV_IMPLEMENTS_ABORT? */
-#ifdef MPIDI_DEV_IMPLEMENTS_ABORT
 #ifdef USE_PMI2_API
 #include "pmi2.h"
 #else
 #include "pmi.h"
 #endif
-static int MPIDI_CH3I_PMI_Abort(int exit_code, const char *error_msg);
-#endif
 
 /* FIXME: We should move this into a header file so that we don't
    need the ifdef.  Also, don't use exit (add to coding check) since
@@ -87,36 +83,6 @@ int MPID_Abort(MPID_Comm * comm, int mpi_errno, int exit_code,
     MPIR_DebuggerSetAborting( error_msg );
 #endif
 
-    /* FIXME: This should not use an ifelse chain. Either define the function
-       by name or set a function pointer */
-#ifdef MPIDI_CH3_IMPLEMENTS_ABORT
-    MPIDI_CH3_Abort(exit_code, error_msg);
-#elif defined(MPIDI_DEV_IMPLEMENTS_ABORT)
-    MPIDI_CH3I_PMI_Abort(exit_code, error_msg);
-#else
-    if (error_msg[0]) MPIU_Error_printf("%s\n", error_msg);
-    fflush(stderr);
-#endif
-
-    /* ch3_abort should not return but if it does, exit here.  If it does,
-       add the function exit code before calling the final exit.  */
-    MPIDI_FUNC_EXIT(MPID_STATE_MPID_ABORT);
-    MPIU_Exit(exit_code);
-    
-    return MPI_ERR_INTERN;
-}
-
-#ifdef MPIDI_DEV_IMPLEMENTS_ABORT
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_PMI_Abort
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int MPIDI_CH3I_PMI_Abort(int exit_code, const char *error_msg)
-{
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PMI_ABORT);
-    
-    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PMI_ABORT);
-
     /* Dumping the error message in MPICH and passing the same
      * message to the PM as well. This might cause duplicate messages,
      * but it is better to have two messages than none. Note that the
@@ -138,10 +104,10 @@ static int MPIDI_CH3I_PMI_Abort(int exit_code, const char *error_msg)
     PMI_Abort(exit_code, error_msg);
 #endif
 
-    /* if abort returns for some reason, exit here */
-    exit(exit_code);
+    /* pmi_abort should not return but if it does, exit here.  If it does,
+       add the function exit code before calling the final exit.  */
+    MPIDI_FUNC_EXIT(MPID_STATE_MPID_ABORT);
+    MPIU_Exit(exit_code);
 
-    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PMI_ABORT);
-    return MPI_ERR_INTERN;    
+    return MPI_ERR_INTERN;
 }
-#endif

-----------------------------------------------------------------------

Summary of changes:
 src/mpi/errhan/dynerrutil.c                        |    2 +-
 src/mpi/errhan/errutil.c                           |    6 +-
 src/mpid/ch3/channels/nemesis/src/Makefile.mk      |    1 -
 src/mpid/ch3/channels/nemesis/src/ch3_abort.c      |   39 -----------------
 src/mpid/ch3/channels/sock/include/mpidi_ch3_pre.h |    1 -
 src/mpid/ch3/src/mpid_abort.c                      |   44 ++-----------------
 src/pm/hydra/pm/pmiserv/pmiserv_pmi_v1.c           |   33 +++++++++++++++
 src/pmi/simple/simple_pmi.c                        |   11 ++++-
 test/mpi/ft/abort.c                                |    5 +--
 test/mpi/ft/testlist                               |    2 +-
 10 files changed, 53 insertions(+), 91 deletions(-)
 delete mode 100644 src/mpid/ch3/channels/nemesis/src/ch3_abort.c


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list