[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.2-209-g0e37fd5

Service Account noreply at mpich.org
Fri Feb 19 16:34:34 CST 2016


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  0e37fd5d6c375aeb7d00eaaae23d709af1c88388 (commit)
       via  fe136b43a7242db04ce3eddf3e4fe5c41ae92827 (commit)
       via  3f83856bba2d70686692d38652cd69294ec46dc6 (commit)
      from  9f06000c160665b8a39b49b92c950b47bbf5dea1 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/0e37fd5d6c375aeb7d00eaaae23d709af1c88388

commit 0e37fd5d6c375aeb7d00eaaae23d709af1c88388
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Wed Oct 14 11:02:19 2015 -0500

    a test for fs-aware split_type
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/test/mpi/comm/cmsplit_type.c b/test/mpi/comm/cmsplit_type.c
index 5385fac..819812a 100644
--- a/test/mpi/comm/cmsplit_type.c
+++ b/test/mpi/comm/cmsplit_type.c
@@ -14,9 +14,10 @@
 
 int main(int argc, char *argv[])
 {
-    int rank, size, verbose = 0;
+    int rank, size, verbose = 0, errs=0, tot_errs=0;
     int wrank;
     MPI_Comm comm;
+    MPI_Info info;
 
     MPI_Init(&argc, &argv);
 
@@ -27,38 +28,87 @@ int main(int argc, char *argv[])
 
     /* Check to see if MPI_COMM_TYPE_SHARED works correctly */
     MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &comm);
-    if (comm == MPI_COMM_NULL)
+    if (comm == MPI_COMM_NULL) {
         printf("Expected a non-null communicator, but got MPI_COMM_NULL\n");
+	errs++;
+    }
+    else {
+        MPI_Comm_rank(comm, &rank);
+        MPI_Comm_size(comm, &size);
+        if (rank == 0 && verbose)
+            printf("Created shared subcommunicator of size %d\n", size);
+        MPI_Comm_free(&comm);
+    }
+
+#ifdef MPIX_COMM_TYPE_NEIGHBORHOOD
+    /* the MPICH-specific MPIX_COMM_TYPE_NEIGHBORHOOD*/
+    /* test #1: expected behavior -- user provided a directory, and we
+     * determine which processes share access to it */
+    MPI_Info_create(&info);
+    if (argc == 2)
+	    MPI_Info_set(info, "nbhd_common_dirname", argv[1]);
+    else
+	MPI_Info_set(info, "nbhd_common_dirname", ".");
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPIX_COMM_TYPE_NEIGHBORHOOD, 0,
+	    info, &comm);
+    if (comm == MPI_COMM_NULL) {
+        printf("Expected a non-null communicator, but got MPI_COMM_NULL\n");
+	errs++;
+    }
     else {
         MPI_Comm_rank(comm, &rank);
         MPI_Comm_size(comm, &size);
         if (rank == 0 && verbose)
-            printf("Created subcommunicator of size %d\n", size);
+            printf("Correctly created common-file subcommunicator of size %d\n", size);
         MPI_Comm_free(&comm);
     }
 
+    /* test #2: a hint we don't know about */
+    MPI_Info_delete(info, "nbhd_common_dirname");
+    MPI_Info_set(info, "mpix_tooth_fairy", "enable");
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPIX_COMM_TYPE_NEIGHBORHOOD, 0,
+	    info, &comm);
+    if (comm != MPI_COMM_NULL) {
+        printf("Expected a NULL communicator, but got something else\n");
+	errs++;
+        MPI_Comm_free(&comm);
+    }
+    else {
+        if (rank == 0 && verbose)
+            printf("Unknown hint correctly resulted in NULL communicator\n");
+    }
+
+
+    MPI_Info_free(&info);
+#endif
+
     /* Check to see if MPI_UNDEFINED is respected */
     MPI_Comm_split_type(MPI_COMM_WORLD, (wrank % 2 == 0) ? MPI_COMM_TYPE_SHARED : MPI_UNDEFINED,
                         0, MPI_INFO_NULL, &comm);
-    if ((wrank % 2) && (comm != MPI_COMM_NULL))
+    if ((wrank % 2) && (comm != MPI_COMM_NULL)) {
         printf("Expected MPI_COMM_NULL, but did not get one\n");
+	errs++;
+    }
     if (wrank % 2 == 0) {
-        if (comm == MPI_COMM_NULL)
+        if (comm == MPI_COMM_NULL) {
             printf("Expected a non-null communicator, but got MPI_COMM_NULL\n");
+	    errs++;
+	}
         else {
             MPI_Comm_rank(comm, &rank);
             MPI_Comm_size(comm, &size);
             if (rank == 0 && verbose)
-                printf("Created subcommunicator of size %d\n", size);
+                printf("Created shared subcommunicator of size %d\n", size);
             MPI_Comm_free(&comm);
         }
     }
+    MPI_Reduce(&errs, &tot_errs, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
 
     /* Use wrank because Comm_split_type may return more than one communicator
      * across the job, and if so, each will have a rank 0 entry.  Test
      * output rules are for a single process to write the successful
      * test (No Errors) output. */
-    if (wrank == 0)
+    if (wrank == 0 && errs == 0)
         printf(" No errors\n");
 
     MPI_Finalize();
diff --git a/test/mpi/comm/testlist b/test/mpi/comm/testlist
index c14e078..8e0411e 100644
--- a/test/mpi/comm/testlist
+++ b/test/mpi/comm/testlist
@@ -15,7 +15,11 @@ cmfree 4
 cmsplit 4
 cmsplit2 12
 probe-intercomm 2
-cmsplit_type 4 mpiversion=3.0
+# one way to split a communicator, if implementation supports it, is by access
+# to a common file system. argument is a directory whose presence or
+# non-presence will determine the communicator split.   the other split_type
+# cases will ignore the argument
+cmsplit_type 4 mpiversion=3.0 arg="."
 comm_create_group 4 mpiversion=3.0
 comm_create_group 8 mpiversion=3.0
 comm_group_half 2 mpiversion=3.0

http://git.mpich.org/mpich.git/commitdiff/fe136b43a7242db04ce3eddf3e4fe5c41ae92827

commit fe136b43a7242db04ce3eddf3e4fe5c41ae92827
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Thu Oct 15 15:12:37 2015 -0500

    extend comm_split_type based on neighborhood props
    
    In this first extension, we split the communicator based on common
    access to a filesystem.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/include/mpi.h.in b/src/include/mpi.h.in
index eeb656c..d95e4a9 100644
--- a/src/include/mpi.h.in
+++ b/src/include/mpi.h.in
@@ -545,6 +545,9 @@ typedef int MPI_Info;
 /* predefined types for MPI_Comm_split_type */
 #define MPI_COMM_TYPE_SHARED    1
 
+/* MPICH-specific types */
+#define MPIX_COMM_TYPE_NEIGHBORHOOD 2
+
 /* Definitions that are determined by configure. */
 typedef @MPI_AINT@ MPI_Aint;
 typedef @MPI_FINT@ MPI_Fint;
diff --git a/src/include/mpiimpl.h b/src/include/mpiimpl.h
index 23ef928..73c5472 100644
--- a/src/include/mpiimpl.h
+++ b/src/include/mpiimpl.h
@@ -70,6 +70,10 @@ int usleep(useconds_t usec);
 #include "mpidpre.h"
 #include "mpir_refcount.h"
 
+#if defined(HAVE_ROMIO)
+int MPIR_Comm_split_filesystem(MPI_Comm comm, int key, const char *dirname, MPI_Comm *newcomm);
+#endif
+
 #if defined(HAVE_LONG_LONG_INT)
 /* tt#1776: some platforms have "long long" but not a LLONG_MAX/ULLONG_MAX,
  * usually because some feature test macro has turned them off in glibc's
diff --git a/src/mpi/comm/comm_split_type.c b/src/mpi/comm/comm_split_type.c
index c4e2f59..838c561 100644
--- a/src/mpi/comm/comm_split_type.c
+++ b/src/mpi/comm/comm_split_type.c
@@ -35,8 +35,43 @@ int MPIR_Comm_split_type_impl(MPID_Comm * comm_ptr, int split_type, int key,
 {
     int mpi_errno = MPI_SUCCESS;
 
-    /* Only MPI_COMM_TYPE_SHARED and MPI_UNDEFINED are supported */
-    MPIU_Assert(split_type == MPI_COMM_TYPE_SHARED || split_type == MPI_UNDEFINED);
+    /* Only MPI_COMM_TYPE_SHARED, MPI_UNDEFINED, and
+     * NEIGHBORHOOD are supported */
+    MPIU_Assert(split_type == MPI_COMM_TYPE_SHARED ||
+                split_type == MPI_UNDEFINED ||
+                split_type == MPIX_COMM_TYPE_NEIGHBORHOOD);
+
+    if (split_type == MPIX_COMM_TYPE_NEIGHBORHOOD) {
+	int flag;
+	char hintval[MPI_MAX_INFO_VAL+1];
+
+	/* We plan on dispatching different NEIGHBORHOOD support to
+	 * different parts of MPICH, based on the key provided in the
+	 * info object.  Right now, the one NEIGHBORHOOD we support is
+	 * "nbhd_common_dirname", implementation of which lives in ROMIO */
+
+	MPIR_Info_get_impl(info_ptr, "nbhd_common_dirname", MPI_MAX_INFO_VAL, hintval,
+                           &flag);
+	if (flag) {
+	    MPI_Comm dummycomm;
+	    MPID_Comm * dummycomm_ptr;
+
+	    mpi_errno = MPIR_Comm_split_filesystem(comm_ptr->handle, key,
+                                                   hintval, &dummycomm);
+	    MPID_Comm_get_ptr(dummycomm, dummycomm_ptr);
+	    *newcomm_ptr = dummycomm_ptr;
+
+	    goto fn_exit;
+	}
+	/* we don't work with other hints yet, but if we did (e.g.
+	 * nbhd_network, nbhd_partition), we'd do so here */
+
+	/* In the mean time, the user passed in COMM_TYPE_NEIGHBORHOOD
+	 * but did not give us an info we know how to work with.
+	 * Throw up our hands and treat it like UNDEFINED.  This will
+	 * result in MPI_COMM_NULL being returned to the user. */
+	split_type = MPI_UNDEFINED;
+    }
 
     if (MPID_Comm_fns == NULL || MPID_Comm_fns->split_type == NULL) {
         int color = (split_type == MPI_COMM_TYPE_SHARED) ? comm_ptr->rank : MPI_UNDEFINED;
diff --git a/src/mpi/romio/mpi-io/Makefile.mk b/src/mpi/romio/mpi-io/Makefile.mk
index d4d5a29..275cb98 100644
--- a/src/mpi/romio/mpi-io/Makefile.mk
+++ b/src/mpi/romio/mpi-io/Makefile.mk
@@ -77,7 +77,8 @@ romio_other_sources +=       \
     mpi-io/mpich_fileutil.c \
     mpi-io/mpir-mpioinit.c   \
     mpi-io/mpiu_greq.c \
-    mpi-io/mpiu_external32.c
+    mpi-io/mpiu_external32.c \
+    mpi-io/mpir_cst_filesys.c
 
 # helper variables for conditionally compiled sources
 mpio_request_sources=   \
diff --git a/src/mpi/romio/mpi-io/mpir_cst_filesys.c b/src/mpi/romio/mpi-io/mpir_cst_filesys.c
new file mode 100644
index 0000000..74df86a
--- /dev/null
+++ b/src/mpi/romio/mpi-io/mpir_cst_filesys.c
@@ -0,0 +1,184 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2016 UChicago/Argonne LLC
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "mpioimpl.h"
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
+
+/* not to be called directly (note the MPIR_ prefix), but instead from
+ * MPI-level MPI_Comm_split_type implementation (e.g.
+ * MPIR_Comm_split_type_impl). */
+#undef FUNCNAME
+#define FUNCNAME MPIR_Comm_split_filesystem
+#undef FCNAME
+#define FCNAME MPL_QUOTE(FUNCNAME)
+
+/* split communicator based on access to directory 'dirname'. */
+int MPIR_Comm_split_filesystem(MPI_Comm comm, int key, const char *dirname, MPI_Comm * newcomm)
+{
+
+    int i, mpi_errno = MPI_SUCCESS;
+    int rank, nprocs;
+    int id;
+    int32_t *all_ids;
+    char *filename = NULL;
+    int challenge_rank, globally_visible = 0;
+    MPI_Request check_req;
+
+    MPI_Comm_rank(comm, &rank);
+    MPI_Comm_size(comm, &nprocs);
+    MPIR_Get_node_id(comm, rank, &id);
+
+    /* We could detect the common file systems by parsing 'df'-style
+     * output, but that's fidgety, fragile, and error prone.  Instead,
+     * determine who shares a file system through testing.
+     *
+     * We shouldn't create a lot of files, though -- we want something
+     * that could work at hundreds of thousands of nodes, and creating a
+     * hundred thousand files in a directory is a recipe for sadness
+     *
+     * In CH3 and in wider practice "shared memory" is the same as "on
+     * the same node, so let's start there.
+     *
+     * - Create file on one processor
+     * - pick a processor outside the "on this node" group
+     * - if that processor can see the file, then assume the file is
+     *   visible to all groups.
+     *
+     * note that this scheme works really well for traditional linux clusters:
+     * think nodes with a local scratch drive.  this scheme works less well for
+     * a deeper heirarchy.  what if the directory in question was hosted by an
+     * i/o forwarding agent?
+     */
+
+    /* learn a bit about what groups were created: as a scalable
+     * optimization we want to check a file's presence from a group
+     * other than which created it */
+    all_ids = MPL_malloc(nprocs * sizeof(*all_ids));
+
+    mpi_errno = MPI_Gather(&id, 1, MPI_INT32_T, all_ids, 1, MPI_INT32_T, 0, comm);
+
+    if (rank == 0) {
+        for (i = 0; i < nprocs; i++) {
+            if (all_ids[i] != id)
+                break;
+        }
+        if (i >= nprocs)
+            /* everyone is in the same group; pick a process that's not rank 0
+             * just in case the file system is really weird */
+            challenge_rank = nprocs - 1;
+        else
+            challenge_rank = i;
+    }
+    mpi_errno = MPI_Bcast(&challenge_rank, 1, MPI_INT, 0, comm);
+
+    /* now that we've informally lumped everyone into groups based on node
+     * (like shared memory does) it's time to poke the file system and see
+     * which group can see what files */
+
+    /* here come a bunch of assumptions:
+     * - file system layouts are homogenous: if one system has /scratch,
+     *   all have /scratch
+     * - a globally visible parallel file system will have the same name
+     *   everywhere: e.g /gpfs/users/something
+     * - a file created on one node will be deterministically visible on
+     *   another.  NFS has problems with this
+     * - if a process from one group creates a file, and a process from
+     *   another group finds that file, then a process from all groups
+     *   can find that file
+     */
+
+    /* is the file globally visible to all?  create on rank 0, test on a
+     * different off-group rank.
+     * Use a single short message to force check after create: ordering
+     * is a little odd in case we are creating and checking on the same
+     * rank  */
+
+    filename = MPL_calloc(PATH_MAX, sizeof(char));
+
+    if (rank == 0) {
+        int i, pid;
+
+        /* same algorithim as shared file pointer name */
+        srand(time(NULL));
+        i = rand();
+        pid = (int) getpid();
+
+        MPL_snprintf(filename, PATH_MAX, "%s/.commonfstest.%d.%d.%d",
+                     dirname == NULL ? "." : dirname, rank, i, pid);
+    }
+
+    MPI_Bcast(filename, PATH_MAX, MPI_BYTE, 0, comm);
+
+    if (rank == challenge_rank) {
+        MPI_Irecv(NULL, 0, MPI_BYTE, 0, 0, comm, &check_req);
+    }
+
+    if (rank == 0) {
+        MPI_File fh;
+        mpi_errno = MPI_File_open(MPI_COMM_SELF, filename,
+                                  MPI_MODE_CREATE | MPI_MODE_EXCL | MPI_MODE_WRONLY,
+                                  MPI_INFO_NULL, &fh);
+        if (mpi_errno != MPI_SUCCESS)
+            goto fn_exit;
+        MPI_File_close(&fh);
+        /* the check for file has to happen after file created. only need one
+         * process, though, not a full barrier */
+        MPI_Send(NULL, 0, MPI_BYTE, challenge_rank, 0, comm);
+    }
+
+    if (rank == challenge_rank) {
+        MPI_File fh;
+
+        MPI_Wait(&check_req, MPI_STATUS_IGNORE);
+
+        /* too bad there's no ADIO equivalent of access: we'll have to
+         * open/close the file instead */
+
+        mpi_errno = MPI_File_open(MPI_COMM_SELF, filename, MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);
+        if (mpi_errno != MPI_SUCCESS)
+            goto fn_exit;
+        if (mpi_errno == MPI_SUCCESS)
+            globally_visible = 1;
+        MPI_File_close(&fh);
+    }
+    MPI_Bcast(&globally_visible, 1, MPI_INT, challenge_rank, comm);
+
+    /*   with the above assumptions, we have two cases for a flie
+     *   created on one process:
+     *   -- either a process not in the group can access it (node-local
+     *      storage of some sort)
+     *   -- or a process not in the group cannot access it (globally
+     *      accessable parallel file system) */
+
+    if (globally_visible) {
+        MPI_Comm_dup(comm, newcomm);
+    }
+    else {
+        MPI_Comm_split(comm, id, key, newcomm);
+    }
+    if (rank == 0)
+        MPI_File_delete(filename, MPI_INFO_NULL);
+
+  fn_exit:
+    MPL_free(all_ids);
+    MPL_free(filename);
+    return mpi_errno;
+
+    /* --BEGIN ERROR HANDLING-- */
+  fn_fail:
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 noexpandtab
+ */

http://git.mpich.org/mpich.git/commitdiff/3f83856bba2d70686692d38652cd69294ec46dc6

commit 3f83856bba2d70686692d38652cd69294ec46dc6
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Fri Feb 12 10:33:01 2016 -0600

    Expose get_node_id to ROMIO
    
    some optimizations are easier for ROMIO if it has access to the node id.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/glue/romio/glue_romio.c b/src/glue/romio/glue_romio.c
index 6080327..c44928d 100644
--- a/src/glue/romio/glue_romio.c
+++ b/src/glue/romio/glue_romio.c
@@ -97,3 +97,16 @@ fn_fail:
     return mpi_errno;
 }
 
+/* ROMIO could parse hostnames but it's easier if we can let it know
+ * node ids */
+int MPIR_Get_node_id(MPI_Comm comm, int rank, int *id)
+{
+    MPID_Comm *comm_ptr;
+    MPID_Node_id_t node_id;
+
+    MPID_Comm_get_ptr(comm, comm_ptr);
+    MPID_Get_node_id(comm_ptr, rank, &node_id);
+    *id = node_id;
+
+    return MPI_SUCCESS;
+}
diff --git a/src/include/glue_romio.h.in b/src/include/glue_romio.h.in
index 0acc5bd..fc16212 100644
--- a/src/include/glue_romio.h.in
+++ b/src/include/glue_romio.h.in
@@ -41,5 +41,8 @@ void MPIR_Ext_cs_yield(void);
 /* to facilitate error checking */
 int MPIR_Ext_datatype_iscommitted(MPI_Datatype datatype);
 
+/* make comm split based on access to a common file system easier */
+int MPIR_Get_node_id(MPI_Comm comm, int rank, int *id);
+
 #endif /* defined(GLUE_ROMIO_H_INCLUDED) */
 

-----------------------------------------------------------------------

Summary of changes:
 src/glue/romio/glue_romio.c             |   13 ++
 src/include/glue_romio.h.in             |    3 +
 src/include/mpi.h.in                    |    3 +
 src/include/mpiimpl.h                   |    4 +
 src/mpi/comm/comm_split_type.c          |   39 ++++++-
 src/mpi/romio/mpi-io/Makefile.mk        |    3 +-
 src/mpi/romio/mpi-io/mpir_cst_filesys.c |  184 +++++++++++++++++++++++++++++++
 test/mpi/comm/cmsplit_type.c            |   64 ++++++++++-
 test/mpi/comm/testlist                  |    6 +-
 9 files changed, 308 insertions(+), 11 deletions(-)
 create mode 100644 src/mpi/romio/mpi-io/mpir_cst_filesys.c


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list