[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.1.3-199-gf953564

Service Account noreply at mpich.org
Thu Nov 13 12:52:07 CST 2014


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  f953564369f6c22a95fa3f9ccb31ca1c7f0a034a (commit)
       via  96d8f4e992d728e0e667dfb9d09aa4081fe7285a (commit)
       via  c26c66278ab28809bafaf6a5f9de4ac7789ba57d (commit)
      from  f0f2c00ae54e97450d34db4a66e4707892a9c674 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/f953564369f6c22a95fa3f9ccb31ca1c7f0a034a

commit f953564369f6c22a95fa3f9ccb31ca1c7f0a034a
Author: Sangmin Seo <sseo at anl.gov>
Date:   Mon Nov 10 16:08:39 2014 -0600

    Add more nonblocking collective I/O tests.
    
    Ported test programs using collective I/O in the ROMIO test directory
    to the nonblocking collective I/O version. They were temporarily added
    to the MPICH test directory to run with Jenkins and nightly tests.
    However, they may need to be moved to the ROMIO test directory later.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/test/mpi/io/Makefile.am b/test/mpi/io/Makefile.am
index ea6751e..f1932cd 100644
--- a/test/mpi/io/Makefile.am
+++ b/test/mpi/io/Makefile.am
@@ -28,11 +28,19 @@ noinst_PROGRAMS = \
     hindexed_io
 
 if BUILD_MPIX_TESTS
-noinst_PROGRAMS += \
-    i_bigtype     \
-    i_hindexed_io \
-    i_rdwrord     \
-    i_setviewcur
+noinst_PROGRAMS +=      \
+    i_bigtype           \
+    i_hindexed_io       \
+    i_rdwrord           \
+    i_setviewcur        \
+    i_aggregation1      \
+    i_aggregation2      \
+    i_coll_test         \
+    i_darray_read       \
+    i_hindexed          \
+    i_noncontig_coll    \
+    i_noncontig_coll2   \
+    i_types_with_zeros
 endif
 
 clean-local:
diff --git a/test/mpi/io/i_aggregation1.c b/test/mpi/io/i_aggregation1.c
new file mode 100644
index 0000000..d36d821
--- /dev/null
+++ b/test/mpi/io/i_aggregation1.c
@@ -0,0 +1,304 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/* Test case from John Bent (ROMIO req #835)
+ * Aggregation code was not handling certain access patterns when collective
+ * buffering forced */
+
+/* Uses nonblocking collective I/O.*/
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <string.h>
+
+#define NUM_OBJS 4
+#define OBJ_SIZE 1048576
+
+extern char *optarg;
+extern int optind, opterr, optopt;
+
+
+char *prog = NULL;
+int debug = 0;
+
+static void Usage(int line)
+{
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    if (rank == 0) {
+        fprintf(stderr,
+                "Usage (line %d): %s [-d] [-h] -f filename\n"
+                "\t-d for debugging\n"
+                "\t-h to turn on the hints to force collective aggregation\n",
+                line, prog);
+    }
+    exit(0);
+}
+
+static void fatal_error(int mpi_ret, MPI_Status *mpi_stat, const char *msg)
+{
+    fprintf(stderr, "Fatal error %s: %d\n", msg, mpi_ret);
+    MPI_Abort(MPI_COMM_WORLD, -1);
+}
+
+static void print_hints(int rank, MPI_File *mfh)
+{
+    MPI_Info info;
+    int nkeys;
+    int i, dummy_int;
+    char key[1024];
+    char value[1024];
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) {
+        MPI_File_get_info(*mfh, &info);
+        MPI_Info_get_nkeys(info, &nkeys);
+
+        printf("HINTS:\n");
+        for (i = 0; i < nkeys; i++) {
+            MPI_Info_get_nthkey(info, i, key);
+            printf("%35s -> ", key);
+            MPI_Info_get(info, key, 1024, value, &dummy_int);
+            printf("%s\n", value);
+        }
+        MPI_Info_free(&info);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+
+static void fill_buffer(char *buffer, int bufsize, int rank, MPI_Offset offset)
+{
+    memset((void *)buffer, 0, bufsize);
+    snprintf(buffer, bufsize, "Hello from %d at %lld\n", rank, offset);
+}
+
+static MPI_Offset get_offset(int rank, int num_objs, int obj_size, int which_obj)
+{
+    MPI_Offset offset;
+    offset = (MPI_Offset)rank * num_objs * obj_size + which_obj * obj_size;
+    return offset;
+}
+
+static void write_file(char *target, int rank, MPI_Info *info)
+{
+    MPI_File wfh;
+    MPI_Request *request;
+    MPI_Status *mpi_stat;
+    int mpi_ret;
+    int i;
+    char **buffer;
+
+    request = (MPI_Request *)malloc(NUM_OBJS * sizeof(MPI_Request));
+    mpi_stat = (MPI_Status *)malloc(NUM_OBJS * sizeof(MPI_Status));
+    buffer = (char **)malloc(NUM_OBJS * sizeof(char *));
+
+    if (debug) printf("%d writing file %s\n", rank, target);
+
+    if ((mpi_ret = MPI_File_open(MPI_COMM_WORLD, target,
+                                 MPI_MODE_WRONLY | MPI_MODE_CREATE,
+                                 *info, &wfh))
+        != MPI_SUCCESS) {
+        fatal_error(mpi_ret, NULL, "open for write");
+    }
+
+    /* nonblocking collective write */
+    for (i = 0; i < NUM_OBJS; i++) {
+        MPI_Offset offset = get_offset(rank, NUM_OBJS, OBJ_SIZE, i);
+        buffer[i] = (char *)malloc(OBJ_SIZE);
+        fill_buffer(buffer[i], OBJ_SIZE, rank, offset);
+        if (debug) printf("%s", buffer[i]);
+        if ((mpi_ret = MPIX_File_iwrite_at_all(wfh, offset, buffer[i], OBJ_SIZE,
+                                              MPI_CHAR, &request[i]))
+            != MPI_SUCCESS) {
+            fatal_error(mpi_ret, NULL, "write");
+        }
+    }
+
+    if (debug) print_hints(rank, &wfh);
+
+    MPI_Waitall(NUM_OBJS, request, mpi_stat);
+
+    if ((mpi_ret = MPI_File_close(&wfh)) != MPI_SUCCESS) {
+        fatal_error(mpi_ret, NULL, "close for write");
+    }
+    if (debug) printf("%d wrote file %s\n", rank, target);
+
+    for (i = 0; i < NUM_OBJS; i++) free(buffer[i]);
+    free(buffer);
+    free(mpi_stat);
+    free(request);
+}
+
+static int reduce_corruptions(int corrupt_blocks)
+{
+    int mpi_ret;
+    int sum;
+    if ((mpi_ret = MPI_Reduce(&corrupt_blocks, &sum, 1, MPI_INT, MPI_SUM, 0,
+                              MPI_COMM_WORLD)) != MPI_SUCCESS) {
+        fatal_error(mpi_ret, NULL, "MPI_Reduce");
+    }
+    return sum;
+}
+
+static void read_file(char *target, int rank, MPI_Info *info, int *corrupt_blocks)
+{
+    MPI_File rfh;
+    MPI_Offset *offset;
+    MPI_Request *request;
+    MPI_Status *mpi_stat;
+    int mpi_ret;
+    int i;
+    char **buffer;
+    char **verify_buf = NULL;
+
+    offset = (MPI_Offset *)malloc(NUM_OBJS * sizeof(MPI_Offset));
+    request = (MPI_Request *)malloc(NUM_OBJS * sizeof(MPI_Request));
+    mpi_stat = (MPI_Status *)malloc(NUM_OBJS * sizeof(MPI_Status));
+    buffer = (char **)malloc(NUM_OBJS * sizeof(char *));
+    verify_buf = (char **)malloc(NUM_OBJS * sizeof(char *));
+
+    if (debug) printf("%d reading file %s\n", rank, target);
+
+    if ((mpi_ret = MPI_File_open(MPI_COMM_WORLD, target, MPI_MODE_RDONLY,
+                                 *info, &rfh)) != MPI_SUCCESS) {
+        fatal_error(mpi_ret, NULL, "open for read");
+    }
+
+    /* nonblocking collective read */
+    for (i = 0; i < NUM_OBJS; i++) {
+        offset[i] = get_offset(rank, NUM_OBJS, OBJ_SIZE, i);
+        buffer[i] = (char *)malloc(OBJ_SIZE);
+        verify_buf[i] = (char *)malloc(OBJ_SIZE);
+        fill_buffer(verify_buf[i], OBJ_SIZE, rank, offset[i]);
+        if (debug) printf("Expecting %s", verify_buf[i]);
+        if ((mpi_ret = MPIX_File_iread_at_all(rfh, offset[i], buffer[i],
+                                             OBJ_SIZE, MPI_CHAR, &request[i]))
+            != MPI_SUCCESS) {
+            fatal_error(mpi_ret, NULL, "read");
+        }
+    }
+
+    MPI_Waitall(NUM_OBJS, request, mpi_stat);
+
+    /* verification */
+    for (i = 0; i < NUM_OBJS; i++) {
+        if (memcmp(verify_buf[i], buffer[i], OBJ_SIZE) != 0) {
+            (*corrupt_blocks)++;
+            printf("Corruption at %lld\n", offset[i]);
+            if (debug) {
+                printf("\tExpecting %s\n" "\tRecieved  %s\n",
+                       verify_buf[i], buffer[i]);
+            }
+        }
+    }
+
+    if ((mpi_ret = MPI_File_close(&rfh)) != MPI_SUCCESS) {
+        fatal_error(mpi_ret, NULL, "close for read");
+    }
+
+    for (i = 0; i < NUM_OBJS; i++) {
+        free(verify_buf[i]);
+        free(buffer[i]);
+    }
+    free(verify_buf);
+    free(buffer);
+    free(mpi_stat);
+    free(request);
+    free(offset);
+}
+
+static void set_hints(MPI_Info *info)
+{
+    MPI_Info_set(*info, "romio_cb_write", "enable");
+    MPI_Info_set(*info, "romio_no_indep_rw", "1");
+    MPI_Info_set(*info, "cb_nodes", "1");
+    MPI_Info_set(*info, "cb_buffer_size", "4194304");
+}
+
+/*
+void
+set_hints(MPI_Info *info, char *hints) {
+    char *delimiter = " ";
+    char *hints_cp  = strdup(hints);
+    char *key = strtok(hints_cp, delimiter);
+    char *val;
+    while (key) {
+        val = strtok(NULL, delimiter);
+        if (debug) printf("HINT: %s = %s\n", key, val);
+        if (! val) {
+            Usage(__LINE__);
+        }
+        MPI_Info_set(*info, key, val);
+        key = strtok(NULL, delimiter);
+    }
+    free(hints_cp);
+}
+*/
+
+int main(int argc, char *argv[])
+{
+    int nproc = 1, rank = 0;
+    char *target = NULL;
+    int c;
+    MPI_Info info;
+    int mpi_ret;
+    int corrupt_blocks = 0;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    if ((mpi_ret = MPI_Info_create(&info)) != MPI_SUCCESS) {
+        if (rank == 0) fatal_error(mpi_ret, NULL, "MPI_info_create.\n");
+    }
+
+    prog = strdup(argv[0]);
+
+    if (argc > 1) {
+        while ((c = getopt(argc, argv, "df:h")) != EOF) {
+            switch (c) {
+            case 'd':
+                debug = 1;
+                break;
+            case 'f':
+                target = strdup(optarg);
+                break;
+            case 'h':
+                set_hints(&info);
+                break;
+            default:
+                Usage(__LINE__);
+            }
+        }
+        if (!target) {
+            Usage(__LINE__);
+        }
+    } else {
+        target = "testfile";
+        set_hints(&info);
+    }
+
+    write_file(target, rank, &info);
+    read_file(target, rank, &info, &corrupt_blocks);
+
+    corrupt_blocks = reduce_corruptions(corrupt_blocks);
+    if (rank == 0) {
+        if (corrupt_blocks == 0) {
+            fprintf(stdout, " No Errors\n");
+        }
+        else {
+            fprintf(stdout, "%d/%d blocks corrupt\n", corrupt_blocks,
+                    nproc * NUM_OBJS);
+        }
+    }
+    MPI_Info_free(&info);
+
+    MPI_Finalize();
+    free(prog);
+    exit(0);
+}
diff --git a/test/mpi/io/i_aggregation2.c b/test/mpi/io/i_aggregation2.c
new file mode 100644
index 0000000..0ead78e
--- /dev/null
+++ b/test/mpi/io/i_aggregation2.c
@@ -0,0 +1,97 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/* Look for regressions in aggregator code.  A more simple access pattern than
+ * aggregation1 */
+
+/* Uses nonblocking collective I/O.*/
+
+#include <mpi.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string.h>
+
+#define BUFSIZE 512
+
+static void handle_error(int errcode, const char *str)
+{
+    char msg[MPI_MAX_ERROR_STRING];
+    int resultlen;
+    MPI_Error_string(errcode, msg, &resultlen);
+    fprintf(stderr, "%s: %s\n", str, msg);
+    MPI_Abort(MPI_COMM_WORLD, 1);
+}
+
+int main(int argc, char **argv)
+{
+    MPI_Info info = MPI_INFO_NULL;
+    MPI_File fh;
+    MPI_Offset off = 0;
+    MPI_Status status;
+    int errcode;
+    int i, rank, errs = 0, toterrs, buffer[BUFSIZE], buf2[BUFSIZE];
+    MPI_Request request;
+    char *filename = NULL;
+
+    filename = (argc > 1) ? argv[1] : "testfile";
+
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    MPI_Info_create(&info);
+    MPI_Info_set(info, "romio_cb_write", "enable");
+    MPI_Info_set(info, "cb_nodes", "1");
+
+    for (i = 0; i < BUFSIZE; i++) {
+        buffer[i] = 10000 + rank;
+    }
+    off = rank * sizeof(buffer);
+
+    errcode = MPI_File_open(MPI_COMM_WORLD, filename,
+                            MPI_MODE_WRONLY | MPI_MODE_CREATE, info, &fh);
+    if (errcode != MPI_SUCCESS) handle_error(errcode, "MPI_File_open");
+    errcode = MPIX_File_iwrite_at_all(fh, off, buffer, BUFSIZE, MPI_INT,
+                                     &request);
+    if (errcode != MPI_SUCCESS) handle_error(errcode, "MPIX_File_iwrite_at_all");
+    MPI_Wait(&request, &status);
+    errcode = MPI_File_close(&fh);
+    if (errcode != MPI_SUCCESS) handle_error(errcode, "MPI_File_close");
+
+    errcode = MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_RDONLY, info,
+                            &fh);
+    if (errcode != MPI_SUCCESS) handle_error(errcode, "MPI_File_open");
+    errcode = MPIX_File_iread_at_all(fh, off, buf2, BUFSIZE, MPI_INT,
+                                     &request);
+    if (errcode != MPI_SUCCESS) handle_error(errcode, "MPIX_File_iread_at_all");
+    MPI_Wait(&request, &status);
+    errcode = MPI_File_close(&fh);
+    if (errcode != MPI_SUCCESS) handle_error(errcode, "MPI_File_close");
+
+    for (i = 0; i < BUFSIZE; i++) {
+        if (buf2[i] != 10000 + rank)
+            errs++;
+    }
+    MPI_Allreduce(&errs, &toterrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+    if (rank == 0) {
+        if (toterrs > 0) {
+            fprintf(stderr, "Found %d errors\n", toterrs);
+        }
+        else {
+            fprintf(stdout, " No Errors\n");
+        }
+    }
+    MPI_Info_free(&info);
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/test/mpi/io/i_coll_test.c b/test/mpi/io/i_coll_test.c
new file mode 100644
index 0000000..982f4e5
--- /dev/null
+++ b/test/mpi/io/i_coll_test.c
@@ -0,0 +1,198 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "mpi.h"
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+/* A 32^3 array. For other array sizes, change array_of_gsizes below. */
+
+/* Uses nonblocking collective I/O. Writes a 3D block-distributed array to
+   a file corresponding to the global array in row-major (C) order, reads it
+   back, and checks that the data read is correct. */
+
+/* Note that the file access pattern is noncontiguous. */
+
+void handle_error(int errcode, const char *str);
+
+void handle_error(int errcode, const char *str)
+{
+    char msg[MPI_MAX_ERROR_STRING];
+    int resultlen;
+    MPI_Error_string(errcode, msg, &resultlen);
+    fprintf(stderr, "%s: %s\n", str, msg);
+    MPI_Abort(MPI_COMM_WORLD, 1);
+}
+
+int main(int argc, char **argv)
+{
+    MPI_Datatype newtype;
+    int i, ndims, array_of_gsizes[3], array_of_distribs[3];
+    int order, nprocs, j, len;
+    int array_of_dargs[3], array_of_psizes[3];
+    int *readbuf, *writebuf, mynod, *tmpbuf, array_size;
+    MPI_Count bufcount;
+    char *filename;
+    int errs = 0, toterrs;
+    MPI_File fh;
+    MPI_Status status;
+    MPI_Request request;
+    MPI_Info info = MPI_INFO_NULL;
+    int errcode;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &mynod);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    /* process 0 broadcasts the file name to other processes */
+    if (!mynod) {
+        filename = "testfile";
+        len = strlen(filename);
+        MPI_Bcast(&len, 1, MPI_INT, 0, MPI_COMM_WORLD);
+        MPI_Bcast(filename, len + 1, MPI_CHAR, 0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Bcast(&len, 1, MPI_INT, 0, MPI_COMM_WORLD);
+        filename = (char *)malloc(len + 1);
+        MPI_Bcast(filename, len + 1, MPI_CHAR, 0, MPI_COMM_WORLD);
+    }
+
+
+    /* create the distributed array filetype */
+    ndims = 3;
+    order = MPI_ORDER_C;
+
+    array_of_gsizes[0] = 32;
+    array_of_gsizes[1] = 32;
+    array_of_gsizes[2] = 32;
+
+    array_of_distribs[0] = MPI_DISTRIBUTE_BLOCK;
+    array_of_distribs[1] = MPI_DISTRIBUTE_BLOCK;
+    array_of_distribs[2] = MPI_DISTRIBUTE_BLOCK;
+
+    array_of_dargs[0] = MPI_DISTRIBUTE_DFLT_DARG;
+    array_of_dargs[1] = MPI_DISTRIBUTE_DFLT_DARG;
+    array_of_dargs[2] = MPI_DISTRIBUTE_DFLT_DARG;
+
+    for (i = 0; i < ndims; i++) array_of_psizes[i] = 0;
+    MPI_Dims_create(nprocs, ndims, array_of_psizes);
+
+    MPI_Type_create_darray(nprocs, mynod, ndims, array_of_gsizes,
+                           array_of_distribs, array_of_dargs,
+                           array_of_psizes, order, MPI_INT, &newtype);
+    MPI_Type_commit(&newtype);
+
+    /* initialize writebuf */
+
+    MPI_Type_size_x(newtype, &bufcount);
+    bufcount = bufcount / sizeof(int);
+    writebuf = (int *)malloc(bufcount * sizeof(int));
+    for (i = 0; i < bufcount; i++) writebuf[i] = 1;
+
+    array_size = array_of_gsizes[0] * array_of_gsizes[1] * array_of_gsizes[2];
+    tmpbuf = (int *) calloc(array_size, sizeof(int));
+    MPI_Irecv(tmpbuf, 1, newtype, mynod, 10, MPI_COMM_WORLD, &request);
+    MPI_Send(writebuf, bufcount, MPI_INT, mynod, 10, MPI_COMM_WORLD);
+    MPI_Wait(&request, &status);
+
+    j = 0;
+    for (i = 0; i < array_size; i++)
+        if (tmpbuf[i]) {
+            writebuf[j] = i;
+            j++;
+        }
+    free(tmpbuf);
+
+    if (j != bufcount) {
+        fprintf(stderr, "Error in initializing writebuf on process %d\n",
+                mynod);
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    /* end of initialization */
+
+    /* write the array to the file */
+    errcode = MPI_File_open(MPI_COMM_WORLD, filename,
+                            MPI_MODE_CREATE | MPI_MODE_RDWR, info, &fh);
+    if (errcode != MPI_SUCCESS) handle_error(errcode, "MPI_File_open");
+
+    errcode = MPI_File_set_view(fh, 0, MPI_INT, newtype, "native", info);
+    if (errcode != MPI_SUCCESS) handle_error(errcode, "MPI_File_set_view");
+
+    errcode = MPIX_File_iwrite_all(fh, writebuf, bufcount, MPI_INT, &request);
+    if (errcode != MPI_SUCCESS) handle_error(errcode, "MPIX_File_iwrite_all");
+    MPI_Wait(&request, &status);
+
+    errcode = MPI_File_close(&fh);
+    if (errcode != MPI_SUCCESS) handle_error(errcode, "MPI_File_close");
+
+    if (!mynod) {
+        /* wkl suggests potential for false " No Errors" if both read
+         * and write use the same file view */
+        /* solution: rank 0 reads entire file and checks write values */
+        errcode = MPI_File_open(MPI_COMM_SELF, filename, MPI_MODE_RDONLY, info,
+                                &fh);
+        if (errcode != MPI_SUCCESS) handle_error(errcode, "MPI_File_open");
+
+        readbuf = (int *)malloc(array_size * sizeof(int));
+        errcode = MPI_File_read(fh, readbuf, array_size, MPI_INT, &status);
+        if (errcode != MPI_SUCCESS) handle_error(errcode, "MPI_File_read");
+
+        errcode = MPI_File_close(&fh);
+        if (errcode != MPI_SUCCESS) handle_error(errcode, "MPI_File_close");
+
+        for (i = 0; i < array_size; i++)
+            if (readbuf[i] != i) {
+                errs++;
+                fprintf(stderr, "Error: write integer %d but read %d\n",
+                        i, readbuf[i]);
+                break;
+            }
+        free(readbuf);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* now read it back */
+    readbuf = (int *)malloc(bufcount * sizeof(int));
+    errcode = MPI_File_open(MPI_COMM_WORLD, filename,
+                            MPI_MODE_CREATE | MPI_MODE_RDWR, info, &fh);
+    if (errcode != MPI_SUCCESS) handle_error(errcode, "MPI_File_open");
+
+    errcode = MPI_File_set_view(fh, 0, MPI_INT, newtype, "native", info);
+    if (errcode != MPI_SUCCESS) handle_error(errcode, "MPI_File_set_view");
+    errcode = MPIX_File_iread_all(fh, readbuf, bufcount, MPI_INT, &request);
+    if (errcode != MPI_SUCCESS) handle_error(errcode, "MPIX_File_iread_all");
+    MPI_Wait(&request, &status);
+    errcode = MPI_File_close(&fh);
+    if (errcode != MPI_SUCCESS) handle_error(errcode, "MPI_File_close");
+
+    /* check the data read */
+    for (i = 0; i < bufcount; i++) {
+        if (readbuf[i] != writebuf[i]) {
+            errs++;
+            fprintf(stderr, "Process %d, readbuf %d, writebuf %d, i %d\n",
+                    mynod, readbuf[i], writebuf[i], i);
+        }
+    }
+
+    MPI_Allreduce(&errs, &toterrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+    if (mynod == 0) {
+        if (toterrs > 0) {
+            fprintf(stderr, "Found %d errors\n", toterrs);
+        }
+        else {
+            fprintf(stdout, " No Errors\n");
+        }
+    }
+
+    MPI_Type_free(&newtype);
+    free(readbuf);
+    free(writebuf);
+    if (mynod) free(filename);
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/test/mpi/io/i_darray_read.c b/test/mpi/io/i_darray_read.c
new file mode 100644
index 0000000..dccb660
--- /dev/null
+++ b/test/mpi/io/i_darray_read.c
@@ -0,0 +1,137 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+#define NSIDE 5
+#define NBLOCK 3
+#define NPROC 2
+
+#define CHECK(fn) {int errcode; errcode = (fn); if (errcode != MPI_SUCCESS) handle_error(errcode, #fn);}
+
+static void handle_error(int errcode, const char *str)
+{
+    char msg[MPI_MAX_ERROR_STRING];
+    int resultlen;
+    MPI_Error_string(errcode, msg, &resultlen);
+    fprintf(stderr, "%s: %s\n", str, msg);
+    MPI_Abort(MPI_COMM_WORLD, 1);
+}
+
+
+int main(int argc, char *argv[])
+{
+    int i, j, nerrors = 0, total_errors = 0;
+
+    int rank, size;
+    int bpos;
+
+    MPI_Datatype darray;
+    MPI_Request request;
+    MPI_Status status;
+    MPI_File mpi_fh;
+
+    /* Define array distribution
+     * A 2x2 block size works with ROMIO, a 3x3 block size breaks it. */
+    int distrib[2] = { MPI_DISTRIBUTE_CYCLIC, MPI_DISTRIBUTE_CYCLIC };
+    int bsize[2] = { NBLOCK, NBLOCK };
+    int gsize[2] = { NSIDE, NSIDE };
+    int psize[2] = { NPROC, NPROC };
+
+    double data[NSIDE * NSIDE];
+    double *ldata, *pdata;
+
+    int tsize, nelem;
+    char *filename;
+
+    MPI_File dfile;
+
+    filename = (argc > 1) ? argv[1] : "testfile";
+
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    /* Set up type */
+    CHECK(MPI_Type_create_darray(size, rank, 2, gsize, distrib,
+                                 bsize, psize, MPI_ORDER_FORTRAN, MPI_DOUBLE,
+                                 &darray));
+    CHECK(MPI_Type_commit(&darray));
+    CHECK(MPI_Type_size(darray, &tsize));
+    nelem = tsize / sizeof(double);
+
+    for (i = 0; i < (NSIDE * NSIDE); i++) data[i] = i;
+
+    if (rank == 0) {
+        CHECK(MPI_File_open(MPI_COMM_SELF, filename,
+                            MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL,
+                            &dfile));
+        CHECK(MPI_File_write(dfile, data, NSIDE * NSIDE, MPI_DOUBLE, &status));
+        CHECK(MPI_File_close(&dfile));
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Allocate buffer */
+    ldata = (double *)malloc(tsize);
+    pdata = (double *)malloc(tsize);
+
+    /* Use Pack to pull out array */
+    bpos = 0;
+    CHECK(MPI_Pack(data, 1, darray, pdata, tsize, &bpos, MPI_COMM_WORLD));
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Read in array from file.  */
+    CHECK(MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_RDONLY,
+                        MPI_INFO_NULL, &mpi_fh));
+    CHECK(MPI_File_set_view(mpi_fh, 0, MPI_DOUBLE, darray, "native",
+                            MPI_INFO_NULL));
+    CHECK(MPIX_File_iread_all(mpi_fh, ldata, nelem, MPI_DOUBLE, &request));
+    CHECK(MPI_Wait(&request, &status));
+    CHECK(MPI_File_close(&mpi_fh));
+
+    for (i = 0; i < size; i++) {
+#ifdef VERBOSE
+        MPI_Barrier(MPI_COMM_WORLD);
+        if (rank == i) {
+            printf("=== Rank %i === (%i elements) \nPacked: ", rank, nelem);
+            for (j = 0; j < nelem; j++) {
+                printf("%4.1f ", pdata[j]);
+                fflush(stdout);
+            }
+            printf("\nRead:   ");
+            for (j = 0; j < nelem; j++) {
+                printf("%4.1f ", ldata[j]);
+                fflush(stdout);
+            }
+            printf("\n\n");
+            fflush(stdout);
+        }
+#endif
+        if (rank == i) {
+            for (j = 0; j < nelem; j++) {
+                if (pdata[j] != ldata[j]) {
+                    fprintf(stderr, "rank %d at index %d: packbuf %4.1f filebuf %4.1f\n",
+                            rank, j, pdata[j], ldata[j]);
+                    nerrors++;
+                }
+            }
+        }
+    }
+    MPI_Allreduce(&nerrors, &total_errors, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+    if (rank == 0 && total_errors == 0)
+        printf(" No Errors\n");
+
+    free(ldata);
+    free(pdata);
+    MPI_Type_free(&darray);
+    MPI_Finalize();
+
+    exit(total_errors);
+}
diff --git a/test/mpi/io/i_hindexed.c b/test/mpi/io/i_hindexed.c
new file mode 100644
index 0000000..f8523ed
--- /dev/null
+++ b/test/mpi/io/i_hindexed.c
@@ -0,0 +1,277 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/* Wei-keng Liao (wkliao at ece.northwestern.edu) September 8, 2008 */
+
+/* Uses nonblocking collective I/O.*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <mpi.h>
+
+#define YLEN 5
+#define XLEN 10
+#define SUB_XLEN 3
+
+/* rjl: I was just too lazy to compute this at run-time */
+char compare_buf[XLEN * 4][YLEN * 4] = {
+    {'0', '1', '2', 0, 0, '3', '4', '5', 0, 0, 'D', 'E', 'F', 0, 0, 'G', 'H', 'I'},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {'6', '7', '8', 0, 0, '9', ':', ';', 0, 0, 'J', 'K', 'L', 0, 0, 'M', 'N', 'O'},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {'X', 'Y', 'Z', 0, 0, '[', '\\', ']', 0, 0, 'l', 'm', 'n', 0, 0, 'o', 'p', 'q'},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {'^', '_', '`', 0, 0, 'a', 'b', 'c', 0, 0, 'r', 's', 't', 0, 0, 'u', 'v', 'w'},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {'0', '1', '2', 0, 0, '3', '4', '5', 0, 0, 'D', 'E', 'F', 0, 0, 'G', 'H', 'I'},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {'6', '7', '8', 0, 0, '9', ':', ';', 0, 0, 'J', 'K', 'L', 0, 0, 'M', 'N', 'O'},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {'X', 'Y', 'Z', 0, 0, '[', '\\', ']', 0, 0, 'l', 'm', 'n', 0, 0, 'o', 'p', 'q'},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {'^', '_', '`', 0, 0, 'a', 'b', 'c', 0, 0, 'r', 's', 't', 0, 0, 'u', 'v', 'w'},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+};
+
+
+/* set this if you want a dump of the global array
+#define VERBOSE 1
+*/
+
+/*----< main() >------------------------------------------------------------*/
+int main(int argc, char **argv)
+{
+    int i, j, err, rank, np, num_io;
+    char *buf, *filename;
+    int rank_dim[2], array_of_sizes[2];
+    int array_of_subsizes[2];
+    int count, *blocklengths, global_array_size;
+    MPI_Count ftype_size;
+    MPI_Aint *displacements;
+    MPI_File fh;
+    MPI_Datatype ftype;
+    MPI_Request *request;
+    MPI_Status *statuses;
+    MPI_Status status;
+    MPI_Offset offset = 0;
+    int nr_errors = 0;
+#ifdef VERBOSE
+    int k;
+#endif
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &np);
+
+    if (np != 4) {
+        if (!rank)
+            printf("Please run with 4 processes. Exiting ...\n\n");
+        MPI_Finalize();
+        return 1;
+    }
+
+    filename = (argc > 1) ? argv[1] : "testfile";
+
+    num_io = 2;
+
+    request = (MPI_Request *)malloc(num_io * sizeof(MPI_Request));
+    statuses = (MPI_Status *)malloc(num_io * sizeof(MPI_Status));
+
+    /*-----------------------------------------------------------------------*/
+    /* process rank in each dimension */
+    rank_dim[0] = rank / 2;
+    rank_dim[1] = rank % 2;
+
+    /* global 2D array size */
+    array_of_sizes[0] = YLEN * 2;
+    array_of_sizes[1] = XLEN * 2;
+
+    global_array_size = array_of_sizes[0] * array_of_sizes[1];
+
+    array_of_subsizes[0] = YLEN / 2;
+    array_of_subsizes[1] = XLEN * SUB_XLEN / 5;
+
+    offset = rank_dim[0] * YLEN * array_of_sizes[1] + rank_dim[1] * XLEN;
+
+    /* define data type for file view */
+    count = array_of_subsizes[0] * 2;   /* 2 is the no. blocks along X */
+    blocklengths = (int *)malloc(count * sizeof(int));
+    displacements = (MPI_Aint *)malloc(count * sizeof(MPI_Aint));
+    for (i = 0; i < count; i++)
+        blocklengths[i] = array_of_subsizes[1] / 2;
+    for (i = 0; i < array_of_subsizes[0]; i++)
+        for (j = 0; j < 2; j++)
+            displacements[i * 2 + j] = offset + i * 2 * array_of_sizes[1]
+                                     + j * XLEN / 2;
+    MPI_Type_create_hindexed(count, blocklengths, displacements, MPI_CHAR,
+                             &ftype);
+    MPI_Type_commit(&ftype);
+    MPI_Type_size_x(ftype, &ftype_size);
+
+/* subarray's layout in the global array
+
+   P0's 's layout                               P1's layout
+   [ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9] | [ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]
+[ 0] 0 1 2     3 4 5                          |                       D E F     G H I
+[ 1]                                          |
+[ 2] 6 7 8     9 : ;                          |                       J K L     M N O
+[ 3]                                          |
+[ 4]                                          |
+[ 5]                                          |
+[ 6]                                          |
+[ 7]                                          |
+[ 8]                                          |
+[ 9]                                          |
+
+   P2's 's layout                               P3's layout
+   [ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9] | [ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]
+[ 0]                                          |
+[ 1]                                          |
+[ 2]                                          |
+[ 3]                                          |
+[ 4]                                          |
+[ 5] X Y Z     [ \ ]                          |                       l m n     o p q
+[ 6]                                          |
+[ 7] ^ _ `     a b c                          |                       r s t     u v w
+[ 8]                                          |
+[ 9]                                          |
+*/
+
+    /* initialize the write buffer */
+    buf = (char *)malloc(array_of_subsizes[0] * array_of_subsizes[1]);
+    for (i = 0; i < array_of_subsizes[0] * array_of_subsizes[1]; i++)
+        buf[i] = '0' + rank * 20 + i % 79;
+
+    /* zero file contents --------------------------------------------------- */
+    if (rank == 0) {
+        char *wr_buf = (char *)calloc(num_io * global_array_size, 1);
+        MPI_File_open(MPI_COMM_SELF, filename,
+                      MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);
+        MPI_File_write(fh, wr_buf, num_io * global_array_size, MPI_CHAR,
+                       &status);
+        MPI_File_close(&fh);
+        free(wr_buf);
+    }
+    /* open the file -------------------------------------------------------- */
+    err = MPI_File_open(MPI_COMM_WORLD, filename,
+                        MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);
+    if (err != MPI_SUCCESS) {
+        printf("Error: MPI_File_open() filename %s\n", filename);
+        MPI_Abort(MPI_COMM_WORLD, -1);
+        exit(1);
+    }
+
+    /* MPI nonblocking collective write */
+    for (i = 0; i < num_io; i++) {
+        offset = i * global_array_size;
+        /* set the file view */
+        MPI_File_set_view(fh, offset, MPI_BYTE, ftype, "native", MPI_INFO_NULL);
+        MPIX_File_iwrite_all(fh, buf, ftype_size, MPI_CHAR, &request[i]);
+    }
+    MPI_Waitall(num_io, request, statuses);
+    MPI_File_close(&fh);
+
+    /* read and print file contents ----------------------------------------- */
+    if (rank == 0) {
+        char *ptr;
+        char *rd_buf = (char *)calloc(num_io * global_array_size, 1);
+        MPI_File_open(MPI_COMM_SELF, filename, MPI_MODE_RDONLY, MPI_INFO_NULL,
+                      &fh);
+        MPI_File_read(fh, rd_buf, num_io * global_array_size, MPI_CHAR, &status);
+        MPI_File_close(&fh);
+
+#ifdef VERBOSE
+        printf("-------------------------------------------------------\n");
+        printf("   [");
+        for (i = 0; i < 2; i++) {
+            for (j = 0; j < XLEN; j++)
+                printf(" %d", j);
+            printf(" ");
+        }
+        printf("]\n\n");
+
+
+        ptr = rd_buf;
+        for (k = 0; k < num_io; k++) {
+            for (i = 0; i < 2 * YLEN; i++) {
+                printf("[%2d]", k * 2 * YLEN + i);
+                for (j = 0; j < 2 * XLEN; j++) {
+                    if (j > 0 && j % XLEN == 0)
+                        printf(" ");
+                    if (*ptr != 0)
+                        printf(" %c", *ptr);
+                    else
+                        printf("  ");
+                    ptr++;
+                }
+                printf("\n");
+            }
+            printf("\n");
+        }
+#endif
+        ptr = rd_buf;
+        for (i = 0; i < 2 * YLEN * num_io; i++) {
+            for (j = 0; j < 2 * XLEN; j++) {
+                if (*ptr != compare_buf[i][j]) {
+                    fprintf(stderr, "expected %d got %d at [%d][%d]\n",
+                            *ptr, compare_buf[i][j], i, j);
+                    nr_errors++;
+                }
+                ptr++;
+            }
+        }
+        free(rd_buf);
+
+        if (nr_errors == 0)
+            fprintf(stdout, " No Errors\n");
+        else
+            fprintf(stderr, "Found %d errors\n", nr_errors);
+    }
+
+    free(blocklengths);
+    free(displacements);
+    free(buf);
+    free(request);
+    free(statuses);
+    MPI_Type_free(&ftype);
+    MPI_Finalize();
+    return 0;
+}
+
+/* command-line outputs are: (the global array is written twice)
+
+% mpiexec -n 4 wkl_subarray
+-------------------------------------------------------
+   [ 0 1 2 3 4 5 6 7 8 9  0 1 2 3 4 5 6 7 8 9 ]
+
+[ 0] 0 1 2     3 4 5      D E F     G H I
+[ 1]
+[ 2] 6 7 8     9 : ;      J K L     M N O
+[ 3]
+[ 4]
+[ 5] X Y Z     [ \ ]      l m n     o p q
+[ 6]
+[ 7] ^ _ `     a b c      r s t     u v w
+[ 8]
+[ 9]
+
+[10] 0 1 2     3 4 5      D E F     G H I
+[11]
+[12] 6 7 8     9 : ;      J K L     M N O
+[13]
+[14]
+[15] X Y Z     [ \ ]      l m n     o p q
+[16]
+[17] ^ _ `     a b c      r s t     u v w
+[18]
+[19]
+
+*/
diff --git a/test/mpi/io/i_noncontig_coll.c b/test/mpi/io/i_noncontig_coll.c
new file mode 100644
index 0000000..f625e9b
--- /dev/null
+++ b/test/mpi/io/i_noncontig_coll.c
@@ -0,0 +1,240 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "mpi.h"
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+/* tests noncontiguous reads/writes using nonblocking collective I/O */
+
+#define SIZE 5000
+
+#define VERBOSE 0
+int main(int argc, char **argv)
+{
+    int *buf, i, mynod, nprocs, len, b[3];
+    int errs = 0, toterrs;
+    MPI_Aint d[3];
+    MPI_File fh;
+    MPI_Request request;
+    MPI_Status status;
+    char *filename;
+    MPI_Datatype typevec, newtype, t[3];
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &mynod);
+
+    if (nprocs != 2) {
+        fprintf(stderr, "Run this program on two processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    /* process 0 broadcasts the file name to other processes */
+    if (!mynod) {
+        filename = "testfile";
+        len = strlen(filename);
+        MPI_Bcast(&len, 1, MPI_INT, 0, MPI_COMM_WORLD);
+        MPI_Bcast(filename, len + 1, MPI_CHAR, 0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Bcast(&len, 1, MPI_INT, 0, MPI_COMM_WORLD);
+        filename = (char *)malloc(len + 1);
+        MPI_Bcast(filename, len + 1, MPI_CHAR, 0, MPI_COMM_WORLD);
+    }
+
+    buf = (int *)malloc(SIZE * sizeof(int));
+
+    MPI_Type_vector(SIZE / 2, 1, 2, MPI_INT, &typevec);
+
+    b[0] = b[1] = b[2] = 1;
+    d[0] = 0;
+    d[1] = mynod * sizeof(int);
+    d[2] = SIZE * sizeof(int);
+    t[0] = MPI_LB;
+    t[1] = typevec;
+    t[2] = MPI_UB;
+
+    MPI_Type_struct(3, b, d, t, &newtype);
+    MPI_Type_commit(&newtype);
+    MPI_Type_free(&typevec);
+
+    if (!mynod) {
+#if VERBOSE
+        fprintf(stderr, "\ntesting noncontiguous in memory, noncontiguous in "
+                        "file using collective I/O\n");
+#endif
+        MPI_File_delete(filename, MPI_INFO_NULL);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_CREATE | MPI_MODE_RDWR,
+                  MPI_INFO_NULL, &fh);
+
+    MPI_File_set_view(fh, 0, MPI_INT, newtype, "native", MPI_INFO_NULL);
+
+    for (i = 0; i < SIZE; i++)
+        buf[i] = i + mynod * SIZE;
+    MPIX_File_iwrite_all(fh, buf, 1, newtype, &request);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Wait(&request, &status);
+
+    for (i = 0; i < SIZE; i++)
+        buf[i] = -1;
+
+    MPIX_File_iread_at_all(fh, 0, buf, 1, newtype, &request);
+    MPI_Wait(&request, &status);
+
+    for (i = 0; i < SIZE; i++) {
+        if (!mynod) {
+            if ((i % 2) && (buf[i] != -1)) {
+                errs++;
+                fprintf(stderr, "Process %d: buf %d is %d, should be -1\n",
+                        mynod, i, buf[i]);
+            }
+            if (!(i % 2) && (buf[i] != i)) {
+                errs++;
+                fprintf(stderr, "Process %d: buf %d is %d, should be %d\n",
+                        mynod, i, buf[i], i);
+            }
+        }
+        else {
+            if ((i % 2) && (buf[i] != i + mynod * SIZE)) {
+                errs++;
+                fprintf(stderr, "Process %d: buf %d is %d, should be %d\n",
+                        mynod, i, buf[i], i + mynod * SIZE);
+            }
+            if (!(i % 2) && (buf[i] != -1)) {
+                errs++;
+                fprintf(stderr, "Process %d: buf %d is %d, should be -1\n",
+                        mynod, i, buf[i]);
+            }
+        }
+    }
+
+    MPI_File_close(&fh);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (!mynod) {
+#if VERBOSE
+        fprintf(stderr, "\ntesting noncontiguous in memory, contiguous in file "
+                        "using collective I/O\n");
+#endif
+        MPI_File_delete(filename, MPI_INFO_NULL);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_CREATE | MPI_MODE_RDWR,
+                  MPI_INFO_NULL, &fh);
+
+    for (i = 0; i < SIZE; i++) buf[i] = i + mynod * SIZE;
+    MPIX_File_iwrite_at_all(fh, mynod * (SIZE / 2) * sizeof(int), buf, 1,
+                           newtype, &request);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Wait(&request, &status);
+
+    for (i = 0; i < SIZE; i++) buf[i] = -1;
+
+    MPIX_File_iread_at_all(fh, mynod * (SIZE / 2) * sizeof(int), buf, 1,
+                           newtype, &request);
+    MPI_Wait(&request, &status);
+
+    for (i = 0; i < SIZE; i++) {
+        if (!mynod) {
+            if ((i % 2) && (buf[i] != -1)) {
+                errs++;
+                fprintf(stderr, "Process %d: buf %d is %d, should be -1\n",
+                        mynod, i, buf[i]);
+            }
+            if (!(i % 2) && (buf[i] != i)) {
+                errs++;
+                fprintf(stderr, "Process %d: buf %d is %d, should be %d\n",
+                        mynod, i, buf[i], i);
+            }
+        }
+        else {
+            if ((i % 2) && (buf[i] != i + mynod * SIZE)) {
+                errs++;
+                fprintf(stderr, "Process %d: buf %d is %d, should be %d\n",
+                        mynod, i, buf[i], i + mynod * SIZE);
+            }
+            if (!(i % 2) && (buf[i] != -1)) {
+                errs++;
+                fprintf(stderr, "Process %d: buf %d is %d, should be -1\n",
+                        mynod, i, buf[i]);
+            }
+        }
+    }
+
+    MPI_File_close(&fh);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (!mynod) {
+#if VERBOSE
+        fprintf(stderr, "\ntesting contiguous in memory, noncontiguous in file "
+                        "using collective I/O\n");
+#endif
+        MPI_File_delete(filename, MPI_INFO_NULL);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_CREATE | MPI_MODE_RDWR,
+                  MPI_INFO_NULL, &fh);
+
+    MPI_File_set_view(fh, 0, MPI_INT, newtype, "native", MPI_INFO_NULL);
+
+    for (i = 0; i < SIZE; i++) buf[i] = i + mynod * SIZE;
+    MPIX_File_iwrite_all(fh, buf, SIZE, MPI_INT, &request);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Wait(&request, &status);
+
+    for (i = 0; i < SIZE; i++)
+        buf[i] = -1;
+
+    MPIX_File_iread_at_all(fh, 0, buf, SIZE, MPI_INT, &request);
+    MPI_Wait(&request, &status);
+
+    for (i = 0; i < SIZE; i++) {
+        if (!mynod) {
+            if (buf[i] != i) {
+                errs++;
+                fprintf(stderr, "Process %d: buf %d is %d, should be %d\n",
+                        mynod, i, buf[i], i);
+            }
+        }
+        else {
+            if (buf[i] != i + mynod * SIZE) {
+                errs++;
+                fprintf(stderr, "Process %d: buf %d is %d, should be %d\n",
+                        mynod, i, buf[i], i + mynod * SIZE);
+            }
+        }
+    }
+
+    MPI_File_close(&fh);
+
+    MPI_Allreduce(&errs, &toterrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+    if (mynod == 0) {
+        if (toterrs > 0) {
+            fprintf(stderr, "Found %d errors\n", toterrs);
+        }
+        else {
+            fprintf(stdout, " No Errors\n");
+        }
+    }
+
+    MPI_Type_free(&newtype);
+    free(buf);
+    if (mynod) free(filename);
+    MPI_Finalize();
+    return 0;
+}
diff --git a/test/mpi/io/i_noncontig_coll2.c b/test/mpi/io/i_noncontig_coll2.c
new file mode 100644
index 0000000..83d8ab8
--- /dev/null
+++ b/test/mpi/io/i_noncontig_coll2.c
@@ -0,0 +1,560 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "mpi.h"
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+/* tests noncontiguous reads/writes using nonblocking collective I/O */
+
+/* this test is almost exactly like i_noncontig_coll.c with the following changes:
+ *
+ * . generalized file writing/reading to handle arbitrary number of processors
+ * . provides the "cb_config_list" hint with several permutations of the
+ *   avaliable processors.
+ *   [ makes use of code copied from ROMIO's ADIO code to collect the names of
+ *   the processors ]
+ */
+
+/* we are going to muck with this later to make it evenly divisible by however many compute nodes we have */
+#define STARTING_SIZE 5000
+
+int test_file(char *filename, int mynod, int nprocs, char *cb_hosts,
+              const char *msg, int verbose);
+
+#define ADIOI_Free free
+#define ADIOI_Malloc malloc
+#define FPRINTF fprintf
+/* I have no idea what the "D" stands for; it's how things are done in adio.h
+ */
+struct ADIO_cb_name_arrayD {
+    int refct;
+    int namect;
+    char **names;
+};
+typedef struct ADIO_cb_name_arrayD *ADIO_cb_name_array;
+
+void handle_error(int errcode, const char *str);
+int cb_gather_name_array(MPI_Comm comm, ADIO_cb_name_array * arrayp);
+void default_str(int mynod, int len, ADIO_cb_name_array array, char *dest);
+void reverse_str(int mynod, int len, ADIO_cb_name_array array, char *dest);
+void reverse_alternating_str(int mynod, int len, ADIO_cb_name_array array, char *dest);
+void simple_shuffle_str(int mynod, int len, ADIO_cb_name_array array, char *dest);
+
+
+void handle_error(int errcode, const char *str)
+{
+    char msg[MPI_MAX_ERROR_STRING];
+    int resultlen;
+    MPI_Error_string(errcode, msg, &resultlen);
+    fprintf(stderr, "%s: %s\n", str, msg);
+    MPI_Abort(MPI_COMM_WORLD, 1);
+}
+
+
+/* cb_gather_name_array() - gather a list of processor names from all processes
+ *                          in a communicator and store them on rank 0.
+ *
+ * This is a collective call on the communicator(s) passed in.
+ *
+ * Obtains a rank-ordered list of processor names from the processes in
+ * "dupcomm".
+ *
+ * Returns 0 on success, -1 on failure.
+ *
+ * NOTE: Needs some work to cleanly handle out of memory cases!
+ */
+int cb_gather_name_array(MPI_Comm comm, ADIO_cb_name_array * arrayp)
+{
+    /* this is copied from ROMIO, but since this test is for correctness,
+     * not performance, note that we have removed the parts where ROMIO
+     * uses a keyval to cache the name array.  We'll just rebuild it if we
+     * need to */
+
+    char my_procname[MPI_MAX_PROCESSOR_NAME], **procname = 0;
+    int *procname_len = NULL, my_procname_len, *disp = NULL, i;
+    int commsize, commrank;
+    ADIO_cb_name_array array = NULL;
+
+    MPI_Comm_size(comm, &commsize);
+    MPI_Comm_rank(comm, &commrank);
+
+    MPI_Get_processor_name(my_procname, &my_procname_len);
+
+    /* allocate space for everything */
+    array = (ADIO_cb_name_array)malloc(sizeof(*array));
+    if (array == NULL) {
+        return -1;
+    }
+    array->refct = 1;
+
+    if (commrank == 0) {
+        /* process 0 keeps the real list */
+        array->namect = commsize;
+
+        array->names = (char **)ADIOI_Malloc(sizeof(char *) * commsize);
+        if (array->names == NULL) {
+            return -1;
+        }
+        procname = array->names;        /* simpler to read */
+
+        procname_len = (int *)ADIOI_Malloc(commsize * sizeof(int));
+        if (procname_len == NULL) {
+            return -1;
+        }
+    }
+    else {
+        /* everyone else just keeps an empty list as a placeholder */
+        array->namect = 0;
+        array->names = NULL;
+    }
+    /* gather lengths first */
+    MPI_Gather(&my_procname_len, 1, MPI_INT, procname_len, 1, MPI_INT, 0, comm);
+
+    if (commrank == 0) {
+#ifdef CB_CONFIG_LIST_DEBUG
+        for (i = 0; i < commsize; i++) {
+            FPRINTF(stderr, "len[%d] = %d\n", i, procname_len[i]);
+        }
+#endif
+
+        for (i = 0; i < commsize; i++) {
+            /* add one to the lengths because we need to count the
+             * terminator, and we are going to use this list of lengths
+             * again in the gatherv.
+             */
+            procname_len[i]++;
+            procname[i] = malloc(procname_len[i]);
+            if (procname[i] == NULL) {
+                return -1;
+            }
+        }
+
+        /* create our list of displacements for the gatherv.  we're going
+         * to do everything relative to the start of the region allocated
+         * for procname[0]
+         *
+         * I suppose it is theoretically possible that the distance between
+         * malloc'd regions could be more than will fit in an int.  We don't
+         * cover that case.
+         */
+        disp = malloc(commsize * sizeof(int));
+        disp[0] = 0;
+        for (i = 1; i < commsize; i++) {
+            disp[i] = (int)(procname[i] - procname[0]);
+        }
+
+    }
+
+    /* now gather strings */
+    if (commrank == 0) {
+        MPI_Gatherv(my_procname, my_procname_len + 1, MPI_CHAR,
+                    procname[0], procname_len, disp, MPI_CHAR, 0, comm);
+    }
+    else {
+        /* if we didn't do this, we would need to allocate procname[]
+         * on all processes...which seems a little silly.
+         */
+        MPI_Gatherv(my_procname, my_procname_len + 1, MPI_CHAR,
+                    NULL, NULL, NULL, MPI_CHAR, 0, comm);
+    }
+
+    if (commrank == 0) {
+        /* no longer need the displacements or lengths */
+        free(disp);
+        free(procname_len);
+
+#ifdef CB_CONFIG_LIST_DEBUG
+        for (i = 0; i < commsize; i++) {
+            fprintf(stderr, "name[%d] = %s\n", i, procname[i]);
+        }
+#endif
+    }
+
+    *arrayp = array;
+    return 0;
+}
+
+void default_str(int mynod, int len, ADIO_cb_name_array array, char *dest)
+{
+    char *ptr;
+    int i, p;
+    if (!mynod) {
+        ptr = dest;
+        for (i = 0; i < array->namect; i++) {
+            p = snprintf(ptr, len, "%s,", array->names[i]);
+            ptr += p;
+        }
+        /* chop off that last comma */
+        dest[strlen(dest) - 1] = '\0';
+    }
+    MPI_Bcast(dest, len, MPI_CHAR, 0, MPI_COMM_WORLD);
+}
+
+void reverse_str(int mynod, int len, ADIO_cb_name_array array, char *dest)
+{
+    char *ptr;
+    int i, p;
+    if (!mynod) {
+        ptr = dest;
+        for (i = (array->namect - 1); i >= 0; i--) {
+            p = snprintf(ptr, len, "%s,", array->names[i]);
+            ptr += p;
+        }
+        dest[strlen(dest) - 1] = '\0';
+    }
+    MPI_Bcast(dest, len, MPI_CHAR, 0, MPI_COMM_WORLD);
+}
+
+void reverse_alternating_str(int mynod, int len, ADIO_cb_name_array array, char *dest)
+{
+    char *ptr;
+    int i, p;
+    if (!mynod) {
+        ptr = dest;
+        /* evens */
+        for (i = (array->namect - 1); i >= 0; i -= 2) {
+            p = snprintf(ptr, len, "%s,", array->names[i]);
+            ptr += p;
+        }
+        /* odds */
+        for (i = (array->namect - 2); i > 0; i -= 2) {
+            p = snprintf(ptr, len, "%s,", array->names[i]);
+            ptr += p;
+        }
+        dest[strlen(dest) - 1] = '\0';
+    }
+    MPI_Bcast(dest, len, MPI_CHAR, 0, MPI_COMM_WORLD);
+}
+
+void simple_shuffle_str(int mynod, int len, ADIO_cb_name_array array, char *dest)
+{
+    char *ptr;
+    int i, p;
+    if (!mynod) {
+        ptr = dest;
+        for (i = (array->namect / 2); i < array->namect; i++) {
+            p = snprintf(ptr, len, "%s,", array->names[i]);
+            ptr += p;
+        }
+        for (i = 0; i < (array->namect / 2); i++) {
+            p = snprintf(ptr, len, "%s,", array->names[i]);
+            ptr += p;
+        }
+        dest[strlen(dest) - 1] = '\0';
+    }
+    MPI_Bcast(dest, len, MPI_CHAR, 0, MPI_COMM_WORLD);
+}
+
+int main(int argc, char **argv)
+{
+    int i, mynod, nprocs, len, errs = 0, sum_errs = 0, verbose = 0;
+    char *filename;
+    char *cb_config_string;
+    int cb_config_len;
+    ADIO_cb_name_array array;
+
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &mynod);
+
+
+    /* process 0 takes the file name as a command-line argument and
+     * broadcasts it to other processes */
+    if (!mynod) {
+        filename = "testfile";
+        len = strlen(filename);
+        MPI_Bcast(&len, 1, MPI_INT, 0, MPI_COMM_WORLD);
+        MPI_Bcast(filename, len + 1, MPI_CHAR, 0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Bcast(&len, 1, MPI_INT, 0, MPI_COMM_WORLD);
+        filename = (char *)malloc(len + 1);
+        MPI_Bcast(filename, len + 1, MPI_CHAR, 0, MPI_COMM_WORLD);
+    }
+
+    /* want to hint the cb_config_list, but do so in a non-sequential way */
+    cb_gather_name_array(MPI_COMM_WORLD, &array);
+
+    /* sanity check */
+    if (!mynod) {
+        if (array->namect < 2) {
+            fprintf(stderr, "Run this test on two or more hosts\n");
+            MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+    }
+    /* get space for the permuted cb_config_string */
+    if (!mynod) {
+        cb_config_len = 0;
+        for (i = 0; i < array->namect; i++) {
+            /* +1: space for either a , or \0 if last */
+            cb_config_len += strlen(array->names[i]) + 1;
+        }
+        ++cb_config_len;
+    }
+    MPI_Bcast(&cb_config_len, 1, MPI_INT, 0, MPI_COMM_WORLD);
+    if ((cb_config_string = malloc(cb_config_len)) == NULL) {
+        perror("malloc");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    /* first, no hinting */
+    errs += test_file(filename, mynod, nprocs, NULL,
+                      "collective w/o hinting", verbose);
+
+    /* hint, but no change in order */
+    default_str(mynod, cb_config_len, array, cb_config_string);
+    errs += test_file(filename, mynod, nprocs, cb_config_string,
+                      "collective w/ hinting: default order", verbose);
+
+    /*  reverse order */
+    reverse_str(mynod, cb_config_len, array, cb_config_string);
+    errs += test_file(filename, mynod, nprocs, cb_config_string,
+                      "collective w/ hinting: reverse order", verbose);
+
+    /* reverse, every other */
+    reverse_alternating_str(mynod, cb_config_len, array, cb_config_string);
+    errs += test_file(filename, mynod, nprocs, cb_config_string,
+                      "collective w/ hinting: permutation1", verbose);
+
+    /* second half, first half */
+    simple_shuffle_str(mynod, cb_config_len, array, cb_config_string);
+    errs += test_file(filename, mynod, nprocs, cb_config_string,
+                      "collective w/ hinting: permutation2", verbose);
+
+    MPI_Allreduce(&errs, &sum_errs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+
+    if (!mynod) {
+        if (sum_errs) fprintf(stderr, "Found %d error cases\n", sum_errs);
+        else printf(" No Errors\n");
+    }
+    if (mynod) free(filename);
+    free(cb_config_string);
+    MPI_Finalize();
+    return 0;
+}
+
+#define SEEDER(x,y,z) ((x)*1000000 + (y) + (x)*(z))
+
+int test_file(char *filename, int mynod, int nprocs, char *cb_hosts,
+              const char *msg, int verbose)
+{
+    MPI_Datatype typevec, newtype, t[3];
+    int *buf, i, b[3], errcode, errors = 0;
+    MPI_File fh;
+    MPI_Aint d[3];
+    MPI_Request request;
+    MPI_Status status;
+    int SIZE = (STARTING_SIZE / nprocs) * nprocs;
+    MPI_Info info;
+
+    if (mynod == 0 && verbose)
+        fprintf(stderr, "%s\n", msg);
+
+    buf = (int *)malloc(SIZE * sizeof(int));
+    if (buf == NULL) {
+        perror("test_file");
+        MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+
+
+    if (cb_hosts != NULL) {
+        MPI_Info_create(&info);
+        MPI_Info_set(info, "cb_config_list", cb_hosts);
+    }
+    else {
+        info = MPI_INFO_NULL;
+    }
+
+    MPI_Type_vector(SIZE / nprocs, 1, nprocs, MPI_INT, &typevec);
+
+    b[0] = b[1] = b[2] = 1;
+    d[0] = 0;
+    d[1] = mynod * sizeof(int);
+    d[2] = SIZE * sizeof(int);
+    t[0] = MPI_LB;
+    t[1] = typevec;
+    t[2] = MPI_UB;
+
+    MPI_Type_struct(3, b, d, t, &newtype);
+    MPI_Type_commit(&newtype);
+    MPI_Type_free(&typevec);
+
+    if (!mynod) {
+        if (verbose)
+            fprintf(stderr, "\ntesting noncontiguous in memory, noncontiguous "
+                            "in file using collective I/O\n");
+        MPI_File_delete(filename, info);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    errcode = MPI_File_open(MPI_COMM_WORLD, filename,
+                            MPI_MODE_CREATE | MPI_MODE_RDWR, info, &fh);
+    if (errcode != MPI_SUCCESS) {
+        handle_error(errcode, "MPI_File_open");
+    }
+
+    MPI_File_set_view(fh, 0, MPI_INT, newtype, "native", info);
+
+    for (i = 0; i < SIZE; i++) buf[i] = SEEDER(mynod, i, SIZE);
+    errcode = MPIX_File_iwrite_all(fh, buf, 1, newtype, &request);
+    if (errcode != MPI_SUCCESS) {
+        handle_error(errcode, "nc mem - nc file: MPIX_File_iwrite_all");
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Wait(&request, &status);
+
+    for (i = 0; i < SIZE; i++) buf[i] = -1;
+
+    errcode = MPIX_File_iread_at_all(fh, 0, buf, 1, newtype, &request);
+    if (errcode != MPI_SUCCESS) {
+        handle_error(errcode, "nc mem - nc file: MPIX_File_iread_at_all");
+    }
+    MPI_Wait(&request, &status);
+
+    /* the verification for N compute nodes is tricky. Say we have 3
+     * processors.
+     * process 0 sees: 0 -1 -1 3 -1 -1 ...
+     * process 1 sees: -1 34 -1 -1 37 -1 ...
+     * process 2 sees: -1 -1 68 -1 -1 71 ... */
+
+    /* verify those leading -1s exist if they should */
+    for (i = 0; i < mynod; i++) {
+        if (buf[i] != -1) {
+            if (verbose)
+                fprintf(stderr, "Process %d: buf is %d, should be -1\n",
+                        mynod, buf[i]);
+            errors++;
+        }
+    }
+    /* now the modulo games are hairy.  processor 0 sees real data in the 0th,
+     * 3rd, 6th... elements of the buffer (assuming nprocs==3).  proc 1 sees
+     * the data in 1st, 4th, 7th..., and proc 2 sees it in 2nd, 5th, 8th */
+
+    for (/* 'i' set in above loop */ ; i < SIZE; i++) {
+        if (((i - mynod) % nprocs) && buf[i] != -1) {
+            if (verbose)
+                fprintf(stderr, "Process %d: buf %d is %d, should be -1\n",
+                        mynod, i, buf[i]);
+            errors++;
+        }
+        if (!((i - mynod) % nprocs) && buf[i] != SEEDER(mynod, i, SIZE)) {
+            if (verbose)
+                fprintf(stderr, "Process %d: buf %d is %d, should be %d\n",
+                        mynod, i, buf[i], SEEDER(mynod, i, SIZE));
+            errors++;
+        }
+    }
+    MPI_File_close(&fh);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (!mynod) {
+        if (verbose)
+            fprintf(stderr, "\ntesting noncontiguous in memory, contiguous in "
+                            "file using collective I/O\n");
+        MPI_File_delete(filename, info);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_CREATE | MPI_MODE_RDWR,
+                  info, &fh);
+
+    for (i = 0; i < SIZE; i++) buf[i] = SEEDER(mynod, i, SIZE);
+    errcode = MPIX_File_iwrite_at_all(fh, mynod * (SIZE / nprocs) * sizeof(int),
+                                     buf, 1, newtype, &request);
+    if (errcode != MPI_SUCCESS)
+        handle_error(errcode, "nc mem - c file: MPIX_File_iwrite_at_all");
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Wait(&request, &status);
+
+    for (i = 0; i < SIZE; i++)
+        buf[i] = -1;
+
+    errcode = MPIX_File_iread_at_all(fh, mynod * (SIZE / nprocs) * sizeof(int),
+                                    buf, 1, newtype, &request);
+    if (errcode != MPI_SUCCESS)
+        handle_error(errcode, "nc mem - c file: MPIX_File_iread_at_all");
+    MPI_Wait(&request, &status);
+
+    /* just like as above */
+    for (i = 0; i < mynod; i++) {
+        if (buf[i] != -1) {
+            if (verbose)
+                fprintf(stderr, "Process %d: buf is %d, should be -1\n",
+                        mynod, buf[i]);
+            errors++;
+        }
+    }
+    for (/* i set in above loop */ ; i < SIZE; i++) {
+        if (((i - mynod) % nprocs) && buf[i] != -1) {
+            if (verbose)
+                fprintf(stderr, "Process %d: buf %d is %d, should be -1\n",
+                        mynod, i, buf[i]);
+            errors++;
+        }
+        if (!((i - mynod) % nprocs) && buf[i] != SEEDER(mynod, i, SIZE)) {
+            if (verbose)
+                fprintf(stderr, "Process %d: buf %d is %d, should be %d\n",
+                        mynod, i, buf[i], SEEDER(mynod, i, SIZE));
+            errors++;
+        }
+    }
+
+    MPI_File_close(&fh);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (!mynod) {
+        if (verbose)
+            fprintf(stderr, "\ntesting contiguous in memory, noncontiguous in "
+                            "file using collective I/O\n");
+        MPI_File_delete(filename, info);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_CREATE | MPI_MODE_RDWR,
+                  info, &fh);
+
+    MPI_File_set_view(fh, 0, MPI_INT, newtype, "native", info);
+
+    for (i = 0; i < SIZE; i++)
+        buf[i] = SEEDER(mynod, i, SIZE);
+    errcode = MPIX_File_iwrite_all(fh, buf, SIZE, MPI_INT, &request);
+    if (errcode != MPI_SUCCESS)
+        handle_error(errcode, "c mem - nc file: MPIX_File_iwrite_all");
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Wait(&request, &status);
+
+    for (i = 0; i < SIZE; i++) buf[i] = -1;
+
+    errcode = MPIX_File_iread_at_all(fh, 0, buf, SIZE, MPI_INT, &request);
+    if (errcode != MPI_SUCCESS)
+        handle_error(errcode, "c mem - nc file: MPIX_File_iread_at_all");
+    MPI_Wait(&request, &status);
+
+    /* same crazy checking */
+    for (i = 0; i < SIZE; i++) {
+        if (buf[i] != SEEDER(mynod, i, SIZE)) {
+            if (verbose)
+                fprintf(stderr, "Process %d: buf %d is %d, should be %d\n",
+                        mynod, i, buf[i], SEEDER(mynod, i, SIZE));
+            errors++;
+        }
+    }
+
+    MPI_File_close(&fh);
+
+    MPI_Type_free(&newtype);
+    free(buf);
+    if (info != MPI_INFO_NULL) MPI_Info_free(&info);
+    return errors;
+}
diff --git a/test/mpi/io/i_types_with_zeros.c b/test/mpi/io/i_types_with_zeros.c
new file mode 100644
index 0000000..3f32030
--- /dev/null
+++ b/test/mpi/io/i_types_with_zeros.c
@@ -0,0 +1,155 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+
+#include <mpi.h>
+
+#define MAXLEN 9
+
+static void handle_error(int errcode, const char *str)
+{
+    char msg[MPI_MAX_ERROR_STRING];
+    int resultlen;
+    MPI_Error_string(errcode, msg, &resultlen);
+    fprintf(stderr, "%s: %s\n", str, msg);
+    MPI_Abort(MPI_COMM_WORLD, 1);
+}
+
+enum {
+    INDEXED,
+    HINDEXED,
+    STRUCT
+} testcases;
+
+static int test_indexed_with_zeros(char *filename, int testcase)
+{
+    int i, rank, np, buflen, num, err, nr_errors = 0;
+    int nelms[MAXLEN], buf[MAXLEN], indices[MAXLEN], blocklen[MAXLEN];
+    MPI_File fh;
+    MPI_Request request;
+    MPI_Status status;
+    MPI_Datatype filetype;
+    MPI_Datatype types[MAXLEN];
+    MPI_Aint addrs[MAXLEN];
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &np);
+
+    /* set up the number of integers to write in each iteration */
+    for (i = 0; i < MAXLEN; i++) nelms[i] = 0;
+    if (rank == 0)
+        nelms[4] = nelms[5] = nelms[7] = 1;
+    if (rank == 1)
+        nelms[0] = nelms[1] = nelms[2] = nelms[3] = nelms[6] = nelms[8] = 1;
+
+    /* pre-fill the file with integers -999 */
+    if (rank == 0) {
+        for (i = 0; i < MAXLEN; i++)
+            buf[i] = -999;
+        err = MPI_File_open(MPI_COMM_SELF, filename,
+                            MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL,
+                            &fh);
+        if (err != MPI_SUCCESS) handle_error(err, "MPI_File_open");
+        err = MPI_File_write(fh, buf, MAXLEN, MPI_INT, &status);
+        if (err != MPI_SUCCESS) handle_error(err, "MPI_File_write");
+        err = MPI_File_close(&fh);
+        if (err != MPI_SUCCESS) handle_error(err, "MPI_File_close");
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* define a filetype with spurious leading zeros */
+    buflen = num = 0;
+    for (i = 0; i < MAXLEN; i++) {
+        buflen += nelms[i];
+        indices[num] = i;
+        addrs[num] = i * sizeof(int);
+        blocklen[num] = nelms[i];
+        types[num] = MPI_INT;
+        num++;
+    }
+    switch (testcase) {
+        case INDEXED:
+            MPI_Type_indexed(num, blocklen, indices, MPI_INT, &filetype);
+            break;
+        case HINDEXED:
+            MPI_Type_hindexed(num, blocklen, addrs, MPI_INT, &filetype);
+            break;
+        case STRUCT:
+            MPI_Type_create_struct(num, blocklen, addrs, types, &filetype);
+            break;
+        default:
+            fprintf(stderr, "unknown testcase!\n");
+            return (-100);
+    }
+
+    MPI_Type_commit(&filetype);
+
+    /* initialize write buffer and write to file */
+    for (i = 0; i < MAXLEN; i++) buf[i] = 1;
+    err = MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_WRONLY,
+                        MPI_INFO_NULL, &fh);
+    if (err != MPI_SUCCESS) handle_error(err, "MPI_File_open");
+    err = MPI_File_set_view(fh, 0, MPI_INT, filetype, "native", MPI_INFO_NULL);
+    if (err != MPI_SUCCESS) handle_error(err, "MPI_File_set_view");
+    err = MPIX_File_iwrite_all(fh, buf, buflen, MPI_INT, &request);
+    if (err != MPI_SUCCESS) handle_error(err, "MPIX_File_iwrite_all");
+    err = MPI_Wait(&request, &status);
+    if (err != MPI_SUCCESS) handle_error(err, "MPI_Wait");
+    MPI_Type_free(&filetype);
+    err = MPI_File_close(&fh);
+    if (err != MPI_SUCCESS) handle_error(err, "MPI_File_close");
+
+    /* read back and check */
+    if (rank == 0) {
+        err = MPI_File_open(MPI_COMM_SELF, filename, MPI_MODE_RDONLY,
+                            MPI_INFO_NULL, &fh);
+        if (err != MPI_SUCCESS) handle_error(err, "MPI_File_open");
+        err = MPI_File_read(fh, buf, MAXLEN, MPI_INT, &status);
+        if (err != MPI_SUCCESS) handle_error(err, "MPI_File_read");
+        err = MPI_File_close(&fh);
+        if (err != MPI_SUCCESS) handle_error(err, "MPI_File_close");
+        for (i = 0; i < MAXLEN; i++) {
+            if (buf[i] < 0) {
+                nr_errors++;
+                printf("Error: unexpected value for case %d at buf[%d] == %d\n",
+                       testcase, i, buf[i]);
+            }
+        }
+    }
+    return nr_errors;
+}
+
+int main(int argc, char **argv)
+{
+    int nr_errors, rank, np;
+    char *filename;
+
+    filename = (argc > 1) ? argv[1] : "testfile";
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &np);
+
+    if (np != 2) {
+        if (rank == 0) fprintf(stderr, "Must run on 2 MPI processes\n");
+        MPI_Finalize();
+        return 1;
+    }
+    nr_errors = test_indexed_with_zeros(filename, INDEXED);
+    nr_errors += test_indexed_with_zeros(filename, HINDEXED);
+    nr_errors += test_indexed_with_zeros(filename, STRUCT);
+
+    if (rank == 0 && nr_errors == 0) printf(" No Errors\n");
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/test/mpi/io/testlist.in b/test/mpi/io/testlist.in
index 7422238..89ccf48 100644
--- a/test/mpi/io/testlist.in
+++ b/test/mpi/io/testlist.in
@@ -15,3 +15,11 @@ hindexed_io 1
 @mpix@ i_hindexed_io 1
 @mpix@ i_rdwrord 4
 @mpix@ i_setviewcur 4
+ at mpix@ i_aggregation1 4
+ at mpix@ i_aggregation2 4
+ at mpix@ i_coll_test 4
+ at mpix@ i_darray_read 4
+ at mpix@ i_hindexed 4
+ at mpix@ i_noncontig_coll 2
+ at mpix@ i_noncontig_coll2 4
+ at mpix@ i_types_with_zeros 2

http://git.mpich.org/mpich.git/commitdiff/96d8f4e992d728e0e667dfb9d09aa4081fe7285a

commit 96d8f4e992d728e0e667dfb9d09aa4081fe7285a
Author: Sangmin Seo <sseo at anl.gov>
Date:   Mon Nov 10 09:26:45 2014 -0600

    Add tests for nonblocking collective I/O.
    
    Added nonblocking version of bigtype, hindexed_io, rdwrord, and setviewcur
    for testing nonblocking collective I/O functions.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/test/mpi/configure.ac b/test/mpi/configure.ac
index d8f0ef8..2ce7df8 100644
--- a/test/mpi/configure.ac
+++ b/test/mpi/configure.ac
@@ -1521,6 +1521,7 @@ AC_OUTPUT(maint/testmerge \
           spawn/Makefile \
           topo/Makefile \
           io/Makefile \
+          io/testlist \
           f77/Makefile \
           f77/attr/Makefile \
           f77/attr/attraints.h \
diff --git a/test/mpi/io/Makefile.am b/test/mpi/io/Makefile.am
index 4d5ba77..ea6751e 100644
--- a/test/mpi/io/Makefile.am
+++ b/test/mpi/io/Makefile.am
@@ -27,5 +27,13 @@ noinst_PROGRAMS = \
     bigtype       \
     hindexed_io
 
+if BUILD_MPIX_TESTS
+noinst_PROGRAMS += \
+    i_bigtype     \
+    i_hindexed_io \
+    i_rdwrord     \
+    i_setviewcur
+endif
+
 clean-local:
 	-rm -f testfile testfile.*
diff --git a/test/mpi/io/i_bigtype.c b/test/mpi/io/i_bigtype.c
new file mode 100644
index 0000000..301b54c
--- /dev/null
+++ b/test/mpi/io/i_bigtype.c
@@ -0,0 +1,145 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include <mpi.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <assert.h>
+
+//#define NUM_X 536870911
+#define NUM_X 536870912
+#define NUM_Y 1
+
+//#define BIGDT 2147483643
+#define BIGDT 2147483647
+
+/*
+static char MTEST_Descrip[] = "Nonblocking file read/write for bigtype";
+*/
+
+int main(int argc, char **argv)
+{
+    MPI_File fh;
+    int i, j;
+    size_t k;
+    MPI_Datatype inner_type, rem_type, mem_type;
+    MPI_Datatype int_type, file_type;
+    int *buf_write, *buf_read;
+    int rc;
+    MPI_Aint disp[2];
+    int block_len[2];
+    MPI_Datatype type[2];
+    MPI_Status status;
+    MPI_Request request;
+
+    MPI_Init(&argc, &argv);
+
+    if (sizeof(MPI_Aint) <= sizeof(int)) {
+        /* can't test on this platform... */
+        goto exit;
+    }
+
+    k = 0;
+    /* create a large buffer 2 */
+    buf_write = malloc(NUM_X * NUM_Y * sizeof(int));
+    buf_read = malloc(NUM_X * NUM_Y * sizeof(int));
+    memset(buf_read, 0, NUM_X * NUM_Y * sizeof(int));
+
+    for (i = 0; i < NUM_X; i++) {
+        for (j = 0; j < NUM_Y; j++) {
+            buf_write[k] = k;
+            k++;
+        }
+    }
+
+    /* Big Datatype (2^31 - 1 bytes) */
+    MPI_Type_contiguous(BIGDT, MPI_BYTE, &inner_type);
+    /* Small Datatype (1 byte) */
+    MPI_Type_contiguous(1, MPI_BYTE, &rem_type);
+
+    type[0] = inner_type;
+    type[1] = rem_type;
+    block_len[0] = 1;
+    block_len[1] = 1;
+    disp[0] = 0;
+    disp[1] = BIGDT;
+
+    /* combine both types */
+    MPI_Type_struct(2, block_len, disp, type, &mem_type);
+
+    MPI_Type_commit(&mem_type);
+    MPI_Type_free(&rem_type);
+    MPI_Type_free(&inner_type);
+
+    MPI_Type_contiguous(4, MPI_BYTE, &int_type);
+    {
+        /* This creates a big type that is actually contituous, touching an
+         * optimization that was at one point buggy  */
+        MPI_Type_vector(1, NUM_X, 1, int_type, &file_type);
+    }
+
+    MPI_Type_commit(&file_type);
+    MPI_Type_free(&int_type);
+
+    rc = MPI_File_open(MPI_COMM_WORLD, "testfile",
+                       MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
+    if (rc != MPI_SUCCESS) {
+        printf("Can't open file: %s\n", "testfile");
+        exit(1);
+    }
+
+    rc = MPI_File_set_view(fh, 2144, MPI_BYTE, file_type, "native",
+                           MPI_INFO_NULL);
+    if (rc != MPI_SUCCESS) {
+        printf("ERROR SET VIEW\n");
+        exit(1);
+    }
+
+    /* write everything */
+    rc = MPIX_File_iwrite_at_all(fh, 0, buf_write, 1, mem_type, &request);
+    if (rc != MPI_SUCCESS) {
+        printf("%d ERROR IWRITE AT ALL\n", rc);
+        exit(1);
+    }
+    MPI_Wait(&request, &status);
+
+    rc = MPI_File_set_view(fh, 2144, MPI_BYTE, file_type, "native",
+                           MPI_INFO_NULL);
+    if (rc != MPI_SUCCESS) {
+        printf("ERROR SET VIEW\n");
+        exit(1);
+    }
+
+    /* read everything */
+    rc = MPIX_File_iread_at_all(fh, 0, buf_read, 1, mem_type, &request);
+    if (rc != MPI_SUCCESS) {
+        printf("%d ERROR IREAD AT ALL\n", rc);
+        exit(1);
+    }
+    MPI_Wait(&request, &status);
+
+    for (k = 0; k < NUM_X * NUM_Y; k++) {
+        if (buf_read[k] != buf_write[k]) {
+            fprintf(stderr, "Verfiy Failed index %zu: expected %d found %d\n",
+                    k, buf_write[k], buf_read[k]);
+            assert(0);
+        }
+    }
+
+    free(buf_write);
+    free(buf_read);
+    MPI_File_close(&fh);
+
+    MPI_Type_free(&mem_type);
+    MPI_Type_free(&file_type);
+
+  exit:
+    MPI_Finalize();
+    printf(" No Errors\n");
+
+    return 0;
+}
diff --git a/test/mpi/io/i_hindexed_io.c b/test/mpi/io/i_hindexed_io.c
new file mode 100644
index 0000000..5b17819
--- /dev/null
+++ b/test/mpi/io/i_hindexed_io.c
@@ -0,0 +1,118 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include <mpi.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#define DATA_SIZE 324*4
+#define PAD 256
+#define HEADER 144
+#define BLK_COUNT 3
+
+static void handle_error(int errcode, char *str)
+{
+    char msg[MPI_MAX_ERROR_STRING];
+    int resultlen;
+    MPI_Error_string(errcode, msg, &resultlen);
+    fprintf(stderr, "%s: %s\n", str, msg);
+    MPI_Abort(MPI_COMM_WORLD, 1);
+}
+
+#define CHECK(fn) { int errcode; errcode = (fn); if (errcode != MPI_SUCCESS) handle_error(errcode, #fn); }
+
+int main(int argc, char **argv)
+{
+    MPI_File fh;
+    MPI_Datatype file_type, mem_type;
+    int *data = NULL;
+    int *verify = NULL;
+    int data_size = DATA_SIZE;
+    int i, j, k, nr_errors = 0;
+    MPI_Aint disp[BLK_COUNT];
+    int block_lens[BLK_COUNT];
+    char *filename = "unnamed.dat";
+    MPI_Status status;
+    MPI_Request request;
+
+    MPI_Init(&argc, &argv);
+    disp[0] = (MPI_Aint) (PAD);
+    disp[1] = (MPI_Aint) (data_size * 1 + PAD);
+    disp[2] = (MPI_Aint) (data_size * 2 + PAD);
+
+    block_lens[0] = data_size;
+    block_lens[1] = data_size;
+    block_lens[2] = data_size;
+
+    data = malloc(data_size);
+    verify = malloc(data_size * BLK_COUNT + HEADER + PAD);
+    for (i = 0; i < data_size / sizeof(int); i++)
+        data[i] = i;
+
+    MPI_Type_create_hindexed_block(BLK_COUNT, data_size, disp, MPI_BYTE,
+                                   &file_type);
+    MPI_Type_commit(&file_type);
+
+    MPI_Type_create_hvector(BLK_COUNT, data_size, 0, MPI_BYTE, &mem_type);
+    MPI_Type_commit(&mem_type);
+
+    if (1 < argc)
+        filename = argv[1];
+
+    CHECK(MPI_File_open(MPI_COMM_WORLD, filename,
+                MPI_MODE_RDWR | MPI_MODE_CREATE | MPI_MODE_DELETE_ON_CLOSE,
+                MPI_INFO_NULL, &fh) != 0);
+
+    CHECK(MPI_File_set_view(fh, HEADER, MPI_BYTE, file_type, "native",
+                            MPI_INFO_NULL));
+
+    /* write everything */
+    CHECK(MPIX_File_iwrite_at_all(fh, 0, data, 1, mem_type, &request));
+    MPI_Wait(&request, &status);
+
+    /* verify */
+    CHECK(MPI_File_set_view(fh, 0, MPI_BYTE, MPI_BYTE, "native",
+          MPI_INFO_NULL));
+    CHECK(MPIX_File_iread_at_all(fh, 0,
+          verify, (HEADER + PAD + BLK_COUNT * DATA_SIZE) / sizeof(int),
+          MPI_INT, &request));
+    MPI_Wait(&request, &status);
+
+    /* header and block padding should have no data */
+    for (i = 0; i < (HEADER + PAD) / sizeof(int); i++) {
+        if (verify[i] != 0) {
+            nr_errors++;
+            fprintf(stderr, "expected 0, read %d\n", verify[i]);
+        }
+    }
+    /* blocks are replicated */
+    for (j = 0; j < BLK_COUNT; j++) {
+        for (k = 0; k < (DATA_SIZE / sizeof(int)); k++) {
+            if (verify[(HEADER+PAD)/sizeof(int) + k + j*(DATA_SIZE/sizeof(int))]
+                != data[k]) {
+                nr_errors++;
+                fprintf(stderr, "expcted %d, read %d\n", data[k],
+                        verify[(HEADER+PAD)/sizeof(int) + k +
+                                j*(DATA_SIZE/sizeof(int))]);
+            }
+            i++;
+        }
+    }
+
+    MPI_File_close(&fh);
+
+    MPI_Type_free(&mem_type);
+    MPI_Type_free(&file_type);
+
+    if (nr_errors == 0)
+        printf(" No Errors\n");
+
+    MPI_Finalize();
+
+    free(data);
+    return 0;
+}
diff --git a/test/mpi/io/i_rdwrord.c b/test/mpi/io/i_rdwrord.c
new file mode 100644
index 0000000..ca4d625
--- /dev/null
+++ b/test/mpi/io/i_rdwrord.c
@@ -0,0 +1,73 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+
+/*
+static char MTEST_Descrip[] = "Test reading and writing ordered output";
+*/
+
+int main(int argc, char *argv[])
+{
+    int errs = 0;
+    int size, rank, i, *buf, rc;
+    MPI_File fh;
+    MPI_Comm comm;
+    MPI_Status status;
+    MPI_Request request;
+
+    MTest_Init(&argc, &argv);
+
+    comm = MPI_COMM_WORLD;
+    MPI_File_open(comm, (char *)"test.ord",
+                  MPI_MODE_RDWR | MPI_MODE_CREATE |
+                  MPI_MODE_DELETE_ON_CLOSE, MPI_INFO_NULL, &fh);
+
+    MPI_Comm_size(comm, &size);
+    MPI_Comm_rank(comm, &rank);
+    buf = (int *)malloc(size * sizeof(int));
+    buf[0] = rank;
+    rc = MPI_File_write_ordered(fh, buf, 1, MPI_INT, &status);
+    if (rc != MPI_SUCCESS) {
+        MTestPrintErrorMsg("File_write_ordered", rc);
+        errs++;
+    }
+    /* make sure all writes finish before we seek/read */
+    MPI_Barrier(comm);
+
+    /* Set the individual pointer to 0, since we want to use a iread_all */
+    MPI_File_seek(fh, 0, MPI_SEEK_SET);
+    rc = MPIX_File_iread_all(fh, buf, size, MPI_INT, &request);
+    if (rc != MPI_SUCCESS) {
+        MTestPrintErrorMsg("File_iread_all", rc);
+        errs++;
+    }
+    MPI_Wait(&request, &status);
+
+    for (i = 0; i < size; i++) {
+        if (buf[i] != i) {
+            errs++;
+            fprintf(stderr, "%d: buf[%d] = %d\n", rank, i, buf[i]);
+        }
+    }
+
+    MPI_File_seek_shared(fh, 0, MPI_SEEK_SET);
+    for (i = 0; i < size; i++) buf[i] = -1;
+    MPI_File_read_ordered(fh, buf, 1, MPI_INT, &status);
+    if (buf[0] != rank) {
+        errs++;
+        fprintf(stderr, "%d: buf[0] = %d\n", rank, buf[0]);
+    }
+
+    free(buf);
+    MPI_File_close(&fh);
+
+    MTest_Finalize(errs);
+    MPI_Finalize();
+    return 0;
+}
diff --git a/test/mpi/io/i_setviewcur.c b/test/mpi/io/i_setviewcur.c
new file mode 100644
index 0000000..c696628
--- /dev/null
+++ b/test/mpi/io/i_setviewcur.c
@@ -0,0 +1,129 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+
+/*
+static char MTEST_Descrip[] = "Test set_view with DISPLACEMENT_CURRENT";
+*/
+
+int main(int argc, char *argv[])
+{
+    int errs = 0, err;
+    int size, rank, *buf;
+    MPI_Offset offset;
+    MPI_File fh;
+    MPI_Comm comm;
+    MPI_Status status;
+    MPI_Request request;
+
+    MTest_Init(&argc, &argv);
+
+    /* This test reads a header then sets the view to every "size" int,
+     * using set view and current displacement.  The file is first written
+     * using a combination of collective and ordered writes */
+
+    comm = MPI_COMM_WORLD;
+    err = MPI_File_open(comm, (char *) "test.ord",
+                        MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
+    if (err) {
+        errs++;
+        MTestPrintErrorMsg("Open(1)", err);
+    }
+    MPI_Comm_size(comm, &size);
+    MPI_Comm_rank(comm, &rank);
+    buf = (int *) malloc(size * sizeof(int));
+    buf[0] = size;
+    err = MPIX_File_iwrite_all(fh, buf, 1, MPI_INT, &request);
+    if (err) {
+        errs++;
+        MTestPrintErrorMsg("Iwrite_all", err);
+    }
+    err = MPI_Wait(&request, &status);
+    if (err) {
+        errs++;
+        MTestPrintErrorMsg("Wait", err);
+    }
+
+    err = MPI_File_get_position(fh, &offset);
+    if (err) {
+        errs++;
+        MTestPrintErrorMsg("Get_position", err);
+    }
+    err = MPI_File_seek_shared(fh, offset, MPI_SEEK_SET);
+    if (err) {
+        errs++;
+        MTestPrintErrorMsg("Seek_shared", err);
+    }
+    buf[0] = rank;
+    err = MPI_File_write_ordered(fh, buf, 1, MPI_INT, &status);
+    if (err) {
+        errs++;
+        MTestPrintErrorMsg("Write_ordered", err);
+    }
+    err = MPI_File_close(&fh);
+    if (err) {
+        errs++;
+        MTestPrintErrorMsg("Close(1)", err);
+    }
+
+    /* Reopen the file as sequential */
+    err = MPI_File_open(comm, (char *) "test.ord",
+                        MPI_MODE_RDONLY | MPI_MODE_SEQUENTIAL |
+                        MPI_MODE_DELETE_ON_CLOSE, MPI_INFO_NULL, &fh);
+    if (err) {
+        errs++;
+        MTestPrintErrorMsg("Open(Read)", err);
+    }
+
+    if (rank == 0) {
+        err = MPI_File_read_shared(fh, buf, 1, MPI_INT, &status);
+        if (err) {
+            errs++;
+            MTestPrintErrorMsg("Read_all", err);
+        }
+        if (buf[0] != size) {
+            errs++;
+            fprintf(stderr, "Unexpected value for the header = %d, should be %d\n",
+                    buf[0], size);
+            fflush(stderr);
+        }
+    }
+    MPI_Barrier(comm);
+    /* All processes must provide the same file view for MODE_SEQUENTIAL */
+    /* See MPI 2.1, 13.3 - DISPLACEMENT_CURRENT is *required* for
+     * MODE_SEQUENTIAL files */
+    err = MPI_File_set_view(fh, MPI_DISPLACEMENT_CURRENT, MPI_INT,
+                            MPI_INT, (char *) "native", MPI_INFO_NULL);
+    if (err) {
+        errs++;
+        MTestPrintErrorMsg("Set_view (DISPLACEMENT_CURRENT)", err);
+    }
+    buf[0] = -1;
+    err = MPI_File_read_ordered(fh, buf, 1, MPI_INT, &status);
+    if (err) {
+        errs++;
+        MTestPrintErrorMsg("Read_all", err);
+    }
+    if (buf[0] != rank) {
+        errs++;
+        fprintf(stderr, "%d: buf[0] = %d\n", rank, buf[0]);
+        fflush(stderr);
+    }
+
+    free(buf);
+    err = MPI_File_close(&fh);
+    if (err) {
+        errs++;
+        MTestPrintErrorMsg("Close(2)", err);
+    }
+
+    MTest_Finalize(errs);
+    MPI_Finalize();
+    return 0;
+}
diff --git a/test/mpi/io/testlist b/test/mpi/io/testlist
deleted file mode 100644
index 2549091..0000000
--- a/test/mpi/io/testlist
+++ /dev/null
@@ -1,13 +0,0 @@
-rdwrord 4
-rdwrzero 4
-getextent 2
-setinfo 4
-setviewcur 4
-i_noncontig 2
-async 4
-async_any 4
-userioerr 1
-resized 1
-resized2 1 xfail=ticket2088
-bigtype 1
-hindexed_io 1
diff --git a/test/mpi/io/testlist.in b/test/mpi/io/testlist.in
new file mode 100644
index 0000000..7422238
--- /dev/null
+++ b/test/mpi/io/testlist.in
@@ -0,0 +1,17 @@
+rdwrord 4
+rdwrzero 4
+getextent 2
+setinfo 4
+setviewcur 4
+i_noncontig 2
+async 4
+async_any 4
+userioerr 1
+resized 1
+resized2 1 xfail=ticket2088
+bigtype 1
+hindexed_io 1
+ at mpix@ i_bigtype 1
+ at mpix@ i_hindexed_io 1
+ at mpix@ i_rdwrord 4
+ at mpix@ i_setviewcur 4

http://git.mpich.org/mpich.git/commitdiff/c26c66278ab28809bafaf6a5f9de4ac7789ba57d

commit c26c66278ab28809bafaf6a5f9de4ac7789ba57d
Author: Sangmin Seo <sseo at anl.gov>
Date:   Fri Nov 7 14:24:37 2014 -0600

    Add nonblocking collective I/O functions.
    
    This patch implemented four functions for nonblocking collective I/O,
    which will be added to MPI 3.1 standard. Details for these functions
    can be found in the MPI-Forum ticket,
    https://svn.mpi-forum.org/trac/mpi-forum-web/ticket/273.
    Currently, they are implemented as MPIX functions.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/binding/fortran/use_mpi_f08/wrappers_c/buildiface b/src/binding/fortran/use_mpi_f08/wrappers_c/buildiface
index 91cbf73..214f35b 100755
--- a/src/binding/fortran/use_mpi_f08/wrappers_c/buildiface
+++ b/src/binding/fortran/use_mpi_f08/wrappers_c/buildiface
@@ -145,7 +145,11 @@ my %bufpos = (
     "MPI_Unpack_external" => [1, -1, -1, 4, 5, 6],
     "MPI_Win_attach" => [1, -1, -1],
     "MPI_Win_create" => [0, -1, -1],
-    "MPI_Win_detach" => [1, -1, -1]
+    "MPI_Win_detach" => [1, -1, -1],
+    "MPIX_File_iread_all" => [1, 2, 3],
+    "MPIX_File_iread_at_all" => [2, 3, 4],
+    "MPIX_File_iwrite_all" => [1, 2, 3],
+    "MPIX_File_iwrite_at_all" => [2, 3, 4]
 );
 
 # Choice buffers in some functions can be passed in MPI_IN_PLACE. We store such
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs.c
index 4be147f..f376150 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs.c
@@ -56,6 +56,8 @@ struct ADIOI_Fns_struct ADIO_GPFS_operations = {
 #elif PEPLATFORM
     "GPFS+PE: IBM GPFS for PE",
 #else
-    "GPFS: IBM GPFS"
+    "GPFS: IBM GPFS",
 #endif
+    ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+    ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
 };
diff --git a/src/mpi/romio/adio/ad_gridftp/ad_gridftp.c b/src/mpi/romio/adio/ad_gridftp/ad_gridftp.c
index f08f112..f3767dc 100644
--- a/src/mpi/romio/adio/ad_gridftp/ad_gridftp.c
+++ b/src/mpi/romio/adio/ad_gridftp/ad_gridftp.c
@@ -34,4 +34,6 @@ struct ADIOI_Fns_struct ADIO_GRIDFTP_operations = {
     ADIOI_GRIDFTP_Resize, /* Resize */
     ADIOI_GRIDFTP_Delete, /* Delete */
     ADIOI_GRIDFTP_Feature, /* Features */
+    ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+    ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
 };
diff --git a/src/mpi/romio/adio/ad_hfs/ad_hfs.c b/src/mpi/romio/adio/ad_hfs/ad_hfs.c
index ad99ff7..5b34354 100644
--- a/src/mpi/romio/adio/ad_hfs/ad_hfs.c
+++ b/src/mpi/romio/adio/ad_hfs/ad_hfs.c
@@ -33,4 +33,6 @@ struct ADIOI_Fns_struct ADIO_HFS_operations = {
     ADIOI_GEN_Flush, /* Flush */
     ADIOI_HFS_Resize, /* Resize */
     ADIOI_GEN_Delete, /* Delete */
+    ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+    ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
 };
diff --git a/src/mpi/romio/adio/ad_lustre/ad_lustre.c b/src/mpi/romio/adio/ad_lustre/ad_lustre.c
index d524dd8..f3bc3d0 100644
--- a/src/mpi/romio/adio/ad_lustre/ad_lustre.c
+++ b/src/mpi/romio/adio/ad_lustre/ad_lustre.c
@@ -41,4 +41,6 @@ struct ADIOI_Fns_struct ADIO_LUSTRE_operations = {
     ADIOI_GEN_Delete, /* Delete */
     ADIOI_GEN_Feature, /* Features */
     "LUSTRE:",
+    ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+    ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
 };
diff --git a/src/mpi/romio/adio/ad_nfs/ad_nfs.c b/src/mpi/romio/adio/ad_nfs/ad_nfs.c
index 725c4d1..763a1b4 100644
--- a/src/mpi/romio/adio/ad_nfs/ad_nfs.c
+++ b/src/mpi/romio/adio/ad_nfs/ad_nfs.c
@@ -37,5 +37,7 @@ struct ADIOI_Fns_struct ADIO_NFS_operations = {
     ADIOI_NFS_Resize, /* Resize */
     ADIOI_GEN_Delete, /* Delete */
     ADIOI_NFS_Feature, /* Features */
-    "NFS:"  /* fsname: just a string */
+    "NFS:",  /* fsname: just a string */
+    ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+    ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
 };
diff --git a/src/mpi/romio/adio/ad_ntfs/ad_ntfs.c b/src/mpi/romio/adio/ad_ntfs/ad_ntfs.c
index 8789fc3..9788274 100644
--- a/src/mpi/romio/adio/ad_ntfs/ad_ntfs.c
+++ b/src/mpi/romio/adio/ad_ntfs/ad_ntfs.c
@@ -34,5 +34,7 @@ struct ADIOI_Fns_struct ADIO_NTFS_operations = {
     ADIOI_NTFS_Flush, /* Flush */
     ADIOI_NTFS_Resize, /* Resize */
     ADIOI_GEN_Delete, /* Delete */
-    ADIOI_NTFS_Feature /* Features */
+    ADIOI_NTFS_Feature, /* Features */
+    ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+    ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
 };
diff --git a/src/mpi/romio/adio/ad_panfs/ad_panfs.c b/src/mpi/romio/adio/ad_panfs/ad_panfs.c
index fd9cd93..fc75f7d 100644
--- a/src/mpi/romio/adio/ad_panfs/ad_panfs.c
+++ b/src/mpi/romio/adio/ad_panfs/ad_panfs.c
@@ -41,5 +41,7 @@ struct ADIOI_Fns_struct ADIO_PANFS_operations = {
     ADIOI_PANFS_Resize, /* Resize */
     ADIOI_GEN_Delete, /* Delete */
     ADIOI_GEN_Feature,
-    "PANFS: Panasas PanFS"
+    "PANFS: Panasas PanFS",
+    ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+    ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
 };
diff --git a/src/mpi/romio/adio/ad_pfs/ad_pfs.c b/src/mpi/romio/adio/ad_pfs/ad_pfs.c
index 62a4305..cc480b0 100644
--- a/src/mpi/romio/adio/ad_pfs/ad_pfs.c
+++ b/src/mpi/romio/adio/ad_pfs/ad_pfs.c
@@ -33,4 +33,6 @@ struct ADIOI_Fns_struct ADIO_PFS_operations = {
     ADIOI_PFS_Flush, /* Flush */
     ADIOI_GEN_Resize, /* Resize */
     ADIOI_GEN_Delete, /* Delete */
+    ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+    ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
 };
diff --git a/src/mpi/romio/adio/ad_piofs/ad_piofs.c b/src/mpi/romio/adio/ad_piofs/ad_piofs.c
index 29d8c30..726bbf1 100644
--- a/src/mpi/romio/adio/ad_piofs/ad_piofs.c
+++ b/src/mpi/romio/adio/ad_piofs/ad_piofs.c
@@ -34,4 +34,6 @@ struct ADIOI_Fns_struct ADIO_PIOFS_operations = {
     ADIOI_GEN_Resize, /* Resize */
     ADIOI_GEN_Delete, /* Delete */
     ADIOI_PIOFS_Feature, 
+    ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+    ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
 };
diff --git a/src/mpi/romio/adio/ad_pvfs/ad_pvfs.c b/src/mpi/romio/adio/ad_pvfs/ad_pvfs.c
index 92b6df6..27a3df8 100644
--- a/src/mpi/romio/adio/ad_pvfs/ad_pvfs.c
+++ b/src/mpi/romio/adio/ad_pvfs/ad_pvfs.c
@@ -34,4 +34,6 @@ struct ADIOI_Fns_struct ADIO_PVFS_operations = {
     ADIOI_PVFS_Resize, /* Resize */
     ADIOI_PVFS_Delete, /* Delete */
     ADIOI_PVFS_Feature, /* Features */
+    ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+    ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
 };
diff --git a/src/mpi/romio/adio/ad_pvfs2/ad_pvfs2.c b/src/mpi/romio/adio/ad_pvfs2/ad_pvfs2.c
index a55c3c8..bdebe19 100644
--- a/src/mpi/romio/adio/ad_pvfs2/ad_pvfs2.c
+++ b/src/mpi/romio/adio/ad_pvfs2/ad_pvfs2.c
@@ -39,7 +39,9 @@ struct ADIOI_Fns_struct ADIO_PVFS2_operations = {
     ADIOI_PVFS2_Resize, /* Resize */
     ADIOI_PVFS2_Delete, /* Delete */
     ADIOI_PVFS2_Feature,
-    "PVFS2: the PVFS v2 or OrangeFS file systems"
+    "PVFS2: the PVFS v2 or OrangeFS file systems",
+    ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+    ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
 };
 
 /* 
diff --git a/src/mpi/romio/adio/ad_sfs/ad_sfs.c b/src/mpi/romio/adio/ad_sfs/ad_sfs.c
index 929dfd9..c4c1609 100644
--- a/src/mpi/romio/adio/ad_sfs/ad_sfs.c
+++ b/src/mpi/romio/adio/ad_sfs/ad_sfs.c
@@ -33,4 +33,6 @@ struct ADIOI_Fns_struct ADIO_SFS_operations = {
     ADIOI_SFS_Flush, /* Flush */
     ADIOI_GEN_Resize, /* Resize */
     ADIOI_GEN_Delete, /* Delete */
+    ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+    ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
 };
diff --git a/src/mpi/romio/adio/ad_testfs/ad_testfs.c b/src/mpi/romio/adio/ad_testfs/ad_testfs.c
index 6823468..6696847 100644
--- a/src/mpi/romio/adio/ad_testfs/ad_testfs.c
+++ b/src/mpi/romio/adio/ad_testfs/ad_testfs.c
@@ -35,5 +35,7 @@ struct ADIOI_Fns_struct ADIO_TESTFS_operations = {
     ADIOI_TESTFS_Resize, /* Resize */
     ADIOI_TESTFS_Delete, /* Delete */
     ADIOI_GEN_Feature, /* Features */
-    "TESTFS: the logging-only file system"
+    "TESTFS: the logging-only file system",
+    ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+    ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
 };
diff --git a/src/mpi/romio/adio/ad_ufs/ad_ufs.c b/src/mpi/romio/adio/ad_ufs/ad_ufs.c
index 66b183e..a7134dc 100644
--- a/src/mpi/romio/adio/ad_ufs/ad_ufs.c
+++ b/src/mpi/romio/adio/ad_ufs/ad_ufs.c
@@ -41,4 +41,6 @@ struct ADIOI_Fns_struct ADIO_UFS_operations = {
     ADIOI_GEN_Delete, /* Delete */
     ADIOI_GEN_Feature, /* Features */
     "UFS: Generic ROMIO driver for all UNIX-like file systems",
+    ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+    ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
 };
diff --git a/src/mpi/romio/adio/ad_xfs/ad_xfs.c b/src/mpi/romio/adio/ad_xfs/ad_xfs.c
index b748a8a..f43e0e8 100644
--- a/src/mpi/romio/adio/ad_xfs/ad_xfs.c
+++ b/src/mpi/romio/adio/ad_xfs/ad_xfs.c
@@ -40,5 +40,7 @@ struct ADIOI_Fns_struct ADIO_XFS_operations = {
     ADIOI_XFS_Resize, /* Resize */
     ADIOI_GEN_Delete, /* Delete */
     ADIOI_GEN_Feature, /* Features */
-    "XFS: SGI XFS"
+    "XFS: SGI XFS",
+    ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+    ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
 };
diff --git a/src/mpi/romio/adio/ad_zoidfs/ad_zoidfs.c b/src/mpi/romio/adio/ad_zoidfs/ad_zoidfs.c
index a0eadfb..3c9131a 100644
--- a/src/mpi/romio/adio/ad_zoidfs/ad_zoidfs.c
+++ b/src/mpi/romio/adio/ad_zoidfs/ad_zoidfs.c
@@ -35,6 +35,8 @@ struct ADIOI_Fns_struct ADIO_ZOIDFS_operations = {
     ADIOI_ZOIDFS_Resize, /* Resize */
     ADIOI_ZOIDFS_Delete, /* Delete */
     ADIOI_ZOIDFS_Feature,
+    ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+    ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
 };
 
 /* 
diff --git a/src/mpi/romio/adio/common/Makefile.mk b/src/mpi/romio/adio/common/Makefile.mk
index 6cfd118..c2c379c 100644
--- a/src/mpi/romio/adio/common/Makefile.mk
+++ b/src/mpi/romio/adio/common/Makefile.mk
@@ -26,8 +26,10 @@ romio_other_sources +=                  \
     adio/common/ad_io_coll.c            \
     adio/common/ad_iopen.c              \
     adio/common/ad_iread.c              \
+    adio/common/ad_iread_coll.c         \
     adio/common/ad_iread_fake.c         \
     adio/common/ad_iwrite.c             \
+    adio/common/ad_iwrite_coll.c        \
     adio/common/ad_iwrite_fake.c        \
     adio/common/ad_open.c               \
     adio/common/ad_opencoll.c           \
diff --git a/src/mpi/romio/adio/common/ad_aggregate.c b/src/mpi/romio/adio/common/ad_aggregate.c
index 55b5544..77c69bd 100644
--- a/src/mpi/romio/adio/common/ad_aggregate.c
+++ b/src/mpi/romio/adio/common/ad_aggregate.c
@@ -513,3 +513,141 @@ void ADIOI_Calc_others_req(ADIO_File fd, int count_my_req_procs,
     MPE_Log_event (5027, 0, NULL);
 #endif
 }
+
+
+/* Nonblocking version of ADIOI_Calc_others_req().
+   It consists of three functions - ADIOI_Icalc_others_req(),
+   ADIOI_Icalc_others_req_main(), and ADIOI_Icalc_others_req_fini(). */
+void ADIOI_Icalc_others_req(ADIOI_NBC_Request *nbc_req, int *error_code)
+{
+    ADIOI_Icalc_others_req_vars *vars = nbc_req->cor_vars;
+
+    /* count_others_req_per_proc[i] indicates how many separate contiguous
+       requests of proc. i lie in this process's file domain. */
+
+    /* first find out how much to send/recv and from/to whom */
+#ifdef AGGREGATION_PROFILE
+    MPE_Log_event(5026, 0, NULL);
+#endif
+    vars->count_others_req_per_proc =
+        (int *)ADIOI_Malloc(vars->nprocs * sizeof(int));
+
+    *error_code = MPI_Ialltoall(vars->count_my_req_per_proc, 1, MPI_INT,
+            vars->count_others_req_per_proc, 1, MPI_INT, vars->fd->comm,
+            &vars->req1);
+
+    if (nbc_req->rdwr == ADIOI_READ) {
+        nbc_req->data.rd.state = ADIOI_IRC_STATE_ICALC_OTHERS_REQ;
+    } else {
+        ADIOI_Assert(nbc_req->rdwr == ADIOI_WRITE);
+        nbc_req->data.wr.state = ADIOI_IWC_STATE_ICALC_OTHERS_REQ;
+    }
+}
+
+void ADIOI_Icalc_others_req_main(ADIOI_NBC_Request *nbc_req, int *error_code)
+{
+    ADIOI_Icalc_others_req_vars *vars = nbc_req->cor_vars;
+    ADIO_File fd = vars->fd;
+    int count_my_req_procs = vars->count_my_req_procs;
+    ADIOI_Access *my_req = vars->my_req;
+    int nprocs = vars->nprocs;
+    int myrank = vars->myrank;
+    ADIOI_Access **others_req_ptr = vars->others_req_ptr;
+
+    /* determine what requests of other processes lie in this process's
+       file domain */
+
+    /* count_others_req_procs = number of processes whose requests lie in
+       this process's file domain (including this process itself)
+       count_others_req_per_proc[i] indicates how many separate contiguous
+       requests of proc. i lie in this process's file domain. */
+
+    int *count_others_req_per_proc = vars->count_others_req_per_proc;
+    int count_others_req_procs;
+    int i, j;
+    ADIOI_Access *others_req;
+
+    *others_req_ptr = (ADIOI_Access *)ADIOI_Malloc(nprocs*sizeof(ADIOI_Access));
+    others_req = *others_req_ptr;
+
+    count_others_req_procs = 0;
+    for (i = 0; i < nprocs; i++) {
+        if (count_others_req_per_proc[i]) {
+            others_req[i].count = count_others_req_per_proc[i];
+            others_req[i].offsets = (ADIO_Offset *)
+                ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(ADIO_Offset));
+            others_req[i].lens =
+                ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(ADIO_Offset));
+            others_req[i].mem_ptrs = (MPI_Aint *)
+                ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(MPI_Aint));
+            count_others_req_procs++;
+        }
+        else others_req[i].count = 0;
+    }
+    vars->count_others_req_procs = count_others_req_procs;
+
+    /* now send the calculated offsets and lengths to respective processes */
+
+    vars->req2 = (MPI_Request *)
+        ADIOI_Malloc(1+2*(count_my_req_procs+count_others_req_procs)
+                     *sizeof(MPI_Request));
+    /* +1 to avoid a 0-size malloc */
+
+    j = 0;
+    for (i = 0; i < nprocs; i++) {
+        if (others_req[i].count) {
+            MPI_Irecv(others_req[i].offsets, others_req[i].count,
+                    ADIO_OFFSET, i, i+myrank, fd->comm, &vars->req2[j]);
+            j++;
+            MPI_Irecv(others_req[i].lens, others_req[i].count,
+                    ADIO_OFFSET, i, i+myrank+1, fd->comm, &vars->req2[j]);
+            j++;
+        }
+    }
+
+    for (i=0; i < nprocs; i++) {
+        if (my_req[i].count) {
+            MPI_Isend(my_req[i].offsets, my_req[i].count,
+                    ADIO_OFFSET, i, i+myrank, fd->comm, &vars->req2[j]);
+            j++;
+            MPI_Isend(my_req[i].lens, my_req[i].count,
+                    ADIO_OFFSET, i, i+myrank+1, fd->comm, &vars->req2[j]);
+            j++;
+        }
+    }
+
+    /* keep the number of requests */
+    vars->num_req2 = j;
+
+    if (nbc_req->rdwr == ADIOI_READ) {
+        nbc_req->data.rd.state = ADIOI_IRC_STATE_ICALC_OTHERS_REQ_MAIN;
+    } else {
+        ADIOI_Assert(nbc_req->rdwr == ADIOI_WRITE);
+        nbc_req->data.wr.state = ADIOI_IWC_STATE_ICALC_OTHERS_REQ_MAIN;
+    }
+}
+
+void ADIOI_Icalc_others_req_fini(ADIOI_NBC_Request *nbc_req, int *error_code)
+{
+    ADIOI_Icalc_others_req_vars *vars = nbc_req->cor_vars;
+    void (*next_fn)(ADIOI_NBC_Request *, int *);
+
+    ADIOI_Free(vars->req2);
+    ADIOI_Free(vars->count_others_req_per_proc);
+
+    *vars->count_others_req_procs_ptr = vars->count_others_req_procs;
+#ifdef AGGREGATION_PROFILE
+    MPE_Log_event(5027, 0, NULL);
+#endif
+    /* end of the calculation */
+
+    next_fn = vars->next_fn;
+
+    /* free the struct for parameters and variables */
+    ADIOI_Free(vars);
+    nbc_req->cor_vars = NULL;
+
+    /* move to the next function */
+    next_fn(nbc_req, error_code);
+}
+
diff --git a/src/mpi/romio/adio/common/ad_iread_coll.c b/src/mpi/romio/adio/common/ad_iread_coll.c
new file mode 100644
index 0000000..8a0b7a1
--- /dev/null
+++ b/src/mpi/romio/adio/common/ad_iread_coll.c
@@ -0,0 +1,1311 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "adio.h"
+#include "adio_extern.h"
+#include "mpiu_greq.h"
+
+#ifdef USE_DBG_LOGGING
+  #define RDCOLL_DEBUG 1
+#endif
+#ifdef AGGREGATION_PROFILE
+#include "mpe.h"
+#endif
+
+/* ADIOI_GEN_IreadStridedColl */
+struct ADIOI_GEN_IreadStridedColl_vars {
+    /* requests */
+    MPI_Request req_offset[2];  /* ADIOI_IRC_STATE_GEN_IREADSTRIDEDCOLL */
+    MPI_Request req_ind_io;     /* ADIOI_IRC_STATE_GEN_IREADSTRIDEDCOLL_INDIO */
+
+    /* parameters */
+    ADIO_File fd;
+    void *buf;
+    int count;
+    MPI_Datatype datatype;
+    int file_ptr_type;
+    ADIO_Offset offset;
+
+    /* stack variables */
+    ADIOI_Access *my_req;
+    /* array of nprocs structures, one for each other process in
+       whose file domain this process's request lies */
+
+    ADIOI_Access *others_req;
+    /* array of nprocs structures, one for each other process
+       whose request lies in this process's file domain. */
+
+    int nprocs;
+    int nprocs_for_coll;
+    int myrank;
+    int contig_access_count;
+    int interleave_count;
+    int buftype_is_contig;
+    int *count_my_req_per_proc;
+    int count_my_req_procs;
+    int count_others_req_procs;
+    ADIO_Offset orig_fp;
+    ADIO_Offset fd_size;
+    ADIO_Offset min_st_offset;
+    ADIO_Offset *offset_list;
+    ADIO_Offset *st_offsets;
+    ADIO_Offset *fd_start;
+    ADIO_Offset *fd_end;
+    ADIO_Offset *end_offsets;
+    ADIO_Offset *len_list;
+    int *buf_idx;
+};
+
+/* ADIOI_Iread_and_exch */
+struct ADIOI_Iread_and_exch_vars {
+    /* requests */
+    MPI_Request req1;   /* ADIOI_IRC_STATE_IREAD_AND_EXCH */
+    MPI_Request req2;   /* ADIOI_IRC_STATE_IREAD_AND_EXCH_L1_BEGIN */
+
+    /* parameters */
+    ADIO_File fd;
+    void *buf;
+    MPI_Datatype datatype;
+    int nprocs;
+    int myrank;
+    ADIOI_Access *others_req;
+    ADIO_Offset *offset_list;
+    ADIO_Offset *len_list;
+    int contig_access_count;
+    ADIO_Offset min_st_offset;
+    ADIO_Offset fd_size;
+    ADIO_Offset *fd_start;
+    ADIO_Offset *fd_end;
+    int *buf_idx;
+
+    /* stack variables */
+    int m;
+    int ntimes;
+    int max_ntimes;
+    int buftype_is_contig;
+    ADIO_Offset st_loc;
+    ADIO_Offset end_loc;
+    ADIO_Offset off;
+    ADIO_Offset done;
+    char *read_buf;
+    int *curr_offlen_ptr;
+    int *count;
+    int *send_size;
+    int *recv_size;
+    int *partial_send;
+    int *recd_from_proc;
+    int *start_pos;
+    /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets*/
+    ADIO_Offset size;
+    ADIO_Offset real_size;
+    ADIO_Offset for_curr_iter;
+    ADIO_Offset for_next_iter;
+    ADIOI_Flatlist_node *flat_buf;
+    MPI_Aint buftype_extent;
+    int coll_bufsize;
+
+    /* next function to be called */
+    void (*next_fn)(ADIOI_NBC_Request *, int *);
+};
+
+/* ADIOI_R_Iexchange_data */
+struct ADIOI_R_Iexchange_data_vars {
+    /* requests */
+    MPI_Request req1;   /* ADIOI_IRC_STATE_R_IEXCHANGE_DATA */
+    MPI_Request *req2;  /* ADIOI_IRC_STATE_R_IEXCHANGE_DATA_RECV & FILL */
+
+    /* parameters */
+    ADIO_File fd;
+    void *buf;
+    ADIOI_Flatlist_node *flat_buf;
+    ADIO_Offset *offset_list;
+    ADIO_Offset *len_list;
+    int *send_size;
+    int *recv_size;
+    int *count;
+    int *start_pos;
+    int *partial_send;
+    int *recd_from_proc;
+    int nprocs;
+    int myrank;
+    int buftype_is_contig;
+    int contig_access_count;
+    ADIO_Offset min_st_offset;
+    ADIO_Offset fd_size;
+    ADIO_Offset *fd_start;
+    ADIO_Offset *fd_end;
+    ADIOI_Access *others_req;
+    int iter;
+    MPI_Aint buftype_extent;
+    int *buf_idx;
+
+    /* stack variables */
+    int nprocs_recv;
+    int nprocs_send;
+    char **recv_buf;
+
+    /* next function to be called */
+    void (*next_fn)(ADIOI_NBC_Request *, int *);
+};
+
+
+void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
+                   *flat_buf, char **recv_buf, ADIO_Offset
+                   *offset_list, ADIO_Offset *len_list,
+                   unsigned *recv_size,
+                   MPI_Request *requests, MPI_Status *statuses,
+                   int *recd_from_proc, int nprocs,
+                   int contig_access_count,
+                   ADIO_Offset min_st_offset,
+                   ADIO_Offset fd_size, ADIO_Offset *fd_start,
+                   ADIO_Offset *fd_end,
+                   MPI_Aint buftype_extent);
+
+/* prototypes of functions used for nonblocking collective reads only. */
+static void ADIOI_GEN_IreadStridedColl_inter(ADIOI_NBC_Request *, int *);
+static void ADIOI_GEN_IreadStridedColl_indio(ADIOI_NBC_Request *, int *);
+static void ADIOI_GEN_IreadStridedColl_read(ADIOI_NBC_Request *, int *);
+static void ADIOI_GEN_IreadStridedColl_free(ADIOI_NBC_Request *, int *);
+static void ADIOI_GEN_IreadStridedColl_fini(ADIOI_NBC_Request *, int *);
+
+static void ADIOI_Iread_and_exch(ADIOI_NBC_Request *, int *);
+static void ADIOI_Iread_and_exch_l1_begin(ADIOI_NBC_Request *, int *);
+static void ADIOI_Iread_and_exch_l1_end(ADIOI_NBC_Request *, int *);
+static void ADIOI_Iread_and_exch_reset(ADIOI_NBC_Request *, int *);
+static void ADIOI_Iread_and_exch_l2_begin(ADIOI_NBC_Request *, int *);
+static void ADIOI_Iread_and_exch_l2_end(ADIOI_NBC_Request *, int *);
+static void ADIOI_Iread_and_exch_fini(ADIOI_NBC_Request *, int *);
+
+static void ADIOI_R_Iexchange_data(ADIOI_NBC_Request *, int *);
+static void ADIOI_R_Iexchange_data_recv(ADIOI_NBC_Request *, int *);
+static void ADIOI_R_Iexchange_data_fill(ADIOI_NBC_Request *, int *);
+static void ADIOI_R_Iexchange_data_fini(ADIOI_NBC_Request *, int *);
+
+static MPIX_Grequest_class ADIOI_GEN_greq_class = 0;
+static int ADIOI_GEN_irc_query_fn(void *extra_state, MPI_Status *status);
+static int ADIOI_GEN_irc_free_fn(void *extra_state);
+static int ADIOI_GEN_irc_poll_fn(void *extra_state, MPI_Status *status);
+static int ADIOI_GEN_irc_wait_fn(int count, void **array_of_states,
+                                 double timeout, MPI_Status *status);
+
+
+/* Nonblocking version of ADIOI_GEN_ReadStridedColl() */
+void ADIOI_GEN_IreadStridedColl(ADIO_File fd, void *buf, int count,
+                   MPI_Datatype datatype, int file_ptr_type,
+                   ADIO_Offset offset, MPI_Request *request,
+                   int *error_code)
+{
+    /* Uses a generalized version of the extended two-phase method described
+       in "An Extended Two-Phase Method for Accessing Sections of
+       Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary,
+       Scientific Programming, (5)4:301--317, Winter 1996.
+       http://www.mcs.anl.gov/home/thakur/ext2ph.ps */
+
+    ADIOI_NBC_Request *nbc_req = NULL;
+    ADIOI_GEN_IreadStridedColl_vars *vars = NULL;
+    int nprocs, myrank;
+#ifdef RDCOLL_DEBUG
+    int i;
+#endif
+    ADIO_Offset start_offset, end_offset;
+
+    /* FIXME: need an implementation of ADIOI_IOIstridedColl
+    if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) {
+        ADIOI_IOIstridedColl(fd, buf, count, ADIOI_READ, datatype,
+                             file_ptr_type, offset, request, error_code);
+        return;
+    }
+    */
+
+    /* top-level struct keeping the status of function progress */
+    nbc_req = (ADIOI_NBC_Request *)ADIOI_Calloc(1, sizeof(ADIOI_NBC_Request));
+    nbc_req->rdwr = ADIOI_READ;
+
+    /* create a generalized request */
+    if (ADIOI_GEN_greq_class == 0) {
+        MPIX_Grequest_class_create(ADIOI_GEN_irc_query_fn,
+                ADIOI_GEN_irc_free_fn, MPIU_Greq_cancel_fn,
+                ADIOI_GEN_irc_poll_fn, ADIOI_GEN_irc_wait_fn,
+                &ADIOI_GEN_greq_class);
+    }
+    MPIX_Grequest_class_allocate(ADIOI_GEN_greq_class, nbc_req, request);
+    memcpy(&nbc_req->req, request, sizeof(MPI_Request));
+
+    /* create a struct for parameters and variables */
+    vars = (ADIOI_GEN_IreadStridedColl_vars *)ADIOI_Calloc(
+            1, sizeof(ADIOI_GEN_IreadStridedColl_vars));
+    nbc_req->data.rd.rsc_vars = vars;
+
+    /* save the parameters */
+    vars->fd = fd;
+    vars->buf = buf;
+    vars->count = count;
+    vars->datatype = datatype;
+    vars->file_ptr_type = file_ptr_type;
+    vars->offset = offset;
+
+    MPI_Comm_size(fd->comm, &nprocs);
+    MPI_Comm_rank(fd->comm, &myrank);
+    vars->nprocs = nprocs;
+    vars->myrank = myrank;
+
+    /* number of aggregators, cb_nodes, is stored in the hints */
+    vars->nprocs_for_coll = fd->hints->cb_nodes;
+    vars->orig_fp = fd->fp_ind;
+
+    /* only check for interleaving if cb_read isn't disabled */
+    if (fd->hints->cb_read != ADIOI_HINT_DISABLE) {
+        /* For this process's request, calculate the list of offsets and
+           lengths in the file and determine the start and end offsets. */
+
+        /* Note: end_offset points to the last byte-offset that will be accessed.
+           e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/
+
+        ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
+                              &vars->offset_list, &vars->len_list,
+                              &start_offset, &end_offset,
+                              &vars->contig_access_count);
+
+#ifdef RDCOLL_DEBUG
+        for (i = 0; i < vars->contig_access_count; i++) {
+            DBG_FPRINTF(stderr, "rank %d  off %lld  len %lld\n",
+                        myrank, vars->offset_list[i], vars->len_list[i]);
+        }
+#endif
+
+        /* each process communicates its start and end offsets to other
+           processes. The result is an array each of start and end offsets
+           stored in order of process rank. */
+
+        vars->st_offsets = (ADIO_Offset *)ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
+        vars->end_offsets = (ADIO_Offset *)ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
+
+        *error_code = MPI_Iallgather(&start_offset, 1, ADIO_OFFSET,
+                                     vars->st_offsets, 1, ADIO_OFFSET,
+                                     fd->comm, &vars->req_offset[0]);
+        if (*error_code != MPI_SUCCESS) return;
+        *error_code = MPI_Iallgather(&end_offset, 1, ADIO_OFFSET,
+                                     vars->end_offsets, 1, ADIO_OFFSET,
+                                     fd->comm, &vars->req_offset[1]);
+
+        nbc_req->data.rd.state = ADIOI_IRC_STATE_GEN_IREADSTRIDEDCOLL;
+        return;
+    }
+
+    ADIOI_GEN_IreadStridedColl_indio(nbc_req, error_code);
+}
+
+static void ADIOI_GEN_IreadStridedColl_inter(ADIOI_NBC_Request *nbc_req,
+                                             int *error_code)
+{
+    ADIOI_GEN_IreadStridedColl_vars *vars = nbc_req->data.rd.rsc_vars;
+    int nprocs = vars->nprocs;
+    ADIO_Offset *st_offsets = vars->st_offsets;
+    ADIO_Offset *end_offsets = vars->end_offsets;
+    int i, interleave_count = 0;
+
+    /* are the accesses of different processes interleaved? */
+    for (i = 1; i < nprocs; i++)
+        if ((st_offsets[i] < end_offsets[i-1]) &&
+            (st_offsets[i] <= end_offsets[i]))
+            interleave_count++;
+    /* This is a rudimentary check for interleaving, but should suffice
+       for the moment. */
+
+    vars->interleave_count = interleave_count;
+
+    ADIOI_GEN_IreadStridedColl_indio(nbc_req, error_code);
+}
+
+static void ADIOI_GEN_IreadStridedColl_indio(ADIOI_NBC_Request *nbc_req,
+                                             int *error_code)
+{
+    ADIOI_GEN_IreadStridedColl_vars *vars = nbc_req->data.rd.rsc_vars;
+    ADIOI_Icalc_others_req_vars *cor_vars = NULL;
+    ADIO_File fd = vars->fd;
+    void *buf;
+    int count, file_ptr_type;
+    MPI_Datatype datatype = vars->datatype;
+    ADIO_Offset offset;
+    int filetype_is_contig;
+    ADIO_Offset off;
+    int nprocs;
+
+    ADIOI_Datatype_iscontig(datatype, &vars->buftype_is_contig);
+
+    if (fd->hints->cb_read == ADIOI_HINT_DISABLE
+    || (!vars->interleave_count && (fd->hints->cb_read == ADIOI_HINT_AUTO)))
+    {
+        buf = vars->buf;
+        count = vars->count;
+        file_ptr_type = vars->file_ptr_type;
+        offset = vars->offset;
+
+        /* don't do aggregation */
+        if (fd->hints->cb_read != ADIOI_HINT_DISABLE) {
+            ADIOI_Free(vars->offset_list);
+            ADIOI_Free(vars->len_list);
+            ADIOI_Free(vars->st_offsets);
+            ADIOI_Free(vars->end_offsets);
+        }
+
+        fd->fp_ind = vars->orig_fp;
+        ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
+
+        if (vars->buftype_is_contig && filetype_is_contig) {
+            if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
+                off = fd->disp + (fd->etype_size) * offset;
+                ADIO_IreadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET,
+                                 off, &vars->req_ind_io, error_code);
+            }
+            else ADIO_IreadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
+                                  0, &vars->req_ind_io, error_code);
+        }
+        else {
+            ADIO_IreadStrided(fd, buf, count, datatype, file_ptr_type,
+                              offset, &vars->req_ind_io, error_code);
+        }
+
+        nbc_req->data.rd.state = ADIOI_IRC_STATE_GEN_IREADSTRIDEDCOLL_INDIO;
+        return;
+    }
+
+    nprocs = vars->nprocs;
+
+    /* We're going to perform aggregation of I/O.  Here we call
+     * ADIOI_Calc_file_domains() to determine what processes will handle I/O
+     * to what regions.  We pass nprocs_for_coll into this function; it is
+     * used to determine how many processes will perform I/O, which is also
+     * the number of regions into which the range of bytes must be divided.
+     * These regions are called "file domains", or FDs.
+     *
+     * When this function returns, fd_start, fd_end, fd_size, and
+     * min_st_offset will be filled in.  fd_start holds the starting byte
+     * location for each file domain.  fd_end holds the ending byte location.
+     * min_st_offset holds the minimum byte location that will be accessed.
+     *
+     * Both fd_start[] and fd_end[] are indexed by an aggregator number; this
+     * needs to be mapped to an actual rank in the communicator later.
+     *
+     */
+    ADIOI_Calc_file_domains(vars->st_offsets, vars->end_offsets, nprocs,
+                vars->nprocs_for_coll, &vars->min_st_offset,
+                &vars->fd_start, &vars->fd_end,
+                fd->hints->min_fdomain_size, &vars->fd_size,
+                fd->hints->striping_unit);
+
+    /* calculate where the portions of the access requests of this process
+     * are located in terms of the file domains.  this could be on the same
+     * process or on other processes.  this function fills in:
+     * count_my_req_procs - number of processes (including this one) for which
+     *     this process has requests in their file domain
+     * count_my_req_per_proc - count of requests for each process, indexed
+     *     by rank of the process
+     * my_req[] - array of data structures describing the requests to be
+     *     performed by each process (including self).  indexed by rank.
+     * buf_idx[] - array of locations into which data can be directly moved;
+     *     this is only valid for contiguous buffer case
+     */
+    ADIOI_Calc_my_req(fd, vars->offset_list, vars->len_list,
+              vars->contig_access_count, vars->min_st_offset,
+              vars->fd_start, vars->fd_end, vars->fd_size,
+              nprocs, &vars->count_my_req_procs,
+              &vars->count_my_req_per_proc, &vars->my_req,
+              &vars->buf_idx);
+
+    /* perform a collective communication in order to distribute the
+     * data calculated above.  fills in the following:
+     * count_others_req_procs - number of processes (including this
+     *     one) which have requests in this process's file domain.
+     * count_others_req_per_proc[] - number of separate contiguous
+     *     requests from proc i lie in this process's file domain.
+     */
+
+    cor_vars = (ADIOI_Icalc_others_req_vars *)ADIOI_Calloc(
+            1, sizeof(ADIOI_Icalc_others_req_vars));
+    nbc_req->cor_vars = cor_vars;
+    cor_vars->fd = vars->fd;
+    cor_vars->count_my_req_procs = vars->count_my_req_procs;
+    cor_vars->count_my_req_per_proc = vars->count_my_req_per_proc;
+    cor_vars->my_req = vars->my_req;
+    cor_vars->nprocs = vars->nprocs;
+    cor_vars->myrank = vars->myrank;
+    cor_vars->count_others_req_procs_ptr = &vars->count_others_req_procs;
+    cor_vars->others_req_ptr = &vars->others_req;
+    cor_vars->next_fn = ADIOI_GEN_IreadStridedColl_read;
+
+    ADIOI_Icalc_others_req(nbc_req, error_code);
+}
+
+static void ADIOI_GEN_IreadStridedColl_read(ADIOI_NBC_Request *nbc_req,
+                                            int *error_code)
+{
+    ADIOI_GEN_IreadStridedColl_vars *vars = nbc_req->data.rd.rsc_vars;
+    ADIOI_Iread_and_exch_vars *rae_vars = NULL;
+    ADIOI_Access *my_req = vars->my_req;
+    int nprocs = vars->nprocs;
+    int i;
+
+    /* my_req[] and count_my_req_per_proc aren't needed at this point, so
+     * let's free the memory
+     */
+    ADIOI_Free(vars->count_my_req_per_proc);
+    for (i = 0; i < nprocs; i++) {
+        if (my_req[i].count) {
+            ADIOI_Free(my_req[i].offsets);
+            ADIOI_Free(my_req[i].lens);
+        }
+    }
+    ADIOI_Free(my_req);
+
+    /* read data in sizes of no more than ADIOI_Coll_bufsize,
+     * communicate, and fill user buf.
+     */
+    rae_vars = (ADIOI_Iread_and_exch_vars *)ADIOI_Calloc(
+            1, sizeof(ADIOI_Iread_and_exch_vars));
+    nbc_req->data.rd.rae_vars = rae_vars;
+    rae_vars->fd = vars->fd;
+    rae_vars->buf = vars->buf;
+    rae_vars->datatype = vars->datatype;
+    rae_vars->nprocs = vars->nprocs;
+    rae_vars->myrank = vars->myrank;
+    rae_vars->others_req = vars->others_req;
+    rae_vars->offset_list = vars->offset_list;
+    rae_vars->len_list = vars->len_list;
+    rae_vars->contig_access_count = vars->contig_access_count;
+    rae_vars->min_st_offset = vars->min_st_offset;
+    rae_vars->fd_size = vars->fd_size;
+    rae_vars->fd_start = vars->fd_start;
+    rae_vars->fd_end = vars->fd_end;
+    rae_vars->buf_idx = vars->buf_idx;
+    rae_vars->next_fn = ADIOI_GEN_IreadStridedColl_free;
+
+    ADIOI_Iread_and_exch(nbc_req, error_code);
+}
+
+static void ADIOI_GEN_IreadStridedColl_free(ADIOI_NBC_Request *nbc_req,
+                                            int *error_code)
+{
+    ADIOI_GEN_IreadStridedColl_vars *vars = nbc_req->data.rd.rsc_vars;
+    ADIO_File fd = vars->fd;
+    MPI_Datatype datatype = vars->datatype;
+    ADIOI_Access *others_req = vars->others_req;
+    int nprocs = vars->nprocs;
+    int i;
+
+    if (!vars->buftype_is_contig) ADIOI_Delete_flattened(datatype);
+
+    /* free all memory allocated for collective I/O */
+    for (i = 0; i < nprocs; i++) {
+        if (others_req[i].count) {
+            ADIOI_Free(others_req[i].offsets);
+            ADIOI_Free(others_req[i].lens);
+            ADIOI_Free(others_req[i].mem_ptrs);
+        }
+    }
+    ADIOI_Free(others_req);
+
+    ADIOI_Free(vars->buf_idx);
+    ADIOI_Free(vars->offset_list);
+    ADIOI_Free(vars->len_list);
+    ADIOI_Free(vars->st_offsets);
+    ADIOI_Free(vars->end_offsets);
+    ADIOI_Free(vars->fd_start);
+    ADIOI_Free(vars->fd_end);
+
+    fd->fp_sys_posn = -1;   /* set it to null. */
+
+    ADIOI_GEN_IreadStridedColl_fini(nbc_req, error_code);
+}
+
+static void ADIOI_GEN_IreadStridedColl_fini(ADIOI_NBC_Request *nbc_req,
+                                            int *error_code)
+{
+    ADIOI_GEN_IreadStridedColl_vars *vars = nbc_req->data.rd.rsc_vars;
+    MPI_Count size;
+
+    /* This is a temporary way of filling in status. The right way is to
+       keep track of how much data was actually read and placed in buf
+       during collective I/O. */
+    MPI_Type_size_x(vars->datatype, &size);
+    nbc_req->nbytes = size * vars->count;
+
+    /* free the struct for parameters and variables */
+    if (nbc_req->data.rd.rsc_vars) {
+        ADIOI_Free(nbc_req->data.rd.rsc_vars);
+        nbc_req->data.rd.rsc_vars = NULL;
+    }
+
+    /* make the request complete */
+    *error_code = MPI_Grequest_complete(nbc_req->req);
+    nbc_req->data.rd.state = ADIOI_IRC_STATE_COMPLETE;
+}
+
+
+static void ADIOI_Iread_and_exch(ADIOI_NBC_Request *nbc_req, int *error_code)
+{
+    ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars;
+    ADIO_File fd = vars->fd;
+    MPI_Datatype datatype = vars->datatype;
+    int nprocs = vars->nprocs;
+    ADIOI_Access *others_req = vars->others_req;
+
+    /* Read in sizes of no more than coll_bufsize, an info parameter.
+       Send data to appropriate processes.
+       Place recd. data in user buf.
+       The idea is to reduce the amount of extra memory required for
+       collective I/O. If all data were read all at once, which is much
+       easier, it would require temp space more than the size of user_buf,
+       which is often unacceptable. For example, to read a distributed
+       array from a file, where each local array is 8Mbytes, requiring
+       at least another 8Mbytes of temp space is unacceptable. */
+
+    int i, j;
+    ADIO_Offset st_loc = -1, end_loc = -1;
+    ADIOI_Flatlist_node *flat_buf = NULL;
+    int coll_bufsize;
+
+    *error_code = MPI_SUCCESS;  /* changed below if error */
+    /* only I/O errors are currently reported */
+
+    /* calculate the number of reads of size coll_bufsize
+       to be done by each process and the max among all processes.
+       That gives the no. of communication phases as well.
+       coll_bufsize is obtained from the hints object. */
+
+    coll_bufsize = fd->hints->cb_buffer_size;
+    vars->coll_bufsize = coll_bufsize;
+
+    /* grab some initial values for st_loc and end_loc */
+    for (i = 0; i < nprocs; i++) {
+        if (others_req[i].count) {
+            st_loc = others_req[i].offsets[0];
+            end_loc = others_req[i].offsets[0];
+            break;
+        }
+    }
+
+    /* now find the real values */
+    for (i = 0; i < nprocs; i++)
+        for (j = 0; j < others_req[i].count; j++) {
+            st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
+            end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j]
+                          + others_req[i].lens[j] - 1));
+        }
+
+    vars->st_loc = st_loc;
+    vars->end_loc = end_loc;
+
+    /* calculate ntimes, the number of times this process must perform I/O
+     * operations in order to complete all the requests it has received.
+     * the need for multiple I/O operations comes from the restriction that
+     * we only use coll_bufsize bytes of memory for internal buffering.
+     */
+    if ((st_loc == -1) && (end_loc == -1)) {
+        /* this process does no I/O. */
+        vars->ntimes = 0;
+    }
+    else {
+        /* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/
+        vars->ntimes = (int)((end_loc - st_loc + coll_bufsize) / coll_bufsize);
+    }
+
+    *error_code = MPI_Iallreduce(&vars->ntimes, &vars->max_ntimes, 1, MPI_INT,
+                                 MPI_MAX, fd->comm, &vars->req1);
+
+    vars->read_buf = fd->io_buf;  /* Allocated at open time */
+
+    vars->curr_offlen_ptr = (int *)ADIOI_Calloc(nprocs, sizeof(int));
+    /* its use is explained below. calloc initializes to 0. */
+
+    vars->count = (int *)ADIOI_Malloc(nprocs * sizeof(int));
+    /* to store count of how many off-len pairs per proc are satisfied
+       in an iteration. */
+
+    vars->partial_send = (int *)ADIOI_Calloc(nprocs, sizeof(int));
+    /* if only a portion of the last off-len pair is sent to a process
+       in a particular iteration, the length sent is stored here.
+       calloc initializes to 0. */
+
+    vars->send_size = (int *)ADIOI_Malloc(nprocs * sizeof(int));
+    /* total size of data to be sent to each proc. in an iteration */
+
+    vars->recv_size = (int *)ADIOI_Malloc(nprocs * sizeof(int));
+    /* total size of data to be recd. from each proc. in an iteration.
+       Of size nprocs so that I can use MPI_Alltoall later. */
+
+    vars->recd_from_proc = (int *)ADIOI_Calloc(nprocs, sizeof(int));
+    /* amount of data recd. so far from each proc. Used in
+       ADIOI_Fill_user_buffer. initialized to 0 here. */
+
+    vars->start_pos = (int *)ADIOI_Malloc(nprocs*sizeof(int));
+    /* used to store the starting value of curr_offlen_ptr[i] in
+       this iteration */
+
+    ADIOI_Datatype_iscontig(datatype, &vars->buftype_is_contig);
+    if (!vars->buftype_is_contig) {
+        ADIOI_Flatten_datatype(datatype);
+        flat_buf = ADIOI_Flatlist;
+        while (flat_buf->type != datatype) flat_buf = flat_buf->next;
+        vars->flat_buf = flat_buf;
+    }
+    MPI_Type_extent(datatype, &vars->buftype_extent);
+
+    vars->done = 0;
+    vars->off = st_loc;
+    vars->for_curr_iter = vars->for_next_iter = 0;
+
+    /* set the state to wait until MPI_Ialltoall finishes. */
+    nbc_req->data.rd.state = ADIOI_IRC_STATE_IREAD_AND_EXCH;
+}
+
+static void ADIOI_Iread_and_exch_l1_begin(ADIOI_NBC_Request *nbc_req,
+                                          int *error_code)
+{
+    ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars;
+    ADIO_File fd;
+    int nprocs;
+    ADIOI_Access *others_req;
+
+    int i, j;
+    ADIO_Offset real_off, req_off;
+    char *read_buf;
+    int *curr_offlen_ptr, *count, *send_size;
+    int *partial_send, *start_pos;
+    ADIO_Offset size, real_size, for_next_iter;
+    int req_len, flag;
+
+    ADIOI_R_Iexchange_data_vars *red_vars = NULL;
+
+    /* loop exit condition */
+    if (vars->m >= vars->ntimes) {
+        ADIOI_Iread_and_exch_reset(nbc_req, error_code);
+        return;
+    }
+
+    fd = vars->fd;
+    nprocs = vars->nprocs;
+    others_req = vars->others_req;
+
+    read_buf = vars->read_buf;
+    curr_offlen_ptr = vars->curr_offlen_ptr;
+    count = vars->count;
+    send_size = vars->send_size;
+    partial_send = vars->partial_send;
+    start_pos = vars->start_pos;
+
+    /* read buf of size coll_bufsize (or less) */
+    /* go through all others_req and check if any are satisfied
+       by the current read */
+
+    /* since MPI guarantees that displacements in filetypes are in
+       monotonically nondecreasing order, I can maintain a pointer
+       (curr_offlen_ptr) to
+       current off-len pair for each process in others_req and scan
+       further only from there. There is still a problem of filetypes
+       such as:  (1, 2, 3 are not process nos. They are just numbers for
+       three chunks of data, specified by a filetype.)
+
+       1  -------!--
+       2    -----!----
+       3       --!-----
+
+       where ! indicates where the current read_size limitation cuts
+       through the filetype.  I resolve this by reading up to !, but
+       filling the communication buffer only for 1. I copy the portion
+       left over for 2 into a tmp_buf for use in the next
+       iteration. i.e., 2 and 3 will be satisfied in the next
+       iteration. This simplifies filling in the user's buf at the
+       other end, as only one off-len pair with incomplete data
+       will be sent. I also don't need to send the individual
+       offsets and lens along with the data, as the data is being
+       sent in a particular order. */
+
+    /* off = start offset in the file for the data actually read in
+             this iteration
+       size = size of data read corresponding to off
+       real_off = off minus whatever data was retained in memory from
+             previous iteration for cases like 2, 3 illustrated above
+       real_size = size plus the extra corresponding to real_off
+       req_off = off in file for a particular contiguous request
+                 minus what was satisfied in previous iteration
+       req_size = size corresponding to req_off */
+
+    size = ADIOI_MIN((unsigned)vars->coll_bufsize,
+                     vars->end_loc - vars->st_loc + 1 - vars->done);
+    real_off = vars->off - vars->for_curr_iter;
+    real_size = size + vars->for_curr_iter;
+
+    vars->size = size;
+    vars->real_size = real_size;
+
+    for (i = 0; i < nprocs; i++) count[i] = send_size[i] = 0;
+    for_next_iter = 0;
+
+    for (i = 0; i < nprocs; i++) {
+#ifdef RDCOLL_DEBUG
+        DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n",
+                    vars->myrank, i, others_req[i].count);
+#endif
+        if (others_req[i].count) {
+            start_pos[i] = curr_offlen_ptr[i];
+            for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) {
+                if (partial_send[i]) {
+                    /* this request may have been partially
+                       satisfied in the previous iteration. */
+                    req_off = others_req[i].offsets[j] + partial_send[i];
+                    req_len = others_req[i].lens[j] - partial_send[i];
+                    partial_send[i] = 0;
+                    /* modify the off-len pair to reflect this change */
+                    others_req[i].offsets[j] = req_off;
+                    others_req[i].lens[j] = req_len;
+                }
+                else {
+                    req_off = others_req[i].offsets[j];
+                    req_len = others_req[i].lens[j];
+                }
+                if (req_off < real_off + real_size) {
+                    count[i]++;
+                    ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf) + req_off - real_off) == (ADIO_Offset)(MPIR_Upint)(read_buf + req_off - real_off));
+                    MPI_Address(read_buf + req_off - real_off,
+                                &(others_req[i].mem_ptrs[j]));
+                    ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off));
+                    send_size[i] += (int)(ADIOI_MIN(real_off + real_size - req_off,
+                                                    (ADIO_Offset)(unsigned)req_len));
+
+                    if (real_off + real_size - req_off < (ADIO_Offset)(unsigned)req_len) {
+                        partial_send[i] = (int)(real_off + real_size - req_off);
+                        if ((j+1 < others_req[i].count) &&
+                            (others_req[i].offsets[j+1] < real_off + real_size)) {
+                            /* this is the case illustrated in the
+                               figure above. */
+                            for_next_iter = ADIOI_MAX(for_next_iter,
+                                    real_off + real_size - others_req[i].offsets[j+1]);
+                            /* max because it must cover requests
+                               from different processes */
+                        }
+                        break;
+                    }
+                }
+                else break;
+            }
+            curr_offlen_ptr[i] = j;
+        }
+    }
+    vars->for_next_iter = for_next_iter;
+
+    flag = 0;
+    for (i = 0; i < nprocs; i++)
+        if (count[i]) flag = 1;
+
+    /* create a struct for ADIOI_R_Iexchange_data() */
+    red_vars = (ADIOI_R_Iexchange_data_vars *)ADIOI_Calloc(
+            1, sizeof(ADIOI_R_Iexchange_data_vars));
+    nbc_req->data.rd.red_vars = red_vars;
+    red_vars->fd = vars->fd;
+    red_vars->buf = vars->buf;
+    red_vars->flat_buf = vars->flat_buf;
+    red_vars->offset_list = vars->offset_list;
+    red_vars->len_list = vars->len_list;
+    red_vars->send_size = vars->send_size;
+    red_vars->recv_size = vars->recv_size;
+    red_vars->count = vars->count;
+    red_vars->start_pos = vars->start_pos;
+    red_vars->partial_send = vars->partial_send;
+    red_vars->recd_from_proc = vars->recd_from_proc;
+    red_vars->nprocs = vars->nprocs;
+    red_vars->myrank = vars->myrank;
+    red_vars->buftype_is_contig = vars->buftype_is_contig;
+    red_vars->contig_access_count = vars->contig_access_count;
+    red_vars->min_st_offset = vars->min_st_offset;
+    red_vars->fd_size = vars->fd_size;
+    red_vars->fd_start = vars->fd_start;
+    red_vars->fd_end = vars->fd_end;
+    red_vars->others_req = vars->others_req;
+    red_vars->iter = vars->m;
+    red_vars->buftype_extent = vars->buftype_extent;
+    red_vars->buf_idx = vars->buf_idx;
+    red_vars->next_fn = ADIOI_Iread_and_exch_l1_end;
+
+    if (flag) {
+        ADIOI_Assert(size == (int)size);
+        ADIO_IreadContig(fd, read_buf+vars->for_curr_iter, (int)size,
+                         MPI_BYTE, ADIO_EXPLICIT_OFFSET, vars->off,
+                         &vars->req2, error_code);
+
+        nbc_req->data.rd.state = ADIOI_IRC_STATE_IREAD_AND_EXCH_L1_BEGIN;
+        return;
+    }
+
+    ADIOI_R_Iexchange_data(nbc_req, error_code);
+}
+
+static void ADIOI_Iread_and_exch_l1_end(ADIOI_NBC_Request *nbc_req,
+                                        int *error_code)
+{
+    ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars;
+    ADIO_File fd = vars->fd;
+    ADIO_Offset size = vars->size;
+    ADIO_Offset real_size = vars->real_size;
+    ADIO_Offset for_next_iter = vars->for_next_iter;
+    char *read_buf = vars->read_buf;
+    char *tmp_buf;
+
+    vars->for_curr_iter = for_next_iter;
+
+    if (for_next_iter) {
+        tmp_buf = (char *)ADIOI_Malloc(for_next_iter);
+        ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+real_size-for_next_iter) == (ADIO_Offset)(MPIR_Upint)(read_buf+real_size-for_next_iter));
+        ADIOI_Assert((for_next_iter+vars->coll_bufsize) == (size_t)(for_next_iter+vars->coll_bufsize));
+        memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter);
+        ADIOI_Free(fd->io_buf);
+        fd->io_buf = (char *)ADIOI_Malloc(for_next_iter+vars->coll_bufsize);
+        memcpy(fd->io_buf, tmp_buf, for_next_iter);
+        vars->read_buf = fd->io_buf;
+        ADIOI_Free(tmp_buf);
+    }
+
+    vars->off += size;
+    vars->done += size;
+
+    /* increment m and go back to the beginning of m loop */
+    vars->m++;
+    ADIOI_Iread_and_exch_l1_begin(nbc_req, error_code);
+}
+
+static void ADIOI_Iread_and_exch_reset(ADIOI_NBC_Request *nbc_req,
+                                       int *error_code)
+{
+    ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars;
+    int nprocs = vars->nprocs;
+    int *count = vars->count;
+    int *send_size = vars->send_size;
+    int i;
+
+    for (i = 0; i < nprocs; i++) count[i] = send_size[i] = 0;
+
+    vars->m = vars->ntimes;
+    ADIOI_Iread_and_exch_l2_begin(nbc_req, error_code);
+}
+
+static void ADIOI_Iread_and_exch_l2_begin(ADIOI_NBC_Request *nbc_req,
+                                          int *error_code)
+{
+    ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars;
+    ADIOI_R_Iexchange_data_vars *red_vars = NULL;
+
+    /* loop exit condition */
+    if (vars->m >= vars->max_ntimes) {
+        ADIOI_Iread_and_exch_fini(nbc_req, error_code);
+        return;
+    }
+
+    /* create a struct for ADIOI_R_Iexchange_data() */
+    red_vars = (ADIOI_R_Iexchange_data_vars *)ADIOI_Calloc(
+            1, sizeof(ADIOI_R_Iexchange_data_vars));
+    nbc_req->data.rd.red_vars = red_vars;
+    red_vars->fd = vars->fd;
+    red_vars->buf = vars->buf;
+    red_vars->flat_buf = vars->flat_buf;
+    red_vars->offset_list = vars->offset_list;
+    red_vars->len_list = vars->len_list;
+    red_vars->send_size = vars->send_size;
+    red_vars->recv_size = vars->recv_size;
+    red_vars->count = vars->count;
+    red_vars->start_pos = vars->start_pos;
+    red_vars->partial_send = vars->partial_send;
+    red_vars->recd_from_proc = vars->recd_from_proc;
+    red_vars->nprocs = vars->nprocs;
+    red_vars->myrank = vars->myrank;
+    red_vars->buftype_is_contig = vars->buftype_is_contig;
+    red_vars->contig_access_count = vars->contig_access_count;
+    red_vars->min_st_offset = vars->min_st_offset;
+    red_vars->fd_size = vars->fd_size;
+    red_vars->fd_start = vars->fd_start;
+    red_vars->fd_end = vars->fd_end;
+    red_vars->others_req = vars->others_req;
+    red_vars->iter = vars->m;
+    red_vars->buftype_extent = vars->buftype_extent;
+    red_vars->buf_idx = vars->buf_idx;
+    red_vars->next_fn = ADIOI_Iread_and_exch_l2_end;
+
+    ADIOI_R_Iexchange_data(nbc_req, error_code);
+}
+
+static void ADIOI_Iread_and_exch_l2_end(ADIOI_NBC_Request *nbc_req,
+                                        int *error_code)
+{
+    ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars;
+
+    vars->m++;
+    ADIOI_Iread_and_exch_l2_begin(nbc_req, error_code);
+}
+
+static void ADIOI_Iread_and_exch_fini(ADIOI_NBC_Request *nbc_req, int *error_code)
+{
+    ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars;
+    void (*next_fn)(ADIOI_NBC_Request *, int *);
+
+    ADIOI_Free(vars->curr_offlen_ptr);
+    ADIOI_Free(vars->count);
+    ADIOI_Free(vars->partial_send);
+    ADIOI_Free(vars->send_size);
+    ADIOI_Free(vars->recv_size);
+    ADIOI_Free(vars->recd_from_proc);
+    ADIOI_Free(vars->start_pos);
+
+    next_fn = vars->next_fn;
+
+    /* free the struct for parameters and variables */
+    ADIOI_Free(nbc_req->data.rd.rae_vars);
+    nbc_req->data.rd.rae_vars = NULL;
+
+    /* move to the next function */
+    next_fn(nbc_req, error_code);
+}
+
+
+static void ADIOI_R_Iexchange_data(ADIOI_NBC_Request *nbc_req, int *error_code)
+{
+    ADIOI_R_Iexchange_data_vars *vars = nbc_req->data.rd.red_vars;
+
+    /* exchange send_size info so that each process knows how much to
+       receive from whom and how much memory to allocate. */
+    *error_code = MPI_Ialltoall(vars->send_size, 1, MPI_INT, vars->recv_size, 1,
+                                MPI_INT, vars->fd->comm, &vars->req1);
+
+    nbc_req->data.rd.state = ADIOI_IRC_STATE_R_IEXCHANGE_DATA;
+}
+
+static void ADIOI_R_Iexchange_data_recv(ADIOI_NBC_Request *nbc_req,
+                                        int *error_code)
+{
+    ADIOI_R_Iexchange_data_vars *vars = nbc_req->data.rd.red_vars;
+    ADIO_File fd = vars->fd;
+    int *send_size = vars->send_size;
+    int *recv_size = vars->recv_size;
+    int *count = vars->count;
+    int *start_pos = vars->start_pos;
+    int *partial_send = vars->partial_send;
+    int nprocs = vars->nprocs;
+    int myrank = vars->myrank;
+    ADIOI_Access *others_req = vars->others_req;
+    int iter = vars->iter;
+    int *buf_idx = vars->buf_idx;
+
+    int i, j, k = 0, tmp = 0, nprocs_recv, nprocs_send;
+    char **recv_buf = NULL;
+    MPI_Datatype send_type;
+
+    nprocs_recv = 0;
+    for (i = 0; i < nprocs; i++) if (recv_size[i]) nprocs_recv++;
+    vars->nprocs_recv = nprocs_recv;
+
+    nprocs_send = 0;
+    for (i = 0; i < nprocs; i++) if (send_size[i]) nprocs_send++;
+    vars->nprocs_send = nprocs_send;
+
+    vars->req2 = (MPI_Request *)
+        ADIOI_Malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request));
+    /* +1 to avoid a 0-size malloc */
+
+    /* post recvs. if buftype_is_contig, data can be directly recd. into
+       user buf at location given by buf_idx. else use recv_buf. */
+
+#ifdef AGGREGATION_PROFILE
+    MPE_Log_event (5032, 0, NULL);
+#endif
+
+    if (vars->buftype_is_contig) {
+        j = 0;
+        for (i = 0; i < nprocs; i++)
+            if (recv_size[i]) {
+                MPI_Irecv(((char *)vars->buf) + buf_idx[i], recv_size[i],
+                          MPI_BYTE, i, myrank+i+100*iter, fd->comm,
+                          vars->req2 + j);
+                j++;
+                buf_idx[i] += recv_size[i];
+            }
+    }
+    else {
+        /* allocate memory for recv_buf and post receives */
+        recv_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char*));
+        vars->recv_buf = recv_buf;
+        for (i = 0; i < nprocs; i++)
+            if (recv_size[i]) recv_buf[i] = (char *)ADIOI_Malloc(recv_size[i]);
+
+        j = 0;
+        for (i = 0; i < nprocs; i++)
+            if (recv_size[i]) {
+                MPI_Irecv(recv_buf[i], recv_size[i], MPI_BYTE, i,
+                          myrank+i+100*iter, fd->comm,
+                          vars->req2 + j);
+                j++;
+#ifdef RDCOLL_DEBUG
+                DBG_FPRINTF(stderr, "node %d, recv_size %d, tag %d \n",
+                            myrank, recv_size[i], myrank+i+100*iter);
+#endif
+            }
+    }
+
+    /* create derived datatypes and send data */
+
+    j = 0;
+    for (i = 0; i < nprocs; i++) {
+        if (send_size[i]) {
+            /* take care if the last off-len pair is a partial send */
+            if (partial_send[i]) {
+                k = start_pos[i] + count[i] - 1;
+                tmp = others_req[i].lens[k];
+                others_req[i].lens[k] = partial_send[i];
+            }
+            ADIOI_Type_create_hindexed_x(count[i],
+                    &(others_req[i].lens[start_pos[i]]),
+                    &(others_req[i].mem_ptrs[start_pos[i]]),
+                    MPI_BYTE, &send_type);
+            /* absolute displacement; use MPI_BOTTOM in send */
+            MPI_Type_commit(&send_type);
+            MPI_Isend(MPI_BOTTOM, 1, send_type, i, myrank+i+100*iter,
+                      fd->comm, vars->req2 + nprocs_recv + j);
+            MPI_Type_free(&send_type);
+            if (partial_send[i]) others_req[i].lens[k] = tmp;
+            j++;
+        }
+    }
+
+    /* wait on the receives */
+    if (nprocs_recv) {
+        nbc_req->data.rd.state = ADIOI_IRC_STATE_R_IEXCHANGE_DATA_RECV;
+        return;
+    }
+
+    ADIOI_R_Iexchange_data_fill(nbc_req, error_code);
+}
+
+static void ADIOI_R_Iexchange_data_fill(ADIOI_NBC_Request *nbc_req,
+                                        int *error_code)
+{
+    ADIOI_R_Iexchange_data_vars *vars = nbc_req->data.rd.red_vars;
+
+    if (vars->nprocs_recv) {
+        /* if noncontiguous, to the copies from the recv buffers */
+        if (!vars->buftype_is_contig)
+            ADIOI_Fill_user_buffer(vars->fd, vars->buf, vars->flat_buf,
+                    vars->recv_buf, vars->offset_list, vars->len_list,
+                    (unsigned*)vars->recv_size,
+                    vars->req2, NULL, vars->recd_from_proc,
+                    vars->nprocs, vars->contig_access_count,
+                    vars->min_st_offset, vars->fd_size, vars->fd_start,
+                    vars->fd_end, vars->buftype_extent);
+    }
+
+    nbc_req->data.rd.state = ADIOI_IRC_STATE_R_IEXCHANGE_DATA_FILL;
+}
+
+static void ADIOI_R_Iexchange_data_fini(ADIOI_NBC_Request *nbc_req, int *error_code)
+{
+    ADIOI_R_Iexchange_data_vars *vars = nbc_req->data.rd.red_vars;
+    void (*next_fn)(ADIOI_NBC_Request *, int *);
+    int i;
+
+    ADIOI_Free(vars->req2);
+
+    if (!vars->buftype_is_contig) {
+        for (i = 0; i < vars->nprocs; i++)
+            if (vars->recv_size[i]) ADIOI_Free(vars->recv_buf[i]);
+        ADIOI_Free(vars->recv_buf);
+    }
+#ifdef AGGREGATION_PROFILE
+    MPE_Log_event (5033, 0, NULL);
+#endif
+
+    next_fn = vars->next_fn;
+
+    /* free the structure for parameters and variables */
+    ADIOI_Free(vars);
+    nbc_req->data.rd.red_vars = NULL;
+
+    /* move to the next function */
+    next_fn(nbc_req, error_code);
+}
+
+
+static int ADIOI_GEN_irc_query_fn(void *extra_state, MPI_Status *status)
+{
+    ADIOI_NBC_Request *nbc_req;
+
+    nbc_req = (ADIOI_NBC_Request *)extra_state;
+
+    MPI_Status_set_elements_x(status, MPI_BYTE, nbc_req->nbytes);
+
+    /* can never cancel so always true */
+    MPI_Status_set_cancelled(status, 0);
+
+    /* choose not to return a value for this */
+    status->MPI_SOURCE = MPI_UNDEFINED;
+    /* tag has no meaning for this generalized request */
+    status->MPI_TAG = MPI_UNDEFINED;
+
+    /* this generalized request never fails */
+    return MPI_SUCCESS;
+}
+
+static int ADIOI_GEN_irc_free_fn(void *extra_state)
+{
+    ADIOI_NBC_Request *nbc_req;
+
+    nbc_req = (ADIOI_NBC_Request *)extra_state;
+    ADIOI_Free(nbc_req);
+
+    return MPI_SUCCESS;
+}
+
+static int ADIOI_GEN_irc_poll_fn(void *extra_state, MPI_Status *status)
+{
+    ADIOI_NBC_Request *nbc_req;
+    ADIOI_GEN_IreadStridedColl_vars *rsc_vars = NULL;
+    ADIOI_Icalc_others_req_vars     *cor_vars = NULL;
+    ADIOI_Iread_and_exch_vars       *rae_vars = NULL;
+    ADIOI_R_Iexchange_data_vars     *red_vars = NULL;
+    int errcode = MPI_SUCCESS;
+    int flag;
+
+    nbc_req = (ADIOI_NBC_Request *)extra_state;
+
+    switch (nbc_req->data.rd.state) {
+        case ADIOI_IRC_STATE_GEN_IREADSTRIDEDCOLL:
+            rsc_vars = nbc_req->data.rd.rsc_vars;
+            errcode = MPI_Testall(2, rsc_vars->req_offset, &flag,
+                                  MPI_STATUSES_IGNORE);
+            if (errcode == MPI_SUCCESS && flag) {
+                ADIOI_GEN_IreadStridedColl_inter(nbc_req, &errcode);
+            }
+            break;
+
+        case ADIOI_IRC_STATE_GEN_IREADSTRIDEDCOLL_INDIO:
+            rsc_vars = nbc_req->data.rd.rsc_vars;
+            errcode = MPI_Test(&rsc_vars->req_ind_io, &flag, MPI_STATUS_IGNORE);
+            if (errcode == MPI_SUCCESS && flag) {
+                /* call the last function */
+                ADIOI_GEN_IreadStridedColl_fini(nbc_req, &errcode);
+            }
+            break;
+
+        case ADIOI_IRC_STATE_ICALC_OTHERS_REQ:
+            cor_vars = nbc_req->cor_vars;
+            errcode = MPI_Test(&cor_vars->req1, &flag, MPI_STATUS_IGNORE);
+            if (errcode == MPI_SUCCESS && flag) {
+                ADIOI_Icalc_others_req_main(nbc_req, &errcode);
+            }
+            break;
+
+        case ADIOI_IRC_STATE_ICALC_OTHERS_REQ_MAIN:
+            cor_vars = nbc_req->cor_vars;
+            if (cor_vars->num_req2) {
+                errcode = MPI_Testall(cor_vars->num_req2, cor_vars->req2,
+                                      &flag, MPI_STATUSES_IGNORE);
+                if (errcode == MPI_SUCCESS && flag) {
+                    ADIOI_Icalc_others_req_fini(nbc_req, &errcode);
+                }
+            } else {
+                ADIOI_Icalc_others_req_fini(nbc_req, &errcode);
+            }
+            break;
+
+        case ADIOI_IRC_STATE_IREAD_AND_EXCH:
+            rae_vars = nbc_req->data.rd.rae_vars;
+            errcode = MPI_Test(&rae_vars->req1, &flag, MPI_STATUS_IGNORE);
+            if (errcode == MPI_SUCCESS && flag) {
+                rae_vars->m = 0;
+                ADIOI_Iread_and_exch_l1_begin(nbc_req, &errcode);
+            }
+            break;
+
+        case ADIOI_IRC_STATE_IREAD_AND_EXCH_L1_BEGIN:
+            rae_vars = nbc_req->data.rd.rae_vars;
+            errcode = MPI_Test(&rae_vars->req2, &flag, MPI_STATUS_IGNORE);
+            if (errcode == MPI_SUCCESS && flag) {
+                ADIOI_R_Iexchange_data(nbc_req, &errcode);
+            }
+            break;
+
+        case ADIOI_IRC_STATE_R_IEXCHANGE_DATA:
+            red_vars = nbc_req->data.rd.red_vars;
+            errcode = MPI_Test(&red_vars->req1, &flag, MPI_STATUS_IGNORE);
+            if (errcode == MPI_SUCCESS && flag) {
+                ADIOI_R_Iexchange_data_recv(nbc_req, &errcode);
+            }
+            break;
+
+        case ADIOI_IRC_STATE_R_IEXCHANGE_DATA_RECV:
+            red_vars = nbc_req->data.rd.red_vars;
+            errcode = MPI_Testall(red_vars->nprocs_recv, red_vars->req2, &flag,
+                                  MPI_STATUSES_IGNORE);
+            if (errcode == MPI_SUCCESS && flag) {
+                ADIOI_R_Iexchange_data_fill(nbc_req, &errcode);
+            }
+            break;
+
+        case ADIOI_IRC_STATE_R_IEXCHANGE_DATA_FILL:
+            red_vars = nbc_req->data.rd.red_vars;
+            errcode = MPI_Testall(red_vars->nprocs_send,
+                                  red_vars->req2 + red_vars->nprocs_recv,
+                                  &flag, MPI_STATUSES_IGNORE);
+            if (errcode == MPI_SUCCESS && flag) {
+                ADIOI_R_Iexchange_data_fini(nbc_req, &errcode);
+            }
+            break;
+
+        default:
+            break;
+    }
+
+    /* --BEGIN ERROR HANDLING-- */
+    if (errcode != MPI_SUCCESS) {
+        errcode = MPIO_Err_create_code(MPI_SUCCESS,
+                MPIR_ERR_RECOVERABLE,
+                "ADIOI_GEN_irc_poll_fn", __LINE__,
+                MPI_ERR_IO, "**mpi_grequest_complete",
+                0);
+    }
+    /* --END ERROR HANDLING-- */
+
+    return errcode;
+}
+
+/* wait for multiple requests to complete */
+static int ADIOI_GEN_irc_wait_fn(int count, void **array_of_states,
+                                 double timeout, MPI_Status *status)
+{
+    int i, errcode = MPI_SUCCESS;
+    double starttime;
+    ADIOI_NBC_Request **nbc_reqlist;
+
+    nbc_reqlist = (ADIOI_NBC_Request **)array_of_states;
+
+    starttime = MPI_Wtime();
+    for (i = 0; i < count ; i++) {
+        while (nbc_reqlist[i]->data.rd.state != ADIOI_IRC_STATE_COMPLETE) {
+            errcode = ADIOI_GEN_irc_poll_fn(nbc_reqlist[i], MPI_STATUS_IGNORE);
+            /* --BEGIN ERROR HANDLING-- */
+            if (errcode != MPI_SUCCESS) {
+                errcode = MPIO_Err_create_code(MPI_SUCCESS,
+                        MPIR_ERR_RECOVERABLE,
+                        "ADIOI_GEN_irc_wait_fn",
+                        __LINE__, MPI_ERR_IO,
+                        "**mpi_grequest_complete", 0);
+            }
+            /* --END ERROR HANDLING-- */
+
+            if ((timeout > 0) && (timeout < (MPI_Wtime() - starttime)))
+                goto fn_exit;
+        }
+    }
+
+  fn_exit:
+    return errcode;
+}
+
diff --git a/src/mpi/romio/adio/common/ad_iread_coll.pdf b/src/mpi/romio/adio/common/ad_iread_coll.pdf
new file mode 100644
index 0000000..2c646fb
Binary files /dev/null and b/src/mpi/romio/adio/common/ad_iread_coll.pdf differ
diff --git a/src/mpi/romio/adio/common/ad_iwrite_coll.c b/src/mpi/romio/adio/common/ad_iwrite_coll.c
new file mode 100644
index 0000000..71dd93a
--- /dev/null
+++ b/src/mpi/romio/adio/common/ad_iwrite_coll.c
@@ -0,0 +1,1535 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "adio.h"
+#include "adio_extern.h"
+#include "mpiu_greq.h"
+
+#ifdef AGGREGATION_PROFILE
+#include "mpe.h"
+#endif
+
+/* ADIOI_GEN_IwriteStridedColl */
+struct ADIOI_GEN_IwriteStridedColl_vars {
+    /* requests */
+    MPI_Request req_offset[2]; /* ADIOI_IWC_STATE_GEN_IWRITESTRIDEDCOLL */
+    MPI_Request req_ind_io;    /* ADIOI_IWC_STATE_GEN_IWRITESTRIDEDCOLL_INDIO */
+    MPI_Request req_err;       /* ADIOI_IWC_STATE_GEN_IWRITESTRIDEDCOLL_BCAST */
+
+    /* parameters */
+    ADIO_File fd;
+    const void *buf;
+    int count;
+    MPI_Datatype datatype;
+    int file_ptr_type;
+    ADIO_Offset offset;
+
+    /* stack variables */
+    ADIOI_Access *my_req;
+    /* array of nprocs access structures, one for each other process in
+       whose file domain this process's request lies */
+
+    ADIOI_Access *others_req;
+    /* array of nprocs access structures, one for each other process
+       whose request lies in this process's file domain. */
+
+    int nprocs;
+    int nprocs_for_coll;
+    int myrank;
+    int contig_access_count;
+    int interleave_count;
+    int buftype_is_contig;
+    int *count_my_req_per_proc;
+    int count_my_req_procs;
+    int count_others_req_procs;
+    ADIO_Offset orig_fp;
+    ADIO_Offset fd_size;
+    ADIO_Offset min_st_offset;
+    ADIO_Offset *offset_list;
+    ADIO_Offset *st_offsets;
+    ADIO_Offset *fd_start;
+    ADIO_Offset *fd_end;
+    ADIO_Offset *end_offsets;
+    int *buf_idx;
+    ADIO_Offset *len_list;
+    int old_error;
+    int tmp_error;
+    int error_code;
+};
+
+/* ADIOI_Iexch_and_write */
+struct ADIOI_Iexch_and_write_vars {
+    /* requests */
+    MPI_Request req1;       /* ADIOI_IWC_STATE_IEXCH_AND_WRITE */
+    MPI_Request req3;       /* ADIOI_IWC_STATE_IEXCH_AND_WRITE_L1_BODY */
+
+    /* parameters */
+    ADIO_File fd;
+    void *buf;
+    MPI_Datatype datatype;
+    int nprocs;
+    int myrank;
+    ADIOI_Access *others_req;
+    ADIO_Offset *offset_list;
+    ADIO_Offset *len_list;
+    int contig_access_count;
+    ADIO_Offset min_st_offset;
+    ADIO_Offset fd_size;
+    ADIO_Offset *fd_start;
+    ADIO_Offset *fd_end;
+    int *buf_idx;
+
+    /* stack variables */
+    /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets*/
+    ADIO_Offset size;
+    int hole;
+    int m;
+    int ntimes;
+    int max_ntimes;
+    int buftype_is_contig;
+    ADIO_Offset st_loc;
+    ADIO_Offset end_loc;
+    ADIO_Offset off;
+    ADIO_Offset done;
+    char *write_buf;
+    int *curr_offlen_ptr;
+    int *count;
+    int *send_size;
+    int *recv_size;
+    int *partial_recv;
+    int *sent_to_proc;
+    int *start_pos;
+    int *send_buf_idx;
+    int *curr_to_proc;
+    int *done_to_proc;
+    ADIOI_Flatlist_node *flat_buf;
+    MPI_Aint buftype_extent;
+    int coll_bufsize;
+
+    /* next function to be called */
+    void (*next_fn)(ADIOI_NBC_Request *, int *);
+};
+
+/* ADIOI_W_Iexchange_data */
+struct ADIOI_W_Iexchange_data_vars {
+    /* requests */
+    MPI_Request req1;   /* ADIOI_IWC_STATE_W_IEXCHANGE_DATA */
+    MPI_Request req2;   /* ADIOI_IWC_STATE_W_IEXCHANGE_DATA_HOLE */
+    MPI_Request *req3;  /* ADIOI_IWC_STATE_W_IEXCHANGE_DATA_SEND */
+
+    /* parameters */
+    ADIO_File fd;
+    void *buf;
+    char *write_buf;
+    ADIOI_Flatlist_node *flat_buf;
+    ADIO_Offset *offset_list;
+    ADIO_Offset *len_list;
+    int *send_size;
+    int *recv_size;
+    ADIO_Offset off;
+    int size;
+    int *count;
+    int *start_pos;
+    int *partial_recv;
+    int *sent_to_proc;
+    int nprocs;
+    int myrank;
+    int buftype_is_contig;
+    int contig_access_count;
+    ADIO_Offset min_st_offset;
+    ADIO_Offset fd_size;
+    ADIO_Offset *fd_start;
+    ADIO_Offset *fd_end;
+    ADIOI_Access *others_req;
+    int *send_buf_idx;
+    int *curr_to_proc;
+    int *done_to_proc;
+    int *hole;
+    int iter;
+    MPI_Aint buftype_extent;
+    int *buf_idx;
+
+    /* stack variables */
+    int nprocs_recv;
+    int nprocs_send;
+    int err;
+    char **send_buf;
+    MPI_Request *requests;
+    MPI_Request *send_req;
+    MPI_Datatype *recv_types;
+    int sum;
+    ADIO_Offset *srt_off;
+
+    /* next function to be called */
+    void (*next_fn)(ADIOI_NBC_Request *, int *);
+};
+
+
+void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
+                           *flat_buf, char **send_buf, ADIO_Offset
+                           *offset_list, ADIO_Offset *len_list, int *send_size,
+                           MPI_Request *requests, int *sent_to_proc,
+                           int nprocs, int myrank,
+                           int contig_access_count, ADIO_Offset
+                           min_st_offset, ADIO_Offset fd_size,
+                           ADIO_Offset *fd_start, ADIO_Offset *fd_end,
+                           int *send_buf_idx, int *curr_to_proc,
+                           int *done_to_proc, int iter,
+                           MPI_Aint buftype_extent);
+void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
+                      ADIO_Offset *srt_off, int *srt_len, int *start_pos,
+                      int nprocs, int nprocs_recv, int total_elements);
+
+
+/* prototypes of functions used for nonblocking collective writes only. */
+static void ADIOI_GEN_IwriteStridedColl_inter(ADIOI_NBC_Request *, int *);
+static void ADIOI_GEN_IwriteStridedColl_indio(ADIOI_NBC_Request *, int *);
+static void ADIOI_GEN_IwriteStridedColl_exch(ADIOI_NBC_Request *, int *);
+static void ADIOI_GEN_IwriteStridedColl_bcast(ADIOI_NBC_Request *, int *);
+static void ADIOI_GEN_IwriteStridedColl_free(ADIOI_NBC_Request *, int *);
+static void ADIOI_GEN_IwriteStridedColl_fini(ADIOI_NBC_Request *, int *);
+
+static void ADIOI_Iexch_and_write(ADIOI_NBC_Request *, int *);
+static void ADIOI_Iexch_and_write_l1_begin(ADIOI_NBC_Request *, int *);
+static void ADIOI_Iexch_and_write_l1_body(ADIOI_NBC_Request *, int *);
+static void ADIOI_Iexch_and_write_l1_end(ADIOI_NBC_Request *, int *);
+static void ADIOI_Iexch_and_write_reset(ADIOI_NBC_Request *, int *);
+static void ADIOI_Iexch_and_write_l2_begin(ADIOI_NBC_Request *, int *);
+static void ADIOI_Iexch_and_write_l2_end(ADIOI_NBC_Request *, int *);
+static void ADIOI_Iexch_and_write_fini(ADIOI_NBC_Request *, int *);
+
+static void ADIOI_W_Iexchange_data(ADIOI_NBC_Request *, int *);
+static void ADIOI_W_Iexchange_data_hole(ADIOI_NBC_Request *, int *);
+static void ADIOI_W_Iexchange_data_send(ADIOI_NBC_Request *, int *);
+static void ADIOI_W_Iexchange_data_wait(ADIOI_NBC_Request *, int *);
+static void ADIOI_W_Iexchange_data_fini(ADIOI_NBC_Request *, int *);
+
+static MPIX_Grequest_class ADIOI_GEN_greq_class = 0;
+static int ADIOI_GEN_iwc_query_fn(void *extra_state, MPI_Status *status);
+static int ADIOI_GEN_iwc_free_fn(void *extra_state);
+static int ADIOI_GEN_iwc_poll_fn(void *extra_state, MPI_Status *status);
+static int ADIOI_GEN_iwc_wait_fn(int count, void **array_of_states,
+                                 double timeout, MPI_Status *status);
+
+
+/* Non-blocking version of ADIOI_GEN_WriteStridedColl() */
+void ADIOI_GEN_IwriteStridedColl(ADIO_File fd, const void *buf, int count,
+                       MPI_Datatype datatype, int file_ptr_type,
+                       ADIO_Offset offset, MPI_Request *request,
+                       int *error_code)
+{
+    /* Uses a generalized version of the extended two-phase method described
+       in "An Extended Two-Phase Method for Accessing Sections of
+       Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary,
+       Scientific Programming, (5)4:301--317, Winter 1996.
+       http://www.mcs.anl.gov/home/thakur/ext2ph.ps */
+
+    ADIOI_NBC_Request *nbc_req = NULL;
+    ADIOI_GEN_IwriteStridedColl_vars *vars = NULL;
+    int nprocs, myrank;
+    ADIO_Offset start_offset, end_offset;
+
+#if 0
+    /* FIXME: need an implementation of ADIOI_IOIstridedColl */
+    if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) {
+        /* Cast away const'ness as the below function is used for read
+         * and write */
+        ADIOI_IOIstridedColl(fd, (char *) buf, count, ADIOI_WRITE, datatype,
+                             file_ptr_type, offset, request, error_code);
+        return;
+    }
+#endif
+
+    /* top-level struct keeping the status of function progress */
+    nbc_req = (ADIOI_NBC_Request *)ADIOI_Calloc(1, sizeof(ADIOI_NBC_Request));
+    nbc_req->rdwr = ADIOI_WRITE;
+
+    /* create a generalized request */
+    if (ADIOI_GEN_greq_class == 0) {
+        MPIX_Grequest_class_create(ADIOI_GEN_iwc_query_fn,
+                ADIOI_GEN_iwc_free_fn, MPIU_Greq_cancel_fn,
+                ADIOI_GEN_iwc_poll_fn, ADIOI_GEN_iwc_wait_fn,
+                &ADIOI_GEN_greq_class);
+    }
+    MPIX_Grequest_class_allocate(ADIOI_GEN_greq_class, nbc_req, request);
+    memcpy(&nbc_req->req, request, sizeof(MPI_Request));
+
+    /* create a struct for parameters and variables */
+    vars = (ADIOI_GEN_IwriteStridedColl_vars *)ADIOI_Calloc(
+            1, sizeof(ADIOI_GEN_IwriteStridedColl_vars));
+    nbc_req->data.wr.wsc_vars = vars;
+
+    /* save the parameters */
+    vars->fd = fd;
+    vars->buf = buf;
+    vars->count = count;
+    vars->datatype = datatype;
+    vars->file_ptr_type = file_ptr_type;
+    vars->offset = offset;
+
+    MPI_Comm_size(fd->comm, &nprocs);
+    MPI_Comm_rank(fd->comm, &myrank);
+    vars->nprocs = nprocs;
+    vars->myrank = myrank;
+
+    /* the number of processes that actually perform I/O, nprocs_for_coll,
+     * is stored in the hints off the ADIO_File structure
+     */
+    vars->nprocs_for_coll = fd->hints->cb_nodes;
+    vars->orig_fp = fd->fp_ind;
+
+    /* only check for interleaving if cb_write isn't disabled */
+    if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
+        /* For this process's request, calculate the list of offsets and
+           lengths in the file and determine the start and end offsets. */
+
+        /* Note: end_offset points to the last byte-offset that will be accessed.
+           e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/
+
+        ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
+                              &vars->offset_list, &vars->len_list,
+                              &start_offset, &end_offset,
+                              &vars->contig_access_count);
+
+        /* each process communicates its start and end offsets to other
+           processes. The result is an array each of start and end offsets
+           stored in order of process rank. */
+
+        vars->st_offsets = (ADIO_Offset *)ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
+        vars->end_offsets = (ADIO_Offset *)ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
+
+        *error_code = MPI_Iallgather(&start_offset, 1, ADIO_OFFSET,
+                                     vars->st_offsets, 1, ADIO_OFFSET,
+                                     fd->comm, &vars->req_offset[0]);
+        if (*error_code != MPI_SUCCESS) return;
+        *error_code = MPI_Iallgather(&end_offset, 1, ADIO_OFFSET,
+                                     vars->end_offsets, 1, ADIO_OFFSET,
+                                     fd->comm, &vars->req_offset[1]);
+
+        nbc_req->data.wr.state = ADIOI_IWC_STATE_GEN_IWRITESTRIDEDCOLL;
+        return;
+    }
+
+    ADIOI_GEN_IwriteStridedColl_indio(nbc_req, error_code);
+}
+
+static void ADIOI_GEN_IwriteStridedColl_inter(ADIOI_NBC_Request *nbc_req,
+                                              int *error_code)
+{
+    ADIOI_GEN_IwriteStridedColl_vars *vars = nbc_req->data.wr.wsc_vars;
+    int nprocs = vars->nprocs;
+    ADIO_Offset *st_offsets = vars->st_offsets;
+    ADIO_Offset *end_offsets = vars->end_offsets;
+    int i, interleave_count = 0;
+
+    /* are the accesses of different processes interleaved? */
+    for (i = 1; i < nprocs; i++)
+        if ((st_offsets[i] < end_offsets[i-1]) &&
+            (st_offsets[i] <= end_offsets[i]))
+            interleave_count++;
+    /* This is a rudimentary check for interleaving, but should suffice
+       for the moment. */
+
+    vars->interleave_count = interleave_count;
+
+    ADIOI_GEN_IwriteStridedColl_indio(nbc_req, error_code);
+}
+
+static void ADIOI_GEN_IwriteStridedColl_indio(ADIOI_NBC_Request *nbc_req,
+                                              int *error_code)
+{
+    ADIOI_GEN_IwriteStridedColl_vars *vars = nbc_req->data.wr.wsc_vars;
+    ADIOI_Icalc_others_req_vars *cor_vars = NULL;
+    ADIO_File fd = vars->fd;
+    const void *buf;
+    int count, file_ptr_type;
+    MPI_Datatype datatype = vars->datatype;
+    ADIO_Offset offset;
+    int filetype_is_contig;
+    ADIO_Offset off;
+    int nprocs;
+
+    ADIOI_Datatype_iscontig(datatype, &vars->buftype_is_contig);
+
+    if (fd->hints->cb_write == ADIOI_HINT_DISABLE ||
+       (!vars->interleave_count && (fd->hints->cb_write == ADIOI_HINT_AUTO)))
+    {
+        buf = vars->buf;
+        count = vars->count;
+        file_ptr_type = vars->file_ptr_type;
+        offset = vars->offset;
+
+        /* use independent accesses */
+        if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
+            ADIOI_Free(vars->offset_list);
+            ADIOI_Free(vars->len_list);
+            ADIOI_Free(vars->st_offsets);
+            ADIOI_Free(vars->end_offsets);
+        }
+
+        fd->fp_ind = vars->orig_fp;
+        ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
+
+        if (vars->buftype_is_contig && filetype_is_contig) {
+            if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
+                off = fd->disp + (ADIO_Offset)(fd->etype_size) * offset;
+                ADIO_IwriteContig(fd, buf, count, datatype,
+                                  ADIO_EXPLICIT_OFFSET,
+                                  off, &vars->req_ind_io, error_code);
+            }
+            else ADIO_IwriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
+                                   0, &vars->req_ind_io, error_code);
+        }
+        else {
+            ADIO_IwriteStrided(fd, buf, count, datatype, file_ptr_type,
+                               offset, &vars->req_ind_io, error_code);
+        }
+
+        nbc_req->data.wr.state = ADIOI_IWC_STATE_GEN_IWRITESTRIDEDCOLL_INDIO;
+        return;
+    }
+
+    nprocs = vars->nprocs;
+
+    /* Divide the I/O workload among "nprocs_for_coll" processes. This is
+       done by (logically) dividing the file into file domains (FDs); each
+       process may directly access only its own file domain. */
+
+    ADIOI_Calc_file_domains(vars->st_offsets, vars->end_offsets, nprocs,
+            vars->nprocs_for_coll, &vars->min_st_offset,
+            &vars->fd_start, &vars->fd_end,
+            fd->hints->min_fdomain_size, &vars->fd_size,
+            fd->hints->striping_unit);
+
+    /* calculate what portions of the access requests of this process are
+       located in what file domains */
+
+    ADIOI_Calc_my_req(fd, vars->offset_list, vars->len_list,
+            vars->contig_access_count, vars->min_st_offset,
+            vars->fd_start, vars->fd_end, vars->fd_size,
+            nprocs, &vars->count_my_req_procs,
+            &vars->count_my_req_per_proc, &vars->my_req,
+            &vars->buf_idx);
+
+    /* based on everyone's my_req, calculate what requests of other
+       processes lie in this process's file domain.
+       count_others_req_procs = number of processes whose requests lie in
+       this process's file domain (including this process itself)
+       count_others_req_per_proc[i] indicates how many separate contiguous
+       requests of proc. i lie in this process's file domain. */
+
+    cor_vars = (ADIOI_Icalc_others_req_vars *)ADIOI_Calloc(
+            1, sizeof(ADIOI_Icalc_others_req_vars));
+    nbc_req->cor_vars = cor_vars;
+    cor_vars->fd = vars->fd;
+    cor_vars->count_my_req_procs = vars->count_my_req_procs;
+    cor_vars->count_my_req_per_proc = vars->count_my_req_per_proc;
+    cor_vars->my_req = vars->my_req;
+    cor_vars->nprocs = vars->nprocs;
+    cor_vars->myrank = vars->myrank;
+    cor_vars->count_others_req_procs_ptr = &vars->count_others_req_procs;
+    cor_vars->others_req_ptr = &vars->others_req;
+    cor_vars->next_fn = ADIOI_GEN_IwriteStridedColl_exch;
+
+    ADIOI_Icalc_others_req(nbc_req, error_code);
+}
+
+static void ADIOI_GEN_IwriteStridedColl_exch(ADIOI_NBC_Request *nbc_req,
+                                             int *error_code)
+{
+    ADIOI_GEN_IwriteStridedColl_vars *vars = nbc_req->data.wr.wsc_vars;
+    ADIOI_Iexch_and_write_vars *eaw_vars = NULL;
+    ADIOI_Access *my_req = vars->my_req;
+    int nprocs = vars->nprocs;
+    int i;
+
+    ADIOI_Free(vars->count_my_req_per_proc);
+    for (i = 0; i < nprocs; i++) {
+        if (my_req[i].count) {
+            ADIOI_Free(my_req[i].offsets);
+            ADIOI_Free(my_req[i].lens);
+        }
+    }
+    ADIOI_Free(my_req);
+
+    /* exchange data and write in sizes of no more than coll_bufsize. */
+    /* Cast away const'ness for the below function */
+    eaw_vars = (ADIOI_Iexch_and_write_vars *)ADIOI_Calloc(
+            1, sizeof(ADIOI_Iexch_and_write_vars));
+    nbc_req->data.wr.eaw_vars = eaw_vars;
+    eaw_vars->fd = vars->fd;
+    eaw_vars->buf = (char *)vars->buf;
+    eaw_vars->datatype = vars->datatype;
+    eaw_vars->nprocs = vars->nprocs;
+    eaw_vars->myrank = vars->myrank;
+    eaw_vars->others_req = vars->others_req;
+    eaw_vars->offset_list = vars->offset_list;
+    eaw_vars->len_list = vars->len_list;
+    eaw_vars->contig_access_count = vars->contig_access_count;
+    eaw_vars->min_st_offset = vars->min_st_offset;
+    eaw_vars->fd_size = vars->fd_size;
+    eaw_vars->fd_start = vars->fd_start;
+    eaw_vars->fd_end = vars->fd_end;
+    eaw_vars->buf_idx = vars->buf_idx;
+    eaw_vars->next_fn = ADIOI_GEN_IwriteStridedColl_bcast;
+
+    ADIOI_Iexch_and_write(nbc_req, error_code);
+}
+
+static void ADIOI_GEN_IwriteStridedColl_bcast(ADIOI_NBC_Request *nbc_req,
+                                              int *error_code)
+{
+    ADIOI_GEN_IwriteStridedColl_vars *vars = nbc_req->data.wr.wsc_vars;
+    ADIO_File fd = vars->fd;
+
+    /* If this collective write is followed by an independent write,
+     * it's possible to have those subsequent writes on other processes
+     * race ahead and sneak in before the read-modify-write completes.
+     * We carry out a collective communication at the end here so no one
+     * can start independent i/o before collective I/O completes.
+     *
+     * need to do some gymnastics with the error codes so that if something
+     * went wrong, all processes report error, but if a process has a more
+     * specific error code, we can still have that process report the
+     * additional information */
+
+    vars->old_error = *error_code;
+    if (*error_code != MPI_SUCCESS) *error_code = MPI_ERR_IO;
+
+    /* optimization: if only one process performing i/o, we can perform
+     * a less-expensive Bcast  */
+#ifdef ADIOI_MPE_LOGGING
+    MPE_Log_event( ADIOI_MPE_postwrite_a, 0, NULL );
+#endif
+    vars->error_code = *error_code;
+    if (fd->hints->cb_nodes == 1) {
+        *error_code = MPI_Ibcast(&vars->error_code, 1, MPI_INT,
+                                 fd->hints->ranklist[0], fd->comm,
+                                 &vars->req_err);
+    } else {
+        vars->tmp_error = *error_code;
+        *error_code  = MPI_Iallreduce(&vars->tmp_error, &vars->error_code, 1,
+                                      MPI_INT, MPI_MAX, fd->comm,
+                                      &vars->req_err);
+    }
+
+    nbc_req->data.wr.state = ADIOI_IWC_STATE_GEN_IWRITESTRIDEDCOLL_BCAST;
+}
+
+static void ADIOI_GEN_IwriteStridedColl_free(ADIOI_NBC_Request *nbc_req,
+                                             int *error_code)
+{
+    ADIOI_GEN_IwriteStridedColl_vars *vars = nbc_req->data.wr.wsc_vars;
+    ADIO_File fd = vars->fd;
+    MPI_Datatype datatype = vars->datatype;
+    ADIOI_Access *others_req = vars->others_req;
+    int nprocs = vars->nprocs;
+    int old_error = vars->old_error;
+    int i;
+
+#ifdef ADIOI_MPE_LOGGING
+    MPE_Log_event( ADIOI_MPE_postwrite_b, 0, NULL );
+#endif
+#ifdef AGGREGATION_PROFILE
+    MPE_Log_event(5012, 0, NULL);
+#endif
+
+    if ( (old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO) )
+        *error_code = old_error;
+
+
+    if (!vars->buftype_is_contig) ADIOI_Delete_flattened(datatype);
+
+    /* free all memory allocated for collective I/O */
+    for (i = 0; i < nprocs; i++) {
+        if (others_req[i].count) {
+            ADIOI_Free(others_req[i].offsets);
+            ADIOI_Free(others_req[i].lens);
+            ADIOI_Free(others_req[i].mem_ptrs);
+        }
+    }
+    ADIOI_Free(others_req);
+
+    ADIOI_Free(vars->buf_idx);
+    ADIOI_Free(vars->offset_list);
+    ADIOI_Free(vars->len_list);
+    ADIOI_Free(vars->st_offsets);
+    ADIOI_Free(vars->end_offsets);
+    ADIOI_Free(vars->fd_start);
+    ADIOI_Free(vars->fd_end);
+
+    fd->fp_sys_posn = -1;   /* set it to null. */
+#ifdef AGGREGATION_PROFILE
+    MPE_Log_event (5013, 0, NULL);
+#endif
+
+    ADIOI_GEN_IwriteStridedColl_fini(nbc_req, error_code);
+}
+
+static void ADIOI_GEN_IwriteStridedColl_fini(ADIOI_NBC_Request *nbc_req,
+                                             int *error_code)
+{
+    ADIOI_GEN_IwriteStridedColl_vars *vars = nbc_req->data.wr.wsc_vars;
+    MPI_Count size;
+
+    /* This is a temporary way of filling in status. The right way is to
+       keep track of how much data was actually written during collective I/O. */
+    MPI_Type_size_x(vars->datatype, &size);
+    nbc_req->nbytes = size * vars->count;
+
+    /* free the struct for parameters and variables */
+    if (nbc_req->data.wr.wsc_vars) {
+        ADIOI_Free(nbc_req->data.wr.wsc_vars);
+        nbc_req->data.wr.wsc_vars = NULL;
+    }
+
+    /* make the request complete */
+    *error_code = MPI_Grequest_complete(nbc_req->req);
+    nbc_req->data.wr.state = ADIOI_IWC_STATE_COMPLETE;
+}
+
+
+static void ADIOI_Iexch_and_write(ADIOI_NBC_Request *nbc_req, int *error_code)
+{
+    ADIOI_Iexch_and_write_vars *vars = nbc_req->data.wr.eaw_vars;
+    ADIO_File fd = vars->fd;
+    MPI_Datatype datatype = vars->datatype;
+    int nprocs = vars->nprocs;
+    ADIOI_Access *others_req = vars->others_req;
+
+    /* Send data to appropriate processes and write in sizes of no more
+       than coll_bufsize.
+       The idea is to reduce the amount of extra memory required for
+       collective I/O. If all data were written all at once, which is much
+       easier, it would require temp space more than the size of user_buf,
+       which is often unacceptable. For example, to write a distributed
+       array to a file, where each local array is 8Mbytes, requiring
+       at least another 8Mbytes of temp space is unacceptable. */
+
+    int i, j;
+    ADIO_Offset st_loc = -1, end_loc = -1;
+    ADIOI_Flatlist_node *flat_buf = NULL;
+    int info_flag, coll_bufsize;
+    char *value;
+
+    *error_code = MPI_SUCCESS;  /* changed below if error */
+    /* only I/O errors are currently reported */
+
+    /* calculate the number of writes of size coll_bufsize
+       to be done by each process and the max among all processes.
+       That gives the no. of communication phases as well. */
+
+    value = (char *)ADIOI_Malloc((MPI_MAX_INFO_VAL+1) * sizeof(char));
+    ADIOI_Info_get(fd->info, "cb_buffer_size", MPI_MAX_INFO_VAL, value,
+                   &info_flag);
+    coll_bufsize = atoi(value);
+    vars->coll_bufsize = coll_bufsize;
+    ADIOI_Free(value);
+
+    for (i = 0; i < nprocs; i++) {
+        if (others_req[i].count) {
+            st_loc = others_req[i].offsets[0];
+            end_loc = others_req[i].offsets[0];
+            break;
+        }
+    }
+
+    for (i = 0; i < nprocs; i++)
+        for (j = 0; j < others_req[i].count; j++) {
+            st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
+            end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j]
+                        + others_req[i].lens[j] - 1));
+        }
+
+    vars->st_loc = st_loc;
+    vars->end_loc = end_loc;
+
+    /* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/
+
+    vars->ntimes = (int)((end_loc - st_loc + coll_bufsize) / coll_bufsize);
+
+    if ((st_loc==-1) && (end_loc==-1)) {
+        vars->ntimes = 0; /* this process does no writing. */
+    }
+
+    *error_code = MPI_Iallreduce(&vars->ntimes, &vars->max_ntimes, 1, MPI_INT,
+                                 MPI_MAX, fd->comm, &vars->req1);
+
+    vars->write_buf = fd->io_buf;
+
+    vars->curr_offlen_ptr = (int *)ADIOI_Calloc(nprocs, sizeof(int));
+    /* its use is explained below. calloc initializes to 0. */
+
+    vars->count = (int *)ADIOI_Malloc(nprocs*sizeof(int));
+    /* to store count of how many off-len pairs per proc are satisfied
+       in an iteration. */
+
+    vars->partial_recv = (int *)ADIOI_Calloc(nprocs, sizeof(int));
+    /* if only a portion of the last off-len pair is recd. from a process
+       in a particular iteration, the length recd. is stored here.
+       calloc initializes to 0. */
+
+    vars->send_size = (int *)ADIOI_Malloc(nprocs*sizeof(int));
+    /* total size of data to be sent to each proc. in an iteration.
+       Of size nprocs so that I can use MPI_Alltoall later. */
+
+    vars->recv_size = (int *)ADIOI_Malloc(nprocs*sizeof(int));
+    /* total size of data to be recd. from each proc. in an iteration.*/
+
+    vars->sent_to_proc = (int *)ADIOI_Calloc(nprocs, sizeof(int));
+    /* amount of data sent to each proc so far. Used in
+       ADIOI_Fill_send_buffer. initialized to 0 here. */
+
+    vars->send_buf_idx = (int *)ADIOI_Malloc(nprocs*sizeof(int));
+    vars->curr_to_proc = (int *)ADIOI_Malloc(nprocs*sizeof(int));
+    vars->done_to_proc = (int *)ADIOI_Malloc(nprocs*sizeof(int));
+    /* Above three are used in ADIOI_Fill_send_buffer*/
+
+    vars->start_pos = (int *)ADIOI_Malloc(nprocs*sizeof(int));
+    /* used to store the starting value of curr_offlen_ptr[i] in
+       this iteration */
+
+    ADIOI_Datatype_iscontig(datatype, &vars->buftype_is_contig);
+    if (!vars->buftype_is_contig) {
+        ADIOI_Flatten_datatype(datatype);
+        flat_buf = ADIOI_Flatlist;
+        while (flat_buf->type != datatype) flat_buf = flat_buf->next;
+        vars->flat_buf = flat_buf;
+    }
+    MPI_Type_extent(datatype, &vars->buftype_extent);
+
+
+    /* I need to check if there are any outstanding nonblocking writes to
+       the file, which could potentially interfere with the writes taking
+       place in this collective write call. Since this is not likely to be
+       common, let me do the simplest thing possible here: Each process
+       completes all pending nonblocking operations before completing. */
+
+    /*ADIOI_Complete_async(error_code);
+      if (*error_code != MPI_SUCCESS) return;
+      MPI_Barrier(fd->comm);
+     */
+
+    vars->done = 0;
+    vars->off = st_loc;
+
+    /* set the state to wait until MPI_Ialltoall finishes. */
+    nbc_req->data.wr.state = ADIOI_IWC_STATE_IEXCH_AND_WRITE;
+}
+
+static void ADIOI_Iexch_and_write_l1_begin(ADIOI_NBC_Request *nbc_req,
+                                           int *error_code)
+{
+    ADIOI_Iexch_and_write_vars *vars = nbc_req->data.wr.eaw_vars;
+    int nprocs;
+    ADIOI_Access *others_req;
+
+    int i, j;
+    ADIO_Offset off, req_off;
+    char *write_buf;
+    int *curr_offlen_ptr, *count, req_len, *recv_size;
+    int *partial_recv, *start_pos;
+    ADIO_Offset size;
+    static char myname[] = "ADIOI_IEXCH_AND_WRITE_L1_BEGIN";
+
+    ADIOI_W_Iexchange_data_vars *wed_vars = NULL;
+
+    /* loop exit condition */
+    if (vars->m >= vars->ntimes) {
+        ADIOI_Iexch_and_write_reset(nbc_req, error_code);
+        return;
+    }
+
+    nprocs = vars->nprocs;
+    others_req = vars->others_req;
+
+    off = vars->off;
+    write_buf = vars->write_buf;
+    curr_offlen_ptr = vars->curr_offlen_ptr;
+    count = vars->count;
+    recv_size = vars->recv_size;
+    partial_recv = vars->partial_recv;
+    start_pos = vars->start_pos;
+
+    /* go through all others_req and check which will be satisfied
+       by the current write */
+
+    /* Note that MPI guarantees that displacements in filetypes are in
+       monotonically nondecreasing order and that, for writes, the
+       filetypes cannot specify overlapping regions in the file. This
+       simplifies implementation a bit compared to reads. */
+
+    /* off = start offset in the file for the data to be written in
+       this iteration
+       size = size of data written (bytes) corresponding to off
+       req_off = off in file for a particular contiguous request
+       minus what was satisfied in previous iteration
+       req_size = size corresponding to req_off */
+
+    /* first calculate what should be communicated */
+
+    for (i = 0; i < nprocs; i++) count[i] = recv_size[i] = 0;
+
+    size = ADIOI_MIN((unsigned)vars->coll_bufsize,
+                     vars->end_loc - vars->st_loc + 1 - vars->done);
+    vars->size = size;
+
+    for (i = 0; i < nprocs; i++) {
+        if (others_req[i].count) {
+            start_pos[i] = curr_offlen_ptr[i];
+            for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) {
+                if (partial_recv[i]) {
+                    /* this request may have been partially
+                       satisfied in the previous iteration. */
+                    req_off = others_req[i].offsets[j] +
+                        partial_recv[i];
+                    req_len = others_req[i].lens[j] -
+                        partial_recv[i];
+                    partial_recv[i] = 0;
+                    /* modify the off-len pair to reflect this change */
+                    others_req[i].offsets[j] = req_off;
+                    others_req[i].lens[j] = req_len;
+                }
+                else {
+                    req_off = others_req[i].offsets[j];
+                    req_len = others_req[i].lens[j];
+                }
+                if (req_off < off + size) {
+                    count[i]++;
+                    ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)write_buf)+req_off-off) == (ADIO_Offset)(MPIR_Upint)(write_buf+req_off-off));
+                    MPI_Address(write_buf + req_off - off,
+                                &(others_req[i].mem_ptrs[j]));
+                    ADIOI_Assert((off + size - req_off) == (int)(off + size - req_off));
+                    recv_size[i] += (int)(ADIOI_MIN(off + size - req_off,
+                                                    (unsigned)req_len));
+
+                    if (off+size-req_off < (unsigned)req_len)
+                    {
+                        partial_recv[i] = (int)(off + size - req_off);
+
+                        /* --BEGIN ERROR HANDLING-- */
+                        if ((j+1 < others_req[i].count) &&
+                            (others_req[i].offsets[j+1] < off+size))
+                        {
+                            *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+                                    MPIR_ERR_RECOVERABLE,
+                                    myname,
+                                    __LINE__,
+                                    MPI_ERR_ARG,
+                                    "Filetype specifies overlapping write regions (which is illegal according to the MPI-2 specification)", 0);
+                            /* allow to continue since additional
+                             * communication might have to occur
+                             */
+                        }
+                        /* --END ERROR HANDLING-- */
+                        break;
+                    }
+                }
+                else break;
+            }
+            curr_offlen_ptr[i] = j;
+        }
+    }
+
+    /* create a struct for ADIOI_W_Iexchange_data() */
+    wed_vars = (ADIOI_W_Iexchange_data_vars *)ADIOI_Calloc(
+            1, sizeof(ADIOI_W_Iexchange_data_vars));
+    nbc_req->data.wr.wed_vars = wed_vars;
+
+    wed_vars->fd = vars->fd;
+    wed_vars->buf = vars->buf;
+    wed_vars->write_buf = vars->write_buf;
+    wed_vars->flat_buf = vars->flat_buf;
+    wed_vars->offset_list = vars->offset_list;
+    wed_vars->len_list = vars->len_list;
+    wed_vars->send_size = vars->send_size;
+    wed_vars->recv_size = vars->recv_size;
+    wed_vars->off = vars->off;
+    wed_vars->size = vars->size;
+    wed_vars->count = vars->count;
+    wed_vars->start_pos = vars->start_pos;
+    wed_vars->partial_recv = vars->partial_recv;
+    wed_vars->sent_to_proc = vars->sent_to_proc;
+    wed_vars->nprocs = vars->nprocs;
+    wed_vars->myrank = vars->myrank;
+    wed_vars->buftype_is_contig = vars->buftype_is_contig;
+    wed_vars->contig_access_count = vars->contig_access_count;
+    wed_vars->min_st_offset = vars->min_st_offset;
+    wed_vars->fd_size = vars->fd_size;
+    wed_vars->fd_start = vars->fd_start;
+    wed_vars->fd_end = vars->fd_end;
+    wed_vars->others_req = vars->others_req;
+    wed_vars->send_buf_idx = vars->send_buf_idx;
+    wed_vars->curr_to_proc = vars->curr_to_proc;
+    wed_vars->done_to_proc = vars->done_to_proc;
+    wed_vars->hole = &vars->hole;
+    wed_vars->iter = vars->m;
+    wed_vars->buftype_extent = vars->buftype_extent;
+    wed_vars->buf_idx = vars->buf_idx;
+    wed_vars->next_fn = ADIOI_Iexch_and_write_l1_body;
+
+    ADIOI_W_Iexchange_data(nbc_req, error_code);
+}
+
+static void ADIOI_Iexch_and_write_l1_body(ADIOI_NBC_Request *nbc_req,
+                                          int *error_code)
+{
+    ADIOI_Iexch_and_write_vars *vars = nbc_req->data.wr.eaw_vars;
+    ADIO_File fd = vars->fd;
+    int nprocs = vars->nprocs;
+    ADIO_Offset size = vars->size;
+    char *write_buf = vars->write_buf;
+    int *count = vars->count;
+    int flag, i;
+
+    flag = 0;
+    for (i = 0; i < nprocs; i++)
+        if (count[i]) flag = 1;
+
+    if (flag) {
+        ADIOI_Assert(size == (int)size);
+        ADIO_IwriteContig(fd, write_buf, (int)size, MPI_BYTE,
+                          ADIO_EXPLICIT_OFFSET, vars->off, &vars->req3,
+                          error_code);
+
+        nbc_req->data.wr.state = ADIOI_IWC_STATE_IEXCH_AND_WRITE_L1_BODY;
+        return;
+    }
+
+    ADIOI_Iexch_and_write_l1_end(nbc_req, error_code);
+}
+
+static void ADIOI_Iexch_and_write_l1_end(ADIOI_NBC_Request *nbc_req,
+                                         int *error_code)
+{
+    ADIOI_Iexch_and_write_vars *vars = nbc_req->data.wr.eaw_vars;
+    ADIO_Offset size = vars->size;
+
+    vars->off += size;
+    vars->done += size;
+
+    /* increment m and go back to the beginning of m loop */
+    vars->m++;
+    ADIOI_Iexch_and_write_l1_begin(nbc_req, error_code);
+}
+
+static void ADIOI_Iexch_and_write_reset(ADIOI_NBC_Request *nbc_req,
+                                        int *error_code)
+{
+    ADIOI_Iexch_and_write_vars *vars = nbc_req->data.wr.eaw_vars;
+    int nprocs = vars->nprocs;
+    int *count = vars->count;
+    int *recv_size = vars->recv_size;
+    int i;
+
+    for (i = 0; i < nprocs; i++) count[i] = recv_size[i] = 0;
+
+    vars->m = vars->ntimes;
+    ADIOI_Iexch_and_write_l2_begin(nbc_req, error_code);
+}
+
+static void ADIOI_Iexch_and_write_l2_begin(ADIOI_NBC_Request *nbc_req,
+                                           int *error_code)
+{
+    ADIOI_Iexch_and_write_vars *vars = nbc_req->data.wr.eaw_vars;
+    ADIO_Offset size = vars->size;
+    ADIOI_W_Iexchange_data_vars *wed_vars = NULL;
+
+    /* loop exit condition */
+    if (vars->m >= vars->max_ntimes) {
+        ADIOI_Iexch_and_write_fini(nbc_req, error_code);
+        return;
+    }
+
+    ADIOI_Assert(size == (int)size);
+
+    /* create a struct for ADIOI_W_Iexchange_data() */
+    wed_vars = (ADIOI_W_Iexchange_data_vars *)ADIOI_Calloc(
+            1, sizeof(ADIOI_W_Iexchange_data_vars));
+    nbc_req->data.wr.wed_vars = wed_vars;
+
+    wed_vars->fd = vars->fd;
+    wed_vars->buf = vars->buf;
+    wed_vars->write_buf = vars->write_buf;
+    wed_vars->flat_buf = vars->flat_buf;
+    wed_vars->offset_list = vars->offset_list;
+    wed_vars->len_list = vars->len_list;
+    wed_vars->send_size = vars->send_size;
+    wed_vars->recv_size = vars->recv_size;
+    wed_vars->off = vars->off;
+    wed_vars->size = (int)vars->size;
+    wed_vars->count = vars->count;
+    wed_vars->start_pos = vars->start_pos;
+    wed_vars->partial_recv = vars->partial_recv;
+    wed_vars->sent_to_proc = vars->sent_to_proc;
+    wed_vars->nprocs = vars->nprocs;
+    wed_vars->myrank = vars->myrank;
+    wed_vars->buftype_is_contig = vars->buftype_is_contig;
+    wed_vars->contig_access_count = vars->contig_access_count;
+    wed_vars->min_st_offset = vars->min_st_offset;
+    wed_vars->fd_size = vars->fd_size;
+    wed_vars->fd_start = vars->fd_start;
+    wed_vars->fd_end = vars->fd_end;
+    wed_vars->others_req = vars->others_req;
+    wed_vars->send_buf_idx = vars->send_buf_idx;
+    wed_vars->curr_to_proc = vars->curr_to_proc;
+    wed_vars->done_to_proc = vars->done_to_proc;
+    wed_vars->hole = &vars->hole;
+    wed_vars->iter = vars->m;
+    wed_vars->buftype_extent = vars->buftype_extent;
+    wed_vars->buf_idx = vars->buf_idx;
+    wed_vars->next_fn = ADIOI_Iexch_and_write_l2_end;
+
+    /* nothing to recv, but check for send. */
+    ADIOI_W_Iexchange_data(nbc_req, error_code);
+}
+
+static void ADIOI_Iexch_and_write_l2_end(ADIOI_NBC_Request *nbc_req,
+                                         int *error_code)
+{
+    ADIOI_Iexch_and_write_vars *vars = nbc_req->data.wr.eaw_vars;
+
+    vars->m++;
+    ADIOI_Iexch_and_write_l2_begin(nbc_req, error_code);
+}
+
+static void ADIOI_Iexch_and_write_fini(ADIOI_NBC_Request *nbc_req, int *error_code)
+{
+    ADIOI_Iexch_and_write_vars *vars = nbc_req->data.wr.eaw_vars;
+    void (*next_fn)(ADIOI_NBC_Request *, int *);
+
+    ADIOI_Free(vars->curr_offlen_ptr);
+    ADIOI_Free(vars->count);
+    ADIOI_Free(vars->partial_recv);
+    ADIOI_Free(vars->send_size);
+    ADIOI_Free(vars->recv_size);
+    ADIOI_Free(vars->sent_to_proc);
+    ADIOI_Free(vars->start_pos);
+    ADIOI_Free(vars->send_buf_idx);
+    ADIOI_Free(vars->curr_to_proc);
+    ADIOI_Free(vars->done_to_proc);
+
+    next_fn = vars->next_fn;
+
+    /* free the struct for parameters and variables */
+    ADIOI_Free(nbc_req->data.wr.eaw_vars);
+    nbc_req->data.wr.eaw_vars = NULL;
+
+    /* move to the next function */
+    next_fn(nbc_req, error_code);
+}
+
+
+static void ADIOI_W_Iexchange_data(ADIOI_NBC_Request *nbc_req, int *error_code)
+{
+    ADIOI_W_Iexchange_data_vars *vars = nbc_req->data.wr.wed_vars;
+
+    /* exchange recv_size info so that each process knows how much to
+       send to whom. */
+
+    *error_code = MPI_Ialltoall(vars->recv_size, 1, MPI_INT, vars->send_size, 1,
+                                MPI_INT, vars->fd->comm, &vars->req1);
+
+    nbc_req->data.wr.state = ADIOI_IWC_STATE_W_IEXCHANGE_DATA;
+}
+
+static void ADIOI_W_Iexchange_data_hole(ADIOI_NBC_Request *nbc_req,
+                                        int *error_code)
+{
+    ADIOI_W_Iexchange_data_vars *vars = nbc_req->data.wr.wed_vars;
+    ADIO_File fd = vars->fd;
+    int *recv_size = vars->recv_size;
+    ADIO_Offset off = vars->off;
+    int size = vars->size;
+    int *count = vars->count;
+    int *start_pos = vars->start_pos;
+    int *partial_recv = vars->partial_recv;
+    int nprocs = vars->nprocs;
+    ADIOI_Access *others_req = vars->others_req;
+    int *hole = vars->hole;
+
+    int i, j, k, *tmp_len, nprocs_recv;
+    MPI_Datatype *recv_types;
+    int *srt_len = NULL, sum;
+    ADIO_Offset *srt_off = NULL;
+
+    /* create derived datatypes for recv */
+
+    nprocs_recv = 0;
+    for (i = 0; i < nprocs; i++) if (recv_size[i]) nprocs_recv++;
+    vars->nprocs_recv = nprocs_recv;
+
+    recv_types = (MPI_Datatype *)
+        ADIOI_Malloc((nprocs_recv+1)*sizeof(MPI_Datatype));
+    vars->recv_types = recv_types;
+    /* +1 to avoid a 0-size malloc */
+
+    tmp_len = (int *)ADIOI_Malloc(nprocs*sizeof(int));
+    j = 0;
+    for (i = 0; i < nprocs; i++) {
+        if (recv_size[i]) {
+            /* take care if the last off-len pair is a partial recv */
+            if (partial_recv[i]) {
+                k = start_pos[i] + count[i] - 1;
+                tmp_len[i] = others_req[i].lens[k];
+                others_req[i].lens[k] = partial_recv[i];
+            }
+            ADIOI_Type_create_hindexed_x(count[i],
+                    &(others_req[i].lens[start_pos[i]]),
+                    &(others_req[i].mem_ptrs[start_pos[i]]),
+                    MPI_BYTE, recv_types+j);
+            /* absolute displacements; use MPI_BOTTOM in recv */
+            MPI_Type_commit(recv_types+j);
+            j++;
+        }
+    }
+
+    /* To avoid a read-modify-write, check if there are holes in the
+       data to be written. For this, merge the (sorted) offset lists
+       others_req using a heap-merge. */
+
+    sum = 0;
+    for (i = 0; i < nprocs; i++) sum += count[i];
+    /* valgrind-detcted optimization: if there is no work on this process we do
+     * not need to search for holes */
+    if (sum) {
+        srt_off = (ADIO_Offset *)ADIOI_Malloc(sum*sizeof(ADIO_Offset));
+        srt_len = (int *)ADIOI_Malloc(sum*sizeof(int));
+
+        ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
+                         nprocs, nprocs_recv, sum);
+    }
+
+    /* for partial recvs, restore original lengths */
+    for (i = 0; i < nprocs; i++)
+        if (partial_recv[i]) {
+            k = start_pos[i] + count[i] - 1;
+            others_req[i].lens[k] = tmp_len[i];
+        }
+    ADIOI_Free(tmp_len);
+
+    /* check if there are any holes. If yes, must do read-modify-write.
+     * holes can be in three places.  'middle' is what you'd expect: the
+     * processes are operating on noncontigous data.  But holes can also show
+     * up at the beginning or end of the file domain (see John Bent ROMIO REQ
+     * #835). Missing these holes would result in us writing more data than
+     * recieved by everyone else. */
+
+    *hole = 0;
+    if (sum) {
+        if (off != srt_off[0]) /* hole at the front */
+            *hole = 1;
+        else { /* coalesce the sorted offset-length pairs */
+            for (i = 1; i < sum; i++) {
+                if (srt_off[i] <= srt_off[0] + srt_len[0]) {
+                    /* ok to cast: operating on cb_buffer_size chunks */
+                    int new_len = (int)srt_off[i] + srt_len[i] - (int)srt_off[0];
+                    if (new_len > srt_len[0]) srt_len[0] = new_len;
+                }
+                else
+                    break;
+            }
+            if (i < sum || size != srt_len[0]) /* hole in middle or end */
+                *hole = 1;
+        }
+
+        ADIOI_Free(srt_off);
+        ADIOI_Free(srt_len);
+    }
+
+    if (nprocs_recv) {
+        if (*hole) {
+            ADIO_IreadContig(fd, vars->write_buf, size, MPI_BYTE,
+                             ADIO_EXPLICIT_OFFSET, off, &vars->req2,
+                             &vars->err);
+            nbc_req->data.wr.state = ADIOI_IWC_STATE_W_IEXCHANGE_DATA_HOLE;
+            return;
+        }
+    }
+
+    ADIOI_W_Iexchange_data_send(nbc_req, error_code);
+}
+
+static void ADIOI_W_Iexchange_data_send(ADIOI_NBC_Request *nbc_req,
+                                        int *error_code)
+{
+    ADIOI_W_Iexchange_data_vars *vars = nbc_req->data.wr.wed_vars;
+    ADIO_File fd = vars->fd;
+    void *buf = vars->buf;
+    int *send_size = vars->send_size;
+    int *recv_size = vars->recv_size;
+    int nprocs = vars->nprocs;
+    int myrank = vars->myrank;
+    int iter = vars->iter;
+    int *buf_idx = vars->buf_idx;
+
+    int nprocs_recv = vars->nprocs_recv;
+    MPI_Datatype *recv_types = vars->recv_types;
+
+    int i, j;
+    int nprocs_send;
+    char **send_buf = NULL;
+
+    nprocs_send = 0;
+    for (i = 0; i < nprocs; i++) if (send_size[i]) nprocs_send++;
+    vars->nprocs_send = nprocs_send;
+
+    if (fd->atomicity) {
+        /* bug fix from Wei-keng Liao and Kenin Coloma */
+        vars->requests = (MPI_Request *)
+            ADIOI_Malloc((nprocs_send+1)*sizeof(MPI_Request));
+        vars->send_req = vars->requests;
+    }
+    else {
+        vars->requests = (MPI_Request *)
+            ADIOI_Malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request));
+        /* +1 to avoid a 0-size malloc */
+
+        /* post receives */
+        j = 0;
+        for (i = 0; i < nprocs; i++) {
+            if (recv_size[i]) {
+                MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i, myrank+i+100*iter,
+                          fd->comm, vars->requests+j);
+                j++;
+            }
+        }
+        vars->send_req = vars->requests + nprocs_recv;
+    }
+
+    /* post sends. if buftype_is_contig, data can be directly sent from
+       user buf at location given by buf_idx. else use send_buf. */
+
+#ifdef AGGREGATION_PROFILE
+    MPE_Log_event (5032, 0, NULL);
+#endif
+    if (vars->buftype_is_contig) {
+        j = 0;
+        for (i = 0; i < nprocs; i++)
+            if (send_size[i]) {
+                MPI_Isend(((char *) buf) + buf_idx[i], send_size[i],
+                          MPI_BYTE, i,  myrank+i+100*iter, fd->comm,
+                          vars->send_req+j);
+                j++;
+                buf_idx[i] += send_size[i];
+            }
+    }
+    else if (nprocs_send) {
+        /* buftype is not contig */
+        send_buf = (char **)ADIOI_Malloc(nprocs*sizeof(char*));
+        vars->send_buf = send_buf;
+        for (i = 0; i < nprocs; i++)
+            if (send_size[i])
+                send_buf[i] = (char *)ADIOI_Malloc(send_size[i]);
+
+        ADIOI_Fill_send_buffer(fd, buf, vars->flat_buf, send_buf,
+                               vars->offset_list, vars->len_list, send_size,
+                               vars->send_req,
+                               vars->sent_to_proc, nprocs, myrank,
+                               vars->contig_access_count,
+                               vars->min_st_offset, vars->fd_size,
+                               vars->fd_start, vars->fd_end,
+                               vars->send_buf_idx, vars->curr_to_proc,
+                               vars->done_to_proc, iter,
+                               vars->buftype_extent);
+        /* the send is done in ADIOI_Fill_send_buffer */
+    }
+
+    if (fd->atomicity) {
+        vars->req3 = (MPI_Request *)
+            ADIOI_Malloc((nprocs_recv+1)*sizeof(MPI_Request));
+        /* +1 to avoid a 0-size malloc */
+
+        /* bug fix from Wei-keng Liao and Kenin Coloma */
+        j = 0;
+        for (i = 0; i < nprocs; i++) {
+            if (recv_size[i]) {
+                MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i, myrank+i+100*iter,
+                          fd->comm, vars->req3+j);
+                j++;
+            }
+        }
+
+        nbc_req->data.wr.state = ADIOI_IWC_STATE_W_IEXCHANGE_DATA_SEND;
+        return;
+    }
+
+    ADIOI_W_Iexchange_data_wait(nbc_req, error_code);
+}
+
+static void ADIOI_W_Iexchange_data_wait(ADIOI_NBC_Request *nbc_req,
+                                        int *error_code)
+{
+    ADIOI_W_Iexchange_data_vars *vars = nbc_req->data.wr.wed_vars;
+    ADIO_File fd = vars->fd;
+    int nprocs_send = vars->nprocs_send;
+    int nprocs_recv = vars->nprocs_recv;
+    MPI_Datatype *recv_types = vars->recv_types;
+    int i;
+
+    for (i = 0; i < nprocs_recv; i++) MPI_Type_free(recv_types+i);
+    ADIOI_Free(recv_types);
+
+    i= 0;
+    if (fd->atomicity) {
+        /* bug fix from Wei-keng Liao and Kenin Coloma */
+        MPI_Testall(nprocs_send, vars->send_req, &i, MPI_STATUSES_IGNORE);
+    }
+    else {
+        MPI_Testall(nprocs_send+nprocs_recv, vars->requests, &i,
+                    MPI_STATUSES_IGNORE);
+    }
+
+    if (i) {
+        ADIOI_W_Iexchange_data_fini(nbc_req, error_code);
+    } else {
+        nbc_req->data.wr.state = ADIOI_IWC_STATE_W_IEXCHANGE_DATA_WAIT;
+    }
+}
+
+static void ADIOI_W_Iexchange_data_fini(ADIOI_NBC_Request *nbc_req, int *error_code)
+{
+    ADIOI_W_Iexchange_data_vars *vars = nbc_req->data.wr.wed_vars;
+    void (*next_fn)(ADIOI_NBC_Request *, int *);
+    ADIO_File fd = vars->fd;
+    int *send_size = vars->send_size;
+    int nprocs = vars->nprocs;
+    char **send_buf = vars->send_buf;
+    int i;
+
+    if (fd->atomicity) ADIOI_Free(vars->req3);
+
+#ifdef AGGREGATION_PROFILE
+    MPE_Log_event (5033, 0, NULL);
+#endif
+    ADIOI_Free(vars->requests);
+    if (!vars->buftype_is_contig && vars->nprocs_send) {
+        for (i = 0; i < nprocs; i++)
+            if (send_size[i]) ADIOI_Free(send_buf[i]);
+        ADIOI_Free(send_buf);
+    }
+
+    next_fn = vars->next_fn;
+
+    /* free the structure for parameters and variables */
+    ADIOI_Free(vars);
+    nbc_req->data.wr.wed_vars = NULL;
+
+    /* move to the next function */
+    next_fn(nbc_req, error_code);
+}
+
+
+static int ADIOI_GEN_iwc_query_fn(void *extra_state, MPI_Status *status)
+{
+    ADIOI_NBC_Request *nbc_req;
+
+    nbc_req = (ADIOI_NBC_Request *)extra_state;
+
+    MPI_Status_set_elements_x(status, MPI_BYTE, nbc_req->nbytes);
+
+    /* can never cancel so always true */
+    MPI_Status_set_cancelled(status, 0);
+
+    /* choose not to return a value for this */
+    status->MPI_SOURCE = MPI_UNDEFINED;
+    /* tag has no meaning for this generalized request */
+    status->MPI_TAG = MPI_UNDEFINED;
+
+    /* this generalized request never fails */
+    return MPI_SUCCESS;
+}
+
+static int ADIOI_GEN_iwc_free_fn(void *extra_state)
+{
+    ADIOI_NBC_Request *nbc_req;
+
+    nbc_req = (ADIOI_NBC_Request *)extra_state;
+    ADIOI_Free(nbc_req);
+
+    return MPI_SUCCESS;
+}
+
+static int ADIOI_GEN_iwc_poll_fn(void *extra_state, MPI_Status *status)
+{
+    ADIOI_NBC_Request *nbc_req;
+    ADIOI_GEN_IwriteStridedColl_vars *wsc_vars = NULL;
+    ADIOI_Icalc_others_req_vars      *cor_vars = NULL;
+    ADIOI_Iexch_and_write_vars       *eaw_vars = NULL;
+    ADIOI_W_Iexchange_data_vars      *wed_vars = NULL;
+    int errcode = MPI_SUCCESS;
+    int flag;
+
+    nbc_req = (ADIOI_NBC_Request *)extra_state;
+
+    switch (nbc_req->data.wr.state) {
+        case ADIOI_IWC_STATE_GEN_IWRITESTRIDEDCOLL:
+            wsc_vars = nbc_req->data.wr.wsc_vars;
+            errcode = MPI_Testall(2, wsc_vars->req_offset, &flag,
+                                  MPI_STATUSES_IGNORE);
+            if (errcode == MPI_SUCCESS && flag) {
+                ADIOI_GEN_IwriteStridedColl_inter(nbc_req, &errcode);
+            }
+            break;
+
+        case ADIOI_IWC_STATE_GEN_IWRITESTRIDEDCOLL_INDIO:
+            wsc_vars = nbc_req->data.wr.wsc_vars;
+            errcode = MPI_Test(&wsc_vars->req_ind_io, &flag, MPI_STATUS_IGNORE);
+            if (errcode == MPI_SUCCESS && flag) {
+                /* call the last function */
+                ADIOI_GEN_IwriteStridedColl_fini(nbc_req, &errcode);
+            }
+            break;
+
+        case ADIOI_IWC_STATE_GEN_IWRITESTRIDEDCOLL_BCAST:
+            wsc_vars = nbc_req->data.wr.wsc_vars;
+            errcode = MPI_Test(&wsc_vars->req_err, &flag, MPI_STATUS_IGNORE);
+            if (errcode == MPI_SUCCESS && flag) {
+                errcode = wsc_vars->error_code;
+                ADIOI_GEN_IwriteStridedColl_free(nbc_req, &errcode);
+            }
+            break;
+
+        case ADIOI_IWC_STATE_ICALC_OTHERS_REQ:
+            cor_vars = nbc_req->cor_vars;
+            errcode = MPI_Test(&cor_vars->req1, &flag, MPI_STATUS_IGNORE);
+            if (errcode == MPI_SUCCESS && flag) {
+                ADIOI_Icalc_others_req_main(nbc_req, &errcode);
+            }
+            break;
+
+        case ADIOI_IWC_STATE_ICALC_OTHERS_REQ_MAIN:
+            cor_vars = nbc_req->cor_vars;
+            if (cor_vars->num_req2) {
+                errcode = MPI_Testall(cor_vars->num_req2, cor_vars->req2,
+                                      &flag, MPI_STATUSES_IGNORE);
+                if (errcode == MPI_SUCCESS && flag) {
+                    ADIOI_Icalc_others_req_fini(nbc_req, &errcode);
+                }
+            } else {
+                ADIOI_Icalc_others_req_fini(nbc_req, &errcode);
+            }
+            break;
+
+        case ADIOI_IWC_STATE_IEXCH_AND_WRITE:
+            eaw_vars = nbc_req->data.wr.eaw_vars;
+            errcode = MPI_Test(&eaw_vars->req1, &flag, MPI_STATUS_IGNORE);
+            if (errcode == MPI_SUCCESS && flag) {
+                eaw_vars->m = 0;
+                ADIOI_Iexch_and_write_l1_begin(nbc_req, &errcode);
+            }
+            break;
+
+        case ADIOI_IWC_STATE_IEXCH_AND_WRITE_L1_BODY:
+            eaw_vars = nbc_req->data.wr.eaw_vars;
+            errcode = MPI_Test(&eaw_vars->req3, &flag, MPI_STATUS_IGNORE);
+            if (errcode == MPI_SUCCESS && flag) {
+                ADIOI_Iexch_and_write_l1_end(nbc_req, &errcode);
+            }
+            break;
+
+        case ADIOI_IWC_STATE_W_IEXCHANGE_DATA:
+            wed_vars = nbc_req->data.wr.wed_vars;
+            errcode = MPI_Test(&wed_vars->req1, &flag, MPI_STATUS_IGNORE);
+            if (errcode == MPI_SUCCESS && flag) {
+                ADIOI_W_Iexchange_data_hole(nbc_req, &errcode);
+            }
+            break;
+
+        case ADIOI_IWC_STATE_W_IEXCHANGE_DATA_HOLE:
+            wed_vars = nbc_req->data.wr.wed_vars;
+            errcode = MPI_Test(&wed_vars->req2, &flag, MPI_STATUSES_IGNORE);
+            if (errcode == MPI_SUCCESS && flag) {
+                /* --BEGIN ERROR HANDLING-- */
+                if (wed_vars->err != MPI_SUCCESS) {
+                    errcode = MPIO_Err_create_code(wed_vars->err,
+                            MPIR_ERR_RECOVERABLE,
+                            "ADIOI_W_EXCHANGE_DATA",
+                            __LINE__, MPI_ERR_IO,
+                            "**ioRMWrdwr", 0);
+                    break;;
+                }
+                /* --END ERROR HANDLING-- */
+                ADIOI_W_Iexchange_data_send(nbc_req, &errcode);
+            }
+            break;
+
+        case ADIOI_IWC_STATE_W_IEXCHANGE_DATA_SEND:
+            wed_vars = nbc_req->data.wr.wed_vars;
+            errcode = MPI_Testall(wed_vars->nprocs_recv, wed_vars->req3,
+                                  &flag, MPI_STATUSES_IGNORE);
+            if (errcode == MPI_SUCCESS && flag) {
+                ADIOI_W_Iexchange_data_wait(nbc_req, &errcode);
+            }
+            break;
+
+        case ADIOI_IWC_STATE_W_IEXCHANGE_DATA_WAIT:
+            wed_vars = nbc_req->data.wr.wed_vars;
+            if (wed_vars->fd->atomicity) {
+                /* bug fix from Wei-keng Liao and Kenin Coloma */
+                errcode = MPI_Testall(wed_vars->nprocs_send, wed_vars->send_req,
+                                      &flag, MPI_STATUSES_IGNORE);
+            } else {
+                errcode = MPI_Testall(wed_vars->nprocs_send +
+                                      wed_vars->nprocs_recv,
+                                      wed_vars->requests,
+                                      &flag, MPI_STATUSES_IGNORE);
+            }
+            if (errcode == MPI_SUCCESS && flag) {
+                ADIOI_W_Iexchange_data_fini(nbc_req, &errcode);
+            }
+            break;
+
+        default:
+            break;
+    }
+
+    /* --BEGIN ERROR HANDLING-- */
+    if (errcode != MPI_SUCCESS) {
+        errcode = MPIO_Err_create_code(MPI_SUCCESS,
+                MPIR_ERR_RECOVERABLE,
+                "ADIOI_GEN_iwc_poll_fn", __LINE__,
+                MPI_ERR_IO, "**mpi_grequest_complete",
+                0);
+    }
+    /* --END ERROR HANDLING-- */
+
+    return errcode;
+}
+
+/* wait for multiple requests to complete */
+static int ADIOI_GEN_iwc_wait_fn(int count, void **array_of_states,
+                                 double timeout, MPI_Status *status)
+{
+    int i, errcode = MPI_SUCCESS;
+    double starttime;
+    ADIOI_NBC_Request **nbc_reqlist;
+
+    nbc_reqlist = (ADIOI_NBC_Request **)array_of_states;
+
+    starttime = MPI_Wtime();
+    for (i = 0; i < count ; i++) {
+        while (nbc_reqlist[i]->data.wr.state != ADIOI_IWC_STATE_COMPLETE) {
+            errcode = ADIOI_GEN_iwc_poll_fn(nbc_reqlist[i], MPI_STATUS_IGNORE);
+            /* --BEGIN ERROR HANDLING-- */
+            if (errcode != MPI_SUCCESS) {
+                errcode = MPIO_Err_create_code(MPI_SUCCESS,
+                        MPIR_ERR_RECOVERABLE,
+                        "ADIOI_GEN_iwc_wait_fn",
+                        __LINE__, MPI_ERR_IO,
+                        "**mpi_grequest_complete", 0);
+            }
+            /* --END ERROR HANDLING-- */
+
+            if ((timeout > 0) && (timeout < (MPI_Wtime() - starttime)))
+                goto fn_exit;
+        }
+    }
+
+  fn_exit:
+    return errcode;
+}
+
diff --git a/src/mpi/romio/adio/common/ad_iwrite_coll.pdf b/src/mpi/romio/adio/common/ad_iwrite_coll.pdf
new file mode 100644
index 0000000..8efe138
Binary files /dev/null and b/src/mpi/romio/adio/common/ad_iwrite_coll.pdf differ
diff --git a/src/mpi/romio/adio/common/ad_read_coll.c b/src/mpi/romio/adio/common/ad_read_coll.c
index c5202a3..6577637 100644
--- a/src/mpi/romio/adio/common/ad_read_coll.c
+++ b/src/mpi/romio/adio/common/ad_read_coll.c
@@ -39,7 +39,7 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
 				  ADIOI_Access *others_req, 
 				  int iter, 
 				  MPI_Aint buftype_extent, int *buf_idx);
-static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
+void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
 				   *flat_buf, char **recv_buf, ADIO_Offset 
 				   *offset_list, ADIO_Offset *len_list, 
 				   unsigned *recv_size, 
@@ -954,7 +954,7 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
     ADIOI_BUF_INCR \
 }
 
-static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
+void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
 				   *flat_buf, char **recv_buf, ADIO_Offset 
 				   *offset_list, ADIO_Offset *len_list, 
 				   unsigned *recv_size, 
diff --git a/src/mpi/romio/adio/common/ad_write_coll.c b/src/mpi/romio/adio/common/ad_write_coll.c
index d585f9e..56dfdd2 100644
--- a/src/mpi/romio/adio/common/ad_write_coll.c
+++ b/src/mpi/romio/adio/common/ad_write_coll.c
@@ -35,7 +35,7 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
                          int *send_buf_idx, int *curr_to_proc,
                          int *done_to_proc, int *hole, int iter, 
                          MPI_Aint buftype_extent, int *buf_idx, int *error_code);
-static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
+void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
                            *flat_buf, char **send_buf, ADIO_Offset 
                            *offset_list, ADIO_Offset *len_list, int *send_size, 
                            MPI_Request *requests, int *sent_to_proc, 
@@ -860,7 +860,7 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
 
 
 
-static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
+void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
                            *flat_buf, char **send_buf, ADIO_Offset 
                            *offset_list, ADIO_Offset *len_list, int *send_size, 
                            MPI_Request *requests, int *sent_to_proc, 
diff --git a/src/mpi/romio/adio/include/adio.h b/src/mpi/romio/adio/include/adio.h
index 7ad3ced..20ceb30 100644
--- a/src/mpi/romio/adio/include/adio.h
+++ b/src/mpi/romio/adio/include/adio.h
@@ -398,6 +398,14 @@ void ADIO_IwriteStrided(ADIO_File fd, void *buf, int count,
 		       MPI_Datatype datatype, int file_ptr_type,
 		       ADIO_Offset offset, ADIO_Request *request, int
 		       *error_code);
+void ADIO_IreadStridedColl(ADIO_File fd, void *buf, int count,
+               MPI_Datatype datatype, int file_ptr_type,
+               ADIO_Offset offset, ADIO_Request *request,
+               int *error_code);
+void ADIO_IwriteStridedColl(ADIO_File fd, void *buf, int count,
+               MPI_Datatype datatype, int file_ptr_type,
+               ADIO_Offset offset, ADIO_Request *request,
+               int *error_code);
 ADIO_Offset ADIO_SeekIndividual(ADIO_File fd, ADIO_Offset offset, 
                        int whence, int *error_code);
 void ADIO_Delete(char *filename, int *error_code);
diff --git a/src/mpi/romio/adio/include/adioi.h b/src/mpi/romio/adio/include/adioi.h
index e3f9a16..02cea30 100644
--- a/src/mpi/romio/adio/include/adioi.h
+++ b/src/mpi/romio/adio/include/adioi.h
@@ -197,6 +197,12 @@ struct ADIOI_Fns_struct {
     void (*ADIOI_xxx_Delete) (const char *filename, int *error_code);
     int  (*ADIOI_xxx_Feature) (ADIO_File fd, int flag);
     const char *fsname;
+    void (*ADIOI_xxx_IreadStridedColl) (ADIO_File fd, void *buf, int count,
+           MPI_Datatype datatype, int file_ptr_type,
+           ADIO_Offset offset, ADIO_Request *request, int *error_code);
+    void (*ADIOI_xxx_IwriteStridedColl) (ADIO_File fd, const void *buf,
+           int count, MPI_Datatype datatype, int file_ptr_type,
+           ADIO_Offset offset, ADIO_Request *request, int *error_code);
 };
 
 /* optypes for ADIO_RequestD */
@@ -287,6 +293,12 @@ struct ADIOI_Fns_struct {
 #define ADIO_IwriteStrided(fd,buf,count,datatype,file_ptr_type,offset,request,error_code) \
         (*(fd->fns->ADIOI_xxx_IwriteStrided))(fd,buf,count,datatype,file_ptr_type,offset,request,error_code)
 
+#define ADIO_IreadStridedColl(fd,buf,count,datatype,file_ptr_type,offset,request,error_code) \
+        (*(fd->fns->ADIOI_xxx_IreadStridedColl))(fd,buf,count,datatype,file_ptr_type,offset,request,error_code)
+
+#define ADIO_IwriteStridedColl(fd,buf,count,datatype,file_ptr_type,offset,request,error_code) \
+        (*(fd->fns->ADIOI_xxx_IwriteStridedColl))(fd,buf,count,datatype,file_ptr_type,offset,request,error_code)
+
 #define ADIO_Flush(fd,error_code) (*(fd->fns->ADIOI_xxx_Flush))(fd,error_code)
 
 #define ADIO_Resize(fd,size,error_code) \
@@ -417,10 +429,18 @@ void ADIOI_GEN_ReadStridedColl(ADIO_File fd, void *buf, int count,
                        MPI_Datatype datatype, int file_ptr_type,
                        ADIO_Offset offset, ADIO_Status *status, int
                        *error_code);
+void ADIOI_GEN_IreadStridedColl(ADIO_File fd, void *buf, int count,
+                       MPI_Datatype datatype, int file_ptr_type,
+                       ADIO_Offset offset, MPI_Request *request,
+                       int *error_code);
 void ADIOI_GEN_WriteStridedColl(ADIO_File fd, const void *buf, int count,
                        MPI_Datatype datatype, int file_ptr_type,
                        ADIO_Offset offset, ADIO_Status *status, int
                        *error_code);
+void ADIOI_GEN_IwriteStridedColl(ADIO_File fd, const void *buf, int count,
+                       MPI_Datatype datatype, int file_ptr_type,
+                       ADIO_Offset offset, MPI_Request *request,
+                       int *error_code);
 void ADIOI_Calc_my_off_len(ADIO_File fd, int bufcount, MPI_Datatype
 			    datatype, int file_ptr_type, ADIO_Offset 
 			    offset, ADIO_Offset **offset_list_ptr, ADIO_Offset
@@ -458,6 +478,98 @@ void ADIOI_Calc_others_req(ADIO_File fd, int count_my_req_procs,
 				int *count_others_req_procs_ptr,
 				ADIOI_Access **others_req_ptr);  
 
+
+/* Nonblocking Collective I/O internals */
+typedef enum {
+    ADIOI_IRC_STATE_GEN_IREADSTRIDEDCOLL,
+    ADIOI_IRC_STATE_GEN_IREADSTRIDEDCOLL_INDIO,
+    ADIOI_IRC_STATE_ICALC_OTHERS_REQ,
+    ADIOI_IRC_STATE_ICALC_OTHERS_REQ_MAIN,
+    ADIOI_IRC_STATE_IREAD_AND_EXCH,
+    ADIOI_IRC_STATE_IREAD_AND_EXCH_L1_BEGIN,
+    ADIOI_IRC_STATE_R_IEXCHANGE_DATA,
+    ADIOI_IRC_STATE_R_IEXCHANGE_DATA_RECV,
+    ADIOI_IRC_STATE_R_IEXCHANGE_DATA_FILL,
+    ADIOI_IRC_STATE_COMPLETE
+} ADIOI_IRC_State;
+
+typedef enum {
+    ADIOI_IWC_STATE_GEN_IWRITESTRIDEDCOLL,
+    ADIOI_IWC_STATE_GEN_IWRITESTRIDEDCOLL_INDIO,
+    ADIOI_IWC_STATE_GEN_IWRITESTRIDEDCOLL_BCAST,
+    ADIOI_IWC_STATE_ICALC_OTHERS_REQ,
+    ADIOI_IWC_STATE_ICALC_OTHERS_REQ_MAIN,
+    ADIOI_IWC_STATE_IEXCH_AND_WRITE,
+    ADIOI_IWC_STATE_IEXCH_AND_WRITE_L1_BODY,
+    ADIOI_IWC_STATE_W_IEXCHANGE_DATA,
+    ADIOI_IWC_STATE_W_IEXCHANGE_DATA_HOLE,
+    ADIOI_IWC_STATE_W_IEXCHANGE_DATA_SEND,
+    ADIOI_IWC_STATE_W_IEXCHANGE_DATA_WAIT,
+    ADIOI_IWC_STATE_COMPLETE
+} ADIOI_IWC_State;
+
+typedef struct ADIOI_NBC_Request                ADIOI_NBC_Request;
+
+typedef struct ADIOI_GEN_IreadStridedColl_vars  ADIOI_GEN_IreadStridedColl_vars;
+typedef struct ADIOI_Iread_and_exch_vars        ADIOI_Iread_and_exch_vars;
+typedef struct ADIOI_R_Iexchange_data_vars      ADIOI_R_Iexchange_data_vars;
+
+typedef struct ADIOI_GEN_IwriteStridedColl_vars ADIOI_GEN_IwriteStridedColl_vars;
+typedef struct ADIOI_Iexch_and_write_vars       ADIOI_Iexch_and_write_vars;
+typedef struct ADIOI_W_Iexchange_data_vars      ADIOI_W_Iexchange_data_vars;
+
+typedef struct ADIOI_Icalc_others_req_vars {
+    /* requests */
+    MPI_Request req1;
+    MPI_Request *req2;
+    int num_req2;
+
+    /* parameters */
+    ADIO_File fd;
+    int count_my_req_procs;
+    int *count_my_req_per_proc;
+    ADIOI_Access *my_req;
+    int nprocs;
+    int myrank;
+    int *count_others_req_procs_ptr;
+    ADIOI_Access **others_req_ptr;
+
+    /* stack variables */
+    int *count_others_req_per_proc;
+    int count_others_req_procs;
+    ADIOI_Access *others_req;
+
+    /* next function to be called */
+    void (*next_fn)(ADIOI_NBC_Request *, int *);
+} ADIOI_Icalc_others_req_vars;
+
+struct ADIOI_NBC_Request {
+    int rdwr;           /* ADIOI_READ or ADIOI_WRITE */
+    MPI_Request req;    /* MPIX_Grequest */
+    MPI_Count nbytes;   /* data read or written */
+
+    union {
+        struct {
+            ADIOI_IRC_State state;      /* progress state */
+            ADIOI_GEN_IreadStridedColl_vars *rsc_vars;
+            ADIOI_Iread_and_exch_vars       *rae_vars;
+            ADIOI_R_Iexchange_data_vars     *red_vars;
+        } rd;
+        struct {
+            ADIOI_IWC_State state;      /* progress state */
+            ADIOI_GEN_IwriteStridedColl_vars *wsc_vars;
+            ADIOI_Iexch_and_write_vars       *eaw_vars;
+            ADIOI_W_Iexchange_data_vars      *wed_vars;
+        } wr;
+    } data;
+    ADIOI_Icalc_others_req_vars *cor_vars;
+};
+
+void ADIOI_Icalc_others_req(ADIOI_NBC_Request *nbc_req, int *error_code);
+void ADIOI_Icalc_others_req_main(ADIOI_NBC_Request *nbc_req, int *error_code);
+void ADIOI_Icalc_others_req_fini(ADIOI_NBC_Request *nbc_req, int *error_code);
+
+
 /* KC && AC - New Collective I/O internals*/
 
 #define TEMP_OFF 0
@@ -688,6 +800,22 @@ int MPIOI_File_iread(MPI_File fh,
 		     MPI_Datatype datatype,
 		     char *myname,
 		     MPI_Request *request);
+int MPIOI_File_iwrite_all(MPI_File fh,
+            MPI_Offset offset,
+            int file_ptr_type,
+            const void *buf,
+            int count,
+            MPI_Datatype datatype,
+            char *myname,
+            MPI_Request *request);
+int MPIOI_File_iread_all(MPI_File fh,
+            MPI_Offset offset,
+            int file_ptr_type,
+            void *buf,
+            int count,
+            MPI_Datatype datatype,
+            char *myname,
+            MPI_Request *request);
 
 
 
diff --git a/src/mpi/romio/include/mpio.h.in b/src/mpi/romio/include/mpio.h.in
index d4c7ed7..2238f4b 100644
--- a/src/mpi/romio/include/mpio.h.in
+++ b/src/mpi/romio/include/mpio.h.in
@@ -237,6 +237,20 @@ int MPI_File_sync(MPI_File fh);
 int MPI_File_set_errhandler(MPI_File file, MPI_Errhandler errhandler);
 int MPI_File_get_errhandler(MPI_File file, MPI_Errhandler *errhandler);
 #endif
+
+/* For MPI 3.1 */
+int MPIX_File_iread_at_all(MPI_File fh, MPI_Offset offset, void *buf, int count,
+                           MPI_Datatype datatype, MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int MPIX_File_iwrite_at_all(MPI_File fh, MPI_Offset offset, const void *buf, int count,
+                            MPI_Datatype datatype, MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int MPIX_File_iread_all(MPI_File fh, void *buf, int count, MPI_Datatype datatype,
+                        MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPIX_File_iwrite_all(MPI_File fh, const void *buf, int count, MPI_Datatype datatype,
+                         MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
 /* End Prototypes */
 
 #ifndef HAVE_MPI_DARRAY_SUBARRAY
@@ -453,6 +467,20 @@ int PMPI_File_set_errhandler( MPI_File, MPI_Errhandler );
 int PMPI_File_get_errhandler( MPI_File, MPI_Errhandler * );
 #endif
 
+/* For MPI 3.1 */
+int PMPIX_File_iread_at_all(MPI_File fh, MPI_Offset offset, void *buf, int count,
+                            MPI_Datatype datatype, MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int PMPIX_File_iwrite_at_all(MPI_File fh, MPI_Offset offset, const void *buf, int count,
+                             MPI_Datatype datatype, MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int PMPIX_File_iread_all(MPI_File fh, void *buf, int count, MPI_Datatype datatype,
+                         MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPIX_File_iwrite_all(MPI_File fh, const void *buf, int count, MPI_Datatype datatype,
+                          MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+
 #ifndef HAVE_MPI_DARRAY_SUBARRAY
 /* Section 4.14.4 */
 int PMPI_Type_create_subarray(int, int *, int *, int *, int, 
diff --git a/src/mpi/romio/mpi-io/Makefile.mk b/src/mpi/romio/mpi-io/Makefile.mk
index b917fad..d4d5a29 100644
--- a/src/mpi/romio/mpi-io/Makefile.mk
+++ b/src/mpi/romio/mpi-io/Makefile.mk
@@ -28,10 +28,14 @@ romio_mpi_sources +=          \
     mpi-io/get_size.c         \
     mpi-io/get_view.c         \
     mpi-io/iread.c            \
+    mpi-io/iread_all.c        \
     mpi-io/iread_at.c         \
+    mpi-io/iread_atall.c      \
     mpi-io/iread_sh.c         \
     mpi-io/iwrite.c           \
+    mpi-io/iwrite_all.c       \
     mpi-io/iwrite_at.c        \
+    mpi-io/iwrite_atall.c     \
     mpi-io/iwrite_sh.c        \
     mpi-io/open.c             \
     mpi-io/prealloc.c         \
diff --git a/src/mpi/romio/mpi-io/iread_all.c b/src/mpi/romio/mpi-io/iread_all.c
new file mode 100644
index 0000000..ba36161
--- /dev/null
+++ b/src/mpi/romio/mpi-io/iread_all.c
@@ -0,0 +1,149 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "mpioimpl.h"
+
+#ifdef HAVE_WEAK_SYMBOLS
+
+#if defined(HAVE_PRAGMA_WEAK)
+#pragma weak MPIX_File_iread_all = PMPIX_File_iread_all
+#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
+#pragma _HP_SECONDARY_DEF PMPIX_File_iread_all MPIX_File_iread_all
+#elif defined(HAVE_PRAGMA_CRI_DUP)
+#pragma _CRI duplicate MPIX_File_iread_all as PMPIX_File_iread_all
+/* end of weak pragmas */
+#elif defined(HAVE_WEAK_ATTRIBUTE)
+int MPIX_File_iread_all(MPI_File fh, void *buf, int count, MPI_Datatype datatype,
+                       MPI_Request *request)
+    __attribute__((weak,alias("PMPIX_File_iread_all")));
+#endif
+
+/* Include mapping from MPI->PMPI */
+#define MPIO_BUILD_PROFILING
+#include "mpioprof.h"
+#endif
+
+#ifdef HAVE_MPI_GREQUEST
+#include "mpiu_greq.h"
+#endif
+
+/*@
+    MPIX_File_iread_all - Nonblocking collective read using individual file pointer
+
+Input Parameters:
+. fh - file handle (handle)
+. count - number of elements in buffer (nonnegative integer)
+. datatype - datatype of each buffer element (handle)
+
+Output Parameters:
+. buf - initial address of buffer (choice)
+. request - request object (handle)
+
+.N fortran
+@*/
+int MPIX_File_iread_all(MPI_File fh, void *buf, int count,
+                       MPI_Datatype datatype, MPI_Request *request)
+{
+    int error_code;
+    static char myname[] = "MPIX_FILE_IREAD_ALL";
+#ifdef MPI_hpux
+    int fl_xmpi;
+
+    HPMP_IO_START(fl_xmpi, BLKMPIFILEREADALL, TRDTBLOCK, fh, datatype, count);
+#endif /* MPI_hpux */
+
+    error_code = MPIOI_File_iread_all(fh, (MPI_Offset)0,
+                     ADIO_INDIVIDUAL, buf,
+                     count, datatype, myname, request);
+
+    /* --BEGIN ERROR HANDLING-- */
+    if (error_code != MPI_SUCCESS) {
+        error_code = MPIO_Err_return_file(fh, error_code);
+    }
+    /* --END ERROR HANDLING-- */
+
+#ifdef MPI_hpux
+    HPMP_IO_END(fl_xmpi, fh, datatype, count);
+#endif /* MPI_hpux */
+
+    return error_code;
+}
+
+/* Note: MPIOI_File_iread_all also used by MPIX_File_iread_at_all */
+/* prevent multiple definitions of this routine */
+#ifdef MPIO_BUILD_PROFILING
+int MPIOI_File_iread_all(MPI_File fh,
+            MPI_Offset offset,
+            int file_ptr_type,
+            void *buf,
+            int count,
+            MPI_Datatype datatype,
+            char *myname,
+            MPI_Request *request)
+{
+    int error_code;
+    MPI_Count datatype_size;
+    ADIO_File adio_fh;
+    void *xbuf=NULL, *e32_buf=NULL;
+
+    MPIU_THREAD_CS_ENTER(ALLFUNC,);
+
+    adio_fh = MPIO_File_resolve(fh);
+
+    /* --BEGIN ERROR HANDLING-- */
+    MPIO_CHECK_FILE_HANDLE(adio_fh, myname, error_code);
+    MPIO_CHECK_COUNT(adio_fh, count, myname, error_code);
+    MPIO_CHECK_DATATYPE(adio_fh, datatype, myname, error_code);
+
+    if (file_ptr_type == ADIO_EXPLICIT_OFFSET && offset < 0) {
+        error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+                                          myname, __LINE__, MPI_ERR_ARG,
+                                          "**iobadoffset", 0);
+        error_code = MPIO_Err_return_file(adio_fh, error_code);
+        goto fn_exit;
+    }
+    /* --END ERROR HANDLING-- */
+
+    MPI_Type_size_x(datatype, &datatype_size);
+
+    /* --BEGIN ERROR HANDLING-- */
+    MPIO_CHECK_INTEGRAL_ETYPE(adio_fh, count, datatype_size, myname, error_code);
+    MPIO_CHECK_READABLE(adio_fh, myname, error_code);
+    MPIO_CHECK_NOT_SEQUENTIAL_MODE(adio_fh, myname, error_code);
+    MPIO_CHECK_COUNT_SIZE(adio_fh, count, datatype_size, myname, error_code);
+    /* --END ERROR HANDLING-- */
+
+    xbuf = buf;
+    if (adio_fh->is_external32) {
+        MPI_Aint e32_size = 0;
+        error_code = MPIU_datatype_full_size(datatype, &e32_size);
+        if (error_code != MPI_SUCCESS)
+            goto fn_exit;
+
+        e32_buf = ADIOI_Malloc(e32_size*count);
+        xbuf = e32_buf;
+    }
+
+    ADIO_IreadStridedColl(adio_fh, xbuf, count, datatype, file_ptr_type,
+                          offset, request, &error_code);
+
+    /* --BEGIN ERROR HANDLING-- */
+    if (error_code != MPI_SUCCESS)
+    error_code = MPIO_Err_return_file(adio_fh, error_code);
+    /* --END ERROR HANDLING-- */
+
+    if (e32_buf != NULL) {
+        error_code = MPIU_read_external32_conversion_fn(xbuf, datatype,
+                                                        count, e32_buf);
+        ADIOI_Free(e32_buf);
+    }
+
+fn_exit:
+    MPIU_THREAD_CS_EXIT(ALLFUNC,);
+
+    return error_code;
+}
+#endif
diff --git a/src/mpi/romio/mpi-io/iread_atall.c b/src/mpi/romio/mpi-io/iread_atall.c
new file mode 100644
index 0000000..ec70b2d
--- /dev/null
+++ b/src/mpi/romio/mpi-io/iread_atall.c
@@ -0,0 +1,74 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "mpioimpl.h"
+
+#ifdef HAVE_WEAK_SYMBOLS
+
+#if defined(HAVE_PRAGMA_WEAK)
+#pragma weak MPIX_File_iread_at_all = PMPIX_File_iread_at_all
+#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
+#pragma _HP_SECONDARY_DEF PMPIX_File_iread_at_all MPIX_File_iread_at_all
+#elif defined(HAVE_PRAGMA_CRI_DUP)
+#pragma _CRI duplicate MPIX_File_iread_at_all as PMPIX_File_iread_at_all
+/* end of weak pragmas */
+#elif defined(HAVE_WEAK_ATTRIBUTE)
+int MPIX_File_iread_at_all(MPI_File fh, MPI_Offset offset, void * buf, int count,
+                          MPI_Datatype datatype, MPI_Rquest *request)
+    __attribute__((weak,alias("PMPIX_File_iread_at_all")));
+#endif
+
+/* Include mapping from MPI->PMPI */
+#define MPIO_BUILD_PROFILING
+#include "mpioprof.h"
+#endif
+
+#ifdef HAVE_MPI_GREQUEST
+#include "mpiu_greq.h"
+#endif
+
+/*@
+    MPIX_File_iread_at_all - Nonblocking collective read using explicit offset
+
+Input Parameters:
+. fh - file handle (handle)
+. offset - file offset (nonnegative integer)
+. count - number of elements in buffer (nonnegative integer)
+. datatype - datatype of each buffer element (handle)
+
+Output Parameters:
+. buf - initial address of buffer (choice)
+. request - request object (handle)
+
+.N fortran
+@*/
+int MPIX_File_iread_at_all(MPI_File fh, MPI_Offset offset, void *buf,
+                          int count, MPI_Datatype datatype,
+                          MPI_Request *request)
+{
+    int error_code;
+    static char myname[] = "MPIX_FILE_IREAD_AT_ALL";
+#ifdef MPI_hpux
+    int fl_xmpi;
+
+    HPMP_IO_START(fl_xmpi, BLKMPIFILEIREADATALL, TRDTBLOCK, fh, datatype,
+		  count);
+#endif /* MPI_hpux */
+
+    error_code = MPIOI_File_iread_all(fh, offset, ADIO_EXPLICIT_OFFSET, buf,
+				     count, datatype, myname, request);
+
+    /* --BEGIN ERROR HANDLING-- */
+    if (error_code != MPI_SUCCESS)
+	error_code = MPIO_Err_return_file(fh, error_code);
+    /* --END ERROR HANDLING-- */
+
+#ifdef MPI_hpux
+    HPMP_IO_END(fl_xmpi, fh, datatype, count);
+#endif /* MPI_hpux */
+
+    return error_code;
+}
diff --git a/src/mpi/romio/mpi-io/iwrite_all.c b/src/mpi/romio/mpi-io/iwrite_all.c
new file mode 100644
index 0000000..aeed90d
--- /dev/null
+++ b/src/mpi/romio/mpi-io/iwrite_all.c
@@ -0,0 +1,138 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "mpioimpl.h"
+
+#ifdef HAVE_WEAK_SYMBOLS
+
+#if defined(HAVE_PRAGMA_WEAK)
+#pragma weak MPIX_File_iwrite_all = PMPIX_File_iwrite_all
+#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
+#pragma _HP_SECONDARY_DEF PMPIX_File_iwrite_all MPIX_File_iwrite_all
+#elif defined(HAVE_PRAGMA_CRI_DUP)
+#pragma _CRI duplicate MPIX_File_iwrite_all as PMPIX_File_iwrite_all
+/* end of weak pragmas */
+#elif defined(HAVE_WEAK_ATTRIBUTE)
+int MPIX_File_iwrite_all(MPI_File fh, const void *buf, int count, MPI_Datatype datatype,
+                        MPI_Request *request)
+    __attribute__((weak,alias("PMPIX_File_iwrite_all")));
+#endif
+
+/* Include mapping from MPI->PMPI */
+#define MPIO_BUILD_PROFILING
+#include "mpioprof.h"
+#endif
+
+#ifdef HAVE_MPI_GREQUEST
+#include "mpiu_greq.h"
+#endif
+
+/*@
+    MPIX_File_iwrite_all - Nonblocking collective write using individual file pointer
+
+Input Parameters:
+. fh - file handle (handle)
+. buf - initial address of buffer (choice)
+. count - number of elements in buffer (nonnegative integer)
+. datatype - datatype of each buffer element (handle)
+
+Output Parameters:
+. request - request object (handle)
+
+.N fortran
+@*/
+int MPIX_File_iwrite_all(MPI_File fh, ROMIO_CONST void *buf, int count,
+                        MPI_Datatype datatype, MPI_Request *request)
+{
+    int error_code;
+    static char myname[] = "MPIX_FILE_IWRITE_ALL";
+#ifdef MPI_hpux
+    int fl_xmpi;
+
+    HPMP_IO_START(fl_xmpi, BLKMPIFILEIWRITEALL, TRDTBLOCK, fh, datatype, count);
+#endif /* MPI_hpux */
+
+    error_code = MPIOI_File_iwrite_all(fh, (MPI_Offset) 0,
+                      ADIO_INDIVIDUAL, buf,
+                      count, datatype, myname, request);
+
+#ifdef MPI_hpux
+    HPMP_IO_END(fl_xmpi, fh, datatype, count);
+#endif /* MPI_hpux */
+
+    return error_code;
+}
+
+/* Note: MPIOI_File_iwrite_all also used by MPIX_File_iwrite_at_all */
+/* prevent multiple definitions of this routine */
+#ifdef MPIO_BUILD_PROFILING
+int MPIOI_File_iwrite_all(MPI_File fh,
+            MPI_Offset offset,
+            int file_ptr_type,
+            const void *buf,
+            int count,
+            MPI_Datatype datatype,
+            char *myname,
+            MPI_Request *request)
+{
+    int error_code;
+    MPI_Count datatype_size;
+    ADIO_File adio_fh;
+    void *e32buf=NULL;
+    const void *xbuf=NULL;
+
+    MPIU_THREAD_CS_ENTER(ALLFUNC,);
+
+    adio_fh = MPIO_File_resolve(fh);
+
+    /* --BEGIN ERROR HANDLING-- */
+    MPIO_CHECK_FILE_HANDLE(adio_fh, myname, error_code);
+    MPIO_CHECK_COUNT(adio_fh, count, myname, error_code);
+    MPIO_CHECK_DATATYPE(adio_fh, datatype, myname, error_code);
+
+    if (file_ptr_type == ADIO_EXPLICIT_OFFSET && offset < 0)
+    {
+        error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+                          myname, __LINE__, MPI_ERR_ARG,
+                          "**iobadoffset", 0);
+        error_code = MPIO_Err_return_file(adio_fh, error_code);
+        goto fn_exit;
+    }
+    /* --END ERROR HANDLING-- */
+
+    MPI_Type_size_x(datatype, &datatype_size);
+
+    /* --BEGIN ERROR HANDLING-- */
+    MPIO_CHECK_INTEGRAL_ETYPE(adio_fh, count, datatype_size, myname, error_code);
+    MPIO_CHECK_WRITABLE(adio_fh, myname, error_code);
+    MPIO_CHECK_NOT_SEQUENTIAL_MODE(adio_fh, myname, error_code);
+    MPIO_CHECK_COUNT_SIZE(adio_fh, count, datatype_size, myname, error_code);
+    /* --END ERROR HANDLING-- */
+
+    xbuf = buf;
+    if (adio_fh->is_external32) {
+        error_code = MPIU_external32_buffer_setup(buf, count, datatype, &e32buf);
+        if (error_code != MPI_SUCCESS)
+            goto fn_exit;
+
+        xbuf = e32buf;
+    }
+
+    ADIO_IwriteStridedColl(adio_fh, xbuf, count, datatype, file_ptr_type,
+                           offset, request, &error_code);
+
+    /* --BEGIN ERROR HANDLING-- */
+    if (error_code != MPI_SUCCESS)
+    error_code = MPIO_Err_return_file(adio_fh, error_code);
+    /* --END ERROR HANDLING-- */
+
+fn_exit:
+    if (e32buf != NULL) ADIOI_Free(e32buf);
+    MPIU_THREAD_CS_EXIT(ALLFUNC,);
+
+    return error_code;
+}
+#endif
diff --git a/src/mpi/romio/mpi-io/iwrite_atall.c b/src/mpi/romio/mpi-io/iwrite_atall.c
new file mode 100644
index 0000000..c5eba79
--- /dev/null
+++ b/src/mpi/romio/mpi-io/iwrite_atall.c
@@ -0,0 +1,68 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "mpioimpl.h"
+
+#ifdef HAVE_WEAK_SYMBOLS
+
+#if defined(HAVE_PRAGMA_WEAK)
+#pragma weak MPIX_File_iwrite_at_all = PMPIX_File_iwrite_at_all
+#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
+#pragma _HP_SECONDARY_DEF PMPIX_File_iwrite_at_all MPIX_File_iwrite_at_all
+#elif defined(HAVE_PRAGMA_CRI_DUP)
+#pragma _CRI duplicate MPIX_File_iwrite_at_all as PMPIX_File_iwrite_at_all
+/* end of weak pragmas */
+#elif defined(HAVE_WEAK_ATTRIBUTE)
+int MPIX_File_iwrite_at_all(MPI_File fh, MPI_Offset offset, const void *buf, int count,
+                           MPI_Datatype datatype, MPI_Request *request)
+    __attribute__((weak,alias("PMPIX_File_iwrite_at_all")));
+#endif
+
+/* Include mapping from MPI->PMPI */
+#define MPIO_BUILD_PROFILING
+#include "mpioprof.h"
+#endif
+
+#ifdef HAVE_MPI_GREQUEST
+#include "mpiu_greq.h"
+#endif
+
+/*@
+    MPIX_File_iwrite_at_all - Nonblocking collective write using explicit offset
+
+Input Parameters:
+. fh - file handle (handle)
+. offset - file offset (nonnegative integer)
+. buf - initial address of buffer (choice)
+. count - number of elements in buffer (nonnegative integer)
+. datatype - datatype of each buffer element (handle)
+
+Output Parameters:
+. request - request object (handle)
+
+.N fortran
+@*/
+int MPIX_File_iwrite_at_all(MPI_File fh, MPI_Offset offset, ROMIO_CONST void *buf,
+                           int count, MPI_Datatype datatype,
+                           MPI_Request *request)
+{
+    int error_code;
+    static char myname[] = "MPIX_FILE_IWRITE_AT_ALL";
+#ifdef MPI_hpux
+    int fl_xmpi;
+
+    HPMP_IO_START(fl_xmpi, BLKMPIFILEIWRITEATALL, TRDTBLOCK, fh, datatype, count);
+#endif /* MPI_hpux */
+
+    error_code = MPIOI_File_iwrite_all(fh, offset, ADIO_EXPLICIT_OFFSET,
+				      buf, count, datatype, myname, request);
+
+#ifdef MPI_hpux
+    HPMP_IO_END(fl_xmpi, fh, datatype, count);
+#endif /* MPI_hpux */
+    return error_code;
+}
+
diff --git a/src/mpi/romio/mpi-io/mpioprof.h b/src/mpi/romio/mpi-io/mpioprof.h
index 32e3c7c..15654ac 100644
--- a/src/mpi/romio/mpi-io/mpioprof.h
+++ b/src/mpi/romio/mpi-io/mpioprof.h
@@ -70,6 +70,15 @@
 #undef MPI_File_get_byte_offset
 #define MPI_File_get_byte_offset PMPI_File_get_byte_offset
 
+#undef MPIX_File_iread_at_all
+#define MPIX_File_iread_at_all PMPIX_File_iread_at_all
+#undef MPIX_File_iwrite_at_all
+#define MPIX_File_iwrite_at_all PMPIX_File_iwrite_at_all
+#undef MPIX_File_iread_all
+#define MPIX_File_iread_all  PMPIX_File_iread_all
+#undef MPIX_File_iwrite_all
+#define MPIX_File_iwrite_all PMPIX_File_iwrite_all
+
 #undef MPI_File_read_shared
 #define MPI_File_read_shared PMPI_File_read_shared
 #undef MPI_File_write_shared

-----------------------------------------------------------------------

Summary of changes:
 .../fortran/use_mpi_f08/wrappers_c/buildiface      |    6 +-
 src/mpi/romio/adio/ad_gpfs/ad_gpfs.c               |    4 +-
 src/mpi/romio/adio/ad_gridftp/ad_gridftp.c         |    2 +
 src/mpi/romio/adio/ad_hfs/ad_hfs.c                 |    2 +
 src/mpi/romio/adio/ad_lustre/ad_lustre.c           |    2 +
 src/mpi/romio/adio/ad_nfs/ad_nfs.c                 |    4 +-
 src/mpi/romio/adio/ad_ntfs/ad_ntfs.c               |    4 +-
 src/mpi/romio/adio/ad_panfs/ad_panfs.c             |    4 +-
 src/mpi/romio/adio/ad_pfs/ad_pfs.c                 |    2 +
 src/mpi/romio/adio/ad_piofs/ad_piofs.c             |    2 +
 src/mpi/romio/adio/ad_pvfs/ad_pvfs.c               |    2 +
 src/mpi/romio/adio/ad_pvfs2/ad_pvfs2.c             |    4 +-
 src/mpi/romio/adio/ad_sfs/ad_sfs.c                 |    2 +
 src/mpi/romio/adio/ad_testfs/ad_testfs.c           |    4 +-
 src/mpi/romio/adio/ad_ufs/ad_ufs.c                 |    2 +
 src/mpi/romio/adio/ad_xfs/ad_xfs.c                 |    4 +-
 src/mpi/romio/adio/ad_zoidfs/ad_zoidfs.c           |    2 +
 src/mpi/romio/adio/common/Makefile.mk              |    2 +
 src/mpi/romio/adio/common/ad_aggregate.c           |  138 ++
 src/mpi/romio/adio/common/ad_iread_coll.c          | 1311 +++++++++++++++++
 src/mpi/romio/adio/common/ad_iread_coll.pdf        |  Bin 0 -> 25911 bytes
 src/mpi/romio/adio/common/ad_iwrite_coll.c         | 1535 ++++++++++++++++++++
 src/mpi/romio/adio/common/ad_iwrite_coll.pdf       |  Bin 0 -> 27758 bytes
 src/mpi/romio/adio/common/ad_read_coll.c           |    4 +-
 src/mpi/romio/adio/common/ad_write_coll.c          |    4 +-
 src/mpi/romio/adio/include/adio.h                  |    8 +
 src/mpi/romio/adio/include/adioi.h                 |  128 ++
 src/mpi/romio/include/mpio.h.in                    |   28 +
 src/mpi/romio/mpi-io/Makefile.mk                   |    4 +
 src/mpi/romio/mpi-io/iread_all.c                   |  149 ++
 src/mpi/romio/mpi-io/iread_atall.c                 |   74 +
 src/mpi/romio/mpi-io/iwrite_all.c                  |  138 ++
 src/mpi/romio/mpi-io/iwrite_atall.c                |   68 +
 src/mpi/romio/mpi-io/mpioprof.h                    |    9 +
 test/mpi/configure.ac                              |    1 +
 test/mpi/io/Makefile.am                            |   16 +
 test/mpi/io/i_aggregation1.c                       |  304 ++++
 test/mpi/io/i_aggregation2.c                       |   97 ++
 test/mpi/io/i_bigtype.c                            |  145 ++
 test/mpi/io/i_coll_test.c                          |  198 +++
 test/mpi/io/i_darray_read.c                        |  137 ++
 test/mpi/io/i_hindexed.c                           |  277 ++++
 test/mpi/io/i_hindexed_io.c                        |  118 ++
 test/mpi/io/i_noncontig_coll.c                     |  240 +++
 test/mpi/io/i_noncontig_coll2.c                    |  560 +++++++
 test/mpi/io/i_rdwrord.c                            |   73 +
 test/mpi/io/i_setviewcur.c                         |  129 ++
 test/mpi/io/i_types_with_zeros.c                   |  155 ++
 test/mpi/io/testlist                               |   13 -
 test/mpi/io/testlist.in                            |   25 +
 50 files changed, 6115 insertions(+), 25 deletions(-)
 create mode 100644 src/mpi/romio/adio/common/ad_iread_coll.c
 create mode 100644 src/mpi/romio/adio/common/ad_iread_coll.pdf
 create mode 100644 src/mpi/romio/adio/common/ad_iwrite_coll.c
 create mode 100644 src/mpi/romio/adio/common/ad_iwrite_coll.pdf
 create mode 100644 src/mpi/romio/mpi-io/iread_all.c
 create mode 100644 src/mpi/romio/mpi-io/iread_atall.c
 create mode 100644 src/mpi/romio/mpi-io/iwrite_all.c
 create mode 100644 src/mpi/romio/mpi-io/iwrite_atall.c
 create mode 100644 test/mpi/io/i_aggregation1.c
 create mode 100644 test/mpi/io/i_aggregation2.c
 create mode 100644 test/mpi/io/i_bigtype.c
 create mode 100644 test/mpi/io/i_coll_test.c
 create mode 100644 test/mpi/io/i_darray_read.c
 create mode 100644 test/mpi/io/i_hindexed.c
 create mode 100644 test/mpi/io/i_hindexed_io.c
 create mode 100644 test/mpi/io/i_noncontig_coll.c
 create mode 100644 test/mpi/io/i_noncontig_coll2.c
 create mode 100644 test/mpi/io/i_rdwrord.c
 create mode 100644 test/mpi/io/i_setviewcur.c
 create mode 100644 test/mpi/io/i_types_with_zeros.c
 delete mode 100644 test/mpi/io/testlist
 create mode 100644 test/mpi/io/testlist.in


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list