[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.2b1-37-g36ac1a9

Service Account noreply at mpich.org
Fri Apr 3 16:19:42 CDT 2015


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  36ac1a9da2f1ee4d8204d4a93952d71273d96ae3 (commit)
       via  5c4f21d038c2975d968af232447b25aaf6d1442d (commit)
       via  18516bafa3a3ee7a9e0343ad1bda9876fa52a49a (commit)
       via  41ab3461e8200920c0fb9a65f8e1921bb428fe87 (commit)
       via  398aeb4fa597ab55aff917066583e5e665631037 (commit)
       via  e6849ac4f0a85a58afefc7a0ee9643dbf225852f (commit)
       via  147329674e7989e29c441efa73ba2c4462db5683 (commit)
       via  0e4dcc43f3ef0be5fb035ceef310aeaf71e19df7 (commit)
      from  c6c0d6f6e67fcf8fb213044f551c6a577ae05dd6 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/36ac1a9da2f1ee4d8204d4a93952d71273d96ae3

commit 36ac1a9da2f1ee4d8204d4a93952d71273d96ae3
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Fri Apr 3 10:16:21 2015 -0500

    move window creation out of common code
    
    Instead of creating window at open time (depending on hints), let's
    deferr the window creation until we need it.
    
    Signed-off-by: Paul Coffman <pkcoff at us.ibm.com>

diff --git a/src/mpi/romio/adio/common/ad_close.c b/src/mpi/romio/adio/common/ad_close.c
index ff0049e..01cf1ba 100644
--- a/src/mpi/romio/adio/common/ad_close.c
+++ b/src/mpi/romio/adio/common/ad_close.c
@@ -7,9 +7,6 @@
 
 #include "adio.h"
 #include "adio_extern.h"
-extern int      gpfsmpio_aggmethod;
-extern int      gpfsmpio_onesided_no_rmw;
-
 #ifdef HAVE_UNISTD_H
 #include <unistd.h>
 #endif
@@ -117,15 +114,7 @@ void ADIO_Close(ADIO_File fd, int *error_code)
     MPI_Info_free(&(fd->info));
 
     if (fd->io_buf != NULL) ADIOI_Free(fd->io_buf);
-    /* If one-sided aggregation is chosen then free the window over the io_buf.
-     */
-    if ((gpfsmpio_aggmethod == 1) || (gpfsmpio_aggmethod == 2)) {
-	  MPI_Win_free(&fd->io_buf_window);
-	  if (!gpfsmpio_onesided_no_rmw) {
-	    MPI_Win_free(&fd->io_buf_put_amounts_window);
-	    ADIOI_Free(fd->io_buf_put_amounts);
-	  }
-	}
+    ADIOI_OneSidedCleanup(fd);
 
     /* memory for fd is freed in MPI_File_close */
 }
diff --git a/src/mpi/romio/adio/common/ad_open.c b/src/mpi/romio/adio/common/ad_open.c
index 222cb55..5c95b47 100644
--- a/src/mpi/romio/adio/common/ad_open.c
+++ b/src/mpi/romio/adio/common/ad_open.c
@@ -10,8 +10,6 @@
 #include "adio_cb_config_list.h"
 
 #include "mpio.h"
-extern int      gpfsmpio_aggmethod;
-extern int      gpfsmpio_onesided_no_rmw;
 static int is_aggregator(int rank, ADIO_File fd);
 static int uses_generic_read(ADIO_File fd);
 static int uses_generic_write(ADIO_File fd);
@@ -72,6 +70,9 @@ MPI_File ADIO_Open(MPI_Comm orig_comm,
 
     fd->err_handler = ADIOI_DFLT_ERR_HANDLER;
 
+    fd->io_buf_window = MPI_WIN_NULL;
+    fd->io_buf_put_amounts_window = MPI_WIN_NULL;
+
     MPI_Comm_rank(comm, &rank);
     MPI_Comm_size(comm, &procs);
 /* create and initialize info object */
@@ -127,15 +128,6 @@ MPI_File ADIO_Open(MPI_Comm orig_comm,
      * (e.g. Blue Gene) more efficent */
 
     fd->io_buf = ADIOI_Malloc(fd->hints->cb_buffer_size);
-    /* If one-sided aggregation is chosen then create the window over the io_buf.
-     */
-    if ((gpfsmpio_aggmethod == 1) || (gpfsmpio_aggmethod == 2)) {
-      MPI_Win_create(fd->io_buf,fd->hints->cb_buffer_size,1,MPI_INFO_NULL,fd->comm, &fd->io_buf_window);
-      if (!gpfsmpio_onesided_no_rmw) {
-        fd->io_buf_put_amounts = (int *) ADIOI_Malloc(procs*sizeof(int));
-        MPI_Win_create(fd->io_buf_put_amounts,procs*sizeof(int),sizeof(int),MPI_INFO_NULL,fd->comm, &fd->io_buf_put_amounts_window);
-      }
-    }
      /* deferred open: 
      * we can only do this optimization if 'fd->hints->deferred_open' is set
      * (which means the user hinted 'no_indep_rw' and collective buffering).
diff --git a/src/mpi/romio/adio/common/onesided_aggregation.c b/src/mpi/romio/adio/common/onesided_aggregation.c
index 559ea40..84d8333 100644
--- a/src/mpi/romio/adio/common/onesided_aggregation.c
+++ b/src/mpi/romio/adio/common/onesided_aggregation.c
@@ -1,6 +1,12 @@
 #include "adio.h"
 #include "adio_extern.h"
+#ifdef ROMIO_GPFS
+/* right now this is GPFS only but TODO: extend this to all file systems */
 #include "../ad_gpfs/ad_gpfs_tuning.h"
+#else
+int gpfsmpio_onesided_no_rmw = 0;
+int gpfsmpio_aggmethod = 0;
+#endif
 
 #include <pthread.h>
 
@@ -20,6 +26,32 @@ typedef struct NonContigSourceBufOffset {
   ADIO_Offset indiceOffset;
 } NonContigSourceBufOffset;
 
+static int ADIOI_OneSidedSetup(ADIO_File fd, int procs) {
+    int ret = MPI_SUCCESS;
+
+    ret = MPI_Win_create(fd->io_buf,fd->hints->cb_buffer_size,1,
+	    MPI_INFO_NULL,fd->comm, &fd->io_buf_window);
+    if (ret != MPI_SUCCESS) goto fn_exit;
+    fd->io_buf_put_amounts = (int *) ADIOI_Malloc(procs*sizeof(int));
+    ret =MPI_Win_create(fd->io_buf_put_amounts,procs*sizeof(int),sizeof(int),
+	    MPI_INFO_NULL,fd->comm, &fd->io_buf_put_amounts_window);
+fn_exit:
+    return ret;
+}
+
+int ADIOI_OneSidedCleanup(ADIO_File fd)
+{
+    int ret = MPI_SUCCESS;
+    if (fd->io_buf_window != MPI_WIN_NULL)
+	ret = MPI_Win_free(&fd->io_buf_window);
+    if (fd->io_buf_put_amounts_window != MPI_WIN_NULL)
+	ret = MPI_Win_free(&fd->io_buf_put_amounts_window);
+    if (fd->io_buf_put_amounts != NULL)
+	ADIOI_Free(fd->io_buf_put_amounts);
+
+    return ret;
+}
+
 void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
     ADIO_Offset *offset_list,
     ADIO_Offset *len_list,
@@ -51,6 +83,12 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
     MPI_Comm_size(fd->comm, &nprocs);
     MPI_Comm_rank(fd->comm, &myrank);
 
+    if (fd->io_buf_window == MPI_WIN_NULL ||
+	    fd->io_buf_put_amounts_window == MPI_WIN_NULL)
+    {
+	ADIOI_OneSidedSetup(fd, nprocs);
+    }
+
     /* This flag denotes whether the source datatype is contiguous, which is referenced throughout the algorithm
      * and defines how the source buffer offsets and data chunks are determined.  If the value is 1 (true - contiguous data)
      * things are profoundly simpler in that the source buffer offset for a given target offset simply linearly increases
@@ -1063,6 +1101,11 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
     MPI_Comm_size(fd->comm, &nprocs);
     MPI_Comm_rank(fd->comm, &myrank);
 
+    if (fd->io_buf_window == MPI_WIN_NULL ||
+	    fd->io_buf_put_amounts_window == MPI_WIN_NULL)
+    {
+	ADIOI_OneSidedSetup(fd, nprocs);
+    }
     /* This flag denotes whether the source datatype is contiguus, which is referenced throughout the algorithm
      * and defines how the source buffer offsets and data chunks are determined.  If the value is 1 (true - contiguous data)
      * things are profoundly simpler in that the source buffer offset for a given source offset simply linearly increases
diff --git a/src/mpi/romio/adio/include/adioi.h b/src/mpi/romio/adio/include/adioi.h
index 0290cb4..d2dbc94 100644
--- a/src/mpi/romio/adio/include/adioi.h
+++ b/src/mpi/romio/adio/include/adioi.h
@@ -686,6 +686,7 @@ void ADIOI_P2PContigReadAggregation(ADIO_File fd,
 				     ADIO_Offset *fd_start,
 				     ADIO_Offset *fd_end);
 
+int ADIOI_OneSidedCleanup(ADIO_File fd);
 void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
         ADIO_Offset *offset_list,
         ADIO_Offset *len_list,

http://git.mpich.org/mpich.git/commitdiff/5c4f21d038c2975d968af232447b25aaf6d1442d

commit 5c4f21d038c2975d968af232447b25aaf6d1442d
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Sat Feb 28 01:33:02 2015 +0000

    PAMI_Rput_typed / PAMI_Rget_typed utilization for derived types
    
    Optimization to use the PAMI_Rput_typed / PAMI_Rget_typed call in the case where PAMID
    MPI_Put / MPI_Get is called with a derived (non-contiguous) datatype.  Instead of breaking
    the MPI datatype up into contiguous chunks on the MPICH side and repeatedly calling
    PAMI_Rput / PAMI_Rget for each chunk with the associated overhead, create a PAMI datatype
    to represent the MPI derived type and make just 1 call to
    PAMI_Rput_typed / PAMI_Rget_typed.
    
    We deal with non-contiguous buffers by avoiding packing and using origin
    buffers (as in PAMI)
    
    Guarded by the PAMID_TYPED_ONESIDED environment variable.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpid/common/datatype/mpid_type_commit.c b/src/mpid/common/datatype/mpid_type_commit.c
index d990503..28be988 100644
--- a/src/mpid/common/datatype/mpid_type_commit.c
+++ b/src/mpid/common/datatype/mpid_type_commit.c
@@ -30,7 +30,7 @@ int MPID_Type_commit(MPI_Datatype *datatype_p)
     MPID_Datatype_get_ptr(*datatype_p, datatype_ptr);
 
     if (datatype_ptr->is_committed == 0) {
-	datatype_ptr->is_committed = 1;
+       datatype_ptr->is_committed = 1;
 
 #ifdef MPID_NEEDS_DLOOP_ALL_BYTES
         /* If MPID implementation needs use to reduce everything to
@@ -62,8 +62,11 @@ int MPID_Type_commit(MPI_Datatype *datatype_p)
         MPIDI_Dataloop_dot_printf(datatype_ptr->dataloop, 0, 1);
 #endif
 
-    }
+#ifdef MPID_Dev_datatype_commit_hook
+       MPID_Dev_datatype_commit_hook(datatype_p);
+#endif /* MPID_Dev_datatype_commit_hook */
 
+    }
     return mpi_errno;
 }
 
diff --git a/src/mpid/common/datatype/mpid_type_dup.c b/src/mpid/common/datatype/mpid_type_dup.c
index ae24dad..fddabea 100644
--- a/src/mpid/common/datatype/mpid_type_dup.c
+++ b/src/mpid/common/datatype/mpid_type_dup.c
@@ -82,7 +82,6 @@ int MPID_Type_dup(MPI_Datatype oldtype,
 	new_dtp->hetero_dloop       = NULL;
 	new_dtp->hetero_dloop_size  = old_dtp->hetero_dloop_size;
 	new_dtp->hetero_dloop_depth = old_dtp->hetero_dloop_depth;
-
 	*newtype = new_dtp->handle;
 
 	if (old_dtp->is_committed) {
@@ -98,7 +97,11 @@ int MPID_Type_dup(MPI_Datatype oldtype,
 				  old_dtp->hetero_dloop_size,
 				  &new_dtp->hetero_dloop);
 	    }
-	}
+
+#ifdef MPID_Dev_datatype_commit_hook
+            MPID_Dev_datatype_dup_hook(new_dtp);
+#endif /* MPID_Dev_datatype_commit_hook */
+      }
     }
 
     MPIU_DBG_MSG_D(DATATYPE,VERBOSE, "dup type %x created.", *newtype);
diff --git a/src/mpid/pamid/include/mpidi_datatypes.h b/src/mpid/pamid/include/mpidi_datatypes.h
index 14e8dc8..d3e4797 100644
--- a/src/mpid/pamid/include/mpidi_datatypes.h
+++ b/src/mpid/pamid/include/mpidi_datatypes.h
@@ -148,6 +148,7 @@ typedef struct
 
   unsigned mpir_nbc;         /**< Enable MPIR_* non-blocking collectives implementations. */
   int  numTasks;             /* total number of tasks on a job                            */
+  unsigned typed_onesided;       /**< Enable typed PAMI calls for derived types within MPID_Put and MPID_Get. */
 #ifdef DYNAMIC_TASKING
   struct MPIDI_PG_t * my_pg; /**< Process group I belong to */
   int                 my_pg_rank; /**< Rank in process group */
diff --git a/src/mpid/pamid/include/mpidpre.h b/src/mpid/pamid/include/mpidpre.h
index 29039e2..a10e1ea 100644
--- a/src/mpid/pamid/include/mpidpre.h
+++ b/src/mpid/pamid/include/mpidpre.h
@@ -70,5 +70,13 @@
 #define MPID_MAX_SMP_BCAST_MSG_SIZE (16384)
 #define MPID_MAX_SMP_REDUCE_MSG_SIZE (16384)
 #define MPID_MAX_SMP_ALLREDUCE_MSG_SIZE (16384)
+#ifdef MPID_DEV_DATATYPE_DECL
+#error 'Conflicting definitions of MPID_DEV_DATATYPE_DECL'
+#else
+#define MPID_DEV_DATATYPE_DECL void *device_datatype;
+#endif
+#define MPID_Dev_datatype_commit_hook(ptr) MPIDI_PAMI_datatype_commit_hook(ptr)
+#define MPID_Dev_datatype_destroy_hook(ptr) MPIDI_PAMI_datatype_destroy_hook(ptr)
+#define MPID_Dev_datatype_dup_hook(ptr) MPIDI_PAMI_datatype_dup_hook(ptr)
 
 #endif
diff --git a/src/mpid/pamid/src/Makefile.mk b/src/mpid/pamid/src/Makefile.mk
index a8d98e1..e2468f3 100644
--- a/src/mpid/pamid/src/Makefile.mk
+++ b/src/mpid/pamid/src/Makefile.mk
@@ -59,7 +59,8 @@ mpi_core_sources +=               \
     src/mpid/pamid/src/mpid_imrecv.c            \
     src/mpid/pamid/src/mpid_improbe.c           \
     src/mpid/pamid/src/mpid_aint.c              \
-    src/mpid/pamid/src/mpidi_nbc_sched.c
+    src/mpid/pamid/src/mpidi_nbc_sched.c        \
+    src/mpid/pamid/src/mpidi_pami_datatype.c
 
 if QUEUE_BINARY_SEARCH_SUPPORT
 mpi_core_sources +=                             \
diff --git a/src/mpid/pamid/src/mpid_init.c b/src/mpid/pamid/src/mpid_init.c
index e768119..a2bc89d 100644
--- a/src/mpid/pamid/src/mpid_init.c
+++ b/src/mpid/pamid/src/mpid_init.c
@@ -135,6 +135,7 @@ MPIDI_Process_t  MPIDI_Process = {
 
   .mpir_nbc              = 1,
   .numTasks              = 0,
+  .typed_onesided        = 0,
 };
 
 
@@ -1016,6 +1017,7 @@ MPIDI_PAMI_init(int* rank, int* size, int* threading)
              "  optimized.num_requests: %u\n"
              "  mpir_nbc              : %u\n" 
              "  numTasks              : %u\n",
+             "  typed_onesided        : %u\n",
              MPIDI_Process.verbose,
              MPIDI_Process.statistics,
              MPIDI_Process.avail_contexts,
@@ -1052,7 +1054,8 @@ MPIDI_PAMI_init(int* rank, int* size, int* threading)
              MPIDI_Process.optimized.memory,
              MPIDI_Process.optimized.num_requests,
              MPIDI_Process.mpir_nbc, 
-             MPIDI_Process.numTasks);
+             MPIDI_Process.numTasks,
+             MPIDI_Process.typed_onesided);
       switch (*threading)
         {
           case MPI_THREAD_MULTIPLE:
diff --git a/src/mpid/pamid/src/mpidi_env.c b/src/mpid/pamid/src/mpidi_env.c
index 3eb1aac..f6934c6 100644
--- a/src/mpid/pamid/src/mpidi_env.c
+++ b/src/mpid/pamid/src/mpidi_env.c
@@ -938,6 +938,11 @@ MPIDI_Env_setup(int rank, int requested)
     ENV_Unsigned(names, &MPIDI_Process.mpir_nbc, 1, &found_deprecated_env_var, rank);
   }
 
+  /* Enable typed PAMI calls for derived types within MPID_Put and MPID_Get. */
+  {
+    char* names[] = {"PAMID_TYPED_ONESIDED", NULL};
+    ENV_Unsigned(names, &MPIDI_Process.typed_onesided, 1, &found_deprecated_env_var, rank);
+  }
   /* Check for deprecated collectives environment variables. These variables are
    * used in src/mpid/pamid/src/comm/mpid_selectcolls.c */
   {
diff --git a/src/mpid/pamid/src/mpidi_pami_datatype.c b/src/mpid/pamid/src/mpidi_pami_datatype.c
new file mode 100644
index 0000000..b700ea0
--- /dev/null
+++ b/src/mpid/pamid/src/mpidi_pami_datatype.c
@@ -0,0 +1,155 @@
+/* begin_generated_IBM_copyright_prolog                             */
+/*                                                                  */
+/* This is an automatically generated copyright prolog.             */
+/* After initializing,  DO NOT MODIFY OR MOVE                       */
+/*  --------------------------------------------------------------- */
+/* Licensed Materials - Property of IBM                             */
+/* Blue Gene/Q 5765-PER 5765-PRP                                    */
+/*                                                                  */
+/* (C) Copyright IBM Corp. 2011, 2012 All Rights Reserved           */
+/* US Government Users Restricted Rights -                          */
+/* Use, duplication, or disclosure restricted                       */
+/* by GSA ADP Schedule Contract with IBM Corp.                      */
+/*                                                                  */
+/*  --------------------------------------------------------------- */
+/*                                                                  */
+/* end_generated_IBM_copyright_prolog                               */
+/*  (C)Copyright IBM Corp.  2007, 2011  */
+/**
+ * \file src/mpidi_pami_datatype.c
+ * \brief pami_type_t datatype hooks
+ */
+
+#include <pami.h>
+#include <mpidimpl.h>
+
+/**
+ * \brief Create PAMI datatype representation of MPI Datatype during commit.
+ *
+ * Signifcant performance improvements can be realized for one-sided communication
+ * utilizing the PAMI_Rput_typed and PAMI_Rget_typed interface which requires a
+ * PAMI representation of the MPI Datatype.
+ */
+void MPIDI_PAMI_datatype_commit_hook (MPI_Datatype *ptr)
+{
+
+    /* If the PAMID optimization to utilize the PAMI_Rput_typed / PAMI_Rget_typed call for
+     * one-sided comm for derived types is enabled then we need to create the PAMI datatype.
+     */
+    if (MPIDI_Process.typed_onesided == 1) {
+
+      MPID_Datatype *datatype_ptr;
+      MPID_Datatype_get_ptr(*ptr, datatype_ptr);
+
+      pami_result_t pami_dtop_result;
+      datatype_ptr->device_datatype = (pami_type_t *) MPIU_Malloc(sizeof(pami_type_t));
+      pami_dtop_result = PAMI_Type_create ((pami_type_t *)datatype_ptr->device_datatype);
+      MPIU_Assert(pami_dtop_result == PAMI_SUCCESS);
+
+      /* Flatten the non-contiguous data type into arrays describing the contiguous chunks.
+       */
+      MPI_Aint *dt_offset_array = (MPI_Aint *) MPIU_Malloc(datatype_ptr->max_contig_blocks * sizeof(MPI_Aint));
+      MPI_Aint *dt_size_array = (MPI_Aint *) MPIU_Malloc(datatype_ptr->max_contig_blocks * sizeof(MPI_Aint));
+      MPI_Aint dt_array_len = datatype_ptr->max_contig_blocks;
+      int rc = MPIR_Type_flatten(*ptr, dt_offset_array, dt_size_array, &dt_array_len);
+
+      /* Build the PAMI datatype adding one contiguous chunk at a time with the PAMI_Type_add_simple
+       * interface.
+       */
+      int i;
+
+      for (i=0;i<dt_array_len;i++) {
+        size_t num_bytes_this_entry = dt_size_array[i];
+        size_t cursor_offset;
+        if (i == 0)
+          cursor_offset = (size_t) dt_offset_array[i];
+        else
+          cursor_offset = (size_t) dt_offset_array[i] - (size_t)dt_offset_array[i-1];
+        pami_dtop_result = PAMI_Type_add_simple (*(pami_type_t*)(datatype_ptr->device_datatype), num_bytes_this_entry, cursor_offset,  1, 0);
+        MPIU_Assert(pami_dtop_result == PAMI_SUCCESS);
+      }
+
+      /* Complete the PAMI datatype and free arrays.
+       */
+      pami_dtop_result = PAMI_Type_complete (*(pami_type_t*)(datatype_ptr->device_datatype),1);
+      MPIU_Assert(pami_dtop_result == PAMI_SUCCESS);
+      MPIU_Free(dt_offset_array);
+      MPIU_Free(dt_size_array);
+    }
+  return;
+}
+
+/**
+ * \brief Destroy PAMI datatype representation of MPI Datatype.
+ *
+ */
+void MPIDI_PAMI_datatype_destroy_hook (MPID_Datatype *ptr)
+{
+    /* If a PAMI datatype was created, destroy it if this is the
+     * last reference to the MPID_Datatype ptr.
+     */
+    if ((MPIDI_Process.typed_onesided == 1) && (ptr->is_committed)) {
+      if (ptr->device_datatype) {
+        pami_result_t pami_dtop_result;
+        pami_dtop_result = PAMI_Type_destroy ((pami_type_t *)ptr->device_datatype);
+        MPIU_Assert(pami_dtop_result == PAMI_SUCCESS);
+        MPIU_Free(ptr->device_datatype);
+      }
+    }
+}
+
+/**
+ * \brief Create PAMI datatype representation of MPI Datatype during dup.
+ *
+ * Signifcant performance improvements can be realized for one-sided communication
+ * utilizing the PAMI_Rput_typed and PAMI_Rget_typed interface which requires a
+ * PAMI representation of the MPI Datatype.
+ */
+void MPIDI_PAMI_datatype_dup_hook (MPI_Datatype *ptr)
+{
+
+    /* If the PAMID optimization to utilize the PAMI_Rput_typed / PAMI_Rget_typed call for
+     * one-sided comm for derived types is enabled then we need to create the PAMI datatype.
+     */
+    if (MPIDI_Process.typed_onesided == 1) {
+
+      MPID_Datatype *datatype_ptr;
+      MPID_Datatype_get_ptr(*ptr, datatype_ptr);
+
+      pami_result_t pami_dtop_result;
+      datatype_ptr->device_datatype = (pami_type_t *) MPIU_Malloc(sizeof(pami_type_t));
+      pami_dtop_result = PAMI_Type_create ((pami_type_t *)datatype_ptr->device_datatype);
+      MPIU_Assert(pami_dtop_result == PAMI_SUCCESS);
+
+      /* Flatten the non-contiguous data type into arrays describing the contiguous chunks.
+       */
+      MPI_Aint *dt_offset_array = (MPI_Aint *) MPIU_Malloc(datatype_ptr->max_contig_blocks * sizeof(MPI_Aint));
+      MPI_Aint *dt_size_array = (MPI_Aint *) MPIU_Malloc(datatype_ptr->max_contig_blocks * sizeof(MPI_Aint));
+      MPI_Aint dt_array_len = datatype_ptr->max_contig_blocks;
+      int rc = MPIR_Type_flatten(*ptr, dt_offset_array, dt_size_array, &dt_array_len);
+
+      /* Build the PAMI datatype adding one contiguous chunk at a time with the PAMI_Type_add_simple
+       * interface.
+       */
+      int i;
+
+      for (i=0;i<dt_array_len;i++) {
+        size_t num_bytes_this_entry = dt_size_array[i];
+        size_t cursor_offset;
+        if (i == 0)
+          cursor_offset = (size_t) dt_offset_array[i];
+        else
+          cursor_offset = (size_t) dt_offset_array[i] - (size_t)dt_offset_array[i-1];
+        pami_dtop_result = PAMI_Type_add_simple (*(pami_type_t*)(datatype_ptr->device_datatype), num_bytes_this_entry, cursor_offset,  1, 0);
+        MPIU_Assert(pami_dtop_result == PAMI_SUCCESS);
+      }
+
+      /* Complete the PAMI datatype and free arrays.
+       */
+      pami_dtop_result = PAMI_Type_complete (*(pami_type_t*)(datatype_ptr->device_datatype),1);
+      MPIU_Assert(pami_dtop_result == PAMI_SUCCESS);
+      MPIU_Free(dt_offset_array);
+      MPIU_Free(dt_size_array);
+    }
+  return;
+}
diff --git a/src/mpid/pamid/src/onesided/mpid_win_get.c b/src/mpid/pamid/src/onesided/mpid_win_get.c
index 872f62f..ba318af 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_get.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_get.c
@@ -22,7 +22,6 @@
 #include "mpidi_onesided.h"
 #include "mpidi_util.h"
 
-
 static inline int
 MPIDI_Get_use_pami_rget(pami_context_t context, MPIDI_Win_request * req)
 __attribute__((__always_inline__));
@@ -60,6 +59,39 @@ MPIDI_Get(pami_context_t   context,
 static inline int
 MPIDI_Get_use_pami_rget(pami_context_t context, MPIDI_Win_request * req)
 {
+  int use_typed_rdma = 0;
+
+  if (!req->target.dt.contig || !req->origin.dt.contig) {
+    use_typed_rdma = 0;
+    if (MPIDI_Process.typed_onesided == 1)
+      use_typed_rdma = 1;
+  }
+
+  if (use_typed_rdma) {
+    pami_result_t rc;
+    pami_rget_typed_t params;
+    /* params need to zero out to avoid passing garbage to PAMI */
+    params=zero_rget_typed_parms;
+
+    params.rma.dest=req->dest;
+    params.rma.hints.buffer_registered = PAMI_HINT_ENABLE;
+    params.rma.hints.use_rdma          = PAMI_HINT_ENABLE;
+    params.rma.bytes   = req->target.dt.size;
+    params.rma.cookie  = req;
+    params.rma.done_fn = MPIDI_Win_DoneCB;
+    params.rdma.local.mr=&req->origin.memregion;
+    params.rdma.remote.mr=&req->win->mpid.info[req->target.rank].memregion;
+    params.rdma.remote.offset= req->offset;
+    params.rdma.local.offset  = req->state.local_offset;
+
+    params.type.local = *(pami_type_t *)(req->origin.dt.pointer->device_datatype);
+    params.type.remote = *(pami_type_t *)(req->target.dt.pointer->device_datatype);
+
+
+    rc = PAMI_Rget_typed(context, &params);
+    MPID_assert(rc == PAMI_SUCCESS);
+  }
+  else {
   pami_result_t rc;
   pami_rget_simple_t  params;
 
@@ -110,6 +142,7 @@ MPIDI_Get_use_pami_rget(pami_context_t context, MPIDI_Win_request * req)
           ++req->state.index;
       }
   }
+  }
   return PAMI_SUCCESS;
 }
 
@@ -203,6 +236,7 @@ MPID_Get(void         *origin_addr,
          MPI_Datatype  target_datatype,
          MPID_Win     *win)
 {
+
   int mpi_errno = MPI_SUCCESS;
   int shm_locked=0;
   void *target_addr;
@@ -293,10 +327,17 @@ MPID_Get(void         *origin_addr,
   req->target.rank = target_rank;
 
 
-  if (req->origin.dt.contig)
+  /* Only pack the origin data if the origin is non-contiguous and we are using the simple PAMI_Rget.
+   * If we are using the typed PAMI_Rget_typed use the origin address as is, if we are using the simple
+   * PAMI_Rget with contiguous data use the origin address with the lower-bound adjustment.
+   */
+  if (req->origin.dt.contig || (!req->origin.dt.contig && (MPIDI_Process.typed_onesided == 1)))
     {
       req->buffer_free = 0;
-      req->buffer      = origin_addr + req->origin.dt.true_lb;
+      if ((req->origin.dt.contig && req->target.dt.contig && (MPIDI_Process.typed_onesided == 1)) || (!(MPIDI_Process.typed_onesided == 1))) // use simple rput
+        req->buffer      = (void *) ((uintptr_t) origin_addr + req->origin.dt.true_lb);
+      else
+        req->buffer      = (void *) ((uintptr_t) origin_addr);
     }
   else
     {
@@ -356,8 +397,14 @@ MPID_Get(void         *origin_addr,
 
 
   MPIDI_Win_datatype_map(&req->target.dt);
-  win->mpid.sync.total += req->target.dt.num_contig;
 
+  if ((!req->target.dt.contig || !req->origin.dt.contig) && (MPIDI_Process.typed_onesided == 1))
+    /* If the datatype is non-contiguous and the PAMID typed_onesided optimization
+     * is enabled then we will be using the typed interface and will only make 1 call.
+     */
+    win->mpid.sync.total = 1;
+  else
+    win->mpid.sync.total += req->target.dt.num_contig;
 
   /* The pamid one-sided design requires context post in order to handle the
    * case where the number of pending rma operation exceeds the
diff --git a/src/mpid/pamid/src/onesided/mpid_win_put.c b/src/mpid/pamid/src/onesided/mpid_win_put.c
index 9cbcc05..a7e7ce5 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_put.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_put.c
@@ -61,6 +61,39 @@ MPIDI_Put(pami_context_t   context,
 static inline int
 MPIDI_Put_use_pami_rput(pami_context_t context, MPIDI_Win_request * req)
 {
+  int use_typed_rdma = 0;
+  if (!req->target.dt.contig || !req->origin.dt.contig) {
+    use_typed_rdma = 0;
+    if (MPIDI_Process.typed_onesided == 1)
+      use_typed_rdma = 1;
+  }
+
+  if (use_typed_rdma) {
+    pami_result_t rc;
+    pami_rput_typed_t params;
+    /* params need to zero out to avoid passing garbage to PAMI */
+    params=zero_rput_typed_parms;
+
+    params.rma.dest=req->dest;
+    params.rma.hints.buffer_registered = PAMI_HINT_ENABLE;
+    params.rma.hints.use_rdma          = PAMI_HINT_ENABLE;
+    params.rma.bytes   = req->target.dt.size;
+    params.rma.cookie  = req;
+    params.rma.done_fn = NULL;
+    params.rdma.local.mr=&req->origin.memregion;
+    params.rdma.remote.mr=&req->win->mpid.info[req->target.rank].memregion;
+    params.rdma.remote.offset= req->offset;
+    params.rdma.local.offset  = req->state.local_offset;
+    params.put.rdone_fn= MPIDI_Win_DoneCB;
+
+    params.type.local = *(pami_type_t *)(req->origin.dt.pointer->device_datatype);
+    params.type.remote = *(pami_type_t *)(req->target.dt.pointer->device_datatype);
+
+    rc = PAMI_Rput_typed(context, &params);
+    MPID_assert(rc == PAMI_SUCCESS);
+
+  }
+  else {
   pami_result_t rc;
   pami_rput_simple_t params;
   /* params need to zero out to avoid passing garbage to PAMI */
@@ -113,6 +146,7 @@ MPIDI_Put_use_pami_rput(pami_context_t context, MPIDI_Win_request * req)
           ++req->state.index;
     }
   }
+  }
   return PAMI_SUCCESS;
 }
 
@@ -296,10 +330,17 @@ MPID_Put(const void   *origin_addr,
   req->target.rank = target_rank;
 
 
-  if (req->origin.dt.contig)
+  /* Only pack the origin data if the origin is non-contiguous and we are using the simple PAMI_Rput.
+   * If we are using the typed PAMI_Rput_typed use the origin address as-is, if we are using the simple
+   * PAMI_Rput with contiguous data use the origin address with the lower-bound adjustment.
+   */
+  if (req->origin.dt.contig || (!req->origin.dt.contig && (MPIDI_Process.typed_onesided == 1)))
     {
       req->buffer_free = 0;
-      req->buffer      = (void *) ((uintptr_t) origin_addr + req->origin.dt.true_lb);
+      if ((req->origin.dt.contig && req->target.dt.contig && (MPIDI_Process.typed_onesided == 1)) || (!(MPIDI_Process.typed_onesided == 1)))
+        req->buffer      = (void *) ((uintptr_t) origin_addr + req->origin.dt.true_lb);
+      else
+        req->buffer      = (void *) ((uintptr_t) origin_addr);
     }
   else
     {
@@ -357,7 +398,13 @@ MPID_Put(const void   *origin_addr,
 
 
   MPIDI_Win_datatype_map(&req->target.dt);
-  win->mpid.sync.total += req->target.dt.num_contig;
+  if ((!req->target.dt.contig || !req->origin.dt.contig) && (MPIDI_Process.typed_onesided == 1))
+    /* If the datatype is non-contiguous and the PAMID typed_onesided optimization
+     * is enabled then we will be using the typed interface and will only make 1 call.
+     */
+    win->mpid.sync.total = 1;
+  else
+    win->mpid.sync.total += req->target.dt.num_contig;
 
   /* The pamid one-sided design requires context post in order to handle the
    * case where the number of pending rma operation exceeds the
diff --git a/src/mpid/pamid/src/onesided/mpidi_onesided.h b/src/mpid/pamid/src/onesided/mpidi_onesided.h
index 0b324cc..23e2cf8 100644
--- a/src/mpid/pamid/src/onesided/mpidi_onesided.h
+++ b/src/mpid/pamid/src/onesided/mpidi_onesided.h
@@ -29,6 +29,8 @@ pami_rget_simple_t zero_rget_parms;
 pami_get_simple_t zero_get_parms;
 pami_rput_simple_t zero_rput_parms;
 pami_put_simple_t zero_put_parms;
+pami_rput_typed_t zero_rput_typed_parms;
+pami_rget_typed_t zero_rget_typed_parms;
 pami_send_t   zero_send_parms;
 pami_send_immediate_t   zero_send_immediate_parms;
 pami_recv_t   zero_recv_parms;

http://git.mpich.org/mpich.git/commitdiff/18516bafa3a3ee7a9e0343ad1bda9876fa52a49a

commit 18516bafa3a3ee7a9e0343ad1bda9876fa52a49a
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Mon Mar 23 18:39:49 2015 -0500

    ROMIO Collective IO One-sided aggregation algorithm support read-modify-write
    
    Added support to additionally run two-phase aggregation which has
    the read-modify-write capability in cases where the one-sided
    write aggregation encounters holes in the data.  Additon of two new
    environment variables (GPFSMPIO_ONESIDED_NO_RMW,
    GPFSMPIO_ONESIDED_INFORM_RMW) to control this behavior and inform the
    user.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
index 192f747..0d44176 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
@@ -41,6 +41,8 @@ int     gpfsmpio_aggmethod;
 int	gpfsmpio_balancecontig;
 int     gpfsmpio_devnullio;
 int     gpfsmpio_bridgeringagg;
+int     gpfsmpio_onesided_no_rmw;
+int     gpfsmpio_onesided_inform_rmw;
 
 double	gpfsmpio_prof_cw    [GPFSMPIO_CIO_LAST+1];
 double	gpfsmpio_prof_cr    [GPFSMPIO_CIO_LAST+1];
@@ -122,6 +124,22 @@ double	gpfsmpio_prof_cr    [GPFSMPIO_CIO_LAST+1];
  *         optimal performance for this is achieved when paired with PAMID_TYPED_ONESIDED=1.
  *   - Default is 0
  *
+ * - GPFSMPIO_ONESIDED_NO_RMW - For one-sided aggregation (GPFSMPIO_AGGMETHOD = 1 or 2)
+ *   disable the detection of holes in the data when writing to a pre-existing
+ *   file requiring a read-modify-write, thereby avoiding the communication
+ *   overhead for this detection.
+ *   - 0 (hole detection enabled) or 1 (hole detection disabled)
+ *   - Default is 0
+ *
+ * - GPFSMPIO_ONESIDED_INFORM_RMW - For one-sided aggregation
+ *   (GPFSMPIO_AGGMETHOD = 1 or 2) generate an informational message informing
+ *   the user whether holes exist in the data when writing to a pre-existing
+ *   file requiring a read-modify-write, thereby educating the user to set
+ *   GPFSMPIO_ONESIDED_NO_RMW=1 on a future run to avoid the communication
+ *   overhead for this detection.
+ *   - 0 (disabled) or 1 (enabled)
+ *   - Default is 0
+ *
  * - GPFSMPIO_BALANCECONTIG -  Relevant only to BGQ.  File domain blocks are assigned
  *   to aggregators in a breadth-first fashion relative to the ions - additionally,
  *   file domains on the aggregators sharing the same bridgeset and ion have contiguous
@@ -197,6 +215,14 @@ void ad_gpfs_get_env_vars() {
     gpfsmpio_bridgeringagg = 0;
     x = getenv( "GPFSMPIO_BRIDGERINGAGG" );
     if (x) gpfsmpio_bridgeringagg = atoi(x);
+
+    gpfsmpio_onesided_no_rmw = 0;
+    x = getenv( "GPFSMPIO_ONESIDED_NO_RMW" );
+    if (x) gpfsmpio_onesided_no_rmw = atoi(x);
+
+    gpfsmpio_onesided_inform_rmw = 0;
+    x = getenv( "GPFSMPIO_ONESIDED_INFORM_RMW" );
+    if (x) gpfsmpio_onesided_inform_rmw = atoi(x);
 }
 
 /* report timing breakdown for MPI I/O collective call */
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
index 87aefce..56e1588 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
@@ -23,6 +23,7 @@
  *  Global variables for the control of
  *  1.  timing
  *  2.  select specific optimizations
+ *  3.  global flags for certain optimizations
  *-----------------------------------------*/
 
 /* timing fields */
@@ -56,7 +57,6 @@ enum {
 extern double 	gpfsmpio_prof_cw    [GPFSMPIO_CIO_LAST+1];
 extern double 	gpfsmpio_prof_cr    [GPFSMPIO_CIO_LAST+1];
 
-
 /* corresponds to environment variables to select optimizations and timing level */
 extern int 	gpfsmpio_timing;
 extern int      gpfsmpio_timing_cw_level;
@@ -70,6 +70,8 @@ extern int      gpfsmpio_aggmethod;
 extern int  gpfsmpio_balancecontig;
 extern int      gpfsmpio_devnullio;
 extern int      gpfsmpio_bridgeringagg;
+extern int      gpfsmpio_onesided_no_rmw;
+extern int      gpfsmpio_onesided_inform_rmw;
 
 /* Default is, well, kind of complicated. Blue Gene /L and /P had "psets": one
  * i/o node and all compute nodes wired to it.  On Blue Gene /Q that
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
index 9ce68fa..4fcb14c 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
@@ -267,7 +267,12 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
     /* If the user has specified to use a one-sided aggregation method then do that at
      * this point instead of the two-phase I/O.
      */
-      ADIOI_OneSidedWriteAggregation(fd, offset_list, len_list, contig_access_count, buf, datatype, error_code, st_offsets, end_offsets, fd_start, fd_end);
+      int holeFound = 0;
+      ADIOI_OneSidedWriteAggregation(fd, offset_list, len_list, contig_access_count, buf, datatype, error_code, st_offsets, end_offsets, fd_start, fd_end, &holeFound);
+      int anyHolesFound = 0;
+      if (!gpfsmpio_onesided_no_rmw)
+        MPI_Allreduce(&holeFound, &anyHolesFound, 1, MPI_INT, MPI_MAX, fd->comm);
+      if (anyHolesFound == 0) {
       GPFSMPIO_T_CIO_REPORT( 1, fd, myrank, nprocs)
       ADIOI_Free(offset_list);
       ADIOI_Free(len_list);
@@ -276,6 +281,15 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
       ADIOI_Free(fd_start);
       ADIOI_Free(fd_end);
 	  goto fn_exit;
+	  }
+      else {
+        /* Holes are found in the data and the user has not set gpfsmpio_onesided_no_rmw ---
+         * fall thru and perform the two-phase aggregation and if the user has gpfsmpio_onesided_inform_rmw
+         * set then inform him of this condition and behavior.
+         */
+        if (gpfsmpio_onesided_inform_rmw && (myrank ==0))
+          FPRINTF(stderr,"Information: Holes found during one-sided write aggregation algorithm --- additionally performing default two-phase aggregation algorithm\n");
+      }
     }
     if (gpfsmpio_p2pcontig==1) {
 	/* For some simple yet common(?) workloads, full-on two-phase I/O is overkill.  We can establish sub-groups of processes and their aggregator, and then these sub-groups will carry out a simplified two-phase over that sub-group.
diff --git a/src/mpi/romio/adio/common/ad_close.c b/src/mpi/romio/adio/common/ad_close.c
index d51d75c..ff0049e 100644
--- a/src/mpi/romio/adio/common/ad_close.c
+++ b/src/mpi/romio/adio/common/ad_close.c
@@ -8,6 +8,7 @@
 #include "adio.h"
 #include "adio_extern.h"
 extern int      gpfsmpio_aggmethod;
+extern int      gpfsmpio_onesided_no_rmw;
 
 #ifdef HAVE_UNISTD_H
 #include <unistd.h>
@@ -118,8 +119,13 @@ void ADIO_Close(ADIO_File fd, int *error_code)
     if (fd->io_buf != NULL) ADIOI_Free(fd->io_buf);
     /* If one-sided aggregation is chosen then free the window over the io_buf.
      */
-    if ((gpfsmpio_aggmethod == 1) || (gpfsmpio_aggmethod == 2))
+    if ((gpfsmpio_aggmethod == 1) || (gpfsmpio_aggmethod == 2)) {
 	  MPI_Win_free(&fd->io_buf_window);
+	  if (!gpfsmpio_onesided_no_rmw) {
+	    MPI_Win_free(&fd->io_buf_put_amounts_window);
+	    ADIOI_Free(fd->io_buf_put_amounts);
+	  }
+	}
 
     /* memory for fd is freed in MPI_File_close */
 }
diff --git a/src/mpi/romio/adio/common/ad_open.c b/src/mpi/romio/adio/common/ad_open.c
index c0d9331..222cb55 100644
--- a/src/mpi/romio/adio/common/ad_open.c
+++ b/src/mpi/romio/adio/common/ad_open.c
@@ -11,6 +11,7 @@
 
 #include "mpio.h"
 extern int      gpfsmpio_aggmethod;
+extern int      gpfsmpio_onesided_no_rmw;
 static int is_aggregator(int rank, ADIO_File fd);
 static int uses_generic_read(ADIO_File fd);
 static int uses_generic_write(ADIO_File fd);
@@ -124,12 +125,16 @@ MPI_File ADIO_Open(MPI_Comm orig_comm,
     /* Instead of repeatedly allocating this buffer in collective read/write,
      * allocating up-front might make memory management on small platforms
      * (e.g. Blue Gene) more efficent */
-    fd->io_buf = ADIOI_Malloc(fd->hints->cb_buffer_size);
 
+    fd->io_buf = ADIOI_Malloc(fd->hints->cb_buffer_size);
     /* If one-sided aggregation is chosen then create the window over the io_buf.
      */
     if ((gpfsmpio_aggmethod == 1) || (gpfsmpio_aggmethod == 2)) {
       MPI_Win_create(fd->io_buf,fd->hints->cb_buffer_size,1,MPI_INFO_NULL,fd->comm, &fd->io_buf_window);
+      if (!gpfsmpio_onesided_no_rmw) {
+        fd->io_buf_put_amounts = (int *) ADIOI_Malloc(procs*sizeof(int));
+        MPI_Win_create(fd->io_buf_put_amounts,procs*sizeof(int),sizeof(int),MPI_INFO_NULL,fd->comm, &fd->io_buf_put_amounts_window);
+      }
     }
      /* deferred open: 
      * we can only do this optimization if 'fd->hints->deferred_open' is set
diff --git a/src/mpi/romio/adio/common/onesided_aggregation.c b/src/mpi/romio/adio/common/onesided_aggregation.c
index 50d9315..559ea40 100644
--- a/src/mpi/romio/adio/common/onesided_aggregation.c
+++ b/src/mpi/romio/adio/common/onesided_aggregation.c
@@ -30,9 +30,10 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
     ADIO_Offset *st_offsets,
     ADIO_Offset *end_offsets,
     ADIO_Offset *fd_start,
-    ADIO_Offset* fd_end)
-{
+    ADIO_Offset* fd_end,
+    int *hole_found)
 
+{
     *error_code = MPI_SUCCESS; /* initialize to success */
 
 #ifdef ROMIO_GPFS
@@ -119,10 +120,6 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
     ADIO_Offset greatestFileDomainOffset = 0;
     ADIO_Offset smallestFileDomainOffset = lastFileOffset;
     for (j=0;j<naggs;j++) {
-      /* Find the actual lowest and highest offsets to be written.
-         The non-aggs need to know this too to adjust the mpi_put
-         window displacement accordingly.
-       */
       if (fd_end[j] > greatestFileDomainOffset) {
         greatestFileDomainOffset = fd_end[j];
         greatestFileDomainAggRank = j;
@@ -346,6 +343,12 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
             targetAggsForMyDataFDStart[numTargetAggs] = firstFileOffset;
         }
         targetAggsForMyDataFDEnd[numTargetAggs] = fd_end[currentAggRankListIndex];
+        /* Round down file domain to the last actual offset used if this is the last file domain.
+         */
+        if (currentAggRankListIndex == greatestFileDomainAggRank) {
+          if (targetAggsForMyDataFDEnd[numTargetAggs] > lastFileOffset)
+            targetAggsForMyDataFDEnd[numTargetAggs] = lastFileOffset;
+        }
         targetAggsForMyDataFirstOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
         if (bufTypeIsContig)
           baseSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = currentSourceBufferOffset;
@@ -417,6 +420,12 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
                 targetAggsForMyDataFDStart[numTargetAggs] = firstFileOffset;
             }
             targetAggsForMyDataFDEnd[numTargetAggs] = fd_end[currentAggRankListIndex];
+            /* Round down file domain to the last actual offset used if this is the last file domain.
+             */
+            if (currentAggRankListIndex == greatestFileDomainAggRank) {
+              if (targetAggsForMyDataFDEnd[numTargetAggs] > lastFileOffset)
+                targetAggsForMyDataFDEnd[numTargetAggs] = lastFileOffset;
+            }
             targetAggsForMyDataFirstOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
             if (bufTypeIsContig)
               baseSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = currentSourceBufferOffset;
@@ -493,8 +502,16 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
     char *write_buf = write_buf0;
     MPI_Win write_buf_window = fd->io_buf_window;
 
+    int *write_buf_put_amounts = fd->io_buf_put_amounts;
+    if(!gpfsmpio_onesided_no_rmw) {
+      *hole_found = 0;
+      for (i=0;i<nprocs;i++)
+        write_buf_put_amounts[i] = 0;
+    }
 #ifdef ACTIVE_TARGET
     MPI_Win_fence(0, write_buf_window);
+    if (!gpfsmpio_onesided_no_rmw)
+      MPI_Win_fence(0, fd->io_buf_put_amounts_window);
 #endif
 
     ADIO_Offset currentRoundFDStart = 0;
@@ -506,6 +523,10 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
         if (currentRoundFDStart < firstFileOffset)
           currentRoundFDStart = firstFileOffset;
       }
+      else if (myAggRank == greatestFileDomainAggRank) {
+        if (currentRoundFDEnd > lastFileOffset)
+          currentRoundFDEnd = lastFileOffset;
+      }
     }
 
 #ifdef ROMIO_GPFS
@@ -537,6 +558,7 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
     int aggIter;
     for (aggIter=0;aggIter<numTargetAggs;aggIter++) {
 
+    int numBytesPutThisAggRound = 0;
     /* If we have data for the round/agg process it.
      */
     if ((bufTypeIsContig && (baseSourceBufferOffset[roundIter][aggIter] != -1)) || (!bufTypeIsContig && (baseNonContigSourceBufferOffset[roundIter][aggIter].flatBufIndice != -1))) {
@@ -661,6 +683,7 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
             offsetStart = currentRoundFDStartForMyTargetAgg;
         }
 
+        numBytesPutThisAggRound += bufferAmountToSend;
 #ifdef onesidedtrace
         printf("bufferAmountToSend is %d\n",bufferAmountToSend);
 #endif
@@ -848,6 +871,16 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
         MPI_Type_free(&targetBufferDerivedDataType);
         }
       }
+      if (!gpfsmpio_onesided_no_rmw) {
+#ifndef ACTIVE_TARGET
+        MPI_Win_lock(MPI_LOCK_SHARED, targetAggsForMyData[aggIter], 0, fd->io_buf_put_amounts_window);
+#endif
+        MPI_Put(&numBytesPutThisAggRound,1, MPI_INT,targetAggsForMyData[aggIter],myrank, 1,MPI_INT,fd->io_buf_put_amounts_window);
+
+#ifndef ACTIVE_TARGET
+        MPI_Win_unlock(targetAggsForMyData[aggIter], fd->io_buf_put_amounts_window);
+#endif
+      }
       } // baseoffset != -1
     } // target aggs
 
@@ -855,6 +888,8 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
 
 #ifdef ACTIVE_TARGET
     MPI_Win_fence(0, write_buf_window);
+    if (!gpfsmpio_onesided_no_rmw)
+      MPI_Win_fence(0, fd->io_buf_put_amounts_window);
 #else
     MPI_Barrier(fd->comm);
 #endif
@@ -879,20 +914,24 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
         printf("currentRoundFDStart is %ld currentRoundFDEnd is %ld within file domeain %ld to %ld\n",currentRoundFDStart,currentRoundFDEnd,fd_start[myAggRank],fd_end[myAggRank]);
 #endif
 
+        int doWriteContig = 1;
+        if (!gpfsmpio_onesided_no_rmw) {
+          int numBytesPutIntoBuf = 0;
+          for (i=0;i<nprocs;i++) {
+            numBytesPutIntoBuf += write_buf_put_amounts[i];
+            write_buf_put_amounts[i] = 0;
+          }
+          if (numBytesPutIntoBuf != ((int)(currentRoundFDEnd - currentRoundFDStart)+1)) {
+            doWriteContig = 0;
+            *hole_found = 1;
+          }
+        }
+
         if (!useIOBuffer) {
-          ADIO_WriteContig(fd, write_buf, (int)(currentRoundFDEnd - currentRoundFDStart)+1,
-            MPI_BYTE, ADIO_EXPLICIT_OFFSET,currentRoundFDStart, &status, error_code);
-
-/* For now this algorithm cannot handle holes in the source data and does not do any data sieving.
- * One possible approach would be to initialize the write buffer with some value and then check to
- * see if the mpi_put operations changed the values, if they were unchanged then retry with some other
- * default initialized value and if that was still unchanged then you would know where a hole was.
- * Here is some initial sample code for that:
- *       if (roundIter<(numberOfRounds-1)) {
- *         for (i=0;i<((currentRoundFDEnd - currentRoundFDStart)+1);i++)
- *           write_buf[i] = '\0';
- *       }
-*/
+          if (doWriteContig)
+            ADIO_WriteContig(fd, write_buf, (int)(currentRoundFDEnd - currentRoundFDStart)+1,
+              MPI_BYTE, ADIO_EXPLICIT_OFFSET,currentRoundFDStart, &status, error_code);
+
         } else { /* use the thread writer */
 
         if(!pthread_equal(io_thread, pthread_self())) {
@@ -916,6 +955,7 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
             currentWriteBuf = 0;
             write_buf = write_buf0;
         }
+        if (doWriteContig) {
         io_thread_args.io_kind = ADIOI_WRITE;
         io_thread_args.size = (currentRoundFDEnd-currentRoundFDStart) + 1;
         io_thread_args.offset = currentRoundFDStart;
@@ -925,7 +965,7 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
         if ( (pthread_create(&io_thread, NULL,
                 ADIOI_IO_Thread_Func, &(io_thread_args))) != 0)
             io_thread = pthread_self();
-
+        }
         } // useIOBuffer
 
     } // iAmUsedAgg
@@ -1087,10 +1127,6 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
     ADIO_Offset greatestFileDomainOffset = 0;
     ADIO_Offset smallestFileDomainOffset = lastFileOffset;
     for (j=0;j<naggs;j++) {
-      /* Find the actual lowest and highest offsets to be written.
-         The non-aggs need to know this too to adjust the mpi_get
-         window displacement accordingly.
-       */
       if (fd_end[j] > greatestFileDomainOffset) {
         greatestFileDomainOffset = fd_end[j];
         greatestFileDomainAggRank = j;
@@ -1306,6 +1342,12 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
             sourceAggsForMyDataFDStart[numSourceAggs] = firstFileOffset;
         }
         sourceAggsForMyDataFDEnd[numSourceAggs] = fd_end[currentAggRankListIndex];
+        /* Round down file domain to the last actual offset used if this is the last file domain.
+         */
+        if (currentAggRankListIndex == greatestFileDomainAggRank) {
+          if (sourceAggsForMyDataFDEnd[numSourceAggs] > lastFileOffset)
+            sourceAggsForMyDataFDEnd[numSourceAggs] = lastFileOffset;
+        }
         sourceAggsForMyDataFirstOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
         if (bufTypeIsContig)
           baseRecvBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = currentRecvBufferOffset;
@@ -1377,6 +1419,12 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
                 sourceAggsForMyDataFDStart[numSourceAggs] = firstFileOffset;
             }
             sourceAggsForMyDataFDEnd[numSourceAggs] = fd_end[currentAggRankListIndex];
+            /* Round down file domain to the last actual offset used if this is the last file domain.
+             */
+            if (currentAggRankListIndex == greatestFileDomainAggRank) {
+              if (sourceAggsForMyDataFDEnd[numSourceAggs] > lastFileOffset)
+                sourceAggsForMyDataFDEnd[numSourceAggs] = lastFileOffset;
+            }
             sourceAggsForMyDataFirstOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
             if (bufTypeIsContig)
               baseRecvBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = currentRecvBufferOffset;
@@ -1470,6 +1518,12 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
         if (nextRoundFDStart < firstFileOffset)
           nextRoundFDStart = firstFileOffset;
       }
+      else if (myAggRank == greatestFileDomainAggRank) {
+        if (currentRoundFDEnd > lastFileOffset)
+          currentRoundFDEnd = lastFileOffset;
+        if (nextRoundFDEnd > lastFileOffset)
+          nextRoundFDEnd = lastFileOffset;
+      }
     }
 
 #ifdef ROMIO_GPFS
diff --git a/src/mpi/romio/adio/include/adio.h b/src/mpi/romio/adio/include/adio.h
index f1d4ac3..da77015 100644
--- a/src/mpi/romio/adio/include/adio.h
+++ b/src/mpi/romio/adio/include/adio.h
@@ -237,6 +237,9 @@ typedef struct ADIOI_FileD {
     int my_cb_nodes_index; /* my index into cb_config_list. -1 if N/A */
     char *io_buf;          /* two-phase buffer allocated out of i/o path */
     MPI_Win io_buf_window; /* Window over the io_buf to support one-sided aggregation */
+    int *io_buf_put_amounts; /* array tracking the amount of data mpi_put into the io_buf
+                                during the same round of one-sided write aggregation */
+    MPI_Win io_buf_put_amounts_window; /* Window over the io_buf_put_amounts */
     /* External32 */
     int is_external32;      /* bool:  0 means native view */
 
diff --git a/src/mpi/romio/adio/include/adioi.h b/src/mpi/romio/adio/include/adioi.h
index 6320a63..0290cb4 100644
--- a/src/mpi/romio/adio/include/adioi.h
+++ b/src/mpi/romio/adio/include/adioi.h
@@ -696,7 +696,8 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
         ADIO_Offset *st_offsets,
         ADIO_Offset *end_offsets,
         ADIO_Offset *fd_start,
-        ADIO_Offset* fd_end);
+        ADIO_Offset* fd_end,
+        int *hole_found);
 void ADIOI_OneSidedReadAggregation(ADIO_File fd,
         ADIO_Offset *offset_list,
         ADIO_Offset *len_list,

http://git.mpich.org/mpich.git/commitdiff/41ab3461e8200920c0fb9a65f8e1921bb428fe87

commit 41ab3461e8200920c0fb9a65f8e1921bb428fe87
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Thu Mar 19 19:05:42 2015 -0500

    ROMIO Collective IO One-sided aggregation algorithm avoid
    read-modify-write for holes at the beginning
    
    Added support to correctly handle a data pattern that has a hole
    only at the beginning of the file offset range to essentially ignore
    the hole and begin writing at the first offset with actual data,
    thereby avoiding the need for a read-modify-write.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpi/romio/adio/common/onesided_aggregation.c b/src/mpi/romio/adio/common/onesided_aggregation.c
index 1068fab..50d9315 100644
--- a/src/mpi/romio/adio/common/onesided_aggregation.c
+++ b/src/mpi/romio/adio/common/onesided_aggregation.c
@@ -119,16 +119,20 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
     ADIO_Offset greatestFileDomainOffset = 0;
     ADIO_Offset smallestFileDomainOffset = lastFileOffset;
     for (j=0;j<naggs;j++) {
+      /* Find the actual lowest and highest offsets to be written.
+         The non-aggs need to know this too to adjust the mpi_put
+         window displacement accordingly.
+       */
+      if (fd_end[j] > greatestFileDomainOffset) {
+        greatestFileDomainOffset = fd_end[j];
+        greatestFileDomainAggRank = j;
+      }
+      if (fd_start[j] < smallestFileDomainOffset) {
+        smallestFileDomainOffset = fd_start[j];
+        smallestFileDomainAggRank = j;
+      }
       if (fd->hints->ranklist[j] == myrank) {
         myAggRank = j;
-        if (fd_end[j] > greatestFileDomainOffset) {
-          greatestFileDomainOffset = fd_end[j];
-          greatestFileDomainAggRank = j;
-        }
-        if (fd_start[j] < smallestFileDomainOffset) {
-          smallestFileDomainOffset = fd_start[j];
-          smallestFileDomainAggRank = j;
-        }
         if (fd_end[j] > fd_start[j]) {
           iAmUsedAgg = 1;
         }
@@ -333,8 +337,14 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
       /* Initialize the data structures if this is the first offset in the round/target agg.
        */
       if (targetAggsForMyDataFirstOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] == -1) {
-         targetAggsForMyData[numTargetAggs] = fd->hints->ranklist[currentAggRankListIndex];
+        targetAggsForMyData[numTargetAggs] = fd->hints->ranklist[currentAggRankListIndex];
         targetAggsForMyDataFDStart[numTargetAggs] = fd_start[currentAggRankListIndex];
+        /* Round up file domain to the first actual offset used if this is the first file domain.
+         */
+        if (currentAggRankListIndex == smallestFileDomainAggRank) {
+          if (targetAggsForMyDataFDStart[numTargetAggs] < firstFileOffset)
+            targetAggsForMyDataFDStart[numTargetAggs] = firstFileOffset;
+        }
         targetAggsForMyDataFDEnd[numTargetAggs] = fd_end[currentAggRankListIndex];
         targetAggsForMyDataFirstOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
         if (bufTypeIsContig)
@@ -398,8 +408,14 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
            */
           if (blockEnd >= fd_start[currentAggRankListIndex]) {
             numTargetAggs++;
-             targetAggsForMyData[numTargetAggs] = fd->hints->ranklist[currentAggRankListIndex];
+            targetAggsForMyData[numTargetAggs] = fd->hints->ranklist[currentAggRankListIndex];
             targetAggsForMyDataFDStart[numTargetAggs] = fd_start[currentAggRankListIndex];
+            /* Round up file domain to the first actual offset used if this is the first file domain.
+             */
+            if (currentAggRankListIndex == smallestFileDomainAggRank) {
+              if (targetAggsForMyDataFDStart[numTargetAggs] < firstFileOffset)
+                targetAggsForMyDataFDStart[numTargetAggs] = firstFileOffset;
+            }
             targetAggsForMyDataFDEnd[numTargetAggs] = fd_end[currentAggRankListIndex];
             targetAggsForMyDataFirstOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
             if (bufTypeIsContig)
@@ -1071,16 +1087,20 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
     ADIO_Offset greatestFileDomainOffset = 0;
     ADIO_Offset smallestFileDomainOffset = lastFileOffset;
     for (j=0;j<naggs;j++) {
+      /* Find the actual lowest and highest offsets to be written.
+         The non-aggs need to know this too to adjust the mpi_get
+         window displacement accordingly.
+       */
+      if (fd_end[j] > greatestFileDomainOffset) {
+        greatestFileDomainOffset = fd_end[j];
+        greatestFileDomainAggRank = j;
+      }
+      if (fd_start[j] < smallestFileDomainOffset) {
+        smallestFileDomainOffset = fd_start[j];
+        smallestFileDomainAggRank = j;
+      }
       if (fd->hints->ranklist[j] == myrank) {
         myAggRank = j;
-        if (fd_end[j] > greatestFileDomainOffset) {
-          greatestFileDomainOffset = fd_end[j];
-          greatestFileDomainAggRank = j;
-        }
-        if (fd_start[j] < smallestFileDomainOffset) {
-          smallestFileDomainOffset = fd_start[j];
-          smallestFileDomainAggRank = j;
-        }
         if (fd_end[j] > fd_start[j]) {
           iAmUsedAgg = 1;
         }
@@ -1277,8 +1297,14 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
       /* Initialize the data structures if this is the first offset in the round/source agg.
        */
       if (sourceAggsForMyDataFirstOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] == -1) {
-         sourceAggsForMyData[numSourceAggs] = fd->hints->ranklist[currentAggRankListIndex];
+        sourceAggsForMyData[numSourceAggs] = fd->hints->ranklist[currentAggRankListIndex];
         sourceAggsForMyDataFDStart[numSourceAggs] = fd_start[currentAggRankListIndex];
+        /* Round up file domain to the first actual offset used if this is the first file domain.
+         */
+        if (currentAggRankListIndex == smallestFileDomainAggRank) {
+          if (sourceAggsForMyDataFDStart[numSourceAggs] < firstFileOffset)
+            sourceAggsForMyDataFDStart[numSourceAggs] = firstFileOffset;
+        }
         sourceAggsForMyDataFDEnd[numSourceAggs] = fd_end[currentAggRankListIndex];
         sourceAggsForMyDataFirstOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
         if (bufTypeIsContig)
@@ -1342,8 +1368,14 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
            */
           if (blockEnd >= fd_start[currentAggRankListIndex]) {
             numSourceAggs++;
-             sourceAggsForMyData[numSourceAggs] = fd->hints->ranklist[currentAggRankListIndex];
+            sourceAggsForMyData[numSourceAggs] = fd->hints->ranklist[currentAggRankListIndex];
             sourceAggsForMyDataFDStart[numSourceAggs] = fd_start[currentAggRankListIndex];
+            /* Round up file domain to the first actual offset used if this is the first file domain.
+             */
+            if (currentAggRankListIndex == smallestFileDomainAggRank) {
+              if (sourceAggsForMyDataFDStart[numSourceAggs] < firstFileOffset)
+                sourceAggsForMyDataFDStart[numSourceAggs] = firstFileOffset;
+            }
             sourceAggsForMyDataFDEnd[numSourceAggs] = fd_end[currentAggRankListIndex];
             sourceAggsForMyDataFirstOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
             if (bufTypeIsContig)

http://git.mpich.org/mpich.git/commitdiff/398aeb4fa597ab55aff917066583e5e665631037

commit 398aeb4fa597ab55aff917066583e5e665631037
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Wed Mar 11 21:41:04 2015 -0500

    ROMIO Collective IO One-sided aggregation algorithm non-contiguous
    source buffer bug fixes
    
    The CESM climate model decomps for fill-value support exposed several
    bugs in the algorithm related to non-contiguous source buffers which
    have been fixed.  Those issues include:
    Mishandling of ranks with no data.
    Miscalculations of the source buffer offsets utilizing the flattened
    buffer mechanisms.
    Mishandling of negative source buffer offsets.
    Inefficient and inaccurate memory management of temporary buffers used
    to collect non-contigous chunks for a given file offset.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpi/romio/adio/common/onesided_aggregation.c b/src/mpi/romio/adio/common/onesided_aggregation.c
index 695edde..1068fab 100644
--- a/src/mpi/romio/adio/common/onesided_aggregation.c
+++ b/src/mpi/romio/adio/common/onesided_aggregation.c
@@ -17,7 +17,7 @@
 typedef struct NonContigSourceBufOffset {
   int dataTypeExtent;
   int flatBufIndice;
-  ADIO_Offset remLen;
+  ADIO_Offset indiceOffset;
 } NonContigSourceBufOffset;
 
 void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
@@ -74,20 +74,30 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
       flatBuf = ADIOI_Flatlist;
       while (flatBuf->type != datatype) flatBuf = flatBuf->next;
       MPI_Type_extent(datatype, &bufTypeExtent);
+#ifdef onesidedtrace
+      printf("flatBuf->count is %d bufTypeExtent is %d\n", flatBuf->count,bufTypeExtent);
+      for (i=0;i<flatBuf->count;i++)
+        printf("flatBuf->blocklens[%d] is %d flatBuf->indices[%d] is %ld\n",i,flatBuf->blocklens[i],i,flatBuf->indices[i]);
+#endif
     }
 
 #ifdef onesidedtrace
-    printf(" ADIOI_OneSidedWriteAggregation bufTypeIsContig is %d maxNumContigOperations is %d\n",bufTypeIsContig,maxNumContigOperations);
+    printf(" ADIOI_OneSidedWriteAggregation bufTypeIsContig is %d contig_access_count is %d\n",bufTypeIsContig,contig_access_count);
 #endif
 
-    ADIO_Offset myOffsetStart = st_offsets[myrank], myOffsetEnd = end_offsets[myrank], currentRoundOffsetStart = st_offsets[myrank];
-
-    ADIO_Offset lastFileOffset = 0, firstFileOffset = myOffsetEnd;
+    ADIO_Offset lastFileOffset = 0, firstFileOffset = -1;
     /* Get the total range being written.
      */
     for (j=0;j<nprocs;j++) {
-      lastFileOffset = ADIOI_MAX(lastFileOffset,end_offsets[j]);
-      firstFileOffset = ADIOI_MIN(firstFileOffset,st_offsets[j]);
+      if (end_offsets[j] > st_offsets[j]) {
+        /* Guard against ranks with empty data.
+         */
+        lastFileOffset = ADIOI_MAX(lastFileOffset,end_offsets[j]);
+        if (firstFileOffset == -1)
+          firstFileOffset = st_offsets[j];
+        else
+          firstFileOffset = ADIOI_MIN(firstFileOffset,st_offsets[j]);
+      }
     }
 
     int myAggRank = -1; /* if I am an aggregor this is my index into fd->hints->ranklist */
@@ -195,7 +205,7 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
 
     int currentDataTypeExtent = 0;
     int currentFlatBufIndice=0;
-    ADIO_Offset currentRemLen = 0;
+    ADIO_Offset currentIndiceOffset = 0;
 
 #ifdef onesidedtrace
    printf("NumberOfRounds is %d\n",numberOfRounds);
@@ -206,6 +216,7 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
 #endif
 
     int currentAggRankListIndex = 0;
+    int maxNumNonContigSourceChunks = 0;
 
     /* This denotes the coll_bufsize boundaries within the source buffer for writing for the same round.
      */
@@ -232,31 +243,44 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
           /* Non-contiguous source datatype, count up the extents and indices to this point
            * in the blocks for use in computing the source buffer offset.
            */
-          ADIO_Offset sourceBlockTotal = 0-currentRemLen;
+          ADIO_Offset sourceBlockTotal = 0;
+          int lastIndiceUsed = currentFlatBufIndice;
+          int numNonContigSourceChunks = 0;
+#ifdef onesidedtrace
+          printf("blockIter %d len_list[blockIter-1] is %d currentIndiceOffset is %ld currentFlatBufIndice is %d\n",blockIter,len_list[blockIter-1],currentIndiceOffset,currentFlatBufIndice);
+#endif
           while (sourceBlockTotal < len_list[blockIter-1]) {
-            maxNumContigOperations++;
-            sourceBlockTotal += flatBuf->blocklens[currentFlatBufIndice];
+            numNonContigSourceChunks++;
+            sourceBlockTotal += (flatBuf->blocklens[currentFlatBufIndice] - currentIndiceOffset);
+            lastIndiceUsed = currentFlatBufIndice;
             currentFlatBufIndice++;
             if (currentFlatBufIndice == flatBuf->count) {
               currentFlatBufIndice = 0;
               currentDataTypeExtent++;
             }
+            currentIndiceOffset = 0;
           }
           if (sourceBlockTotal > len_list[blockIter-1]) {
             currentFlatBufIndice--;
             if (currentFlatBufIndice < 0 ) {
               currentDataTypeExtent--;
-              currentFlatBufIndice = flatBuf->count;
+              currentFlatBufIndice = flatBuf->count-1;
             }
+            currentIndiceOffset =  len_list[blockIter-1] - (sourceBlockTotal - flatBuf->blocklens[lastIndiceUsed]);
+            ADIOI_Assert((currentIndiceOffset >= 0) && (currentIndiceOffset < flatBuf->blocklens[currentFlatBufIndice]));
           }
-          currentRemLen = len_list[blockIter-1] - sourceBlockTotal;
+          else
+            currentIndiceOffset = 0;
+          maxNumContigOperations += numNonContigSourceChunks;
+          if (numNonContigSourceChunks > maxNumNonContigSourceChunks)
+            maxNumNonContigSourceChunks = numNonContigSourceChunks;
 #ifdef onesidedtrace
-          printf("contig_access_count iter %d currentFlatBufIndice is now %d currentDataTypeExtent is now %d currentRemLen is now %ld\n",blockIter,currentFlatBufIndice,currentDataTypeExtent,currentRemLen);
+          printf("blockiter %d currentFlatBufIndice is now %d currentDataTypeExtent is now %d currentIndiceOffset is now %ld maxNumContigOperations is now %d\n",blockIter,currentFlatBufIndice,currentDataTypeExtent,currentIndiceOffset,maxNumContigOperations);
 #endif
         } // !bufTypeIsContig
       } // blockIter > 0
 
-      /* For the first iteration we need to include these maxNumContigOperations
+      /* For the first iteration we need to include these maxNumContigOperations and maxNumNonContigSourceChunks
        * for non-contig case even though we did not need to compute the starting offset.
        */
       if ((blockIter == 0) && (!bufTypeIsContig)) {
@@ -264,6 +288,7 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
         int tmpCurrentFlatBufIndice = currentFlatBufIndice;
         while (sourceBlockTotal < len_list[0]) {
           maxNumContigOperations++;
+          maxNumNonContigSourceChunks++;
           sourceBlockTotal += flatBuf->blocklens[tmpCurrentFlatBufIndice];
           tmpCurrentFlatBufIndice++;
           if (tmpCurrentFlatBufIndice == flatBuf->count) {
@@ -272,7 +297,7 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
         }
       }
 
-      ADIO_Offset blockStart = offset_list[blockIter], blockLen = len_list[blockIter], blockEnd = offset_list[blockIter]+len_list[blockIter]-(ADIO_Offset)1;
+      ADIO_Offset blockStart = offset_list[blockIter], blockEnd = offset_list[blockIter]+len_list[blockIter]-(ADIO_Offset)1;
 
       /* Find the starting target agg for this block - normally it will be the current agg so guard the expensive
        * while loop with a cheap if-check which for large numbers of small blocks will usually be false.
@@ -317,7 +342,7 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
         else {
           baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].flatBufIndice = currentFlatBufIndice;
           baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].dataTypeExtent = currentDataTypeExtent;
-          baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].remLen = currentRemLen;
+          baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].indiceOffset = currentIndiceOffset;
         }
 
         intraRoundCollBufsizeOffset = fd_start[currentAggRankListIndex] + ((targetAggsForMyDataCurrentRoundIter[numTargetAggs]+1) * coll_bufsize);
@@ -349,7 +374,7 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
               else {
                 baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].flatBufIndice = currentFlatBufIndice;
                 baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].dataTypeExtent = currentDataTypeExtent;
-                baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].remLen = currentRemLen;
+                baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].indiceOffset = currentIndiceOffset;
               }
 
               targetAggsForMyDataLastOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
@@ -382,7 +407,7 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
             else {
               baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].flatBufIndice = currentFlatBufIndice;
               baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].dataTypeExtent = currentDataTypeExtent;
-              baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].remLen = currentRemLen;
+              baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].indiceOffset = currentIndiceOffset;
             }
 #ifdef onesidedtrace
             printf("large block init settings numTargetAggs %d offset_list[%d] with value %ld past fd border %ld with len %ld\n",numTargetAggs,i,offset_list[blockIter],fd_start[currentAggRankListIndex],len_list[blockIter]);
@@ -407,7 +432,7 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
           else {
             baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].flatBufIndice = currentFlatBufIndice;
             baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].dataTypeExtent = currentDataTypeExtent;
-            baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].remLen = currentRemLen;
+            baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].indiceOffset = currentIndiceOffset;
           }
           targetAggsForMyDataLastOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
 #ifdef onesidedtrace
@@ -475,6 +500,16 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
 
     ADIO_Offset currentBaseSourceBufferOffset = 0;
 
+    /* These data structures are used to track the offset/len pairs for non-contiguous source buffers that are
+     * to be used for each block of data in the offset list.  Allocate them once with the maximum size and then 
+     * reuse the space throughout the algoritm below.
+     */
+    ADIO_Offset *nonContigSourceOffsets;
+    int *nonContigSourceLens;
+    if (!bufTypeIsContig) {
+      nonContigSourceOffsets = (ADIO_Offset *)ADIOI_Malloc((maxNumNonContigSourceChunks+2) * sizeof(ADIO_Offset));
+      nonContigSourceLens = (int *)ADIOI_Malloc((maxNumNonContigSourceChunks+2) * sizeof(int));
+    }
     /* This is the second main loop of the algorithm, actually nested loop of target aggs within rounds.  There are 2 flavors of this.
      * For gpfsmpio_aggmethod of 1 each nested iteration for the target agg does an mpi_put on a contiguous chunk using a primative datatype
      * determined using the data structures from the first main loop.  For gpfsmpio_aggmethod of 2 each nested iteration for the target agg
@@ -513,7 +548,7 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
        */
       int offsetIter;
       int startingOffLenIndex = targetAggsForMyDataFirstOffLenIndex[roundIter][aggIter], endingOffLenIndex = targetAggsForMyDataLastOffLenIndex[roundIter][aggIter];
-         for (offsetIter=startingOffLenIndex;offsetIter<=endingOffLenIndex;offsetIter++) {
+      for (offsetIter=startingOffLenIndex;offsetIter<=endingOffLenIndex;offsetIter++) {
         if (currentRoundFDEndForMyTargetAgg > targetAggsForMyDataFDEnd[aggIter])
             currentRoundFDEndForMyTargetAgg = targetAggsForMyDataFDEnd[aggIter];
 
@@ -527,7 +562,7 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
           if (bufTypeIsContig)
             currentBaseSourceBufferOffset = baseSourceBufferOffset[roundIter][aggIter];
           else {
-            currentRemLen = baseNonContigSourceBufferOffset[roundIter][aggIter].remLen;
+            currentIndiceOffset = baseNonContigSourceBufferOffset[roundIter][aggIter].indiceOffset;
             currentDataTypeExtent = baseNonContigSourceBufferOffset[roundIter][aggIter].dataTypeExtent;
             currentFlatBufIndice = baseNonContigSourceBufferOffset[roundIter][aggIter].flatBufIndice;
           }
@@ -537,22 +572,35 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
             currentBaseSourceBufferOffset += len_list[offsetIter-1];
           else {
 
-          /* For non-contiguous source datatype count up the extents and indices to this point.
+          /* For non-contiguous source datatype advance the flattened buffer machinery to this offset.
+           * Note that currentDataTypeExtent, currentFlatBufIndice and currentIndiceOffset are used and
+           * advanced across the offsetIters.
            */
-          ADIO_Offset sourceBlockTotal = 0-currentRemLen;
+          ADIO_Offset sourceBlockTotal = 0;
+          int lastIndiceUsed = currentFlatBufIndice;
           while (sourceBlockTotal < len_list[offsetIter-1]) {
-            sourceBlockTotal += flatBuf->blocklens[currentFlatBufIndice];
+            sourceBlockTotal += (flatBuf->blocklens[currentFlatBufIndice] - currentIndiceOffset);
+            lastIndiceUsed = currentFlatBufIndice;
             currentFlatBufIndice++;
             if (currentFlatBufIndice == flatBuf->count) {
               currentFlatBufIndice = 0;
               currentDataTypeExtent++;
             }
+            currentIndiceOffset = 0;
           } // while
-          if (sourceBlockTotal > len_list[offsetIter-1])
+          if (sourceBlockTotal > len_list[offsetIter-1]) {
             currentFlatBufIndice--;
-          currentRemLen = len_list[offsetIter-1] - sourceBlockTotal;
+            if (currentFlatBufIndice < 0 ) {
+              currentDataTypeExtent--;
+              currentFlatBufIndice = flatBuf->count-1;
+            }
+            currentIndiceOffset =  len_list[offsetIter-1] - (sourceBlockTotal - flatBuf->blocklens[lastIndiceUsed]);
+            ADIOI_Assert((currentIndiceOffset >= 0) && (currentIndiceOffset < flatBuf->blocklens[currentFlatBufIndice]));
+          }
+          else
+            currentIndiceOffset = 0;
 #ifdef onesidedtrace
-          printf("contig_access_count target agg %d currentFlatBufIndice is now %d currentDataTypeExtent is now %d currentRemLen is now %ld\n",aggIter,currentFlatBufIndice,currentDataTypeExtent,currentRemLen);
+          printf("offsetIter %d contig_access_count target agg %d currentFlatBufIndice is now %d currentDataTypeExtent is now %d currentIndiceOffset is now %ld\n",offsetIter,aggIter,currentFlatBufIndice,currentDataTypeExtent,currentIndiceOffset);
 #endif
           }
         }
@@ -597,6 +645,9 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
             offsetStart = currentRoundFDStartForMyTargetAgg;
         }
 
+#ifdef onesidedtrace
+        printf("bufferAmountToSend is %d\n",bufferAmountToSend);
+#endif
         if (bufferAmountToSend > 0) { /* we have data to send this round */
           if (gpfsmpio_aggmethod == 2) {
             /* Only allocate these arrays if we are using method 2 and only do it once for this round/target agg.
@@ -613,73 +664,55 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
            * from the source buffer to be sent to this contiguous chunk defined by the round/agg/iter in the target.
            */
           int numNonContigSourceChunks = 0;
-          ADIO_Offset *nonContigSourceOffsets;
-          int *nonContigSourceLens;
           ADIO_Offset baseDatatypeInstanceOffset = 0;
 
           if (!bufTypeIsContig) {
 
-            currentSourceBufferOffset = (ADIO_Offset)((ADIO_Offset)currentDataTypeExtent * (ADIO_Offset)bufTypeExtent) + flatBuf->indices[currentFlatBufIndice] + currentRemLen;
+            currentSourceBufferOffset = (ADIO_Offset)((ADIO_Offset)currentDataTypeExtent * (ADIO_Offset)bufTypeExtent) + flatBuf->indices[currentFlatBufIndice] + currentIndiceOffset;
 #ifdef onesidedtrace
-            printf("!bufTypeIsContig currentSourceBufferOffset set to %ld for roundIter %d target %d currentDataTypeExtent %d flatBuf->indices[currentFlatBufIndice] %ld currentRemLen %ld currentFlatBufIndice %d\n",currentSourceBufferOffset,roundIter,aggIter,currentDataTypeExtent,flatBuf->indices[currentFlatBufIndice],currentRemLen,currentFlatBufIndice);
+            printf("!bufTypeIsContig currentSourceBufferOffset set to %ld for roundIter %d target %d currentDataTypeExtent %d flatBuf->indices[currentFlatBufIndice] %ld currentIndiceOffset %ld currentFlatBufIndice %d\n",currentSourceBufferOffset,roundIter,aggIter,currentDataTypeExtent,flatBuf->indices[currentFlatBufIndice],currentIndiceOffset,currentFlatBufIndice);
 #endif
 
-            /* Count the chunks first to see how much to malloc from the ending point from above code.
+            /* Use a tmp variable for the currentFlatBufIndice and currentIndiceOffset as they are used across the offsetIters
+             * to compute the starting point for this iteration and will be modified now to compute the data chunks for
+             * this iteration.
              */
-            int sendBytesCounted = 0;
             int tmpFlatBufIndice = currentFlatBufIndice;
-            int maxNumNonContigSourceChunks = 2; // over-initialize for potential remnants on both ends
-
-            while (sendBytesCounted < bufferAmountToSend) {
-              maxNumNonContigSourceChunks++;
-              if (tmpFlatBufIndice == flatBuf->count) {
-                tmpFlatBufIndice = 0;
-              }
-              sendBytesCounted += flatBuf->blocklens[tmpFlatBufIndice];
-              tmpFlatBufIndice++;
-            }
-
-            nonContigSourceOffsets = (ADIO_Offset *)ADIOI_Malloc(maxNumNonContigSourceChunks * sizeof(ADIO_Offset));
-            nonContigSourceLens = (int *)ADIOI_Malloc(maxNumNonContigSourceChunks * sizeof(int));
+            ADIO_Offset tmpIndiceOffset = currentIndiceOffset;
 
             /* now populate the nonContigSourceOffsets and nonContigSourceLens arrays for use in the one-sided operations.
              */
             int ncArrayIndex = 0;
             int remainingBytesToLoadedIntoNCArrays = bufferAmountToSend;
-            ADIO_Offset indexIntoCurrentIndice = 0;
-            if (currentRemLen > 0)
-              indexIntoCurrentIndice = flatBuf->blocklens[currentFlatBufIndice] - currentRemLen;
-
             int datatypeInstances = currentDataTypeExtent;
-            tmpFlatBufIndice = currentFlatBufIndice;
             while (remainingBytesToLoadedIntoNCArrays > 0) {
-             nonContigSourceOffsets[ncArrayIndex] = currentSourceBufferOffset;
+              ADIOI_Assert(ncArrayIndex < (maxNumNonContigSourceChunks+2));
+              nonContigSourceOffsets[ncArrayIndex] = currentSourceBufferOffset;
 
-              if ((flatBuf->blocklens[tmpFlatBufIndice] - indexIntoCurrentIndice) > remainingBytesToLoadedIntoNCArrays) {
+              if ((flatBuf->blocklens[tmpFlatBufIndice] - tmpIndiceOffset) >= remainingBytesToLoadedIntoNCArrays) {
                 nonContigSourceLens[ncArrayIndex] = remainingBytesToLoadedIntoNCArrays;
                 remainingBytesToLoadedIntoNCArrays = 0;
               }
               else {
-                nonContigSourceLens[ncArrayIndex] = (int)(flatBuf->blocklens[tmpFlatBufIndice] - indexIntoCurrentIndice);
-                remainingBytesToLoadedIntoNCArrays -= (flatBuf->blocklens[tmpFlatBufIndice] - indexIntoCurrentIndice);
-              }
-              indexIntoCurrentIndice = 0; // only worry about beginning remnant for first iter
-
-              tmpFlatBufIndice++;
-              if (tmpFlatBufIndice == flatBuf->count) {
-                tmpFlatBufIndice = 0;
-                datatypeInstances++;
-                baseDatatypeInstanceOffset = datatypeInstances * bufTypeExtent;
+                nonContigSourceLens[ncArrayIndex] = (int)(flatBuf->blocklens[tmpFlatBufIndice] - tmpIndiceOffset);
+                remainingBytesToLoadedIntoNCArrays -= (flatBuf->blocklens[tmpFlatBufIndice] - tmpIndiceOffset);
+                tmpIndiceOffset = 0;
+                tmpFlatBufIndice++;
+                if (tmpFlatBufIndice == flatBuf->count) {
+                  tmpFlatBufIndice = 0;
+                  datatypeInstances++;
+                  baseDatatypeInstanceOffset = datatypeInstances * bufTypeExtent;
+                }
+                currentSourceBufferOffset = baseDatatypeInstanceOffset + flatBuf->indices[tmpFlatBufIndice];
               }
-              currentSourceBufferOffset = baseDatatypeInstanceOffset + flatBuf->indices[tmpFlatBufIndice];
+#ifdef onesidedtrace
+              printf("currentSourceBufferOffset set to %ld off of baseDatatypeInstanceOffset of %ld + tmpFlatBufIndice %d with value %ld\n ncArrayIndex is %d nonContigSourceOffsets[ncArrayIndex] is %ld nonContigSourceLens[ncArrayIndex] is %ld\n",currentSourceBufferOffset,baseDatatypeInstanceOffset, tmpFlatBufIndice, flatBuf->indices[tmpFlatBufIndice],ncArrayIndex,nonContigSourceOffsets[ncArrayIndex],nonContigSourceLens[ncArrayIndex]);
+#endif
               ncArrayIndex++;
               numNonContigSourceChunks++;
             } // while
-#ifdef onesidedtrace
-            printf("CurrentSourceBufferOffset finally set to %ld\n",currentSourceBufferOffset);
-#endif
-          } // !bufTypeIsContig
 
+          } // !bufTypeIsContig
 
           /* Determine the offset into the target window.
            */
@@ -698,13 +731,14 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
             MPI_Win_lock(MPI_LOCK_SHARED, targetAggsForMyData[aggIter], 0, write_buf_window);
 #endif
             if (bufTypeIsContig) {
-              MPI_Put(&((char*)buf)[sourceBufferOffset],bufferAmountToSend, MPI_BYTE,targetAggsForMyData[aggIter],targetDisplacementToUseThisRound, bufferAmountToSend,MPI_BYTE,write_buf_window);
+              MPI_Put(((char*)buf) + sourceBufferOffset,bufferAmountToSend, MPI_BYTE,targetAggsForMyData[aggIter],targetDisplacementToUseThisRound, bufferAmountToSend,MPI_BYTE,write_buf_window);
             }
             else {
               for (i=0;i<numNonContigSourceChunks;i++) {
-                MPI_Put(&((char*)buf)[nonContigSourceOffsets[i]],nonContigSourceLens[i], MPI_BYTE,targetAggsForMyData[aggIter],targetDisplacementToUseThisRound, nonContigSourceLens[i],MPI_BYTE,write_buf_window);
+                MPI_Put(((char*)buf) + nonContigSourceOffsets[i],nonContigSourceLens[i], MPI_BYTE,targetAggsForMyData[aggIter],targetDisplacementToUseThisRound, nonContigSourceLens[i],MPI_BYTE,write_buf_window);
+
 #ifdef onesidedtrace
-                printf("mpi_put[%d] nonContigSourceOffsets is %d of nonContigSourceLens %d to target disp %d\n",i,nonContigSourceOffsets[i],nonContigSourceLens[i],targetDisplacementToUseThisRound);
+                printf("mpi_put[%d] nonContigSourceOffsets is %d of nonContigSourceLens %d to target disp %d first int of data: %d\n ",i,nonContigSourceOffsets[i],nonContigSourceLens[i],targetDisplacementToUseThisRound, ((int*)(((char*)buf) + nonContigSourceOffsets[i]))[0]);
 #endif
                 targetDisplacementToUseThisRound += nonContigSourceLens[i];
               }
@@ -729,6 +763,7 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
             }
             else {
               for (i=0;i<numNonContigSourceChunks;i++) {
+              if (nonContigSourceLens[i] > 0) {
                 targetAggBlockLengths[targetAggContigAccessCount]= nonContigSourceLens[i];
                 targetAggDataTypes[targetAggContigAccessCount] = MPI_BYTE;
                 targetAggDisplacements[targetAggContigAccessCount] = targetDisplacementToUseThisRound;
@@ -739,16 +774,13 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
                 targetAggContigAccessCount++;
                 targetDisplacementToUseThisRound += nonContigSourceLens[i];
               }
+              }
             }
           }
 #ifdef onesidedtrace
         printf("roundIter %d bufferAmountToSend is %d sourceBufferOffset is %d offsetStart is %ld currentRoundFDStartForMyTargetAgg is %ld targetDisplacementToUseThisRound is %ld targetAggsForMyDataFDStart[aggIter] is %ld\n",roundIter, bufferAmountToSend,sourceBufferOffset, offsetStart,currentRoundFDStartForMyTargetAgg,targetDisplacementToUseThisRound,targetAggsForMyDataFDStart[aggIter]);
 #endif
 
-          if (!bufTypeIsContig) {
-            ADIOI_Free(nonContigSourceOffsets);
-            ADIOI_Free(nonContigSourceLens);
-          }
         } // bufferAmountToSend > 0
       } // contig list
 
@@ -756,6 +788,20 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
        */
       if (gpfsmpio_aggmethod == 2) {
         MPI_Datatype sourceBufferDerivedDataType, targetBufferDerivedDataType;
+        if (targetAggContigAccessCount > 0) {
+        /* Rebase source buffer offsets to 0 if there are any negative offsets for safety
+         * when iteracting with PAMI.
+         */
+        MPI_Aint lowestDisplacement = 0;
+        for (i=0;i<targetAggContigAccessCount;i++) {
+          if (sourceBufferDisplacements[i] < lowestDisplacement)
+            lowestDisplacement = sourceBufferDisplacements[i];
+        }
+        if (lowestDisplacement  < 0) {
+          lowestDisplacement *= -1;
+          for (i=0;i<targetAggContigAccessCount;i++)
+            sourceBufferDisplacements[i] += lowestDisplacement;
+        }
         MPI_Type_create_struct(targetAggContigAccessCount, targetAggBlockLengths, sourceBufferDisplacements, targetAggDataTypes, &sourceBufferDerivedDataType);
         MPI_Type_commit(&sourceBufferDerivedDataType);
         MPI_Type_create_struct(targetAggContigAccessCount, targetAggBlockLengths, targetAggDisplacements, targetAggDataTypes, &targetBufferDerivedDataType);
@@ -768,20 +814,23 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
         MPI_Win_lock(MPI_LOCK_SHARED, targetAggsForMyData[aggIter], 0, write_buf_window);
 #endif
 
-        MPI_Put(((char*)buf),1, sourceBufferDerivedDataType,targetAggsForMyData[aggIter],0, 1,targetBufferDerivedDataType,write_buf_window);
+        MPI_Put((((char*)buf) - lowestDisplacement),1, sourceBufferDerivedDataType,targetAggsForMyData[aggIter],0, 1,targetBufferDerivedDataType,write_buf_window);
 
 #ifndef ACTIVE_TARGET
         MPI_Win_unlock(targetAggsForMyData[aggIter], write_buf_window);
 #endif
 
+        }
         if (allocatedDerivedTypeArrays) {
           ADIOI_Free(targetAggBlockLengths);
           ADIOI_Free(targetAggDisplacements);
           ADIOI_Free(targetAggDataTypes);
           ADIOI_Free(sourceBufferDisplacements);
         }
+        if (targetAggContigAccessCount > 0) {
         MPI_Type_free(&sourceBufferDerivedDataType);
         MPI_Type_free(&targetBufferDerivedDataType);
+        }
       }
       } // baseoffset != -1
     } // target aggs
@@ -817,7 +866,6 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
         if (!useIOBuffer) {
           ADIO_WriteContig(fd, write_buf, (int)(currentRoundFDEnd - currentRoundFDStart)+1,
             MPI_BYTE, ADIO_EXPLICIT_OFFSET,currentRoundFDStart, &status, error_code);
-          int numBytesWritten= (int)(currentRoundFDEnd - currentRoundFDStart)+1;
 
 /* For now this algorithm cannot handle holes in the source data and does not do any data sieving.
  * One possible approach would be to initialize the write buffer with some value and then check to
@@ -825,7 +873,7 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
  * default initialized value and if that was still unchanged then you would know where a hole was.
  * Here is some initial sample code for that:
  *       if (roundIter<(numberOfRounds-1)) {
- *         for (i=0;i<numBytesWritten;i++)
+ *         for (i=0;i<((currentRoundFDEnd - currentRoundFDStart)+1);i++)
  *           write_buf[i] = '\0';
  *       }
 */
@@ -885,6 +933,11 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
 
     } /* for-loop roundIter */
 
+    if (!bufTypeIsContig) {
+      ADIOI_Free(nonContigSourceOffsets);
+      ADIOI_Free(nonContigSourceLens);
+    }
+
 #ifdef ROMIO_GPFS
     endTimeBase = MPI_Wtime();
     gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH] += (endTimeBase-startTimeBase);
@@ -918,6 +971,8 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
     else
       ADIOI_Free(baseNonContigSourceBufferOffset);
 
+    if (!bufTypeIsContig)
+      ADIOI_Delete_flattened(datatype);
     return;
 }
 
@@ -978,18 +1033,23 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
       MPI_Type_extent(datatype, &bufTypeExtent);
     }
 #ifdef onesidedtrace
-      printf("ADIOI_OneSidedReadAggregation bufTypeIsContig is %d maxNumContigOperations is %d\n",bufTypeIsContig,maxNumContigOperations);
+      printf("ADIOI_OneSidedReadAggregation bufTypeIsContig is %d contig_access_count is %d\n",bufTypeIsContig,contig_access_count);
 #endif
 
-    ADIO_Offset myOffsetStart = st_offsets[myrank], myOffsetEnd = end_offsets[myrank], currentRoundOffsetStart = st_offsets[myrank];
-
-    ADIO_Offset lastFileOffset = 0, firstFileOffset = myOffsetEnd;
+    ADIO_Offset lastFileOffset = 0, firstFileOffset = -1;
 
     /* Get the total range being read.
      */
     for (j=0;j<nprocs;j++) {
-      lastFileOffset = ADIOI_MAX(lastFileOffset,end_offsets[j]);
-      firstFileOffset = ADIOI_MIN(firstFileOffset,st_offsets[j]);
+      if (end_offsets[j] > st_offsets[j]) {
+        /* Guard against ranks with empty data.
+         */
+        lastFileOffset = ADIOI_MAX(lastFileOffset,end_offsets[j]);
+        if (firstFileOffset == -1)
+          firstFileOffset = st_offsets[j];
+        else
+          firstFileOffset = ADIOI_MIN(firstFileOffset,st_offsets[j]);
+      }
     }
 
     int myAggRank = -1; /* if I am an aggregor this is my index into fd->hints->ranklist */
@@ -1095,7 +1155,7 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
 
     int currentDataTypeExtent = 0;
     int currentFlatBufIndice=0;
-    ADIO_Offset currentRemLen = 0;
+    ADIO_Offset currentIndiceOffset = 0;
 
 #ifdef onesidedtrace
     printf("NumberOfRounds is %d\n",numberOfRounds);
@@ -1106,6 +1166,7 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
 #endif
 
     int currentAggRankListIndex = 0;
+    int maxNumNonContigSourceChunks = 0;
 
     /* This denotes the coll_bufsize boundaries within the source buffer for reading for 1 round.
      */
@@ -1133,31 +1194,41 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
           /* Non-contiguous source datatype, count up the extents and indices to this point
            * in the blocks.
            */
-          ADIO_Offset sourceBlockTotal = 0-currentRemLen;
+          ADIO_Offset sourceBlockTotal = 0;
+          int lastIndiceUsed;
+          int numNonContigSourceChunks = 0;
           while (sourceBlockTotal < len_list[blockIter-1]) {
-            maxNumContigOperations++;
-            sourceBlockTotal += flatBuf->blocklens[currentFlatBufIndice];
+            numNonContigSourceChunks++;
+            sourceBlockTotal += (flatBuf->blocklens[currentFlatBufIndice] - currentIndiceOffset);
+            lastIndiceUsed = currentFlatBufIndice;
             currentFlatBufIndice++;
             if (currentFlatBufIndice == flatBuf->count) {
               currentFlatBufIndice = 0;
               currentDataTypeExtent++;
             }
+            currentIndiceOffset = 0;
           }
           if (sourceBlockTotal > len_list[blockIter-1]) {
             currentFlatBufIndice--;
             if (currentFlatBufIndice < 0 ) {
               currentDataTypeExtent--;
-              currentFlatBufIndice = flatBuf->count;
+              currentFlatBufIndice = flatBuf->count-1;
             }
+            currentIndiceOffset =  len_list[blockIter-1] - (sourceBlockTotal - flatBuf->blocklens[lastIndiceUsed]);
+            ADIOI_Assert((currentIndiceOffset >= 0) && (currentIndiceOffset < flatBuf->blocklens[currentFlatBufIndice]));
           }
-          currentRemLen = len_list[blockIter-1] - sourceBlockTotal;
+          else
+            currentIndiceOffset = 0;
+          maxNumContigOperations += numNonContigSourceChunks;
+          if (numNonContigSourceChunks > maxNumNonContigSourceChunks)
+            maxNumNonContigSourceChunks = numNonContigSourceChunks;
 #ifdef onesidedtrace
-          printf("contig_access_count iter %d currentFlatBufIndice is now %d currentDataTypeExtent is now %d currentRemLen is now %ld\n",blockIter,currentFlatBufIndice,currentDataTypeExtent,currentRemLen);
+          printf("block iter %d currentFlatBufIndice is now %d currentDataTypeExtent is now %d currentIndiceOffset is now %ld maxNumContigOperations is now %d\n",blockIter,currentFlatBufIndice,currentDataTypeExtent,currentIndiceOffset,maxNumContigOperations);
 #endif
         } // !bufTypeIsContig
       } // blockIter > 0
 
-      /* For the first iteration we need to include these maxNumContigOperations
+      /* For the first iteration we need to include these maxNumContigOperations and maxNumNonContigSourceChunks
        * for non-contig case even though we did not need to compute the starting offset.
        */
       if ((blockIter == 0) && (!bufTypeIsContig)) {
@@ -1165,6 +1236,7 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
         int tmpCurrentFlatBufIndice = currentFlatBufIndice;
         while (sourceBlockTotal < len_list[0]) {
           maxNumContigOperations++;
+          maxNumNonContigSourceChunks++;
           sourceBlockTotal += flatBuf->blocklens[tmpCurrentFlatBufIndice];
           tmpCurrentFlatBufIndice++;
           if (tmpCurrentFlatBufIndice == flatBuf->count) {
@@ -1173,7 +1245,7 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
         }
       }
 
-      ADIO_Offset blockStart = offset_list[blockIter], blockLen = len_list[blockIter], blockEnd = offset_list[blockIter]+len_list[blockIter]-(ADIO_Offset)1;
+      ADIO_Offset blockStart = offset_list[blockIter], blockEnd = offset_list[blockIter]+len_list[blockIter]-(ADIO_Offset)1;
 
       /* Find the starting source agg for this block - normally it will be the current agg so guard the expensive
        * while loop with a cheap if-check which for large numbers of small blocks will usually be false.
@@ -1214,7 +1286,7 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
         else {
           baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].flatBufIndice = currentFlatBufIndice;
           baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].dataTypeExtent = currentDataTypeExtent;
-          baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].remLen = currentRemLen;
+          baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].indiceOffset = currentIndiceOffset;
         }
 
         intraRoundCollBufsizeOffset = fd_start[currentAggRankListIndex] + ((sourceAggsForMyDataCurrentRoundIter[numSourceAggs]+1) * coll_bufsize);
@@ -1246,7 +1318,7 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
               else {
                 baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].flatBufIndice = currentFlatBufIndice;
                 baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].dataTypeExtent = currentDataTypeExtent;
-                baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].remLen = currentRemLen;
+                baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].indiceOffset = currentIndiceOffset;
               }
 
               sourceAggsForMyDataLastOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
@@ -1279,7 +1351,7 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
             else {
               baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].flatBufIndice = currentFlatBufIndice;
               baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].dataTypeExtent = currentDataTypeExtent;
-              baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].remLen = currentRemLen;
+              baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].indiceOffset = currentIndiceOffset;
             }
 
 #ifdef onesidedtrace
@@ -1305,7 +1377,7 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
           else {
             baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].flatBufIndice = currentFlatBufIndice;
             baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].dataTypeExtent = currentDataTypeExtent;
-            baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].remLen = currentRemLen;
+            baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].indiceOffset = currentIndiceOffset;
           }
           sourceAggsForMyDataLastOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
 #ifdef onesidedtrace
@@ -1377,6 +1449,16 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
 
     ADIO_Offset currentBaseRecvBufferOffset = 0;
 
+    /* These data structures are used to track the offset/len pairs for non-contiguous source buffers that are
+     * to be used for each block of data in the offset list.  Allocate them once with the maximum size and then 
+     * reuse the space throughout the algoritm below.
+     */
+    ADIO_Offset *nonContigSourceOffsets;
+    int *nonContigSourceLens;
+    if (!bufTypeIsContig) {
+      nonContigSourceOffsets = (ADIO_Offset *)ADIOI_Malloc((maxNumNonContigSourceChunks+2) * sizeof(ADIO_Offset));
+      nonContigSourceLens = (int *)ADIOI_Malloc((maxNumNonContigSourceChunks+2) * sizeof(int));
+    }
     /* This is the second main loop of the algorithm, actually nested loop of source aggs within rounds.  There are 2 flavors of this.
      * For gpfsmpio_aggmethod of 1 each nested iteration for the source agg does an mpi_put on a contiguous chunk using a primative datatype
      * determined using the data structures from the first main loop.  For gpfsmpio_aggmethod of 2 each nested iteration for the source agg
@@ -1542,7 +1624,7 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
           else {
             currentFlatBufIndice = baseNonContigSourceBufferOffset[roundIter][aggIter].flatBufIndice;
             currentDataTypeExtent = baseNonContigSourceBufferOffset[roundIter][aggIter].dataTypeExtent;
-            currentRemLen = baseNonContigSourceBufferOffset[roundIter][aggIter].remLen;
+            currentIndiceOffset = baseNonContigSourceBufferOffset[roundIter][aggIter].indiceOffset;
 #ifdef onesidedtrace
             printf("currentFlatBufIndice initially set to %d starting this round/agg %d/%d\n",currentFlatBufIndice,roundIter,aggIter);
 #endif
@@ -1553,22 +1635,35 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
             currentBaseRecvBufferOffset += len_list[offsetIter-1];
           else {
 
-          /* For non-contiguous source datatype count up the extents and indices to this point.
+          /* For non-contiguous source datatype advance the flattened buffer machinery to this offset.
+           * Note that currentDataTypeExtent, currentFlatBufIndice and currentIndiceOffset are used and
+           * advanced across the offsetIters.
            */
-          ADIO_Offset sourceBlockTotal = 0-currentRemLen;
+          ADIO_Offset sourceBlockTotal = 0;
+          int lastIndiceUsed;
           while (sourceBlockTotal < len_list[offsetIter-1]) {
-            sourceBlockTotal += flatBuf->blocklens[currentFlatBufIndice];
+            sourceBlockTotal += (flatBuf->blocklens[currentFlatBufIndice] - currentIndiceOffset);
+            lastIndiceUsed = currentFlatBufIndice;
             currentFlatBufIndice++;
             if (currentFlatBufIndice == flatBuf->count) {
               currentFlatBufIndice = 0;
               currentDataTypeExtent++;
             }
+            currentIndiceOffset = 0;
           } // while
-          if (sourceBlockTotal > len_list[offsetIter-1])
+          if (sourceBlockTotal > len_list[offsetIter-1]) {
             currentFlatBufIndice--;
-          currentRemLen = len_list[offsetIter-1] - sourceBlockTotal;
+            if (currentFlatBufIndice < 0 ) {
+              currentDataTypeExtent--;
+              currentFlatBufIndice = flatBuf->count-1;
+            }
+            currentIndiceOffset =  len_list[offsetIter-1] - (sourceBlockTotal - flatBuf->blocklens[lastIndiceUsed]);
+            ADIOI_Assert((currentIndiceOffset >= 0) && (currentIndiceOffset < flatBuf->blocklens[currentFlatBufIndice]));
+          }
+          else
+            currentIndiceOffset = 0;
 #ifdef onesidedtrace
-          printf("contig_access_count source agg %d currentFlatBufIndice is now %d currentDataTypeExtent is now %d currentRemLen is now %ld\n",aggIter,currentFlatBufIndice,currentDataTypeExtent,currentRemLen);
+          printf("contig_access_count source agg %d currentFlatBufIndice is now %d currentDataTypeExtent is now %d currentIndiceOffset is now %ld\n",aggIter,currentFlatBufIndice,currentDataTypeExtent,currentIndiceOffset);
 #endif
           }
 
@@ -1632,65 +1727,48 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
            * from the source buffer to be sent to this contiguous chunk defined by the round/agg/iter in the source.
            */
           int numNonContigSourceChunks = 0;
-          ADIO_Offset *nonContigSourceOffsets;
-          int *nonContigSourceLens;
           ADIO_Offset baseDatatypeInstanceOffset = 0;
 
           if (!bufTypeIsContig) {
 
-            currentRecvBufferOffset = (ADIO_Offset)((ADIO_Offset)currentDataTypeExtent * (ADIO_Offset)bufTypeExtent) + flatBuf->indices[currentFlatBufIndice] + currentRemLen;
+            currentRecvBufferOffset = (ADIO_Offset)((ADIO_Offset)currentDataTypeExtent * (ADIO_Offset)bufTypeExtent) + flatBuf->indices[currentFlatBufIndice] + currentIndiceOffset;
 #ifdef onesidedtrace
-            printf("!bufTypeIsContig currentRecvBufferOffset set to %ld for roundIter %d source %d currentDataTypeExtent %d flatBuf->indices[currentFlatBufIndice] %ld currentRemLen %ld currentFlatBufIndice %d\n",currentRecvBufferOffset,roundIter,aggIter,currentDataTypeExtent,flatBuf->indices[currentFlatBufIndice],currentRemLen,currentFlatBufIndice);
+            printf("!bufTypeIsContig currentRecvBufferOffset set to %ld for roundIter %d source %d currentDataTypeExtent %d flatBuf->indices[currentFlatBufIndice] %ld currentIndiceOffset %ld currentFlatBufIndice %d\n",currentRecvBufferOffset,roundIter,aggIter,currentDataTypeExtent,flatBuf->indices[currentFlatBufIndice],currentIndiceOffset,currentFlatBufIndice);
 #endif
 
-            /* Count the chunks first to see how much to malloc from the ending point from above code.
+            /* Use a tmp variable for the currentFlatBufIndice and currentIndiceOffset as they are used across the offsetIters
+             * to compute the starting point for this iteration and will be modified now to compute the data chunks for
+             * this iteration.
              */
-            int recvBytesCounted = 0;
             int tmpFlatBufIndice = currentFlatBufIndice;
-            int maxNumNonContigSourceChunks = 2; // over-initialize for potential remnants on both ends
-
-            while (recvBytesCounted < bufferAmountToRecv) {
-              maxNumNonContigSourceChunks++;
-              if (tmpFlatBufIndice == flatBuf->count) {
-                tmpFlatBufIndice = 0;
-              }
-              recvBytesCounted += flatBuf->blocklens[tmpFlatBufIndice];
-              tmpFlatBufIndice++;
-            }
-
-            nonContigSourceOffsets = (ADIO_Offset *)ADIOI_Malloc(maxNumNonContigSourceChunks * sizeof(ADIO_Offset));
-            nonContigSourceLens = (int *)ADIOI_Malloc(maxNumNonContigSourceChunks * sizeof(int));
+            ADIO_Offset tmpIndiceOffset = currentIndiceOffset;
 
             /* now populate the nonContigSourceOffsets and nonContigSourceLens arrays for use in the one-sided operations.
              */
             int ncArrayIndex = 0;
             int remainingBytesToLoadedIntoNCArrays = bufferAmountToRecv;
-            ADIO_Offset indexIntoCurrentIndice = 0;
-            if (currentRemLen > 0)
-              indexIntoCurrentIndice = flatBuf->blocklens[currentFlatBufIndice] - currentRemLen;
 
             int datatypeInstances = currentDataTypeExtent;
-            tmpFlatBufIndice = currentFlatBufIndice;
             while (remainingBytesToLoadedIntoNCArrays > 0) {
-             nonContigSourceOffsets[ncArrayIndex] = currentRecvBufferOffset;
+              ADIOI_Assert(ncArrayIndex < (maxNumNonContigSourceChunks+2));
+              nonContigSourceOffsets[ncArrayIndex] = currentRecvBufferOffset;
 
-              if ((flatBuf->blocklens[tmpFlatBufIndice] - indexIntoCurrentIndice) > remainingBytesToLoadedIntoNCArrays) {
+              if ((flatBuf->blocklens[tmpFlatBufIndice] - tmpIndiceOffset) >= remainingBytesToLoadedIntoNCArrays) {
                 nonContigSourceLens[ncArrayIndex] = remainingBytesToLoadedIntoNCArrays;
                 remainingBytesToLoadedIntoNCArrays = 0;
               }
               else {
-                nonContigSourceLens[ncArrayIndex] = (int)(flatBuf->blocklens[tmpFlatBufIndice] - indexIntoCurrentIndice);
-                remainingBytesToLoadedIntoNCArrays -= (flatBuf->blocklens[tmpFlatBufIndice] - indexIntoCurrentIndice);
-              }
-              indexIntoCurrentIndice = 0; // only worry about beginning remnant for first iter
-
-              tmpFlatBufIndice++;
-              if (tmpFlatBufIndice == flatBuf->count) {
-                tmpFlatBufIndice = 0;
-                datatypeInstances++;
-                baseDatatypeInstanceOffset = datatypeInstances * bufTypeExtent;
+                nonContigSourceLens[ncArrayIndex] = (int)(flatBuf->blocklens[tmpFlatBufIndice] - tmpIndiceOffset);
+                remainingBytesToLoadedIntoNCArrays -= (flatBuf->blocklens[tmpFlatBufIndice] - tmpIndiceOffset);
+                tmpIndiceOffset = 0;
+                tmpFlatBufIndice++;
+                if (tmpFlatBufIndice == flatBuf->count) {
+                  tmpFlatBufIndice = 0;
+                  datatypeInstances++;
+                  baseDatatypeInstanceOffset = datatypeInstances * bufTypeExtent;
+                }
+                currentRecvBufferOffset = baseDatatypeInstanceOffset + flatBuf->indices[tmpFlatBufIndice];
               }
-              currentRecvBufferOffset = baseDatatypeInstanceOffset + flatBuf->indices[tmpFlatBufIndice];
               ncArrayIndex++;
               numNonContigSourceChunks++;
             } // while
@@ -1703,7 +1781,7 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
            */
           MPI_Aint sourceDisplacementToUseThisRound = (MPI_Aint) ((ADIO_Offset)offsetStart - currentRoundFDStartForMySourceAgg);
 
-          /* If using the thread readr select the appropriate side of the split window.
+          /* If using the thread reader select the appropriate side of the split window.
            */
           if (useIOBuffer && (read_buf == read_buf1)) {
             sourceDisplacementToUseThisRound += coll_bufsize;
@@ -1716,11 +1794,11 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
             MPI_Win_lock(MPI_LOCK_SHARED, sourceAggsForMyData[aggIter], 0, read_buf_window);
 #endif
             if (bufTypeIsContig) {
-              MPI_Get(&((char*)buf)[recvBufferOffset],bufferAmountToRecv, MPI_BYTE,sourceAggsForMyData[aggIter],sourceDisplacementToUseThisRound, bufferAmountToRecv,MPI_BYTE,read_buf_window);
+              MPI_Get(((char*)buf) + recvBufferOffset,bufferAmountToRecv, MPI_BYTE,sourceAggsForMyData[aggIter],sourceDisplacementToUseThisRound, bufferAmountToRecv,MPI_BYTE,read_buf_window);
             }
             else {
               for (i=0;i<numNonContigSourceChunks;i++) {
-                MPI_Get(&((char*)buf)[nonContigSourceOffsets[i]],nonContigSourceLens[i], MPI_BYTE,sourceAggsForMyData[aggIter],sourceDisplacementToUseThisRound, nonContigSourceLens[i],MPI_BYTE,read_buf_window);
+                MPI_Get(((char*)buf) + nonContigSourceOffsets[i],nonContigSourceLens[i], MPI_BYTE,sourceAggsForMyData[aggIter],sourceDisplacementToUseThisRound, nonContigSourceLens[i],MPI_BYTE,read_buf_window);
 #ifdef onesidedtrace
                 printf("mpi_put[%d] nonContigSourceOffsets is %d of nonContigSourceLens %d to source disp %d\n",i,nonContigSourceOffsets[i],nonContigSourceLens[i],sourceDisplacementToUseThisRound);
 #endif
@@ -1764,10 +1842,6 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
         printf("roundIter %d bufferAmountToRecv is %d recvBufferOffset is %d offsetStart is %ld currentRoundFDStartForMySourceAgg is %ld sourceDisplacementToUseThisRound is %ld sourceAggsForMyDataFDStart[aggIter] is %ld\n",roundIter, bufferAmountToRecv,recvBufferOffset, offsetStart,currentRoundFDStartForMySourceAgg,sourceDisplacementToUseThisRound,sourceAggsForMyDataFDStart[aggIter]);
 #endif
 
-          if (!bufTypeIsContig) {
-            ADIOI_Free(nonContigSourceOffsets);
-            ADIOI_Free(nonContigSourceLens);
-          }
           } // bufferAmountToRecv > 0
       } // contig list
 
@@ -1775,19 +1849,33 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
        */
       if (gpfsmpio_aggmethod == 2) {
         MPI_Datatype recvBufferDerivedDataType, sourceBufferDerivedDataType;
+        if (sourceAggContigAccessCount > 0) {
+        /* Rebase source buffer offsets to 0 if there are any negative offsets for safety
+         * when iteracting with PAMI.
+         */
+        MPI_Aint lowestDisplacement = 0;
+        for (i=0;i<sourceAggContigAccessCount;i++) {
+          if (recvBufferDisplacements[i] < lowestDisplacement)
+            lowestDisplacement = recvBufferDisplacements[i];
+        }
+        if (lowestDisplacement  < 0) {
+          lowestDisplacement *= -1;
+          for (i=0;i<sourceAggContigAccessCount;i++)
+           recvBufferDisplacements[i] += lowestDisplacement;
+        }
         MPI_Type_create_struct(sourceAggContigAccessCount, sourceAggBlockLengths, recvBufferDisplacements, sourceAggDataTypes, &recvBufferDerivedDataType);
         MPI_Type_commit(&recvBufferDerivedDataType);
         MPI_Type_create_struct(sourceAggContigAccessCount, sourceAggBlockLengths, sourceAggDisplacements, sourceAggDataTypes, &sourceBufferDerivedDataType);
         MPI_Type_commit(&sourceBufferDerivedDataType);
-// printf("round %d mpi_put of derived type to agg %d sourceAggContigAccessCount is %d\n",roundIter, sourceAggsForMyData[aggIter],sourceAggContigAccessCount);
 #ifndef ACTIVE_TARGET
         MPI_Win_lock(MPI_LOCK_SHARED, sourceAggsForMyData[aggIter], 0, read_buf_window);
 #endif
-        MPI_Get(((char*)buf),1, recvBufferDerivedDataType,sourceAggsForMyData[aggIter],0, 1,sourceBufferDerivedDataType,read_buf_window);
+        MPI_Get((((char*)buf) - lowestDisplacement),1, recvBufferDerivedDataType,sourceAggsForMyData[aggIter],0, 1,sourceBufferDerivedDataType,read_buf_window);
 
 #ifndef ACTIVE_TARGET
         MPI_Win_unlock(sourceAggsForMyData[aggIter], read_buf_window);
 #endif
+        }
 
         if (allocatedDerivedTypeArrays) {
           ADIOI_Free(sourceAggBlockLengths);
@@ -1795,8 +1883,10 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
           ADIOI_Free(sourceAggDataTypes);
           ADIOI_Free(recvBufferDisplacements);
         }
+        if (sourceAggContigAccessCount > 0) {
         MPI_Type_free(&recvBufferDerivedDataType);
         MPI_Type_free(&sourceBufferDerivedDataType);
+        }
       }
       } // baseoffset != -1
     } // source aggs
@@ -1813,6 +1903,10 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
 
     } /* for-loop roundIter */
 
+    if (!bufTypeIsContig) {
+      ADIOI_Free(nonContigSourceOffsets);
+      ADIOI_Free(nonContigSourceLens);
+    }
 #ifdef ROMIO_GPFS
     endTimeBase = MPI_Wtime();
     gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH] += (endTimeBase-startTimeBase);
@@ -1846,5 +1940,7 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd,
     else
       ADIOI_Free(baseNonContigSourceBufferOffset);
 
+    if (!bufTypeIsContig)
+      ADIOI_Delete_flattened(datatype);
     return;
 }

http://git.mpich.org/mpich.git/commitdiff/e6849ac4f0a85a58afefc7a0ee9643dbf225852f

commit e6849ac4f0a85a58afefc7a0ee9643dbf225852f
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Sat Feb 28 00:59:54 2015 +0000

    ROMIO Collective IO One-sided aggregation algorithm GPFS enablement
    
    Code to enable the usage of the optimized one-sided collective
    IO aggregation algorithm from the ADIOI_GPFS_WriteStridedColl and
    ADIOI_GPFS_ReadStridedColl functions.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c
index c2cad8b..18d92d1 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c
@@ -271,6 +271,21 @@ void ADIOI_GPFS_ReadStridedColl(ADIO_File fd, void *buf, int count,
 			    fd->hints->striping_unit);
 
     GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_MYREQ, GPFSMPIO_CIO_T_FD_PART );
+    if ((gpfsmpio_aggmethod == 1) || (gpfsmpio_aggmethod == 2)) {
+    /* If the user has specified to use a one-sided aggregation method then do that at
+     * this point instead of the two-phase I/O.
+     */
+      ADIOI_OneSidedReadAggregation(fd, offset_list, len_list, contig_access_count, buf,
+		    datatype,error_code, st_offsets, end_offsets, fd_start, fd_end);
+      GPFSMPIO_T_CIO_REPORT( 1, fd, myrank, nprocs)
+      ADIOI_Free(offset_list);
+      ADIOI_Free(len_list);
+      ADIOI_Free(st_offsets);
+      ADIOI_Free(end_offsets);
+      ADIOI_Free(fd_start);
+      ADIOI_Free(fd_end);
+	  goto fn_exit;
+    }
     if (gpfsmpio_p2pcontig==1) {
 	/* For some simple yet common(?) workloads, full-on two-phase I/O is
 	 * overkill.  We can establish sub-groups of processes and their
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
index 968e6e6..9ce68fa 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
@@ -263,6 +263,20 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 
     GPFSMPIO_T_CIO_SET_GET( w, 1, 1, GPFSMPIO_CIO_T_MYREQ, GPFSMPIO_CIO_T_FD_PART );
 
+    if ((gpfsmpio_aggmethod == 1) || (gpfsmpio_aggmethod == 2)) {
+    /* If the user has specified to use a one-sided aggregation method then do that at
+     * this point instead of the two-phase I/O.
+     */
+      ADIOI_OneSidedWriteAggregation(fd, offset_list, len_list, contig_access_count, buf, datatype, error_code, st_offsets, end_offsets, fd_start, fd_end);
+      GPFSMPIO_T_CIO_REPORT( 1, fd, myrank, nprocs)
+      ADIOI_Free(offset_list);
+      ADIOI_Free(len_list);
+      ADIOI_Free(st_offsets);
+      ADIOI_Free(end_offsets);
+      ADIOI_Free(fd_start);
+      ADIOI_Free(fd_end);
+	  goto fn_exit;
+    }
     if (gpfsmpio_p2pcontig==1) {
 	/* For some simple yet common(?) workloads, full-on two-phase I/O is overkill.  We can establish sub-groups of processes and their aggregator, and then these sub-groups will carry out a simplified two-phase over that sub-group.
 	 *
diff --git a/src/mpi/romio/adio/common/ad_close.c b/src/mpi/romio/adio/common/ad_close.c
index 7aa5ce0..d51d75c 100644
--- a/src/mpi/romio/adio/common/ad_close.c
+++ b/src/mpi/romio/adio/common/ad_close.c
@@ -7,6 +7,7 @@
 
 #include "adio.h"
 #include "adio_extern.h"
+extern int      gpfsmpio_aggmethod;
 
 #ifdef HAVE_UNISTD_H
 #include <unistd.h>
@@ -115,6 +116,10 @@ void ADIO_Close(ADIO_File fd, int *error_code)
     MPI_Info_free(&(fd->info));
 
     if (fd->io_buf != NULL) ADIOI_Free(fd->io_buf);
+    /* If one-sided aggregation is chosen then free the window over the io_buf.
+     */
+    if ((gpfsmpio_aggmethod == 1) || (gpfsmpio_aggmethod == 2))
+	  MPI_Win_free(&fd->io_buf_window);
 
     /* memory for fd is freed in MPI_File_close */
 }
diff --git a/src/mpi/romio/adio/common/ad_open.c b/src/mpi/romio/adio/common/ad_open.c
index 6558f76..c0d9331 100644
--- a/src/mpi/romio/adio/common/ad_open.c
+++ b/src/mpi/romio/adio/common/ad_open.c
@@ -126,6 +126,11 @@ MPI_File ADIO_Open(MPI_Comm orig_comm,
      * (e.g. Blue Gene) more efficent */
     fd->io_buf = ADIOI_Malloc(fd->hints->cb_buffer_size);
 
+    /* If one-sided aggregation is chosen then create the window over the io_buf.
+     */
+    if ((gpfsmpio_aggmethod == 1) || (gpfsmpio_aggmethod == 2)) {
+      MPI_Win_create(fd->io_buf,fd->hints->cb_buffer_size,1,MPI_INFO_NULL,fd->comm, &fd->io_buf_window);
+    }
      /* deferred open: 
      * we can only do this optimization if 'fd->hints->deferred_open' is set
      * (which means the user hinted 'no_indep_rw' and collective buffering).

http://git.mpich.org/mpich.git/commitdiff/147329674e7989e29c441efa73ba2c4462db5683

commit 147329674e7989e29c441efa73ba2c4462db5683
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Sat Feb 28 00:45:32 2015 +0000

    ROMIO Collective IO One-sided aggregation algorithm base code
    
    Optimized collective IO algorithm for GPFS to replace the existing
    two-phase algorithm with one utilizing one-sided MPI_Put and MPI_Get.
    Significant performance and memory optimization possible for certain
    workloads.  Guarded by GPFSMPIO_AGGMETHOD environment variable, see
    ad_gpfs_tuning.c for details.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
index c993021..192f747 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
@@ -37,6 +37,7 @@ long    bglocklessmpio_f_type;
 int     gpfsmpio_bg_nagg_pset;
 int     gpfsmpio_pthreadio;
 int     gpfsmpio_p2pcontig;
+int     gpfsmpio_aggmethod;
 int	gpfsmpio_balancecontig;
 int     gpfsmpio_devnullio;
 int     gpfsmpio_bridgeringagg;
@@ -105,6 +106,22 @@ double	gpfsmpio_prof_cr    [GPFSMPIO_CIO_LAST+1];
  * 3.) There are no gaps between the offsets.
  * 4.) No single rank has a data size which spans multiple file domains.
  *
+ * - GPFSMPIO_AGGMETHOD -  Replaces the two-phase collective IO aggregation with a one-
+ *   sided algorithm, significantly reducing communication and memory overhead.  Fully
+ *   supports all datasets and datatypes, the only caveat is that any holes in the data
+ *   when writing to a pre-existing file are ignored -- there is no read-modify-write
+ *   support to maintain the correctness of regions of pre-existing data so every byte
+ *   must be explicitly written to maintain correctness.  Users must beware of middle-ware
+ *   libraries like PNETCDF which may count on read-modify-write functionality for certain
+ *   features (like fill values).  Possible values:
+ *   - 0 - Normal two-phase collective IO is used.
+ *   - 1 - A separate one-sided MPI_Put or MPI_Get is used for each contigous chunk of data
+ *         for a compute to write to or read from the collective buffer on the aggregator.
+ *   - 2 - An MPI derived datatype is created using all the contigous chunks and just one
+ *         call to MPI_Put or MPI_Get is done with the derived datatype.  On Blue Gene /Q
+ *         optimal performance for this is achieved when paired with PAMID_TYPED_ONESIDED=1.
+ *   - Default is 0
+ *
  * - GPFSMPIO_BALANCECONTIG -  Relevant only to BGQ.  File domain blocks are assigned
  *   to aggregators in a breadth-first fashion relative to the ions - additionally,
  *   file domains on the aggregators sharing the same bridgeset and ion have contiguous
@@ -165,6 +182,10 @@ void ad_gpfs_get_env_vars() {
     x = getenv( "GPFSMPIO_P2PCONTIG" );
     if (x) gpfsmpio_p2pcontig = atoi(x);
 
+    gpfsmpio_aggmethod = 0;
+    x = getenv( "GPFSMPIO_AGGMETHOD" );
+    if (x) gpfsmpio_aggmethod = atoi(x);
+
     gpfsmpio_balancecontig = 0;
     x = getenv( "GPFSMPIO_BALANCECONTIG" );
     if (x) gpfsmpio_balancecontig = atoi(x);
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
index b82c642..87aefce 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
@@ -66,6 +66,7 @@ extern int 	gpfsmpio_tuneblocking;
 extern long bglocklessmpio_f_type;
 extern int      gpfsmpio_pthreadio;
 extern int      gpfsmpio_p2pcontig;
+extern int      gpfsmpio_aggmethod;
 extern int  gpfsmpio_balancecontig;
 extern int      gpfsmpio_devnullio;
 extern int      gpfsmpio_bridgeringagg;
diff --git a/src/mpi/romio/adio/common/Makefile.mk b/src/mpi/romio/adio/common/Makefile.mk
index c2c379c..80194ef 100644
--- a/src/mpi/romio/adio/common/Makefile.mk
+++ b/src/mpi/romio/adio/common/Makefile.mk
@@ -71,5 +71,6 @@ romio_other_sources +=                  \
     adio/common/hint_fns.c              \
     adio/common/ad_threaded_io.c        \
     adio/common/p2p_aggregation.c       \
+    adio/common/onesided_aggregation.c  \
     adio/common/utils.c
 
diff --git a/src/mpi/romio/adio/common/ad_open.c b/src/mpi/romio/adio/common/ad_open.c
index 7d69280..6558f76 100644
--- a/src/mpi/romio/adio/common/ad_open.c
+++ b/src/mpi/romio/adio/common/ad_open.c
@@ -10,7 +10,7 @@
 #include "adio_cb_config_list.h"
 
 #include "mpio.h"
-
+extern int      gpfsmpio_aggmethod;
 static int is_aggregator(int rank, ADIO_File fd);
 static int uses_generic_read(ADIO_File fd);
 static int uses_generic_write(ADIO_File fd);
diff --git a/src/mpi/romio/adio/include/adio.h b/src/mpi/romio/adio/include/adio.h
index 20ceb30..f1d4ac3 100644
--- a/src/mpi/romio/adio/include/adio.h
+++ b/src/mpi/romio/adio/include/adio.h
@@ -236,6 +236,7 @@ typedef struct ADIOI_FileD {
     MPI_Datatype *file_realm_types;  /* file realm datatypes */
     int my_cb_nodes_index; /* my index into cb_config_list. -1 if N/A */
     char *io_buf;          /* two-phase buffer allocated out of i/o path */
+    MPI_Win io_buf_window; /* Window over the io_buf to support one-sided aggregation */
     /* External32 */
     int is_external32;      /* bool:  0 means native view */
 
diff --git a/src/mpi/romio/adio/include/adioi.h b/src/mpi/romio/adio/include/adioi.h
index b20ca82..6320a63 100644
--- a/src/mpi/romio/adio/include/adioi.h
+++ b/src/mpi/romio/adio/include/adioi.h
@@ -686,6 +686,28 @@ void ADIOI_P2PContigReadAggregation(ADIO_File fd,
 				     ADIO_Offset *fd_start,
 				     ADIO_Offset *fd_end);
 
+void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
+        ADIO_Offset *offset_list,
+        ADIO_Offset *len_list,
+        int contig_access_count,
+        const void *buf,
+        MPI_Datatype datatype,
+        int *error_code,
+        ADIO_Offset *st_offsets,
+        ADIO_Offset *end_offsets,
+        ADIO_Offset *fd_start,
+        ADIO_Offset* fd_end);
+void ADIOI_OneSidedReadAggregation(ADIO_File fd,
+        ADIO_Offset *offset_list,
+        ADIO_Offset *len_list,
+        int contig_access_count,
+        const void *buf,
+        MPI_Datatype datatype,
+        int *error_code,
+        ADIO_Offset *st_offsets,
+        ADIO_Offset *end_offsets,
+        ADIO_Offset *fd_start,
+        ADIO_Offset* fd_end);
 ADIO_Offset ADIOI_GEN_SeekIndividual(ADIO_File fd, ADIO_Offset offset, 
 				     int whence, int *error_code);
 void ADIOI_GEN_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);

http://git.mpich.org/mpich.git/commitdiff/0e4dcc43f3ef0be5fb035ceef310aeaf71e19df7

commit 0e4dcc43f3ef0be5fb035ceef310aeaf71e19df7
Author: Paul Coffman <pkcoff at vestalac1.ftd.alcf.anl.gov>
Date:   Sat Feb 28 00:35:43 2015 +0000

    ROMIO One-sided Collective IO Aggregation base code
    
    Optimized collective IO aggregation algorithm added for GPFS which replaces the
    existing two-phase aggregation with one-sided MPI_Put and MPI_Get for writing and
    reading respectively.  Significant performance and memory optimization possible
    for many workloads.  Guarded by the GPFSMPIO_AGGMETHOD environment variable --
    see ad_gpfs_tuning.c for details.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpi/romio/adio/common/onesided_aggregation.c b/src/mpi/romio/adio/common/onesided_aggregation.c
new file mode 100644
index 0000000..695edde
--- /dev/null
+++ b/src/mpi/romio/adio/common/onesided_aggregation.c
@@ -0,0 +1,1850 @@
+#include "adio.h"
+#include "adio_extern.h"
+#include "../ad_gpfs/ad_gpfs_tuning.h"
+
+#include <pthread.h>
+
+//  #define onesidedtrace 1
+
+/* Uncomment this to use fences for the one-sided communication.
+ */
+// #define ACTIVE_TARGET 1
+
+/* This data structure holds the number of extents, the index into the flattened buffer and the remnant length
+ * beyond the flattened buffer index corresponding to the base buffer offset for non-contiguous source data
+ * for the range to be written coresponding to the round and target agg.
+ */
+typedef struct NonContigSourceBufOffset {
+  int dataTypeExtent;
+  int flatBufIndice;
+  ADIO_Offset remLen;
+} NonContigSourceBufOffset;
+
+void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
+    ADIO_Offset *offset_list,
+    ADIO_Offset *len_list,
+    int contig_access_count,
+    const void *buf,
+    MPI_Datatype datatype,
+    int *error_code,
+    ADIO_Offset *st_offsets,
+    ADIO_Offset *end_offsets,
+    ADIO_Offset *fd_start,
+    ADIO_Offset* fd_end)
+{
+
+    *error_code = MPI_SUCCESS; /* initialize to success */
+
+#ifdef ROMIO_GPFS
+    double startTimeBase,endTimeBase;
+    startTimeBase = MPI_Wtime();
+#endif
+
+    int i,j; /* generic iterators */
+    MPI_Status status;
+    pthread_t io_thread;
+    void *thread_ret;
+    ADIOI_IO_ThreadFuncData io_thread_args;
+
+    int nprocs,myrank;
+    MPI_Comm_size(fd->comm, &nprocs);
+    MPI_Comm_rank(fd->comm, &myrank);
+
+    /* This flag denotes whether the source datatype is contiguous, which is referenced throughout the algorithm
+     * and defines how the source buffer offsets and data chunks are determined.  If the value is 1 (true - contiguous data)
+     * things are profoundly simpler in that the source buffer offset for a given target offset simply linearly increases
+     * by the chunk sizes being written.  If the value is 0 (non-contiguous) then these values are based on calculations
+     * from the flattened source datatype.
+     */
+    int bufTypeIsContig;
+
+    MPI_Aint bufTypeExtent;
+    ADIOI_Flatlist_node *flatBuf=NULL;
+    ADIOI_Datatype_iscontig(datatype, &bufTypeIsContig);
+
+    /* maxNumContigOperations keeps track of how many different chunks we will need to send
+     * for the purpose of pre-allocating the data structures to hold them.
+     */
+    int maxNumContigOperations = contig_access_count;
+
+    if (!bufTypeIsContig) {
+    /* Flatten the non-contiguous source datatype.
+     */
+      ADIOI_Flatten_datatype(datatype);
+      flatBuf = ADIOI_Flatlist;
+      while (flatBuf->type != datatype) flatBuf = flatBuf->next;
+      MPI_Type_extent(datatype, &bufTypeExtent);
+    }
+
+#ifdef onesidedtrace
+    printf(" ADIOI_OneSidedWriteAggregation bufTypeIsContig is %d maxNumContigOperations is %d\n",bufTypeIsContig,maxNumContigOperations);
+#endif
+
+    ADIO_Offset myOffsetStart = st_offsets[myrank], myOffsetEnd = end_offsets[myrank], currentRoundOffsetStart = st_offsets[myrank];
+
+    ADIO_Offset lastFileOffset = 0, firstFileOffset = myOffsetEnd;
+    /* Get the total range being written.
+     */
+    for (j=0;j<nprocs;j++) {
+      lastFileOffset = ADIOI_MAX(lastFileOffset,end_offsets[j]);
+      firstFileOffset = ADIOI_MIN(firstFileOffset,st_offsets[j]);
+    }
+
+    int myAggRank = -1; /* if I am an aggregor this is my index into fd->hints->ranklist */
+    int iAmUsedAgg = 0; /* whether or not this rank is used as an aggregator. */
+
+    int naggs = fd->hints->cb_nodes;
+    int coll_bufsize = fd->hints->cb_buffer_size;
+#ifdef ROMIO_GPFS
+    if (gpfsmpio_pthreadio == 1) {
+      /* split buffer in half for a kind of double buffering with the threads*/
+      coll_bufsize = fd->hints->cb_buffer_size/2;
+    }
+#endif
+
+    /* This logic defines values that are used later to determine what offsets define the portion
+     * of the file domain the agg is writing this round.
+     */
+    int greatestFileDomainAggRank = -1,smallestFileDomainAggRank = -1;
+    ADIO_Offset greatestFileDomainOffset = 0;
+    ADIO_Offset smallestFileDomainOffset = lastFileOffset;
+    for (j=0;j<naggs;j++) {
+      if (fd->hints->ranklist[j] == myrank) {
+        myAggRank = j;
+        if (fd_end[j] > greatestFileDomainOffset) {
+          greatestFileDomainOffset = fd_end[j];
+          greatestFileDomainAggRank = j;
+        }
+        if (fd_start[j] < smallestFileDomainOffset) {
+          smallestFileDomainOffset = fd_start[j];
+          smallestFileDomainAggRank = j;
+        }
+        if (fd_end[j] > fd_start[j]) {
+          iAmUsedAgg = 1;
+        }
+      }
+    }
+
+#ifdef onesidedtrace
+    printf("contig_access_count is %d lastFileOffset is %ld firstFileOffset is %ld\n",contig_access_count,lastFileOffset,firstFileOffset);
+    for (j=0;j<contig_access_count;j++) {
+      printf("offset_list[%d]: %ld , len_list[%d]: %ld\n",j,offset_list[j],j,len_list[j]);
+    }
+#endif
+
+    /* Determine how much data and to whom I need to send.  For source proc
+     * targets, also determine the target file domain offsets locally to
+     * reduce communication overhead.
+     */
+    int *targetAggsForMyData = (int *)ADIOI_Malloc(naggs * sizeof(int));
+    ADIO_Offset *targetAggsForMyDataFDStart = (ADIO_Offset *)ADIOI_Malloc(naggs * sizeof(ADIO_Offset));
+    ADIO_Offset *targetAggsForMyDataFDEnd = (ADIO_Offset *)ADIOI_Malloc(naggs * sizeof(ADIO_Offset));
+    int numTargetAggs = 0;
+
+    /* Compute number of rounds.
+     */
+    ADIO_Offset numberOfRounds = (ADIO_Offset)((((ADIO_Offset)(end_offsets[nprocs-1]-st_offsets[0]))/((ADIO_Offset)((ADIO_Offset)coll_bufsize*(ADIO_Offset)naggs)))) + 1;
+
+    /* This data structure holds the beginning offset and len list index for the range to be written
+     * coresponding to the round and target agg.  Initialize to -1 to denote being unset.
+     */
+    int **targetAggsForMyDataFirstOffLenIndex = (int **)ADIOI_Malloc(numberOfRounds * sizeof(int *));
+    for (i=0;i<numberOfRounds;i++) {
+      targetAggsForMyDataFirstOffLenIndex[i] = (int *)ADIOI_Malloc(naggs * sizeof(int));
+      for (j=0;j<naggs;j++)
+        targetAggsForMyDataFirstOffLenIndex[i][j] = -1;
+    }
+
+    /* This data structure holds the ending offset and len list index for the range to be written
+     * coresponding to the round and target agg.
+     */
+    int **targetAggsForMyDataLastOffLenIndex = (int **)ADIOI_Malloc(numberOfRounds * sizeof(int *));
+    for (i=0;i<numberOfRounds;i++)
+      targetAggsForMyDataLastOffLenIndex[i] = (int *)ADIOI_Malloc(naggs * sizeof(int));
+
+    /* This data structure holds the base buffer offset for contiguous source data for the range to be written
+     * coresponding to the round and target agg.  Initialize to -1 to denote being unset.yeah
+     */
+    ADIO_Offset **baseSourceBufferOffset;
+
+    if (bufTypeIsContig) {
+      baseSourceBufferOffset= (ADIO_Offset **)ADIOI_Malloc(numberOfRounds * sizeof(ADIO_Offset *));
+      for (i=0;i<numberOfRounds;i++) {
+        baseSourceBufferOffset[i] = (ADIO_Offset *)ADIOI_Malloc(naggs * sizeof(ADIO_Offset));
+        for (j=0;j<naggs;j++)
+          baseSourceBufferOffset[i][j] = -1;
+      }
+    }
+    ADIO_Offset currentSourceBufferOffset = 0;
+
+    /* This data structure holds the number of extents, the index into the flattened buffer and the remnant length
+     * beyond the flattened buffer indice corresponding to the base buffer offset for non-contiguous source data
+     * for the range to be written coresponding to the round and target agg.
+     */
+    NonContigSourceBufOffset **baseNonContigSourceBufferOffset;
+    if (!bufTypeIsContig) {
+      baseNonContigSourceBufferOffset = (NonContigSourceBufOffset **) ADIOI_Malloc(numberOfRounds * sizeof(NonContigSourceBufOffset *));
+      for (i=0;i<numberOfRounds;i++) {
+        baseNonContigSourceBufferOffset[i] = (NonContigSourceBufOffset *)ADIOI_Malloc(naggs * sizeof(NonContigSourceBufOffset));
+        /* initialize flatBufIndice to -1 to indicate that it is unset.
+         */
+        for (j=0;j<naggs;j++)
+          baseNonContigSourceBufferOffset[i][j].flatBufIndice = -1;
+      }
+    }
+
+    int currentDataTypeExtent = 0;
+    int currentFlatBufIndice=0;
+    ADIO_Offset currentRemLen = 0;
+
+#ifdef onesidedtrace
+   printf("NumberOfRounds is %d\n",numberOfRounds);
+   for (i=0;i<naggs;i++)
+     printf("fd->hints->ranklist[%d]is %d fd_start is %ld fd_end is %ld\n",i,fd->hints->ranklist[i],fd_start[i],fd_end[i]);
+   for (j=0;j<contig_access_count;j++)
+     printf("offset_list[%d] is %ld len_list is %ld\n",j,offset_list[j],len_list[j]);
+#endif
+
+    int currentAggRankListIndex = 0;
+
+    /* This denotes the coll_bufsize boundaries within the source buffer for writing for the same round.
+     */
+    ADIO_Offset intraRoundCollBufsizeOffset = 0;
+
+    /* This data structure tracks what target aggs need to be written to on what rounds.
+     */
+    int *targetAggsForMyDataCurrentRoundIter = (int *)ADIOI_Malloc(naggs * sizeof(int));
+    for (i=0;i<naggs;i++)
+      targetAggsForMyDataCurrentRoundIter[i] = 0;
+
+    /* This is the first of the two main loops in this algorithm.  The purpose of this loop is essentially to populate
+     * the data structures defined above for what source data blocks needs to go where (target agg) and when (round iter).
+     */
+    int blockIter;
+    for (blockIter=0;blockIter<contig_access_count;blockIter++) {
+
+      /* Determine the starting source buffer offset for this block - for iter 0 skip it since that value is 0.
+       */
+      if (blockIter>0) {
+        if (bufTypeIsContig)
+          currentSourceBufferOffset += len_list[blockIter-1];
+        else {
+          /* Non-contiguous source datatype, count up the extents and indices to this point
+           * in the blocks for use in computing the source buffer offset.
+           */
+          ADIO_Offset sourceBlockTotal = 0-currentRemLen;
+          while (sourceBlockTotal < len_list[blockIter-1]) {
+            maxNumContigOperations++;
+            sourceBlockTotal += flatBuf->blocklens[currentFlatBufIndice];
+            currentFlatBufIndice++;
+            if (currentFlatBufIndice == flatBuf->count) {
+              currentFlatBufIndice = 0;
+              currentDataTypeExtent++;
+            }
+          }
+          if (sourceBlockTotal > len_list[blockIter-1]) {
+            currentFlatBufIndice--;
+            if (currentFlatBufIndice < 0 ) {
+              currentDataTypeExtent--;
+              currentFlatBufIndice = flatBuf->count;
+            }
+          }
+          currentRemLen = len_list[blockIter-1] - sourceBlockTotal;
+#ifdef onesidedtrace
+          printf("contig_access_count iter %d currentFlatBufIndice is now %d currentDataTypeExtent is now %d currentRemLen is now %ld\n",blockIter,currentFlatBufIndice,currentDataTypeExtent,currentRemLen);
+#endif
+        } // !bufTypeIsContig
+      } // blockIter > 0
+
+      /* For the first iteration we need to include these maxNumContigOperations
+       * for non-contig case even though we did not need to compute the starting offset.
+       */
+      if ((blockIter == 0) && (!bufTypeIsContig)) {
+        ADIO_Offset sourceBlockTotal = 0;
+        int tmpCurrentFlatBufIndice = currentFlatBufIndice;
+        while (sourceBlockTotal < len_list[0]) {
+          maxNumContigOperations++;
+          sourceBlockTotal += flatBuf->blocklens[tmpCurrentFlatBufIndice];
+          tmpCurrentFlatBufIndice++;
+          if (tmpCurrentFlatBufIndice == flatBuf->count) {
+            tmpCurrentFlatBufIndice = 0;
+          }
+        }
+      }
+
+      ADIO_Offset blockStart = offset_list[blockIter], blockLen = len_list[blockIter], blockEnd = offset_list[blockIter]+len_list[blockIter]-(ADIO_Offset)1;
+
+      /* Find the starting target agg for this block - normally it will be the current agg so guard the expensive
+       * while loop with a cheap if-check which for large numbers of small blocks will usually be false.
+       */
+      if (!((blockStart >= fd_start[currentAggRankListIndex]) && (blockStart <= fd_end[currentAggRankListIndex]))) {
+        while (!((blockStart >= fd_start[currentAggRankListIndex]) && (blockStart <= fd_end[currentAggRankListIndex])))
+          currentAggRankListIndex++;
+      };
+
+#ifdef onesidedtrace
+      printf("currentAggRankListIndex is %d blockStart %ld blockEnd %ld fd_start[currentAggRankListIndex] %ld fd_end[currentAggRankListIndex] %ld\n",currentAggRankListIndex,blockStart,blockEnd,fd_start[currentAggRankListIndex],fd_end[currentAggRankListIndex]);
+#endif
+
+      /* Determine if this is a new target agg.
+       */
+      if (blockIter>0) {
+        if ((offset_list[blockIter-1]+len_list[blockIter-1]-(ADIO_Offset)1) < fd_start[currentAggRankListIndex])
+          numTargetAggs++;
+      }
+
+       /* Determine which round to start writing.
+        */
+      if ((blockStart - fd_start[currentAggRankListIndex]) >= coll_bufsize) {
+        ADIO_Offset currentRoundBlockStart = fd_start[currentAggRankListIndex];
+        int startingRound = 0;
+        while (blockStart > (currentRoundBlockStart + coll_bufsize - (ADIO_Offset)1)) {
+          currentRoundBlockStart+=coll_bufsize;
+          startingRound++;
+        }
+        targetAggsForMyDataCurrentRoundIter[numTargetAggs] = startingRound;
+      }
+
+      /* Initialize the data structures if this is the first offset in the round/target agg.
+       */
+      if (targetAggsForMyDataFirstOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] == -1) {
+         targetAggsForMyData[numTargetAggs] = fd->hints->ranklist[currentAggRankListIndex];
+        targetAggsForMyDataFDStart[numTargetAggs] = fd_start[currentAggRankListIndex];
+        targetAggsForMyDataFDEnd[numTargetAggs] = fd_end[currentAggRankListIndex];
+        targetAggsForMyDataFirstOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
+        if (bufTypeIsContig)
+          baseSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = currentSourceBufferOffset;
+        else {
+          baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].flatBufIndice = currentFlatBufIndice;
+          baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].dataTypeExtent = currentDataTypeExtent;
+          baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].remLen = currentRemLen;
+        }
+
+        intraRoundCollBufsizeOffset = fd_start[currentAggRankListIndex] + ((targetAggsForMyDataCurrentRoundIter[numTargetAggs]+1) * coll_bufsize);
+
+#ifdef onesidedtrace
+        printf("Initial settings numTargetAggs %d offset_list[%d] with value %ld past fd border %ld with len %ld currentSourceBufferOffset set to %ld intraRoundCollBufsizeOffset set to %ld\n",numTargetAggs,blockIter,offset_list[blockIter],fd_start[currentAggRankListIndex],len_list[blockIter],currentSourceBufferOffset,intraRoundCollBufsizeOffset);
+#endif
+      }
+
+      /* Replace the last offset block iter with this one.
+       */
+      targetAggsForMyDataLastOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
+
+      /* If this blocks extends into the next file domain handle this situation.
+       */
+      if (blockEnd > fd_end[currentAggRankListIndex]) {
+#ifdef onesidedtrace
+      printf("large block, blockEnd %ld >= fd_end[currentAggRankListIndex] %ld\n",blockEnd,fd_end[currentAggRankListIndex]);
+#endif
+        while (blockEnd >= fd_end[currentAggRankListIndex]) {
+          ADIO_Offset thisAggBlockEnd = fd_end[currentAggRankListIndex];
+          if (thisAggBlockEnd >= intraRoundCollBufsizeOffset) {
+            while (thisAggBlockEnd >= intraRoundCollBufsizeOffset) {
+              targetAggsForMyDataCurrentRoundIter[numTargetAggs]++;
+              intraRoundCollBufsizeOffset += coll_bufsize;
+              targetAggsForMyDataFirstOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
+              if (bufTypeIsContig)
+                baseSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = currentSourceBufferOffset;
+              else {
+                baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].flatBufIndice = currentFlatBufIndice;
+                baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].dataTypeExtent = currentDataTypeExtent;
+                baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].remLen = currentRemLen;
+              }
+
+              targetAggsForMyDataLastOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
+#ifdef onesidedtrace
+              printf("targetAggsForMyDataCurrentRoundI%d] is now %d intraRoundCollBufsizeOffset is now %ld\n",numTargetAggs,targetAggsForMyDataCurrentRoundIter[numTargetAggs],intraRoundCollBufsizeOffset);
+#endif
+            } // while (thisAggBlockEnd >= intraRoundCollBufsizeOffset)
+          } // if (thisAggBlockEnd >= intraRoundCollBufsizeOffset)
+
+          currentAggRankListIndex++;
+
+          /* Skip over unused aggs.
+           */
+          if (fd_start[currentAggRankListIndex] > fd_end[currentAggRankListIndex]) {
+            while (fd_start[currentAggRankListIndex] > fd_end[currentAggRankListIndex])
+              currentAggRankListIndex++;
+
+          } // (fd_start[currentAggRankListIndex] > fd_end[currentAggRankListIndex])
+
+          /* Start new target agg.
+           */
+          if (blockEnd >= fd_start[currentAggRankListIndex]) {
+            numTargetAggs++;
+             targetAggsForMyData[numTargetAggs] = fd->hints->ranklist[currentAggRankListIndex];
+            targetAggsForMyDataFDStart[numTargetAggs] = fd_start[currentAggRankListIndex];
+            targetAggsForMyDataFDEnd[numTargetAggs] = fd_end[currentAggRankListIndex];
+            targetAggsForMyDataFirstOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
+            if (bufTypeIsContig)
+              baseSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = currentSourceBufferOffset;
+            else {
+              baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].flatBufIndice = currentFlatBufIndice;
+              baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].dataTypeExtent = currentDataTypeExtent;
+              baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].remLen = currentRemLen;
+            }
+#ifdef onesidedtrace
+            printf("large block init settings numTargetAggs %d offset_list[%d] with value %ld past fd border %ld with len %ld\n",numTargetAggs,i,offset_list[blockIter],fd_start[currentAggRankListIndex],len_list[blockIter]);
+#endif
+            intraRoundCollBufsizeOffset = fd_start[currentAggRankListIndex] + coll_bufsize;
+            targetAggsForMyDataLastOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
+          } // if (blockEnd >= fd_start[currentAggRankListIndex])
+        } // while (blockEnd >= fd_end[currentAggRankListIndex])
+      } // if (blockEnd > fd_end[currentAggRankListIndex])
+
+      /* Else if we are still in the same file domain / target agg but have gone past the coll_bufsize and need
+       * to advance to the next round handle this situation.
+       */
+      else if (blockEnd >= intraRoundCollBufsizeOffset) {
+        ADIO_Offset currentBlockEnd = blockEnd;
+        while (currentBlockEnd >= intraRoundCollBufsizeOffset) {
+          targetAggsForMyDataCurrentRoundIter[numTargetAggs]++;
+          intraRoundCollBufsizeOffset += coll_bufsize;
+          targetAggsForMyDataFirstOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
+          if (bufTypeIsContig)
+            baseSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = currentSourceBufferOffset;
+          else {
+            baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].flatBufIndice = currentFlatBufIndice;
+            baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].dataTypeExtent = currentDataTypeExtent;
+            baseNonContigSourceBufferOffset[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs].remLen = currentRemLen;
+          }
+          targetAggsForMyDataLastOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
+#ifdef onesidedtrace
+        printf("smaller than fd currentBlockEnd is now %ld intraRoundCollBufsizeOffset is now %ld targetAggsForMyDataCurrentRoundIter[%d] is now %d\n",currentBlockEnd, intraRoundCollBufsizeOffset, numTargetAggs,targetAggsForMyDataCurrentRoundIter[numTargetAggs]);
+#endif
+        } // while (currentBlockEnd >= intraRoundCollBufsizeOffset)
+      } // else if (blockEnd >= intraRoundCollBufsizeOffset)
+
+      /* Need to advance numTargetAggs if this is the last target offset to
+       * include this one.
+       */
+      if (blockIter == (contig_access_count-1))
+        numTargetAggs++;
+    }
+
+#ifdef onesidedtrace
+    printf("numTargetAggs is %d\n",numTargetAggs);
+    for (i=0;i<numTargetAggs;i++) {
+      for (j=0;j<=targetAggsForMyDataCurrentRoundIter[i];j++)
+        printf("targetAggsForMyData[%d] is %d targetAggsForMyDataFDStart[%d] is %ld targetAggsForMyDataFDEnd is %ld targetAggsForMyDataFirstOffLenIndex is %d with value %ld targetAggsForMyDataLastOffLenIndex is %d with value %ld\n",i,targetAggsForMyData[i],i,targetAggsForMyDataFDStart[i],targetAggsForMyDataFDEnd[i],targetAggsForMyDataFirstOffLenIndex[j][i],offset_list[targetAggsForMyDataFirstOffLenIndex[j][i]],targetAggsForMyDataLastOffLenIndex[j][i],offset_list[targetAggsForMyDataLastOffLenIndex[j][i]]);
+
+    }
+#endif
+
+    ADIOI_Free(targetAggsForMyDataCurrentRoundIter);
+
+    int currentWriteBuf = 0;
+    int useIOBuffer = 0;
+#ifdef ROMIO_GPFS
+    if (gpfsmpio_pthreadio && (numberOfRounds>1)) {
+    useIOBuffer = 1;
+    io_thread = pthread_self();
+    }
+#endif
+
+    /* use the write buffer allocated in the file_open */
+    char *write_buf0 = fd->io_buf;
+    char *write_buf1 = fd->io_buf + coll_bufsize;
+
+    /* start off pointing to the first buffer. If we use the 2nd buffer (threaded
+     * case) we'll swap later */
+    char *write_buf = write_buf0;
+    MPI_Win write_buf_window = fd->io_buf_window;
+
+#ifdef ACTIVE_TARGET
+    MPI_Win_fence(0, write_buf_window);
+#endif
+
+    ADIO_Offset currentRoundFDStart = 0;
+    ADIO_Offset currentRoundFDEnd = 0;
+
+    if (iAmUsedAgg) {
+      currentRoundFDStart = fd_start[myAggRank];
+      if (myAggRank == smallestFileDomainAggRank) {
+        if (currentRoundFDStart < firstFileOffset)
+          currentRoundFDStart = firstFileOffset;
+      }
+    }
+
+#ifdef ROMIO_GPFS
+    endTimeBase = MPI_Wtime();
+    gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_SETUP] += (endTimeBase-startTimeBase);
+    startTimeBase = MPI_Wtime();
+#endif
+
+    ADIO_Offset currentBaseSourceBufferOffset = 0;
+
+    /* This is the second main loop of the algorithm, actually nested loop of target aggs within rounds.  There are 2 flavors of this.
+     * For gpfsmpio_aggmethod of 1 each nested iteration for the target agg does an mpi_put on a contiguous chunk using a primative datatype
+     * determined using the data structures from the first main loop.  For gpfsmpio_aggmethod of 2 each nested iteration for the target agg
+     * builds up data to use in created a derived data type for 1 mpi_put that is done for the target agg for each round.
+     */
+    int roundIter;
+    for (roundIter=0;roundIter<numberOfRounds;roundIter++) {
+
+    int aggIter;
+    for (aggIter=0;aggIter<numTargetAggs;aggIter++) {
+
+    /* If we have data for the round/agg process it.
+     */
+    if ((bufTypeIsContig && (baseSourceBufferOffset[roundIter][aggIter] != -1)) || (!bufTypeIsContig && (baseNonContigSourceBufferOffset[roundIter][aggIter].flatBufIndice != -1))) {
+
+      ADIO_Offset currentRoundFDStartForMyTargetAgg = (ADIO_Offset)((ADIO_Offset)targetAggsForMyDataFDStart[aggIter] + (ADIO_Offset)((ADIO_Offset)roundIter*(ADIO_Offset)coll_bufsize));
+      ADIO_Offset currentRoundFDEndForMyTargetAgg = (ADIO_Offset)((ADIO_Offset)targetAggsForMyDataFDStart[aggIter] + (ADIO_Offset)((ADIO_Offset)(roundIter+1)*(ADIO_Offset)coll_bufsize) - (ADIO_Offset)1);
+
+      int targetAggContigAccessCount = 0;
+
+      /* These data structures are used for the derived datatype mpi_put
+       * in the gpfsmpio_aggmethod of 2 case.
+       */
+      int *targetAggBlockLengths;
+      MPI_Aint *targetAggDisplacements, *sourceBufferDisplacements;
+      MPI_Datatype *targetAggDataTypes;
+
+      int allocatedDerivedTypeArrays = 0;
+#ifdef onesidedtrace
+      printf("roundIter %d processing targetAggsForMyData %d \n",roundIter,targetAggsForMyData[aggIter]);
+#endif
+
+      ADIO_Offset sourceBufferOffset = 0;
+
+      /* Process the range of offsets for this target agg.
+       */
+      int offsetIter;
+      int startingOffLenIndex = targetAggsForMyDataFirstOffLenIndex[roundIter][aggIter], endingOffLenIndex = targetAggsForMyDataLastOffLenIndex[roundIter][aggIter];
+         for (offsetIter=startingOffLenIndex;offsetIter<=endingOffLenIndex;offsetIter++) {
+        if (currentRoundFDEndForMyTargetAgg > targetAggsForMyDataFDEnd[aggIter])
+            currentRoundFDEndForMyTargetAgg = targetAggsForMyDataFDEnd[aggIter];
+
+        ADIO_Offset offsetStart = offset_list[offsetIter], offsetEnd = (offset_list[offsetIter]+len_list[offsetIter]-(ADIO_Offset)1);
+
+        /* Get the base source buffer offset for this iterataion of the target agg in the round.
+         * For the first one just get the predetermined base value, for subsequent ones keep track
+         * of the value for each iteration and increment it.
+         */
+        if (offsetIter == startingOffLenIndex) {
+          if (bufTypeIsContig)
+            currentBaseSourceBufferOffset = baseSourceBufferOffset[roundIter][aggIter];
+          else {
+            currentRemLen = baseNonContigSourceBufferOffset[roundIter][aggIter].remLen;
+            currentDataTypeExtent = baseNonContigSourceBufferOffset[roundIter][aggIter].dataTypeExtent;
+            currentFlatBufIndice = baseNonContigSourceBufferOffset[roundIter][aggIter].flatBufIndice;
+          }
+        }
+        else {
+          if (bufTypeIsContig)
+            currentBaseSourceBufferOffset += len_list[offsetIter-1];
+          else {
+
+          /* For non-contiguous source datatype count up the extents and indices to this point.
+           */
+          ADIO_Offset sourceBlockTotal = 0-currentRemLen;
+          while (sourceBlockTotal < len_list[offsetIter-1]) {
+            sourceBlockTotal += flatBuf->blocklens[currentFlatBufIndice];
+            currentFlatBufIndice++;
+            if (currentFlatBufIndice == flatBuf->count) {
+              currentFlatBufIndice = 0;
+              currentDataTypeExtent++;
+            }
+          } // while
+          if (sourceBlockTotal > len_list[offsetIter-1])
+            currentFlatBufIndice--;
+          currentRemLen = len_list[offsetIter-1] - sourceBlockTotal;
+#ifdef onesidedtrace
+          printf("contig_access_count target agg %d currentFlatBufIndice is now %d currentDataTypeExtent is now %d currentRemLen is now %ld\n",aggIter,currentFlatBufIndice,currentDataTypeExtent,currentRemLen);
+#endif
+          }
+        }
+
+        /* For source buffer contiguous here is the offset we will use for this iteration.
+         */
+        if (bufTypeIsContig)
+          sourceBufferOffset = currentBaseSourceBufferOffset;
+
+#ifdef onesidedtrace
+        printf("roundIter %d target iter %d targetAggsForMyData is %d offset_list[%d] is %ld len_list[%d] is %ld targetAggsForMyDataFDStart is %ld targetAggsForMyDataFDEnd is %ld currentRoundFDStartForMyTargetAgg is %ld currentRoundFDEndForMyTargetAgg is %ld targetAggsForMyDataFirstOffLenIndex is %ld baseSourceBufferOffset is %ld\n",
+            roundIter,aggIter,targetAggsForMyData[aggIter],offsetIter,offset_list[offsetIter],offsetIter,len_list[offsetIter],
+            targetAggsForMyDataFDStart[aggIter],targetAggsForMyDataFDEnd[aggIter],
+            currentRoundFDStartForMyTargetAgg,currentRoundFDEndForMyTargetAgg, targetAggsForMyDataFirstOffLenIndex[roundIter][aggIter], baseSourceBufferOffset[roundIter][aggIter]);
+#endif
+
+        /* Determine the amount of data and exact source buffer offsets to use.
+         */
+        int bufferAmountToSend = 0;
+
+        if ((offsetStart >= currentRoundFDStartForMyTargetAgg) && (offsetStart <= currentRoundFDEndForMyTargetAgg)) {
+            if (offsetEnd > currentRoundFDEndForMyTargetAgg)
+            bufferAmountToSend = (currentRoundFDEndForMyTargetAgg - offsetStart) +1;
+            else
+            bufferAmountToSend = (offsetEnd - offsetStart) +1;
+        }
+        else if ((offsetEnd >= currentRoundFDStartForMyTargetAgg) && (offsetEnd <= currentRoundFDEndForMyTargetAgg)) {
+            if (offsetEnd > currentRoundFDEndForMyTargetAgg)
+            bufferAmountToSend = (currentRoundFDEndForMyTargetAgg - currentRoundFDStartForMyTargetAgg) +1;
+            else
+            bufferAmountToSend = (offsetEnd - currentRoundFDStartForMyTargetAgg) +1;
+            if (offsetStart < currentRoundFDStartForMyTargetAgg) {
+              if (bufTypeIsContig)
+                sourceBufferOffset += (currentRoundFDStartForMyTargetAgg-offsetStart);
+              offsetStart = currentRoundFDStartForMyTargetAgg;
+            }
+        }
+        else if ((offsetStart <= currentRoundFDStartForMyTargetAgg) && (offsetEnd >= currentRoundFDEndForMyTargetAgg)) {
+            bufferAmountToSend = (currentRoundFDEndForMyTargetAgg - currentRoundFDStartForMyTargetAgg) +1;
+            if (bufTypeIsContig)
+              sourceBufferOffset += (currentRoundFDStartForMyTargetAgg-offsetStart);
+            offsetStart = currentRoundFDStartForMyTargetAgg;
+        }
+
+        if (bufferAmountToSend > 0) { /* we have data to send this round */
+          if (gpfsmpio_aggmethod == 2) {
+            /* Only allocate these arrays if we are using method 2 and only do it once for this round/target agg.
+             */
+            if (!allocatedDerivedTypeArrays) {
+              targetAggBlockLengths = (int *)ADIOI_Malloc(maxNumContigOperations * sizeof(int));
+              targetAggDisplacements = (MPI_Aint *)ADIOI_Malloc(maxNumContigOperations * sizeof(MPI_Aint));
+              sourceBufferDisplacements = (MPI_Aint *)ADIOI_Malloc(maxNumContigOperations * sizeof(MPI_Aint));
+              targetAggDataTypes = (MPI_Datatype *)ADIOI_Malloc(maxNumContigOperations * sizeof(MPI_Datatype));
+              allocatedDerivedTypeArrays = 1;
+            }
+          }
+          /* For the non-contiguous source datatype we need to determine the number,size and offsets of the chunks
+           * from the source buffer to be sent to this contiguous chunk defined by the round/agg/iter in the target.
+           */
+          int numNonContigSourceChunks = 0;
+          ADIO_Offset *nonContigSourceOffsets;
+          int *nonContigSourceLens;
+          ADIO_Offset baseDatatypeInstanceOffset = 0;
+
+          if (!bufTypeIsContig) {
+
+            currentSourceBufferOffset = (ADIO_Offset)((ADIO_Offset)currentDataTypeExtent * (ADIO_Offset)bufTypeExtent) + flatBuf->indices[currentFlatBufIndice] + currentRemLen;
+#ifdef onesidedtrace
+            printf("!bufTypeIsContig currentSourceBufferOffset set to %ld for roundIter %d target %d currentDataTypeExtent %d flatBuf->indices[currentFlatBufIndice] %ld currentRemLen %ld currentFlatBufIndice %d\n",currentSourceBufferOffset,roundIter,aggIter,currentDataTypeExtent,flatBuf->indices[currentFlatBufIndice],currentRemLen,currentFlatBufIndice);
+#endif
+
+            /* Count the chunks first to see how much to malloc from the ending point from above code.
+             */
+            int sendBytesCounted = 0;
+            int tmpFlatBufIndice = currentFlatBufIndice;
+            int maxNumNonContigSourceChunks = 2; // over-initialize for potential remnants on both ends
+
+            while (sendBytesCounted < bufferAmountToSend) {
+              maxNumNonContigSourceChunks++;
+              if (tmpFlatBufIndice == flatBuf->count) {
+                tmpFlatBufIndice = 0;
+              }
+              sendBytesCounted += flatBuf->blocklens[tmpFlatBufIndice];
+              tmpFlatBufIndice++;
+            }
+
+            nonContigSourceOffsets = (ADIO_Offset *)ADIOI_Malloc(maxNumNonContigSourceChunks * sizeof(ADIO_Offset));
+            nonContigSourceLens = (int *)ADIOI_Malloc(maxNumNonContigSourceChunks * sizeof(int));
+
+            /* now populate the nonContigSourceOffsets and nonContigSourceLens arrays for use in the one-sided operations.
+             */
+            int ncArrayIndex = 0;
+            int remainingBytesToLoadedIntoNCArrays = bufferAmountToSend;
+            ADIO_Offset indexIntoCurrentIndice = 0;
+            if (currentRemLen > 0)
+              indexIntoCurrentIndice = flatBuf->blocklens[currentFlatBufIndice] - currentRemLen;
+
+            int datatypeInstances = currentDataTypeExtent;
+            tmpFlatBufIndice = currentFlatBufIndice;
+            while (remainingBytesToLoadedIntoNCArrays > 0) {
+             nonContigSourceOffsets[ncArrayIndex] = currentSourceBufferOffset;
+
+              if ((flatBuf->blocklens[tmpFlatBufIndice] - indexIntoCurrentIndice) > remainingBytesToLoadedIntoNCArrays) {
+                nonContigSourceLens[ncArrayIndex] = remainingBytesToLoadedIntoNCArrays;
+                remainingBytesToLoadedIntoNCArrays = 0;
+              }
+              else {
+                nonContigSourceLens[ncArrayIndex] = (int)(flatBuf->blocklens[tmpFlatBufIndice] - indexIntoCurrentIndice);
+                remainingBytesToLoadedIntoNCArrays -= (flatBuf->blocklens[tmpFlatBufIndice] - indexIntoCurrentIndice);
+              }
+              indexIntoCurrentIndice = 0; // only worry about beginning remnant for first iter
+
+              tmpFlatBufIndice++;
+              if (tmpFlatBufIndice == flatBuf->count) {
+                tmpFlatBufIndice = 0;
+                datatypeInstances++;
+                baseDatatypeInstanceOffset = datatypeInstances * bufTypeExtent;
+              }
+              currentSourceBufferOffset = baseDatatypeInstanceOffset + flatBuf->indices[tmpFlatBufIndice];
+              ncArrayIndex++;
+              numNonContigSourceChunks++;
+            } // while
+#ifdef onesidedtrace
+            printf("CurrentSourceBufferOffset finally set to %ld\n",currentSourceBufferOffset);
+#endif
+          } // !bufTypeIsContig
+
+
+          /* Determine the offset into the target window.
+           */
+          MPI_Aint targetDisplacementToUseThisRound = (MPI_Aint) ((ADIO_Offset)offsetStart - currentRoundFDStartForMyTargetAgg);
+
+          /* If using the thread writer select the appropriate side of the split window.
+           */
+          if (useIOBuffer && (write_buf == write_buf1)) {
+            targetDisplacementToUseThisRound += coll_bufsize;
+          }
+
+          /* For gpfsmpio_aggmethod of 1 do the mpi_put using the primitive MPI_BYTE type on each contiguous chunk of source data.
+           */
+          if (gpfsmpio_aggmethod == 1) {
+#ifndef ACTIVE_TARGET
+            MPI_Win_lock(MPI_LOCK_SHARED, targetAggsForMyData[aggIter], 0, write_buf_window);
+#endif
+            if (bufTypeIsContig) {
+              MPI_Put(&((char*)buf)[sourceBufferOffset],bufferAmountToSend, MPI_BYTE,targetAggsForMyData[aggIter],targetDisplacementToUseThisRound, bufferAmountToSend,MPI_BYTE,write_buf_window);
+            }
+            else {
+              for (i=0;i<numNonContigSourceChunks;i++) {
+                MPI_Put(&((char*)buf)[nonContigSourceOffsets[i]],nonContigSourceLens[i], MPI_BYTE,targetAggsForMyData[aggIter],targetDisplacementToUseThisRound, nonContigSourceLens[i],MPI_BYTE,write_buf_window);
+#ifdef onesidedtrace
+                printf("mpi_put[%d] nonContigSourceOffsets is %d of nonContigSourceLens %d to target disp %d\n",i,nonContigSourceOffsets[i],nonContigSourceLens[i],targetDisplacementToUseThisRound);
+#endif
+                targetDisplacementToUseThisRound += nonContigSourceLens[i];
+              }
+            }
+#ifndef ACTIVE_TARGET
+            MPI_Win_unlock(targetAggsForMyData[aggIter], write_buf_window);
+#endif
+          }
+
+          /* For gpfsmpio_aggmethod of 2 populate the data structures for this round/agg for this offset iter
+           * to be used subsequently when building the derived type for 1 mpi_put for all the data for this
+           * round/agg.
+           */
+          else if (gpfsmpio_aggmethod == 2) {
+
+            if (bufTypeIsContig) {
+              targetAggBlockLengths[targetAggContigAccessCount]= bufferAmountToSend;
+              targetAggDataTypes[targetAggContigAccessCount] = MPI_BYTE;
+              targetAggDisplacements[targetAggContigAccessCount] = targetDisplacementToUseThisRound;
+              sourceBufferDisplacements[targetAggContigAccessCount] = (MPI_Aint)sourceBufferOffset;
+              targetAggContigAccessCount++;
+            }
+            else {
+              for (i=0;i<numNonContigSourceChunks;i++) {
+                targetAggBlockLengths[targetAggContigAccessCount]= nonContigSourceLens[i];
+                targetAggDataTypes[targetAggContigAccessCount] = MPI_BYTE;
+                targetAggDisplacements[targetAggContigAccessCount] = targetDisplacementToUseThisRound;
+                sourceBufferDisplacements[targetAggContigAccessCount] = (MPI_Aint)nonContigSourceOffsets[i];
+#ifdef onesidedtrace
+                printf("mpi_put building arrays for iter %d - sourceBufferDisplacements is %d of nonContigSourceLens %d to target disp %d targetAggContigAccessCount is %d targetAggBlockLengths[targetAggContigAccessCount] is %d\n",i,sourceBufferDisplacements[targetAggContigAccessCount],nonContigSourceLens[i],targetAggDisplacements[targetAggContigAccessCount],targetAggContigAccessCount, targetAggBlockLengths[targetAggContigAccessCount]);
+#endif
+                targetAggContigAccessCount++;
+                targetDisplacementToUseThisRound += nonContigSourceLens[i];
+              }
+            }
+          }
+#ifdef onesidedtrace
+        printf("roundIter %d bufferAmountToSend is %d sourceBufferOffset is %d offsetStart is %ld currentRoundFDStartForMyTargetAgg is %ld targetDisplacementToUseThisRound is %ld targetAggsForMyDataFDStart[aggIter] is %ld\n",roundIter, bufferAmountToSend,sourceBufferOffset, offsetStart,currentRoundFDStartForMyTargetAgg,targetDisplacementToUseThisRound,targetAggsForMyDataFDStart[aggIter]);
+#endif
+
+          if (!bufTypeIsContig) {
+            ADIOI_Free(nonContigSourceOffsets);
+            ADIOI_Free(nonContigSourceLens);
+          }
+        } // bufferAmountToSend > 0
+      } // contig list
+
+      /* For gpfsmpio_aggmethod of 2 now build the derived type using the data from this round/agg and do 1 single mpi_put.
+       */
+      if (gpfsmpio_aggmethod == 2) {
+        MPI_Datatype sourceBufferDerivedDataType, targetBufferDerivedDataType;
+        MPI_Type_create_struct(targetAggContigAccessCount, targetAggBlockLengths, sourceBufferDisplacements, targetAggDataTypes, &sourceBufferDerivedDataType);
+        MPI_Type_commit(&sourceBufferDerivedDataType);
+        MPI_Type_create_struct(targetAggContigAccessCount, targetAggBlockLengths, targetAggDisplacements, targetAggDataTypes, &targetBufferDerivedDataType);
+        MPI_Type_commit(&targetBufferDerivedDataType);
+
+#ifdef onesidedtrace
+        printf("mpi_put of derived type to agg %d targetAggContigAccessCount is %d\n",targetAggsForMyData[aggIter],targetAggContigAccessCount);
+#endif
+#ifndef ACTIVE_TARGET
+        MPI_Win_lock(MPI_LOCK_SHARED, targetAggsForMyData[aggIter], 0, write_buf_window);
+#endif
+
+        MPI_Put(((char*)buf),1, sourceBufferDerivedDataType,targetAggsForMyData[aggIter],0, 1,targetBufferDerivedDataType,write_buf_window);
+
+#ifndef ACTIVE_TARGET
+        MPI_Win_unlock(targetAggsForMyData[aggIter], write_buf_window);
+#endif
+
+        if (allocatedDerivedTypeArrays) {
+          ADIOI_Free(targetAggBlockLengths);
+          ADIOI_Free(targetAggDisplacements);
+          ADIOI_Free(targetAggDataTypes);
+          ADIOI_Free(sourceBufferDisplacements);
+        }
+        MPI_Type_free(&sourceBufferDerivedDataType);
+        MPI_Type_free(&targetBufferDerivedDataType);
+      }
+      } // baseoffset != -1
+    } // target aggs
+
+    /* the source procs send the requested data to the aggs */
+
+#ifdef ACTIVE_TARGET
+    MPI_Win_fence(0, write_buf_window);
+#else
+    MPI_Barrier(fd->comm);
+#endif
+
+    if (iAmUsedAgg) {
+    /* Determine what offsets define the portion of the file domain the agg is writing this round.
+     */
+        if ((fd_end[myAggRank] - currentRoundFDStart) < coll_bufsize) {
+          if (myAggRank == greatestFileDomainAggRank) {
+            if (fd_end[myAggRank] > lastFileOffset)
+              currentRoundFDEnd = lastFileOffset;
+            else
+              currentRoundFDEnd = fd_end[myAggRank];
+          }
+          else
+            currentRoundFDEnd = fd_end[myAggRank];
+        }
+        else
+        currentRoundFDEnd = currentRoundFDStart + coll_bufsize - 1;
+
+#ifdef onesidedtrace
+        printf("currentRoundFDStart is %ld currentRoundFDEnd is %ld within file domeain %ld to %ld\n",currentRoundFDStart,currentRoundFDEnd,fd_start[myAggRank],fd_end[myAggRank]);
+#endif
+
+        if (!useIOBuffer) {
+          ADIO_WriteContig(fd, write_buf, (int)(currentRoundFDEnd - currentRoundFDStart)+1,
+            MPI_BYTE, ADIO_EXPLICIT_OFFSET,currentRoundFDStart, &status, error_code);
+          int numBytesWritten= (int)(currentRoundFDEnd - currentRoundFDStart)+1;
+
+/* For now this algorithm cannot handle holes in the source data and does not do any data sieving.
+ * One possible approach would be to initialize the write buffer with some value and then check to
+ * see if the mpi_put operations changed the values, if they were unchanged then retry with some other
+ * default initialized value and if that was still unchanged then you would know where a hole was.
+ * Here is some initial sample code for that:
+ *       if (roundIter<(numberOfRounds-1)) {
+ *         for (i=0;i<numBytesWritten;i++)
+ *           write_buf[i] = '\0';
+ *       }
+*/
+        } else { /* use the thread writer */
+
+        if(!pthread_equal(io_thread, pthread_self())) {
+            pthread_join(io_thread, &thread_ret);
+            *error_code = *(int *)thread_ret;
+            if (*error_code != MPI_SUCCESS) return;
+            io_thread = pthread_self();
+
+        }
+        io_thread_args.fd = fd;
+        /* do a little pointer shuffling: background I/O works from one
+         * buffer while two-phase machinery fills up another */
+
+        if (currentWriteBuf == 0) {
+            io_thread_args.buf = write_buf0;
+            currentWriteBuf = 1;
+            write_buf = write_buf1;
+        }
+        else {
+            io_thread_args.buf = write_buf1;
+            currentWriteBuf = 0;
+            write_buf = write_buf0;
+        }
+        io_thread_args.io_kind = ADIOI_WRITE;
+        io_thread_args.size = (currentRoundFDEnd-currentRoundFDStart) + 1;
+        io_thread_args.offset = currentRoundFDStart;
+        io_thread_args.status = status;
+        io_thread_args.error_code = *error_code;
+
+        if ( (pthread_create(&io_thread, NULL,
+                ADIOI_IO_Thread_Func, &(io_thread_args))) != 0)
+            io_thread = pthread_self();
+
+        } // useIOBuffer
+
+    } // iAmUsedAgg
+
+    if (!iAmUsedAgg && useIOBuffer) {
+        if (currentWriteBuf == 0) {
+            currentWriteBuf = 1;
+            write_buf = write_buf1;
+        }
+        else {
+            currentWriteBuf = 0;
+            write_buf = write_buf0;
+        }
+    }
+
+    if (iAmUsedAgg)
+        currentRoundFDStart += coll_bufsize;
+
+    if (roundIter<(numberOfRounds-1))
+      MPI_Barrier(fd->comm);
+
+    } /* for-loop roundIter */
+
+#ifdef ROMIO_GPFS
+    endTimeBase = MPI_Wtime();
+    gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH] += (endTimeBase-startTimeBase);
+#endif
+
+    if (useIOBuffer) { /* thread writer cleanup */
+
+    if ( !pthread_equal(io_thread, pthread_self()) ) {
+        pthread_join(io_thread, &thread_ret);
+        *error_code = *(int *)thread_ret;
+    }
+
+    }
+
+    ADIOI_Free(targetAggsForMyData);
+    ADIOI_Free(targetAggsForMyDataFDStart);
+    ADIOI_Free(targetAggsForMyDataFDEnd);
+
+    for (i=0;i<numberOfRounds;i++) {
+      ADIOI_Free(targetAggsForMyDataFirstOffLenIndex[i]);
+      ADIOI_Free(targetAggsForMyDataLastOffLenIndex[i]);
+      if (bufTypeIsContig)
+        ADIOI_Free(baseSourceBufferOffset[i]);
+      else
+        ADIOI_Free(baseNonContigSourceBufferOffset[i]);
+    }
+    ADIOI_Free(targetAggsForMyDataFirstOffLenIndex);
+    ADIOI_Free(targetAggsForMyDataLastOffLenIndex);
+    if (bufTypeIsContig)
+      ADIOI_Free(baseSourceBufferOffset);
+    else
+      ADIOI_Free(baseNonContigSourceBufferOffset);
+
+    return;
+}
+
+
+void ADIOI_OneSidedReadAggregation(ADIO_File fd,
+    ADIO_Offset *offset_list,
+    ADIO_Offset *len_list,
+    int contig_access_count,
+    const void *buf,
+    MPI_Datatype datatype,
+    int *error_code,
+    ADIO_Offset *st_offsets,
+    ADIO_Offset *end_offsets,
+    ADIO_Offset *fd_start,
+    ADIO_Offset* fd_end)
+{
+
+    *error_code = MPI_SUCCESS; /* initialize to success */
+
+#ifdef ROMIO_GPFS
+    double startTimeBase,endTimeBase;
+    startTimeBase = MPI_Wtime();
+#endif
+
+    MPI_Status status;
+    pthread_t io_thread;
+    void *thread_ret;
+    ADIOI_IO_ThreadFuncData io_thread_args;
+    int i,j; /* generic iterators */
+
+    int nprocs,myrank;
+    MPI_Comm_size(fd->comm, &nprocs);
+    MPI_Comm_rank(fd->comm, &myrank);
+
+    /* This flag denotes whether the source datatype is contiguus, which is referenced throughout the algorithm
+     * and defines how the source buffer offsets and data chunks are determined.  If the value is 1 (true - contiguous data)
+     * things are profoundly simpler in that the source buffer offset for a given source offset simply linearly increases
+     * by the chunk sizes being read.  If the value is 0 (non-contiguous) then these values are based on calculations
+     * from the flattened source datatype.
+     */
+    int bufTypeIsContig;
+
+    MPI_Aint bufTypeExtent;
+    ADIOI_Flatlist_node *flatBuf=NULL;
+    ADIOI_Datatype_iscontig(datatype, &bufTypeIsContig);
+
+    /* maxNumContigOperations keeps track of how many different chunks we will need to recv
+     * for the purpose of pre-allocating the data structures to hold them.
+     */
+    int maxNumContigOperations = contig_access_count;
+
+    if (!bufTypeIsContig) {
+    /* Flatten the non-contiguous source datatype.
+     */
+      ADIOI_Flatten_datatype(datatype);
+      flatBuf = ADIOI_Flatlist;
+      while (flatBuf->type != datatype) flatBuf = flatBuf->next;
+      MPI_Type_extent(datatype, &bufTypeExtent);
+    }
+#ifdef onesidedtrace
+      printf("ADIOI_OneSidedReadAggregation bufTypeIsContig is %d maxNumContigOperations is %d\n",bufTypeIsContig,maxNumContigOperations);
+#endif
+
+    ADIO_Offset myOffsetStart = st_offsets[myrank], myOffsetEnd = end_offsets[myrank], currentRoundOffsetStart = st_offsets[myrank];
+
+    ADIO_Offset lastFileOffset = 0, firstFileOffset = myOffsetEnd;
+
+    /* Get the total range being read.
+     */
+    for (j=0;j<nprocs;j++) {
+      lastFileOffset = ADIOI_MAX(lastFileOffset,end_offsets[j]);
+      firstFileOffset = ADIOI_MIN(firstFileOffset,st_offsets[j]);
+    }
+
+    int myAggRank = -1; /* if I am an aggregor this is my index into fd->hints->ranklist */
+    int iAmUsedAgg = 0; /* whether or not this rank is used as an aggregator. */
+
+    int naggs = fd->hints->cb_nodes;
+    int coll_bufsize = fd->hints->cb_buffer_size;
+#ifdef ROMIO_GPFS
+    if (gpfsmpio_pthreadio == 1) {
+    /* split buffer in half for a kind of double buffering with the threads*/
+    coll_bufsize = fd->hints->cb_buffer_size/2;
+    }
+#endif
+
+    /* This logic defines values that are used later to determine what offsets define the portion
+     * of the file domain the agg is reading this round.
+     */
+    int greatestFileDomainAggRank = -1,smallestFileDomainAggRank = -1;
+    ADIO_Offset greatestFileDomainOffset = 0;
+    ADIO_Offset smallestFileDomainOffset = lastFileOffset;
+    for (j=0;j<naggs;j++) {
+      if (fd->hints->ranklist[j] == myrank) {
+        myAggRank = j;
+        if (fd_end[j] > greatestFileDomainOffset) {
+          greatestFileDomainOffset = fd_end[j];
+          greatestFileDomainAggRank = j;
+        }
+        if (fd_start[j] < smallestFileDomainOffset) {
+          smallestFileDomainOffset = fd_start[j];
+          smallestFileDomainAggRank = j;
+        }
+        if (fd_end[j] > fd_start[j]) {
+          iAmUsedAgg = 1;
+        }
+      }
+    }
+
+#ifdef onesidedtrace
+    printf("contig_access_count is %d lastFileOffset is %ld firstFileOffset is %ld\n",contig_access_count,lastFileOffset,firstFileOffset);
+    for (j=0;j<contig_access_count;j++) {
+      printf("offset_list[%d]: %ld , len_list[%d]: %ld\n",j,offset_list[j],j,len_list[j]);
+    }
+#endif
+
+
+    /* for my offset range determine how much data and from whom I need to get
+     * it.  For source ag sources, also determine the source file domain
+     * offsets locally to reduce communication overhead */
+    int *sourceAggsForMyData = (int *)ADIOI_Malloc(naggs * sizeof(int));
+    ADIO_Offset *sourceAggsForMyDataFDStart = (ADIO_Offset *)ADIOI_Malloc(naggs * sizeof(ADIO_Offset));
+    ADIO_Offset *sourceAggsForMyDataFDEnd = (ADIO_Offset *)ADIOI_Malloc(naggs * sizeof(ADIO_Offset));
+    int numSourceAggs = 0;
+
+    /* compute number of rounds */
+    ADIO_Offset numberOfRounds = (ADIO_Offset)((((ADIO_Offset)(end_offsets[nprocs-1]-st_offsets[0]))/((ADIO_Offset)((ADIO_Offset)coll_bufsize*(ADIO_Offset)naggs)))) + 1;
+
+    /* This data structure holds the beginning offset and len list index for the range to be read
+     * coresponding to the round and source agg.
+    */
+    int **sourceAggsForMyDataFirstOffLenIndex = (int **)ADIOI_Malloc(numberOfRounds * sizeof(int *));
+    for (i=0;i<numberOfRounds;i++) {
+      sourceAggsForMyDataFirstOffLenIndex[i] = (int *)ADIOI_Malloc(naggs * sizeof(int));
+      for (j=0;j<naggs;j++)
+        sourceAggsForMyDataFirstOffLenIndex[i][j] = -1;
+    }
+
+    /* This data structure holds the ending offset and len list index for the range to be read
+     * coresponding to the round and source agg.
+    */
+    int **sourceAggsForMyDataLastOffLenIndex = (int **)ADIOI_Malloc(numberOfRounds * sizeof(int *));
+    for (i=0;i<numberOfRounds;i++)
+      sourceAggsForMyDataLastOffLenIndex[i] = (int *)ADIOI_Malloc(naggs * sizeof(int));
+
+    /* This data structure holds the base buffer offset for contiguous source data for the range to be read
+     * coresponding to the round and source agg.
+    */
+    ADIO_Offset **baseRecvBufferOffset;
+    if (bufTypeIsContig) {
+      baseRecvBufferOffset = (ADIO_Offset **)ADIOI_Malloc(numberOfRounds * sizeof(ADIO_Offset *));
+      for (i=0;i<numberOfRounds;i++) {
+        baseRecvBufferOffset[i] = (ADIO_Offset *)ADIOI_Malloc(naggs * sizeof(ADIO_Offset));
+        for (j=0;j<naggs;j++)
+          baseRecvBufferOffset[i][j] = -1;
+      }
+    }
+    ADIO_Offset currentRecvBufferOffset = 0;
+
+    /* This data structure holds the number of extents, the index into the flattened buffer and the remnant length
+     * beyond the flattened buffer indice corresponding to the base buffer offset for non-contiguous source data
+     * for the range to be written coresponding to the round and target agg.
+     */
+    NonContigSourceBufOffset **baseNonContigSourceBufferOffset;
+    if (!bufTypeIsContig) {
+      baseNonContigSourceBufferOffset = (NonContigSourceBufOffset **) ADIOI_Malloc(numberOfRounds * sizeof(NonContigSourceBufOffset *));
+      for (i=0;i<numberOfRounds;i++) {
+        baseNonContigSourceBufferOffset[i] = (NonContigSourceBufOffset *)ADIOI_Malloc(naggs * sizeof(NonContigSourceBufOffset));
+        /* initialize flatBufIndice to -1 to indicate that it is unset.
+         */
+        for (j=0;j<naggs;j++)
+          baseNonContigSourceBufferOffset[i][j].flatBufIndice = -1;
+      }
+    }
+
+    int currentDataTypeExtent = 0;
+    int currentFlatBufIndice=0;
+    ADIO_Offset currentRemLen = 0;
+
+#ifdef onesidedtrace
+    printf("NumberOfRounds is %d\n",numberOfRounds);
+    for (i=0;i<naggs;i++)
+      printf("fd->hints->ranklist[%d]is %d fd_start is %ld fd_end is %ld\n",i,fd->hints->ranklist[i],fd_start[i],fd_end[i]);
+    for (j=0;j<contig_access_count;j++)
+      printf("offset_list[%d] is %ld len_list is %ld\n",j,offset_list[j],len_list[j]);
+#endif
+
+    int currentAggRankListIndex = 0;
+
+    /* This denotes the coll_bufsize boundaries within the source buffer for reading for 1 round.
+     */
+    ADIO_Offset intraRoundCollBufsizeOffset = 0;
+
+    /* This data structure tracks what source aggs need to be read to on what rounds.
+     */
+    int *sourceAggsForMyDataCurrentRoundIter = (int *)ADIOI_Malloc(naggs * sizeof(int));
+    for (i=0;i<naggs;i++)
+      sourceAggsForMyDataCurrentRoundIter[i] = 0;
+
+    /* This is the first of the two main loops in this algorithm.  The purpose of this loop is essentially to populate
+     * the data structures defined above for what source data blocks needs to go where (source agg) and when (round iter).
+     */
+    int blockIter;
+    for (blockIter=0;blockIter<contig_access_count;blockIter++) {
+
+      /* Determine the starting source buffer offset for this block - for iter 0 skip it since that value is 0.
+       */
+      if (blockIter>0) {
+        if (bufTypeIsContig) {
+          currentRecvBufferOffset += len_list[blockIter-1];
+        }
+        else {
+          /* Non-contiguous source datatype, count up the extents and indices to this point
+           * in the blocks.
+           */
+          ADIO_Offset sourceBlockTotal = 0-currentRemLen;
+          while (sourceBlockTotal < len_list[blockIter-1]) {
+            maxNumContigOperations++;
+            sourceBlockTotal += flatBuf->blocklens[currentFlatBufIndice];
+            currentFlatBufIndice++;
+            if (currentFlatBufIndice == flatBuf->count) {
+              currentFlatBufIndice = 0;
+              currentDataTypeExtent++;
+            }
+          }
+          if (sourceBlockTotal > len_list[blockIter-1]) {
+            currentFlatBufIndice--;
+            if (currentFlatBufIndice < 0 ) {
+              currentDataTypeExtent--;
+              currentFlatBufIndice = flatBuf->count;
+            }
+          }
+          currentRemLen = len_list[blockIter-1] - sourceBlockTotal;
+#ifdef onesidedtrace
+          printf("contig_access_count iter %d currentFlatBufIndice is now %d currentDataTypeExtent is now %d currentRemLen is now %ld\n",blockIter,currentFlatBufIndice,currentDataTypeExtent,currentRemLen);
+#endif
+        } // !bufTypeIsContig
+      } // blockIter > 0
+
+      /* For the first iteration we need to include these maxNumContigOperations
+       * for non-contig case even though we did not need to compute the starting offset.
+       */
+      if ((blockIter == 0) && (!bufTypeIsContig)) {
+        ADIO_Offset sourceBlockTotal = 0;
+        int tmpCurrentFlatBufIndice = currentFlatBufIndice;
+        while (sourceBlockTotal < len_list[0]) {
+          maxNumContigOperations++;
+          sourceBlockTotal += flatBuf->blocklens[tmpCurrentFlatBufIndice];
+          tmpCurrentFlatBufIndice++;
+          if (tmpCurrentFlatBufIndice == flatBuf->count) {
+            tmpCurrentFlatBufIndice = 0;
+          }
+        }
+      }
+
+      ADIO_Offset blockStart = offset_list[blockIter], blockLen = len_list[blockIter], blockEnd = offset_list[blockIter]+len_list[blockIter]-(ADIO_Offset)1;
+
+      /* Find the starting source agg for this block - normally it will be the current agg so guard the expensive
+       * while loop with a cheap if-check which for large numbers of small blocks will usually be false.
+       */
+      if (!((blockStart >= fd_start[currentAggRankListIndex]) && (blockStart <= fd_end[currentAggRankListIndex]))) {
+        while (!((blockStart >= fd_start[currentAggRankListIndex]) && (blockStart <= fd_end[currentAggRankListIndex])))
+          currentAggRankListIndex++;
+      };
+
+      /* Determine if this is a new source agg.
+       */
+      if (blockIter>0) {
+        if ((offset_list[blockIter-1]+len_list[blockIter-1]-(ADIO_Offset)1) < fd_start[currentAggRankListIndex])
+          numSourceAggs++;
+      }
+
+       /* Determine which round to start reading.
+        */
+      if ((blockStart - fd_start[currentAggRankListIndex]) >= coll_bufsize) {
+        ADIO_Offset currentRoundBlockStart = fd_start[currentAggRankListIndex];
+        int startingRound = 0;
+        while (blockStart > (currentRoundBlockStart + coll_bufsize - (ADIO_Offset)1)) {
+          currentRoundBlockStart+=coll_bufsize;
+          startingRound++;
+        }
+        sourceAggsForMyDataCurrentRoundIter[numSourceAggs] = startingRound;
+      }
+
+      /* Initialize the data structures if this is the first offset in the round/source agg.
+       */
+      if (sourceAggsForMyDataFirstOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] == -1) {
+         sourceAggsForMyData[numSourceAggs] = fd->hints->ranklist[currentAggRankListIndex];
+        sourceAggsForMyDataFDStart[numSourceAggs] = fd_start[currentAggRankListIndex];
+        sourceAggsForMyDataFDEnd[numSourceAggs] = fd_end[currentAggRankListIndex];
+        sourceAggsForMyDataFirstOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
+        if (bufTypeIsContig)
+          baseRecvBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = currentRecvBufferOffset;
+        else {
+          baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].flatBufIndice = currentFlatBufIndice;
+          baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].dataTypeExtent = currentDataTypeExtent;
+          baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].remLen = currentRemLen;
+        }
+
+        intraRoundCollBufsizeOffset = fd_start[currentAggRankListIndex] + ((sourceAggsForMyDataCurrentRoundIter[numSourceAggs]+1) * coll_bufsize);
+#ifdef onesidedtrace
+        printf("init settings numSourceAggs %d offset_list[%d] with value %ld past fd border %ld with len %ld currentRecvBufferOffset set to %ld intraRoundCollBufsizeOffset set to %ld\n",numSourceAggs,blockIter,offset_list[blockIter],fd_start[currentAggRankListIndex],len_list[blockIter],currentRecvBufferOffset,intraRoundCollBufsizeOffset);
+#endif
+
+      }
+
+      /* Replace the last offset block iter with this one.
+       */
+      sourceAggsForMyDataLastOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
+
+      /* If this blocks extends into the next file domain handle this situation.
+       */
+      if (blockEnd > fd_end[currentAggRankListIndex]) {
+#ifdef onesidedtrace
+        printf("large block, blockEnd %ld >= fd_end[currentAggRankListIndex] %ld\n",blockEnd,fd_end[currentAggRankListIndex]);
+#endif
+        while (blockEnd >= fd_end[currentAggRankListIndex]) {
+          ADIO_Offset thisAggBlockEnd = fd_end[currentAggRankListIndex];
+          if (thisAggBlockEnd >= intraRoundCollBufsizeOffset) {
+            while (thisAggBlockEnd >= intraRoundCollBufsizeOffset) {
+              sourceAggsForMyDataCurrentRoundIter[numSourceAggs]++;
+              intraRoundCollBufsizeOffset += coll_bufsize;
+              sourceAggsForMyDataFirstOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
+              if (bufTypeIsContig)
+                baseRecvBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = currentRecvBufferOffset;
+              else {
+                baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].flatBufIndice = currentFlatBufIndice;
+                baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].dataTypeExtent = currentDataTypeExtent;
+                baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].remLen = currentRemLen;
+              }
+
+              sourceAggsForMyDataLastOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
+#ifdef onesidedtrace
+              printf("sourceAggsForMyDataCurrentRoundI%d] is now %d intraRoundCollBufsizeOffset is now %ld\n",numSourceAggs,sourceAggsForMyDataCurrentRoundIter[numSourceAggs],intraRoundCollBufsizeOffset);
+#endif
+            } // while (thisAggBlockEnd >= intraRoundCollBufsizeOffset)
+          } // if (thisAggBlockEnd >= intraRoundCollBufsizeOffset)
+
+          currentAggRankListIndex++;
+
+          /* Skip over unused aggs.
+           */
+          if (fd_start[currentAggRankListIndex] > fd_end[currentAggRankListIndex]) {
+            while (fd_start[currentAggRankListIndex] > fd_end[currentAggRankListIndex])
+              currentAggRankListIndex++;
+
+          } // (fd_start[currentAggRankListIndex] > fd_end[currentAggRankListIndex])
+
+          /* Start new source agg.
+           */
+          if (blockEnd >= fd_start[currentAggRankListIndex]) {
+            numSourceAggs++;
+             sourceAggsForMyData[numSourceAggs] = fd->hints->ranklist[currentAggRankListIndex];
+            sourceAggsForMyDataFDStart[numSourceAggs] = fd_start[currentAggRankListIndex];
+            sourceAggsForMyDataFDEnd[numSourceAggs] = fd_end[currentAggRankListIndex];
+            sourceAggsForMyDataFirstOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
+            if (bufTypeIsContig)
+              baseRecvBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = currentRecvBufferOffset;
+            else {
+              baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].flatBufIndice = currentFlatBufIndice;
+              baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].dataTypeExtent = currentDataTypeExtent;
+              baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].remLen = currentRemLen;
+            }
+
+#ifdef onesidedtrace
+            printf("large block init settings numSourceAggs %d offset_list[%d] with value %ld past fd border %ld with len %ld\n",numSourceAggs,i,offset_list[blockIter],fd_start[currentAggRankListIndex],len_list[blockIter]);
+#endif
+            intraRoundCollBufsizeOffset = fd_start[currentAggRankListIndex] + coll_bufsize;
+            sourceAggsForMyDataLastOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
+          } // if (blockEnd >= fd_start[currentAggRankListIndex])
+        } // while (blockEnd >= fd_end[currentAggRankListIndex])
+      } // if (blockEnd > fd_end[currentAggRankListIndex])
+
+      /* Else if we are still in the same file domain / source agg but have gone past the coll_bufsize and need
+       * to advance to the next round handle this situation.
+       */
+      else if (blockEnd >= intraRoundCollBufsizeOffset) {
+        ADIO_Offset currentBlockEnd = blockEnd;
+        while (currentBlockEnd >= intraRoundCollBufsizeOffset) {
+          sourceAggsForMyDataCurrentRoundIter[numSourceAggs]++;
+          intraRoundCollBufsizeOffset += coll_bufsize;
+          sourceAggsForMyDataFirstOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
+          if (bufTypeIsContig)
+            baseRecvBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = currentRecvBufferOffset;
+          else {
+            baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].flatBufIndice = currentFlatBufIndice;
+            baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].dataTypeExtent = currentDataTypeExtent;
+            baseNonContigSourceBufferOffset[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs].remLen = currentRemLen;
+          }
+          sourceAggsForMyDataLastOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
+#ifdef onesidedtrace
+          printf("block less than fd currentBlockEnd is now %ld intraRoundCollBufsizeOffset is now %ld sourceAggsForMyDataCurrentRoundIter[%d] is now %d\n",currentBlockEnd, intraRoundCollBufsizeOffset, numSourceAggs,sourceAggsForMyDataCurrentRoundIter[numSourceAggs]);
+#endif
+        } // while (currentBlockEnd >= intraRoundCollBufsizeOffset)
+      } // else if (blockEnd >= intraRoundCollBufsizeOffset)
+
+      /* Need to advance numSourceAggs if this is the last source offset to
+       * include this one.
+       */
+      if (blockIter == (contig_access_count-1))
+        numSourceAggs++;
+    }
+
+#ifdef onesidedtrace
+    printf("numSourceAggs is %d\n",numSourceAggs);
+    for (i=0;i<numSourceAggs;i++) {
+      for (j=0;j<=sourceAggsForMyDataCurrentRoundIter[i];j++)
+        printf("sourceAggsForMyData[%d] is %d sourceAggsForMyDataFDStart[%d] is %ld sourceAggsForMyDataFDEnd is %ld sourceAggsForMyDataFirstOffLenIndex is %d with value %ld sourceAggsForMyDataLastOffLenIndex is %d with value %ld\n",i,sourceAggsForMyData[i],i,sourceAggsForMyDataFDStart[i],sourceAggsForMyDataFDEnd[i],sourceAggsForMyDataFirstOffLenIndex[j][i],offset_list[sourceAggsForMyDataFirstOffLenIndex[j][i]],sourceAggsForMyDataLastOffLenIndex[j][i],offset_list[sourceAggsForMyDataLastOffLenIndex[j][i]]);
+
+    }
+#endif
+
+    ADIOI_Free(sourceAggsForMyDataCurrentRoundIter);
+
+    /* use the two-phase buffer allocated in the file_open - no app should ever
+     * be both reading and reading at the same time */
+    char *read_buf0 = fd->io_buf;
+    char *read_buf1 = fd->io_buf + coll_bufsize;
+    /* if threaded i/o selected, we'll do a kind of double buffering */
+    char *read_buf = read_buf0;
+
+    int currentReadBuf = 0;
+    int useIOBuffer = 0;
+#ifdef ROMIO_GPFS
+    if (gpfsmpio_pthreadio && (numberOfRounds>1)) {
+    useIOBuffer = 1;
+    io_thread = pthread_self();
+    }
+#endif
+
+    MPI_Win read_buf_window = fd->io_buf_window;
+
+#ifdef ACTIVE_TARGET
+    MPI_Win_fence(0, read_buf_window);
+#endif
+
+    ADIO_Offset currentRoundFDStart = 0, nextRoundFDStart = 0;
+    ADIO_Offset currentRoundFDEnd = 0, nextRoundFDEnd = 0;
+
+    if (iAmUsedAgg) {
+      currentRoundFDStart = fd_start[myAggRank];
+      nextRoundFDStart = fd_start[myAggRank];
+      if (myAggRank == smallestFileDomainAggRank) {
+        if (currentRoundFDStart < firstFileOffset)
+          currentRoundFDStart = firstFileOffset;
+        if (nextRoundFDStart < firstFileOffset)
+          nextRoundFDStart = firstFileOffset;
+      }
+    }
+
+#ifdef ROMIO_GPFS
+    endTimeBase = MPI_Wtime();
+    gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_SETUP] += (endTimeBase-startTimeBase);
+    startTimeBase = MPI_Wtime();
+#endif
+
+
+    ADIO_Offset currentBaseRecvBufferOffset = 0;
+
+    /* This is the second main loop of the algorithm, actually nested loop of source aggs within rounds.  There are 2 flavors of this.
+     * For gpfsmpio_aggmethod of 1 each nested iteration for the source agg does an mpi_put on a contiguous chunk using a primative datatype
+     * determined using the data structures from the first main loop.  For gpfsmpio_aggmethod of 2 each nested iteration for the source agg
+     * builds up data to use in created a derived data type for 1 mpi_put that is done for the source agg for each round.
+     */
+    int roundIter;
+    for (roundIter=0;roundIter<numberOfRounds;roundIter++) {
+
+    /* determine what offsets define the portion of the file domain the agg is reading this round */
+    if (iAmUsedAgg) {
+
+        currentRoundFDStart = nextRoundFDStart;
+
+        if (!useIOBuffer || (roundIter == 0)) {
+        int amountDataToReadThisRound;
+        if ((fd_end[myAggRank] - currentRoundFDStart) < coll_bufsize) {
+            currentRoundFDEnd = fd_end[myAggRank];
+            amountDataToReadThisRound = ((currentRoundFDEnd-currentRoundFDStart)+1);
+        }
+        else {
+            currentRoundFDEnd = currentRoundFDStart + coll_bufsize - 1;
+            amountDataToReadThisRound = coll_bufsize;
+        }
+
+        /* read currentRoundFDEnd bytes */
+        ADIO_ReadContig(fd, read_buf,amountDataToReadThisRound,
+            MPI_BYTE, ADIO_EXPLICIT_OFFSET, currentRoundFDStart,
+            &status, error_code);
+        currentReadBuf = 1;
+
+        }
+        if (useIOBuffer) { /* use the thread reader for the next round */
+        /* switch back and forth between the read buffers so that the data aggregation code is diseminating 1 buffer while the thread is reading into the other */
+
+        if (roundIter > 0)
+            currentRoundFDEnd = nextRoundFDEnd;
+
+        if (roundIter < (numberOfRounds-1)) {
+            nextRoundFDStart += coll_bufsize;
+            int amountDataToReadNextRound;
+            if ((fd_end[myAggRank] - nextRoundFDStart) < coll_bufsize) {
+            nextRoundFDEnd = fd_end[myAggRank];
+            amountDataToReadNextRound = ((nextRoundFDEnd-nextRoundFDStart)+1);
+            }
+            else {
+            nextRoundFDEnd = nextRoundFDStart + coll_bufsize - 1;
+            amountDataToReadNextRound = coll_bufsize;
+            }
+
+            if(!pthread_equal(io_thread, pthread_self())) {
+            pthread_join(io_thread, &thread_ret);
+            *error_code = *(int *)thread_ret;
+            if (*error_code != MPI_SUCCESS) return;
+            io_thread = pthread_self();
+
+            }
+            io_thread_args.fd = fd;
+            /* do a little pointer shuffling: background I/O works from one
+             * buffer while two-phase machinery fills up another */
+
+            if (currentReadBuf == 0) {
+            io_thread_args.buf = read_buf0;
+            currentReadBuf = 1;
+            read_buf = read_buf1;
+            }
+            else {
+            io_thread_args.buf = read_buf1;
+            currentReadBuf = 0;
+            read_buf = read_buf0;
+            }
+            io_thread_args.io_kind = ADIOI_READ;
+            io_thread_args.size = amountDataToReadNextRound;
+            io_thread_args.offset = nextRoundFDStart;
+            io_thread_args.status = status;
+            io_thread_args.error_code = *error_code;
+            if ( (pthread_create(&io_thread, NULL,
+                    ADIOI_IO_Thread_Func, &(io_thread_args))) != 0)
+            io_thread = pthread_self();
+
+        }
+        else { /* last round */
+
+            if(!pthread_equal(io_thread, pthread_self())) {
+            pthread_join(io_thread, &thread_ret);
+            *error_code = *(int *)thread_ret;
+            if (*error_code != MPI_SUCCESS) return;
+            io_thread = pthread_self();
+
+            }
+            if (currentReadBuf == 0) {
+            read_buf = read_buf0;
+            }
+            else {
+            read_buf = read_buf1;
+            }
+
+        }
+        } /* useIOBuffer */
+    } /* IAmUsedAgg */
+    else if (useIOBuffer) {
+      if (roundIter < (numberOfRounds-1)) {
+            if (currentReadBuf == 0) {
+            currentReadBuf = 1;
+            read_buf = read_buf1;
+            }
+            else {
+            currentReadBuf = 0;
+            read_buf = read_buf0;
+            }
+      }
+      else {
+            if (currentReadBuf == 0) {
+            read_buf = read_buf0;
+            }
+            else {
+            read_buf = read_buf1;
+            }
+      }
+
+    }
+    // wait until the read buffers are full before we start pulling from the source procs
+    MPI_Barrier(fd->comm);
+
+    int aggIter;
+    for (aggIter=0;aggIter<numSourceAggs;aggIter++) {
+
+    /* If we have data for the round/agg process it.
+     */
+    if ((bufTypeIsContig && (baseRecvBufferOffset[roundIter][aggIter] != -1)) || (!bufTypeIsContig && (baseNonContigSourceBufferOffset[roundIter][aggIter].flatBufIndice != -1))) {
+
+      ADIO_Offset currentRoundFDStartForMySourceAgg = (ADIO_Offset)((ADIO_Offset)sourceAggsForMyDataFDStart[aggIter] + (ADIO_Offset)((ADIO_Offset)roundIter*(ADIO_Offset)coll_bufsize));
+      ADIO_Offset currentRoundFDEndForMySourceAgg = (ADIO_Offset)((ADIO_Offset)sourceAggsForMyDataFDStart[aggIter] + (ADIO_Offset)((ADIO_Offset)(roundIter+1)*(ADIO_Offset)coll_bufsize) - (ADIO_Offset)1);
+
+      int sourceAggContigAccessCount = 0;
+
+      /* These data structures are used for the derived datatype mpi_put
+       * in the gpfsmpio_aggmethod of 2 case.
+       */
+      int *sourceAggBlockLengths;
+      MPI_Aint *sourceAggDisplacements, *recvBufferDisplacements;
+      MPI_Datatype *sourceAggDataTypes;
+
+      int allocatedDerivedTypeArrays = 0;
+
+      ADIO_Offset recvBufferOffset = 0;
+
+      /* Process the range of offsets for this source agg.
+       */
+      int offsetIter;
+         for (offsetIter=sourceAggsForMyDataFirstOffLenIndex[roundIter][aggIter];offsetIter<=sourceAggsForMyDataLastOffLenIndex[roundIter][aggIter];offsetIter++) {
+        if (currentRoundFDEndForMySourceAgg > sourceAggsForMyDataFDEnd[aggIter])
+            currentRoundFDEndForMySourceAgg = sourceAggsForMyDataFDEnd[aggIter];
+
+        ADIO_Offset offsetStart = offset_list[offsetIter], offsetEnd = (offset_list[offsetIter]+len_list[offsetIter]-(ADIO_Offset)1);
+
+        /* Get the base source buffer offset for this iterataion of the source agg in the round.
+         * For the first one just get the predetermined base value, for subsequent ones keep track
+         * of the value for each iteration and increment it.
+         */
+        if (offsetIter == sourceAggsForMyDataFirstOffLenIndex[roundIter][aggIter]) {
+          if (bufTypeIsContig)
+            currentBaseRecvBufferOffset = baseRecvBufferOffset[roundIter][aggIter];
+          else {
+            currentFlatBufIndice = baseNonContigSourceBufferOffset[roundIter][aggIter].flatBufIndice;
+            currentDataTypeExtent = baseNonContigSourceBufferOffset[roundIter][aggIter].dataTypeExtent;
+            currentRemLen = baseNonContigSourceBufferOffset[roundIter][aggIter].remLen;
+#ifdef onesidedtrace
+            printf("currentFlatBufIndice initially set to %d starting this round/agg %d/%d\n",currentFlatBufIndice,roundIter,aggIter);
+#endif
+          }
+        }
+        else {
+          if (bufTypeIsContig)
+            currentBaseRecvBufferOffset += len_list[offsetIter-1];
+          else {
+
+          /* For non-contiguous source datatype count up the extents and indices to this point.
+           */
+          ADIO_Offset sourceBlockTotal = 0-currentRemLen;
+          while (sourceBlockTotal < len_list[offsetIter-1]) {
+            sourceBlockTotal += flatBuf->blocklens[currentFlatBufIndice];
+            currentFlatBufIndice++;
+            if (currentFlatBufIndice == flatBuf->count) {
+              currentFlatBufIndice = 0;
+              currentDataTypeExtent++;
+            }
+          } // while
+          if (sourceBlockTotal > len_list[offsetIter-1])
+            currentFlatBufIndice--;
+          currentRemLen = len_list[offsetIter-1] - sourceBlockTotal;
+#ifdef onesidedtrace
+          printf("contig_access_count source agg %d currentFlatBufIndice is now %d currentDataTypeExtent is now %d currentRemLen is now %ld\n",aggIter,currentFlatBufIndice,currentDataTypeExtent,currentRemLen);
+#endif
+          }
+
+        }
+
+        /* For source buffer contiguous here is the offset we will use for this iteration.
+         */
+        if (bufTypeIsContig)
+          recvBufferOffset = currentBaseRecvBufferOffset;
+
+#ifdef onesidedtrace
+        printf("roundIter %d source iter %d sourceAggsForMyData is %d offset_list[%d] is %ld len_list[%d] is %ld sourceAggsForMyDataFDStart is %ld sourceAggsForMyDataFDEnd is %ld currentRoundFDStartForMySourceAgg is %ld currentRoundFDEndForMySourceAgg is %ld sourceAggsForMyDataFirstOffLenIndex is %d baseRecvBufferOffset is %ld\n",
+            roundIter,aggIter,sourceAggsForMyData[aggIter],offsetIter,offset_list[offsetIter],offsetIter,len_list[offsetIter],
+            sourceAggsForMyDataFDStart[aggIter],sourceAggsForMyDataFDEnd[aggIter],
+            currentRoundFDStartForMySourceAgg,currentRoundFDEndForMySourceAgg,sourceAggsForMyDataFirstOffLenIndex[roundIter][aggIter],baseRecvBufferOffset[roundIter][aggIter]);
+#endif
+
+
+        /* Determine the amount of data and exact source buffer offsets to use.
+         */
+        int bufferAmountToRecv = 0;
+
+        if ((offsetStart >= currentRoundFDStartForMySourceAgg) && (offsetStart <= currentRoundFDEndForMySourceAgg)) {
+            if (offsetEnd > currentRoundFDEndForMySourceAgg)
+            bufferAmountToRecv = (currentRoundFDEndForMySourceAgg - offsetStart) +1;
+            else
+            bufferAmountToRecv = (offsetEnd - offsetStart) +1;
+        }
+        else if ((offsetEnd >= currentRoundFDStartForMySourceAgg) && (offsetEnd <= currentRoundFDEndForMySourceAgg)) {
+            if (offsetEnd > currentRoundFDEndForMySourceAgg)
+            bufferAmountToRecv = (currentRoundFDEndForMySourceAgg - currentRoundFDStartForMySourceAgg) +1;
+            else
+            bufferAmountToRecv = (offsetEnd - currentRoundFDStartForMySourceAgg) +1;
+            if (offsetStart < currentRoundFDStartForMySourceAgg) {
+              if (bufTypeIsContig)
+                recvBufferOffset += (currentRoundFDStartForMySourceAgg-offsetStart);
+              offsetStart = currentRoundFDStartForMySourceAgg;
+            }
+        }
+        else if ((offsetStart <= currentRoundFDStartForMySourceAgg) && (offsetEnd >= currentRoundFDEndForMySourceAgg)) {
+            bufferAmountToRecv = (currentRoundFDEndForMySourceAgg - currentRoundFDStartForMySourceAgg) +1;
+            if (bufTypeIsContig)
+              recvBufferOffset += (currentRoundFDStartForMySourceAgg-offsetStart);
+            offsetStart = currentRoundFDStartForMySourceAgg;
+        }
+
+        if (bufferAmountToRecv > 0) { /* we have data to recv this round */
+          if (gpfsmpio_aggmethod == 2) {
+            /* Only allocate these arrays if we are using method 2 and only do it once for this round/source agg.
+             */
+            if (!allocatedDerivedTypeArrays) {
+              sourceAggBlockLengths = (int *)ADIOI_Malloc(maxNumContigOperations * sizeof(int));
+              sourceAggDisplacements = (MPI_Aint *)ADIOI_Malloc(maxNumContigOperations * sizeof(MPI_Aint));
+              recvBufferDisplacements = (MPI_Aint *)ADIOI_Malloc(maxNumContigOperations * sizeof(MPI_Aint));
+              sourceAggDataTypes = (MPI_Datatype *)ADIOI_Malloc(maxNumContigOperations * sizeof(MPI_Datatype));
+              allocatedDerivedTypeArrays = 1;
+            }
+          }
+
+          /* For the non-contiguous source datatype we need to determine the number,size and offsets of the chunks
+           * from the source buffer to be sent to this contiguous chunk defined by the round/agg/iter in the source.
+           */
+          int numNonContigSourceChunks = 0;
+          ADIO_Offset *nonContigSourceOffsets;
+          int *nonContigSourceLens;
+          ADIO_Offset baseDatatypeInstanceOffset = 0;
+
+          if (!bufTypeIsContig) {
+
+            currentRecvBufferOffset = (ADIO_Offset)((ADIO_Offset)currentDataTypeExtent * (ADIO_Offset)bufTypeExtent) + flatBuf->indices[currentFlatBufIndice] + currentRemLen;
+#ifdef onesidedtrace
+            printf("!bufTypeIsContig currentRecvBufferOffset set to %ld for roundIter %d source %d currentDataTypeExtent %d flatBuf->indices[currentFlatBufIndice] %ld currentRemLen %ld currentFlatBufIndice %d\n",currentRecvBufferOffset,roundIter,aggIter,currentDataTypeExtent,flatBuf->indices[currentFlatBufIndice],currentRemLen,currentFlatBufIndice);
+#endif
+
+            /* Count the chunks first to see how much to malloc from the ending point from above code.
+             */
+            int recvBytesCounted = 0;
+            int tmpFlatBufIndice = currentFlatBufIndice;
+            int maxNumNonContigSourceChunks = 2; // over-initialize for potential remnants on both ends
+
+            while (recvBytesCounted < bufferAmountToRecv) {
+              maxNumNonContigSourceChunks++;
+              if (tmpFlatBufIndice == flatBuf->count) {
+                tmpFlatBufIndice = 0;
+              }
+              recvBytesCounted += flatBuf->blocklens[tmpFlatBufIndice];
+              tmpFlatBufIndice++;
+            }
+
+            nonContigSourceOffsets = (ADIO_Offset *)ADIOI_Malloc(maxNumNonContigSourceChunks * sizeof(ADIO_Offset));
+            nonContigSourceLens = (int *)ADIOI_Malloc(maxNumNonContigSourceChunks * sizeof(int));
+
+            /* now populate the nonContigSourceOffsets and nonContigSourceLens arrays for use in the one-sided operations.
+             */
+            int ncArrayIndex = 0;
+            int remainingBytesToLoadedIntoNCArrays = bufferAmountToRecv;
+            ADIO_Offset indexIntoCurrentIndice = 0;
+            if (currentRemLen > 0)
+              indexIntoCurrentIndice = flatBuf->blocklens[currentFlatBufIndice] - currentRemLen;
+
+            int datatypeInstances = currentDataTypeExtent;
+            tmpFlatBufIndice = currentFlatBufIndice;
+            while (remainingBytesToLoadedIntoNCArrays > 0) {
+             nonContigSourceOffsets[ncArrayIndex] = currentRecvBufferOffset;
+
+              if ((flatBuf->blocklens[tmpFlatBufIndice] - indexIntoCurrentIndice) > remainingBytesToLoadedIntoNCArrays) {
+                nonContigSourceLens[ncArrayIndex] = remainingBytesToLoadedIntoNCArrays;
+                remainingBytesToLoadedIntoNCArrays = 0;
+              }
+              else {
+                nonContigSourceLens[ncArrayIndex] = (int)(flatBuf->blocklens[tmpFlatBufIndice] - indexIntoCurrentIndice);
+                remainingBytesToLoadedIntoNCArrays -= (flatBuf->blocklens[tmpFlatBufIndice] - indexIntoCurrentIndice);
+              }
+              indexIntoCurrentIndice = 0; // only worry about beginning remnant for first iter
+
+              tmpFlatBufIndice++;
+              if (tmpFlatBufIndice == flatBuf->count) {
+                tmpFlatBufIndice = 0;
+                datatypeInstances++;
+                baseDatatypeInstanceOffset = datatypeInstances * bufTypeExtent;
+              }
+              currentRecvBufferOffset = baseDatatypeInstanceOffset + flatBuf->indices[tmpFlatBufIndice];
+              ncArrayIndex++;
+              numNonContigSourceChunks++;
+            } // while
+#ifdef onesidedtrace
+            printf("currentRecvBufferOffset finally set to %ld\n",currentRecvBufferOffset);
+#endif
+          } // !bufTypeIsContig
+
+          /* Determine the offset into the source window.
+           */
+          MPI_Aint sourceDisplacementToUseThisRound = (MPI_Aint) ((ADIO_Offset)offsetStart - currentRoundFDStartForMySourceAgg);
+
+          /* If using the thread readr select the appropriate side of the split window.
+           */
+          if (useIOBuffer && (read_buf == read_buf1)) {
+            sourceDisplacementToUseThisRound += coll_bufsize;
+          }
+
+          /* For gpfsmpio_aggmethod of 1 do the mpi_put using the primitive MPI_BYTE type on each contiguous chunk of source data.
+           */
+          if (gpfsmpio_aggmethod == 1) {
+#ifndef ACTIVE_TARGET
+            MPI_Win_lock(MPI_LOCK_SHARED, sourceAggsForMyData[aggIter], 0, read_buf_window);
+#endif
+            if (bufTypeIsContig) {
+              MPI_Get(&((char*)buf)[recvBufferOffset],bufferAmountToRecv, MPI_BYTE,sourceAggsForMyData[aggIter],sourceDisplacementToUseThisRound, bufferAmountToRecv,MPI_BYTE,read_buf_window);
+            }
+            else {
+              for (i=0;i<numNonContigSourceChunks;i++) {
+                MPI_Get(&((char*)buf)[nonContigSourceOffsets[i]],nonContigSourceLens[i], MPI_BYTE,sourceAggsForMyData[aggIter],sourceDisplacementToUseThisRound, nonContigSourceLens[i],MPI_BYTE,read_buf_window);
+#ifdef onesidedtrace
+                printf("mpi_put[%d] nonContigSourceOffsets is %d of nonContigSourceLens %d to source disp %d\n",i,nonContigSourceOffsets[i],nonContigSourceLens[i],sourceDisplacementToUseThisRound);
+#endif
+                sourceDisplacementToUseThisRound += nonContigSourceLens[i];
+              }
+            }
+#ifndef ACTIVE_TARGET
+            MPI_Win_unlock(sourceAggsForMyData[aggIter], read_buf_window);
+#endif
+
+          }
+
+          /* For gpfsmpio_aggmethod of 2 populate the data structures for this round/agg for this offset iter
+           * to be used subsequently when building the derived type for 1 mpi_put for all the data for this
+           * round/agg.
+           */
+          else if (gpfsmpio_aggmethod == 2) {
+
+            if (bufTypeIsContig) {
+              sourceAggBlockLengths[sourceAggContigAccessCount]= bufferAmountToRecv;
+              sourceAggDataTypes[sourceAggContigAccessCount] = MPI_BYTE;
+              sourceAggDisplacements[sourceAggContigAccessCount] = sourceDisplacementToUseThisRound;
+              recvBufferDisplacements[sourceAggContigAccessCount] = (MPI_Aint)recvBufferOffset;
+              sourceAggContigAccessCount++;
+            }
+            else {
+              for (i=0;i<numNonContigSourceChunks;i++) {
+                sourceAggBlockLengths[sourceAggContigAccessCount]= nonContigSourceLens[i];
+                sourceAggDataTypes[sourceAggContigAccessCount] = MPI_BYTE;
+                sourceAggDisplacements[sourceAggContigAccessCount] = sourceDisplacementToUseThisRound;
+                recvBufferDisplacements[sourceAggContigAccessCount] = (MPI_Aint)nonContigSourceOffsets[i];
+#ifdef onesidedtrace
+                printf("mpi_put building arrays for iter %d - recvBufferDisplacements is %d of nonContigSourceLens %d to source disp %d sourceAggContigAccessCount is %d sourceAggBlockLengths[sourceAggContigAccessCount] is %d\n",i,recvBufferDisplacements[sourceAggContigAccessCount],nonContigSourceLens[i],sourceAggDisplacements[sourceAggContigAccessCount],sourceAggContigAccessCount, sourceAggBlockLengths[sourceAggContigAccessCount]);
+#endif
+                sourceAggContigAccessCount++;
+                sourceDisplacementToUseThisRound += nonContigSourceLens[i];
+              }
+            }
+          }
+#ifdef onesidedtrace
+        printf("roundIter %d bufferAmountToRecv is %d recvBufferOffset is %d offsetStart is %ld currentRoundFDStartForMySourceAgg is %ld sourceDisplacementToUseThisRound is %ld sourceAggsForMyDataFDStart[aggIter] is %ld\n",roundIter, bufferAmountToRecv,recvBufferOffset, offsetStart,currentRoundFDStartForMySourceAgg,sourceDisplacementToUseThisRound,sourceAggsForMyDataFDStart[aggIter]);
+#endif
+
+          if (!bufTypeIsContig) {
+            ADIOI_Free(nonContigSourceOffsets);
+            ADIOI_Free(nonContigSourceLens);
+          }
+          } // bufferAmountToRecv > 0
+      } // contig list
+
+      /* For gpfsmpio_aggmethod of 2 now build the derived type using the data from this round/agg and do 1 single mpi_put.
+       */
+      if (gpfsmpio_aggmethod == 2) {
+        MPI_Datatype recvBufferDerivedDataType, sourceBufferDerivedDataType;
+        MPI_Type_create_struct(sourceAggContigAccessCount, sourceAggBlockLengths, recvBufferDisplacements, sourceAggDataTypes, &recvBufferDerivedDataType);
+        MPI_Type_commit(&recvBufferDerivedDataType);
+        MPI_Type_create_struct(sourceAggContigAccessCount, sourceAggBlockLengths, sourceAggDisplacements, sourceAggDataTypes, &sourceBufferDerivedDataType);
+        MPI_Type_commit(&sourceBufferDerivedDataType);
+// printf("round %d mpi_put of derived type to agg %d sourceAggContigAccessCount is %d\n",roundIter, sourceAggsForMyData[aggIter],sourceAggContigAccessCount);
+#ifndef ACTIVE_TARGET
+        MPI_Win_lock(MPI_LOCK_SHARED, sourceAggsForMyData[aggIter], 0, read_buf_window);
+#endif
+        MPI_Get(((char*)buf),1, recvBufferDerivedDataType,sourceAggsForMyData[aggIter],0, 1,sourceBufferDerivedDataType,read_buf_window);
+
+#ifndef ACTIVE_TARGET
+        MPI_Win_unlock(sourceAggsForMyData[aggIter], read_buf_window);
+#endif
+
+        if (allocatedDerivedTypeArrays) {
+          ADIOI_Free(sourceAggBlockLengths);
+          ADIOI_Free(sourceAggDisplacements);
+          ADIOI_Free(sourceAggDataTypes);
+          ADIOI_Free(recvBufferDisplacements);
+        }
+        MPI_Type_free(&recvBufferDerivedDataType);
+        MPI_Type_free(&sourceBufferDerivedDataType);
+      }
+      } // baseoffset != -1
+    } // source aggs
+
+    /* the source procs recv the requested data to the aggs */
+
+#ifdef ACTIVE_TARGET
+    MPI_Win_fence(0, read_buf_window);
+#else
+    MPI_Barrier(fd->comm);
+#endif
+
+    nextRoundFDStart = currentRoundFDStart + coll_bufsize;
+
+    } /* for-loop roundIter */
+
+#ifdef ROMIO_GPFS
+    endTimeBase = MPI_Wtime();
+    gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH] += (endTimeBase-startTimeBase);
+#endif
+
+    if (useIOBuffer) { /* thread readr cleanup */
+
+    if ( !pthread_equal(io_thread, pthread_self()) ) {
+        pthread_join(io_thread, &thread_ret);
+        *error_code = *(int *)thread_ret;
+    }
+
+    }
+
+    ADIOI_Free(sourceAggsForMyData);
+    ADIOI_Free(sourceAggsForMyDataFDStart);
+    ADIOI_Free(sourceAggsForMyDataFDEnd);
+
+    for (i=0;i<numberOfRounds;i++) {
+      ADIOI_Free(sourceAggsForMyDataFirstOffLenIndex[i]);
+      ADIOI_Free(sourceAggsForMyDataLastOffLenIndex[i]);
+      if (bufTypeIsContig)
+        ADIOI_Free(baseRecvBufferOffset[i]);
+      else
+        ADIOI_Free(baseNonContigSourceBufferOffset[i]);
+    }
+    ADIOI_Free(sourceAggsForMyDataFirstOffLenIndex);
+    ADIOI_Free(sourceAggsForMyDataLastOffLenIndex);
+    if (bufTypeIsContig)
+      ADIOI_Free(baseRecvBufferOffset);
+    else
+      ADIOI_Free(baseNonContigSourceBufferOffset);
+
+    return;
+}

-----------------------------------------------------------------------

Summary of changes:
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c      |   15 +
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c      |   47 +
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h      |    5 +-
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c      |   28 +
 src/mpi/romio/adio/common/Makefile.mk            |    1 +
 src/mpi/romio/adio/common/ad_close.c             |    2 +-
 src/mpi/romio/adio/common/ad_open.c              |    6 +-
 src/mpi/romio/adio/common/onesided_aggregation.c | 2075 ++++++++++++++++++++++
 src/mpi/romio/adio/include/adio.h                |    4 +
 src/mpi/romio/adio/include/adioi.h               |   24 +
 src/mpid/common/datatype/mpid_type_commit.c      |    7 +-
 src/mpid/common/datatype/mpid_type_dup.c         |    7 +-
 src/mpid/pamid/include/mpidi_datatypes.h         |    1 +
 src/mpid/pamid/include/mpidpre.h                 |    8 +
 src/mpid/pamid/src/Makefile.mk                   |    3 +-
 src/mpid/pamid/src/mpid_init.c                   |    5 +-
 src/mpid/pamid/src/mpidi_env.c                   |    5 +
 src/mpid/pamid/src/mpidi_pami_datatype.c         |  155 ++
 src/mpid/pamid/src/onesided/mpid_win_get.c       |   55 +-
 src/mpid/pamid/src/onesided/mpid_win_put.c       |   53 +-
 src/mpid/pamid/src/onesided/mpidi_onesided.h     |    2 +
 21 files changed, 2491 insertions(+), 17 deletions(-)
 create mode 100644 src/mpi/romio/adio/common/onesided_aggregation.c
 create mode 100644 src/mpid/pamid/src/mpidi_pami_datatype.c


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list