[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.2-359-ga9c78f2

Service Account noreply at mpich.org
Mon Jul 25 11:30:18 CDT 2016


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  a9c78f27dd8d49a5a6ec05d563e965c8b8e141b8 (commit)
       via  a78e899047778329e4f87416ab89aebb005cd8c1 (commit)
       via  8fe8cc157c8330d0e940ba67895c7a3604b3c6c3 (commit)
       via  28d1b074f135173c79244c2ce92e8b5197af00da (commit)
       via  e474997fd5e859fe363d3b2ce116ca63868ad3e2 (commit)
       via  598aa493dcef472ac32498fa0100976df15c2b26 (commit)
       via  ccd815498267d6cc342d25fce7ce3107f3e18294 (commit)
       via  b95b4e4ef0ed1b6ea4b47b56859eef18487d5eb8 (commit)
      from  ea008be1976a301092ccb7a35b3ee1f9a4ec4d07 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/a9c78f27dd8d49a5a6ec05d563e965c8b8e141b8

commit a9c78f27dd8d49a5a6ec05d563e965c8b8e141b8
Author: Paul Coffman <pcoffman at anl.gov>
Date:   Fri Mar 25 13:59:49 2016 -0700

    ROMIO apply TUNEGATHER to ADIOI_LUSTRE_WriteStridedColl
    
    Implement ROMIO_TUNEGATHER within ADIOI_LUSTRE_WriteStridedColl
    to use one MPI_Allreduce instead of 2 (or 3 in the case of onesided aggregation)
    MPI_Allgathers to communication all starting and ending file offsets
    (and onesided count sizes)
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpi/romio/adio/ad_lustre/ad_lustre_wrcoll.c b/src/mpi/romio/adio/ad_lustre/ad_lustre_wrcoll.c
index 0e47e5e..cf6787c 100644
--- a/src/mpi/romio/adio/ad_lustre/ad_lustre_wrcoll.c
+++ b/src/mpi/romio/adio/ad_lustre/ad_lustre_wrcoll.c
@@ -96,7 +96,7 @@ void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count,
     int *striping_info = NULL;
     ADIO_Offset **buf_idx = NULL;
     int old_error, tmp_error;
-    ADIO_Offset *count_sizes;
+    ADIO_Offset *lustre_offsets0, *lustre_offsets, *count_sizes;
 
     MPI_Comm_size(fd->comm, &nprocs);
     MPI_Comm_rank(fd->comm, &myrank);
@@ -133,6 +133,46 @@ void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count,
       MPI_Type_size_x(datatype, &buftype_size);
       my_count_size = (ADIO_Offset) count  * (ADIO_Offset)buftype_size;
     }
+    if (romio_tunegather) {
+      if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
+        lustre_offsets0 = (ADIO_Offset *) ADIOI_Malloc(3*nprocs*sizeof(ADIO_Offset));
+        lustre_offsets  = (ADIO_Offset *) ADIOI_Malloc(3*nprocs*sizeof(ADIO_Offset));
+        for (i=0; i<nprocs; i++)  {
+          lustre_offsets0[i*3]   = 0;
+          lustre_offsets0[i*3+1] = 0;
+          lustre_offsets0[i*3+2] = 0;
+        }
+        lustre_offsets0[myrank*3]   = start_offset;
+        lustre_offsets0[myrank*3+1] =   end_offset;
+        lustre_offsets0[myrank*3+2] =   my_count_size;
+        MPI_Allreduce( lustre_offsets0, lustre_offsets, nprocs*3, ADIO_OFFSET, MPI_MAX, fd->comm );
+        for (i=0; i<nprocs; i++)  {
+          st_offsets [i] = lustre_offsets[i*3]  ;
+          end_offsets[i] = lustre_offsets[i*3+1];
+          count_sizes[i] = lustre_offsets[i*3+2];
+        }
+      }
+      else {
+        lustre_offsets0 = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
+        lustre_offsets  = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
+        for (i=0; i<nprocs; i++)  {
+          lustre_offsets0[i*2]   = 0;
+          lustre_offsets0[i*2+1] = 0;
+        }
+        lustre_offsets0[myrank*2]   = start_offset;
+        lustre_offsets0[myrank*2+1] =   end_offset;
+
+        MPI_Allreduce( lustre_offsets0, lustre_offsets, nprocs*2, ADIO_OFFSET, MPI_MAX, fd->comm );
+
+        for (i=0; i<nprocs; i++)  {
+          st_offsets [i] = lustre_offsets[i*2]  ;
+          end_offsets[i] = lustre_offsets[i*2+1];
+        }
+      }
+      ADIOI_Free( lustre_offsets0 );
+      ADIOI_Free( lustre_offsets  );
+    }
+    else {	
 	MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
 		      ADIO_OFFSET, fd->comm);
 	MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
@@ -141,6 +181,7 @@ void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 	  MPI_Allgather(&my_count_size, 1, ADIO_OFFSET, count_sizes, 1,
                 ADIO_OFFSET, fd->comm);
     }
+    }
 	/* are the accesses of different processes interleaved? */
 	for (i = 1; i < nprocs; i++)
 	    if ((st_offsets[i] < end_offsets[i-1]) &&

http://git.mpich.org/mpich.git/commitdiff/a78e899047778329e4f87416ab89aebb005cd8c1

commit a78e899047778329e4f87416ab89aebb005cd8c1
Author: Paul Coffman <pcoffman at anl.gov>
Date:   Fri Mar 25 13:57:47 2016 -0700

    Generalize ROMIO collective aggregation tuning
    
    Use the TUNEGATHER environment variable to select different collective
    tuning approaches.  The current implementation for GPFS allows the user
    to use one MPI_Allreduce instead of 2 (or 3 in the case of onesided
    aggregation) MPI_Allgathers to communication all starting and ending
    file offsets (and onesided count sizes) if the GPFSMPIO_TUNEGATHER
    environment variable is set.  Generalize to other file systems as
    ROMIO_TUNEGATHER.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c
index bcbd2a4..68df849 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c
@@ -185,7 +185,7 @@ void ADIOI_GPFS_ReadStridedColl(ADIO_File fd, void *buf, int count,
         MPI_Type_size_x(datatype, &buftype_size);
         my_count_size = (ADIO_Offset) count  * (ADIO_Offset)buftype_size;
     }
-    if (gpfsmpio_tunegather) {
+    if (romio_tunegather) {
       if ((romio_read_aggmethod == 1) || (romio_read_aggmethod == 2)) {
         gpfs_offsets0 = (ADIO_Offset *) ADIOI_Malloc(3*nprocs*sizeof(ADIO_Offset));
         gpfs_offsets  = (ADIO_Offset *) ADIOI_Malloc(3*nprocs*sizeof(ADIO_Offset));
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
index 8b4d06d..548ae67 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
@@ -31,7 +31,6 @@ int 	gpfsmpio_timing;
 int 	gpfsmpio_timing2;
 int     gpfsmpio_timing_cw_level;
 int 	gpfsmpio_comm;
-int 	gpfsmpio_tunegather;
 int 	gpfsmpio_tuneblocking;
 long    bglocklessmpio_f_type;
 int     gpfsmpio_bg_nagg_pset;
@@ -59,12 +58,6 @@ double	gpfsmpio_prof_cr    [GPFSMPIO_CIO_LAST+1];
  *   - 1 - Collect/report timing.
  *   - Default is 0.
  *
- * - GPFSMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated
- *   for aggregator collective i/o.  Possible values:
- *   - 0 - Use two MPI_Allgather's to collect starting and ending offsets.
- *   - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets.
- *   - Default is 1.
- *
  * - GPFSMPIO_TUNEBLOCKING - Tune how aggregate file domains are
  *   calculated (block size).  Possible values:
  *   - 0 - Evenly calculate file domains across aggregators.  Also use
@@ -139,9 +132,6 @@ void ad_gpfs_get_env_vars() {
     gpfsmpio_timing = 0;
 	x = getenv( "GPFSMPIO_TIMING"       );
 	if (x) gpfsmpio_timing       = atoi(x);
-    gpfsmpio_tunegather = 1;
-	x = getenv( "GPFSMPIO_TUNEGATHER"   );
-	if (x) gpfsmpio_tunegather   = atoi(x);
     gpfsmpio_tuneblocking = 1;
     x = getenv( "GPFSMPIO_TUNEBLOCKING" );
     if (x) gpfsmpio_tuneblocking = atoi(x);
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
index 21f98f6..df0139d 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
@@ -61,7 +61,6 @@ extern double 	gpfsmpio_prof_cr    [GPFSMPIO_CIO_LAST+1];
 extern int 	gpfsmpio_timing;
 extern int      gpfsmpio_timing_cw_level;
 extern int 	gpfsmpio_comm;
-extern int 	gpfsmpio_tunegather;
 extern int 	gpfsmpio_tuneblocking;
 extern long bglocklessmpio_f_type;
 extern int      gpfsmpio_pthreadio;
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
index bf9707e..f52f2d7 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
@@ -187,7 +187,7 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
         MPI_Type_size_x(datatype, &buftype_size);
         my_count_size = (ADIO_Offset) count  * (ADIO_Offset)buftype_size;
     }
-    if (gpfsmpio_tunegather) {
+    if (romio_tunegather) {
       if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
         gpfs_offsets0 = (ADIO_Offset *) ADIOI_Malloc(3*nprocs*sizeof(ADIO_Offset));
         gpfs_offsets  = (ADIO_Offset *) ADIOI_Malloc(3*nprocs*sizeof(ADIO_Offset));
diff --git a/src/mpi/romio/adio/common/ad_tuning.c b/src/mpi/romio/adio/common/ad_tuning.c
index ee9c318..dc89bd6 100644
--- a/src/mpi/romio/adio/common/ad_tuning.c
+++ b/src/mpi/romio/adio/common/ad_tuning.c
@@ -26,6 +26,7 @@ int     romio_read_aggmethod;
 int     romio_onesided_no_rmw;
 int     romio_onesided_always_rmw;
 int     romio_onesided_inform_rmw;
+int 	romio_tunegather;
 
 /* set internal variables for tuning environment variables */
 /** \page mpiio_vars MPIIO Configuration
@@ -71,6 +72,12 @@ int     romio_onesided_inform_rmw;
  *   - 0 (disabled) or 1 (enabled)
  *   - Default is 0
  *
+ * - ROMIO_TUNEGATHER - Tune how starting and ending offsets are communicated
+ *   for aggregator collective i/o.  Possible values:
+ *   - 0 - Use two or three MPI_Allgather's to collect starting and ending offsets.
+ *   - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets.
+ *   - Default is 1.
+ *
  */
 
 void ad_get_env_vars() {
@@ -97,4 +104,8 @@ void ad_get_env_vars() {
     romio_onesided_inform_rmw = 0;
     x = getenv( "ROMIO_ONESIDED_INFORM_RMW" );
     if (x) romio_onesided_inform_rmw = atoi(x);
+
+    romio_tunegather = 1;
+	x = getenv( "ROMIO_TUNEGATHER"   );
+	if (x) romio_tunegather   = atoi(x);
 }
diff --git a/src/mpi/romio/adio/include/ad_tuning.h b/src/mpi/romio/adio/include/ad_tuning.h
index 51fc926..f481ccc 100644
--- a/src/mpi/romio/adio/include/ad_tuning.h
+++ b/src/mpi/romio/adio/include/ad_tuning.h
@@ -10,7 +10,6 @@
  * ad_tuning.h
  *
  * declares common global variables and functions for performance tuning
- * and functional debugging.
  *---------------------------------------------------------------------*/
 
 #ifndef AD_TUNING_H_
@@ -23,14 +22,13 @@
  *  Global variables for the control of performance tuning.
  *-----------------------------------------*/
 
-/* corresponds to environment variables to select optimizations and timing level */
-extern int      romio_pthreadio;
-extern int      romio_p2pcontig;
+/* corresponds to environment variables to select optimizations */
 extern int      romio_write_aggmethod;
 extern int      romio_read_aggmethod;
 extern int      romio_onesided_no_rmw;
 extern int      romio_onesided_always_rmw;
 extern int      romio_onesided_inform_rmw;
+extern int      romio_tunegather;
 
 /* set internal variables for tuning environment variables */
 void ad_get_env_vars(void);

http://git.mpich.org/mpich.git/commitdiff/8fe8cc157c8330d0e940ba67895c7a3604b3c6c3

commit 8fe8cc157c8330d0e940ba67895c7a3604b3c6c3
Author: Paul Coffman <pcoffman at anl.gov>
Date:   Fri Mar 25 13:43:47 2016 -0700

    ROMIO Onesided write aggregation fixes to ADIOI_GPFS_WriteStridedColl
    
    Fix count_sizes allgather and pass correct parms for the starting and ending file offset
    to ADIOI_OneSidedWriteAggregation when re-running for RMW.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
index 9a48a4f..bf9707e 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
@@ -231,7 +231,7 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 	MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
 		      ADIO_OFFSET, fd->comm);
         if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
-	    MPI_Allgather(&count_sizes, 1, ADIO_OFFSET, count_sizes, 1,
+	    MPI_Allgather(&my_count_size, 1, ADIO_OFFSET, count_sizes, 1,
                      ADIO_OFFSET, fd->comm);
         }
     }
@@ -389,7 +389,7 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
           romio_onesided_always_rmw = 1;
           int prev_romio_onesided_no_rmw = romio_onesided_no_rmw;
           romio_onesided_no_rmw = 1;
-          ADIOI_OneSidedWriteAggregation(fd, offset_list, len_list, contig_access_count, buf, datatype, error_code, st_offsets, end_offsets, currentValidDataIndex, fd_start, fd_end, &holeFound, noStripeParms);
+          ADIOI_OneSidedWriteAggregation(fd, offset_list, len_list, contig_access_count, buf, datatype, error_code, firstFileOffset, lastFileOffset, currentValidDataIndex, fd_start, fd_end, &holeFound, noStripeParms);
           romio_onesided_no_rmw = prev_romio_onesided_no_rmw;
           GPFSMPIO_T_CIO_REPORT( 1, fd, myrank, nprocs)
           ADIOI_Free(offset_list);

http://git.mpich.org/mpich.git/commitdiff/28d1b074f135173c79244c2ce92e8b5197af00da

commit 28d1b074f135173c79244c2ce92e8b5197af00da
Author: Paul Coffman <pcoffman at anl.gov>
Date:   Thu Mar 24 21:20:14 2016 -0700

    ROMIO Onesided write aggregation enhancements to support file striping
    
    The algorithm has been extended to support multiple calls for a single
    one-sided collective write operation for a file system utilizing
    striping.  The algorithm can now be called once for each segment of
    data, a segment being defined as a contiguous region of the file which
    has up to one occurrence of each stripe - the data for each stripe
    being written out by a particular aggregator.  The parameters for
    ADIOI_OneSidedWriteAggregation have changed slightly - pushing the
    computation of the first and last file offsets to the caller to avoid
    re-computation of this based on the offset list and an extra parm for
    a striping parameters data structure.  The call from GPFS therefore
    needed to change to support this new definition, but indicating that
    no striping was enabled via the striping parameter.  Each call for a
    striping file system effectively packs one striping unit of data into
    the collective buffer on each agg, with additional parameters which
    govern flushing the collective buffer to the file.  In practice the
    collective write for a file system such as lustre on a dataset
    composed of multiple segments would call the algorithm several times
    without a flush parameter to fill the collective buffers with multiple
    stripes of data, before calling it again to flush the collective buffer
    to the file system.  In this fashion the synchronization can be
    minimized as that only needs to occur around the actual read from or
    write to the file system.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
index 0e1d6b6..9a48a4f 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
@@ -286,6 +286,7 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
    done by (logically) dividing the file into file domains (FDs); each
    process may directly access only its own file domain. */
 
+    ADIO_Offset lastFileOffset = 0, firstFileOffset = -1;
     int currentValidDataIndex = 0;
     if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
       /* Take out the 0-data offsets by shifting the indexes with data to the front
@@ -295,6 +296,12 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
         if (count_sizes[i] > 0) {
           st_offsets[currentValidDataIndex] = st_offsets[i];
           end_offsets[currentValidDataIndex] = end_offsets[i];
+          lastFileOffset = MPL_MAX(lastFileOffset,end_offsets[currentValidDataIndex]);
+          if (firstFileOffset == -1)
+            firstFileOffset = st_offsets[currentValidDataIndex];
+          else
+            firstFileOffset = MPL_MIN(firstFileOffset,st_offsets[currentValidDataIndex]);
+
           currentValidDataIndex++;
         }
       }
@@ -337,10 +344,22 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
     /* If the user has specified to use a one-sided aggregation method then do that at
      * this point instead of the two-phase I/O.
      */
+      /* pass this datastructure to indicate we are a non-striping filesystem
+       * to the onesided algorithm by setting stripe size to 0.
+       */
+      ADIOI_OneSidedStripeParms noStripeParms;
+      noStripeParms.stripeSize = 0;
+      noStripeParms.segmentLen = 0;
+      noStripeParms.stripesPerAgg = 0;
+      noStripeParms.segmentIter = 0;
+      noStripeParms.flushCB = 1;
+      noStripeParms.stripedLastFileOffset = 0;
+      noStripeParms.firstStripedWriteCall = 0;
+      noStripeParms.lastStripedWriteCall = 0;
       int holeFound = 0;
       ADIOI_OneSidedWriteAggregation(fd, offset_list, len_list, contig_access_count,
-	      buf, datatype, error_code, st_offsets, end_offsets,
-	      currentValidDataIndex, fd_start, fd_end, &holeFound);
+	      buf, datatype, error_code, firstFileOffset, lastFileOffset,
+	      currentValidDataIndex, fd_start, fd_end, &holeFound, noStripeParms);
       int anyHolesFound = 0;
       if (!romio_onesided_no_rmw)
         MPI_Allreduce(&holeFound, &anyHolesFound, 1, MPI_INT, MPI_MAX, fd->comm);
@@ -366,11 +385,11 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
         if (romio_onesided_inform_rmw && (myrank ==0))
           FPRINTF(stderr,"Information: Holes found during one-sided "
 		  "write aggregation algorithm --- re-running one-sided "
-		  "write aggregation with GPFSMPIO_ONESIDED_ALWAYS_RMW set to 1.\n");
+		  "write aggregation with ROMIO_ONESIDED_ALWAYS_RMW set to 1.\n");
           romio_onesided_always_rmw = 1;
           int prev_romio_onesided_no_rmw = romio_onesided_no_rmw;
           romio_onesided_no_rmw = 1;
-          ADIOI_OneSidedWriteAggregation(fd, offset_list, len_list, contig_access_count, buf, datatype, error_code, st_offsets, end_offsets, currentValidDataIndex, fd_start, fd_end, &holeFound);
+          ADIOI_OneSidedWriteAggregation(fd, offset_list, len_list, contig_access_count, buf, datatype, error_code, st_offsets, end_offsets, currentValidDataIndex, fd_start, fd_end, &holeFound, noStripeParms);
           romio_onesided_no_rmw = prev_romio_onesided_no_rmw;
           GPFSMPIO_T_CIO_REPORT( 1, fd, myrank, nprocs)
           ADIOI_Free(offset_list);
diff --git a/src/mpi/romio/adio/common/onesided_aggregation.c b/src/mpi/romio/adio/common/onesided_aggregation.c
index c96b8bf..1445000 100644
--- a/src/mpi/romio/adio/common/onesided_aggregation.c
+++ b/src/mpi/romio/adio/common/onesided_aggregation.c
@@ -16,7 +16,30 @@
 
 //  #define onesidedtrace 1
 
-
+/* Data that needs to persist throughout multiple calls to ADIOI_OneSidedWriteAggregation
+ * to support file systems that stripe data -- the algorithm can now be called once
+ * for each segment of data, a segment being defined as a contiguous region of the file which
+ * is the size of one striping unit times the number of aggregators.  Each call effectively packs one
+ * striping unit of data into the collective buffer on each agg, with additional parameters which govern when to flush
+ * the collective buffer to the file.  Therefore in practice the collective write call for a file system such as
+ * lustre on a dataset composed of multiple segments would call the algorithm several times without a
+ * flush parameter to fill the collective buffers with multiple stripes of data, before calling it again to flush
+ * the collective buffer to the file system.  In this fashion the synchronization can be minimized as that
+ * only needs to occur during the actual read from or write to the file system.
+ */
+int iWasUsedStripingAgg; /* whether this rank was ever a used agg for this striping segement */
+int numStripesUsed;      /* the number of stripes packed into an aggregator */
+/* These 2 variables are the offset and lengths in the file corresponding to the actual stripes */
+ADIO_Offset *stripeWriteOffsets, *stripeWriteLens;
+int amountOfStripedDataExpected; /* used to determine holes in this segment thereby requiring a rmw */
+/* Since ADIOI_OneSidedWriteAggregation can be called multiple times now only flatten the buffer once */
+/* for optimal performance so persist these two variables through multiple calls */
+MPI_Aint bufTypeExtent;
+ADIOI_Flatlist_node *flatBuf;
+/* These three variables track the state of the source buffer advancement through multiple calls */
+int lastDataTypeExtent;
+int lastFlatBufIndice;
+ADIO_Offset lastIndiceOffset;
 /* This data structure holds the number of extents, the index into the flattened buffer and the remnant length
  * beyond the flattened buffer index corresponding to the base buffer offset for non-contiguous source data
  * for the range to be written coresponding to the round and target agg.
@@ -49,7 +72,6 @@ typedef struct FDSourceBufferState {
 
 } FDSourceBufferState;
 
-
 static int ADIOI_OneSidedSetup(ADIO_File fd, int procs) {
     int ret = MPI_SUCCESS;
 
@@ -59,6 +81,7 @@ static int ADIOI_OneSidedSetup(ADIO_File fd, int procs) {
     fd->io_buf_put_amounts = 0;
     ret =MPI_Win_create(&(fd->io_buf_put_amounts),sizeof(int),sizeof(int),
 	    MPI_INFO_NULL,fd->comm, &fd->io_buf_put_amounts_window);
+
 fn_exit:
     return ret;
 }
@@ -182,16 +205,19 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
     const void *buf,
     MPI_Datatype datatype,
     int *error_code,
-    ADIO_Offset *st_offsets,
-    ADIO_Offset *end_offsets,
+    ADIO_Offset firstFileOffset,
+    ADIO_Offset lastFileOffset,
     int numNonZeroDataOffsets,
     ADIO_Offset *fd_start,
     ADIO_Offset* fd_end,
-    int *hole_found)
+    int *hole_found,
+    ADIOI_OneSidedStripeParms stripe_parms)
 
 {
     int i,j; /* generic iterators */
 
+    if ((stripe_parms.stripeSize > 0) && stripe_parms.firstStripedWriteCall)
+      iWasUsedStripingAgg = 0;
 #ifdef onesidedtrace
     if (buf == NULL) {
       printf("ADIOI_OneSidedWriteAggregation - buf is NULL contig_access_count is %d\n",contig_access_count);
@@ -244,14 +270,15 @@ printf("ADIOI_OneSidedWriteAggregation started on rank %d\n",myrank);
      */
     int bufTypeIsContig;
 
-    MPI_Aint bufTypeExtent;
-    ADIOI_Flatlist_node *flatBuf=NULL;
     ADIOI_Datatype_iscontig(datatype, &bufTypeIsContig);
 
     if (!bufTypeIsContig) {
    /* Flatten the non-contiguous source datatype and set the extent. */
-      flatBuf = ADIOI_Flatten_and_find(datatype);
-      MPI_Type_extent(datatype, &bufTypeExtent);
+      if ((stripe_parms.stripeSize == 0) || stripe_parms.firstStripedWriteCall) {
+        flatBuf = ADIOI_Flatten_and_find(datatype);
+        MPI_Type_extent(datatype, &bufTypeExtent);
+      }
+
 #ifdef onesidedtrace
       printf("flatBuf->count is %d bufTypeExtent is %d\n", flatBuf->count,bufTypeExtent);
       for (i=0;i<flatBuf->count;i++)
@@ -272,9 +299,9 @@ printf("ADIOI_OneSidedWriteAggregation started on rank %d\n",myrank);
 #ifdef onesidedtrace
     printf("sizeof(FDSourceBufferState) is %d - make sure is 32 for 32-byte memalign optimal\n",sizeof(FDSourceBufferState));
 #endif
-    FDSourceBufferState *currentFDSourceBufferState;
 
-    currentFDSourceBufferState = (FDSourceBufferState *) ADIOI_Malloc(naggs * sizeof(FDSourceBufferState));
+    FDSourceBufferState *currentFDSourceBufferState = (FDSourceBufferState *) ADIOI_Malloc(naggs * sizeof(FDSourceBufferState));
+
     for (i=0;i<naggs;i++) {
       /* initialize based on the bufType to indicate that it is unset.
        */
@@ -295,27 +322,17 @@ printf("ADIOI_OneSidedWriteAggregation started on rank %d\n",myrank);
      */
     int maxNumContigOperations = contig_access_count;
 
-    ADIO_Offset lastFileOffset = 0, firstFileOffset = -1;
-    /* Get the total range being written - in the case of just 1 byte the starting and ending offsets
-     * will match the same as they would for 0 bytes so to distinguish we need the actual data count.
-     */
-    for (j=0;j<numNonZeroDataOffsets;j++) {
-#ifdef onesidedtrace
-printf("end_offsets[%d] is %ld st_offsets[%d] is %ld\n",j,end_offsets[j],j,st_offsets[j]);
-#endif
-        lastFileOffset = MPL_MAX(lastFileOffset,end_offsets[j]);
-        if (firstFileOffset == -1)
-          firstFileOffset = st_offsets[j];
-        else
-          firstFileOffset = MPL_MIN(firstFileOffset,st_offsets[j]);
-    }
-
     int myAggRank = -1; /* if I am an aggregor this is my index into fd->hints->ranklist */
     int iAmUsedAgg = 0; /* whether or not this rank is used as an aggregator. */
 
     /* Make coll_bufsize an ADIO_Offset since it is used in calculations with offsets.
      */
-    ADIO_Offset coll_bufsize = (ADIO_Offset)(fd->hints->cb_buffer_size);
+    ADIO_Offset coll_bufsize = 0;
+    if (stripe_parms.stripeSize == 0)
+      coll_bufsize = (ADIO_Offset)(fd->hints->cb_buffer_size);
+    else
+      coll_bufsize = stripe_parms.stripeSize;
+
 #ifdef ROMIO_GPFS
     if (gpfsmpio_pthreadio == 1) {
       /* split buffer in half for a kind of double buffering with the threads*/
@@ -342,6 +359,7 @@ printf("end_offsets[%d] is %ld st_offsets[%d] is %ld\n",j,end_offsets[j],j,st_of
         myAggRank = j;
         if (fd_end[j] > fd_start[j]) {
           iAmUsedAgg = 1;
+          iWasUsedStripingAgg = 1;
         }
       }
     }
@@ -406,6 +424,16 @@ printf("end_offsets[%d] is %ld st_offsets[%d] is %ld\n",j,end_offsets[j],j,st_of
     int currentFlatBufIndice=0;
     ADIO_Offset currentIndiceOffset = 0;
 
+    /* Remember where we left off in the source buffer when packing stripes. */
+    if ((stripe_parms.stripeSize > 0) && !stripe_parms.firstStripedWriteCall) {
+      currentDataTypeExtent = lastDataTypeExtent;
+      currentFlatBufIndice = lastFlatBufIndice;
+      currentIndiceOffset = lastIndiceOffset;
+#ifdef onesidedtrace
+      printf("using lastDataTypeExtent %d lastFlatBufIndice %d lastIndiceOffset %ld\n",lastDataTypeExtent,lastFlatBufIndice,lastIndiceOffset);
+#endif
+    }
+
     /* This denotes the coll_bufsize boundaries within the source buffer for writing for the same round.
      */
     ADIO_Offset intraRoundCollBufsizeOffset = 0;
@@ -592,8 +620,7 @@ printf("end_offsets[%d] is %ld st_offsets[%d] is %ld\n",j,end_offsets[j],j,st_of
         ADIO_Offset amountToAdvanceSBOffsetForFD = 0;
         int additionalFDCounter = 0;
 
-        while (blockEnd >= fd_end[currentAggRankListIndex]) {
-          ADIO_Offset thisAggBlockEnd = fd_end[currentAggRankListIndex];
+        while (blockEnd > fd_end[currentAggRankListIndex]) {          ADIO_Offset thisAggBlockEnd = fd_end[currentAggRankListIndex];
           if (thisAggBlockEnd >= intraRoundCollBufsizeOffset) {
             while (thisAggBlockEnd >= intraRoundCollBufsizeOffset) {
               targetAggsForMyDataCurrentRoundIter[numTargetAggs]++;
@@ -706,10 +733,11 @@ printf("end_offsets[%d] is %ld st_offsets[%d] is %ld\n",j,end_offsets[j],j,st_of
 	    additionalFDCounter++;
 
 #ifdef onesidedtrace
-            printf("block extended beyond fd init settings numTargetAggs %d offset_list[%d] with value %ld past fd border %ld with len %ld\n",numTargetAggs,i,offset_list[blockIter],fd_start[currentAggRankListIndex],len_list[blockIter]);
+            printf("block extended beyond fd init settings numTargetAggs %d offset_list[%d] with value %ld past fd border %ld with len %ld\n",numTargetAggs,blockIter,offset_list[blockIter],fd_start[currentAggRankListIndex],len_list[blockIter]);
 #endif
             intraRoundCollBufsizeOffset = fd_start[currentAggRankListIndex] + coll_bufsize;
             targetAggsForMyDataLastOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
+
           } // if (blockEnd >= fd_start[currentAggRankListIndex])
         } // while (blockEnd >= fd_end[currentAggRankListIndex])
       } // if (blockEnd > fd_end[currentAggRankListIndex])
@@ -780,6 +808,7 @@ printf("end_offsets[%d] is %ld st_offsets[%d] is %ld\n",j,end_offsets[j],j,st_of
 
     if (iAmUsedAgg) {
       currentRoundFDStart = fd_start[myAggRank];
+      currentRoundFDEnd = fd_end[myAggRank];
       if (myAggRank == smallestFileDomainAggRank) {
         if (currentRoundFDStart < firstFileOffset)
           currentRoundFDStart = firstFileOffset;
@@ -791,7 +820,49 @@ printf("end_offsets[%d] is %ld st_offsets[%d] is %ld\n",j,end_offsets[j],j,st_of
 #ifdef onesidedtrace
 printf("iAmUsedAgg - currentRoundFDStart initialized to %ld currentRoundFDEnd to %ld\n",currentRoundFDStart,currentRoundFDEnd);
 #endif
-      if (romio_onesided_always_rmw) { // read in the first buffer
+
+      if ((stripe_parms.stripeSize > 0) && (stripe_parms.segmentIter == 0)) {
+        numStripesUsed = 0;
+        stripeWriteOffsets = (ADIO_Offset *) ADIOI_Malloc(stripe_parms.stripesPerAgg*sizeof(ADIO_Offset));
+        stripeWriteLens = (ADIO_Offset *) ADIOI_Malloc(stripe_parms.stripesPerAgg*sizeof(ADIO_Offset));
+        amountOfStripedDataExpected = 0;
+        int stripeIter = 0;
+        for (stripeIter=0;stripeIter<stripe_parms.stripesPerAgg;stripeIter++) {
+        if (stripeIter == 0) {
+          stripeWriteOffsets[stripeIter] = currentRoundFDStart;
+          stripeWriteLens[stripeIter] = (int)(currentRoundFDEnd - currentRoundFDStart)+1;
+          amountOfStripedDataExpected += (int)(currentRoundFDEnd - currentRoundFDStart)+1;
+          numStripesUsed++;
+        }
+          else {
+            if (((currentRoundFDEnd + (ADIO_Offset)1 + ((ADIO_Offset)stripeIter * stripe_parms.segmentLen))) > stripe_parms.stripedLastFileOffset) {
+		      if (((currentRoundFDEnd + (ADIO_Offset)1 - (ADIO_Offset)(stripe_parms.stripeSize) + ((ADIO_Offset)stripeIter * stripe_parms.segmentLen))) <= stripe_parms.stripedLastFileOffset) {
+			    stripeWriteOffsets[stripeIter] = (currentRoundFDEnd + (ADIO_Offset)1) - (ADIO_Offset)(stripe_parms.stripeSize) + ((ADIO_Offset)stripeIter * stripe_parms.segmentLen);
+			    stripeWriteLens[stripeIter] = (int)(stripe_parms.stripedLastFileOffset - (currentRoundFDEnd + (ADIO_Offset)1 - (ADIO_Offset)(stripe_parms.stripeSize) + ((ADIO_Offset)stripeIter * stripe_parms.segmentLen)) + (ADIO_Offset)1);
+                amountOfStripedDataExpected += (int)(stripe_parms.stripedLastFileOffset - (currentRoundFDEnd + (ADIO_Offset)1 - (ADIO_Offset)(stripe_parms.stripeSize) + ((ADIO_Offset)stripeIter * stripe_parms.segmentLen)) + (ADIO_Offset)1);
+                numStripesUsed++;
+			  }
+			}
+            else {
+			  stripeWriteOffsets[stripeIter] = (currentRoundFDEnd + (ADIO_Offset)1) - (ADIO_Offset)(stripe_parms.stripeSize) + ((ADIO_Offset)stripeIter * stripe_parms.segmentLen);
+              stripeWriteLens[stripeIter] = stripe_parms.stripeSize;
+              amountOfStripedDataExpected += stripe_parms.stripeSize;
+              numStripesUsed++;
+		    }
+		  }
+        } // for-loop
+
+#ifdef onesidedtrace
+        printf("amountOfStripedDataExpected is %d numStripesUsed is %d offsets and lengths are ",amountOfStripedDataExpected,numStripesUsed);
+        for (i=0;i<numStripesUsed;i++) {
+          printf("%ld %ld --",stripeWriteOffsets[i],stripeWriteLens[i]);
+        }
+        printf("\n");
+#endif
+      } // if ((stripe_parms.stripeSize>0) && (stripe_parms.segmentIter==0))
+
+
+      if (romio_onesided_always_rmw && ((stripe_parms.stripeSize==0) || (stripe_parms.segmentIter==0))) { // read in the first buffer
         ADIO_Offset tmpCurrentRoundFDEnd = 0;
         if ((fd_end[myAggRank] - currentRoundFDStart) < coll_bufsize) {
           if (myAggRank == greatestFileDomainAggRank) {
@@ -808,12 +879,21 @@ printf("iAmUsedAgg - currentRoundFDStart initialized to %ld currentRoundFDEnd to
 #ifdef onesidedtrace
 printf("romio_onesided_always_rmw - first buffer pre-read for file offsets %ld to %ld total is %d\n",currentRoundFDStart,tmpCurrentRoundFDEnd,(int)(tmpCurrentRoundFDEnd - currentRoundFDStart)+1);
 #endif
-        ADIO_ReadContig(fd, write_buf, (int)(tmpCurrentRoundFDEnd - currentRoundFDStart)+1,
-          MPI_BYTE, ADIO_EXPLICIT_OFFSET,currentRoundFDStart, &status, error_code);
 
+        if (stripe_parms.stripeSize==0) {
+          ADIO_ReadContig(fd, write_buf, (int)(tmpCurrentRoundFDEnd - currentRoundFDStart)+1,
+            MPI_BYTE, ADIO_EXPLICIT_OFFSET,currentRoundFDStart, &status, error_code);
+		}
+		else {
+          // pre-read the entire batch of stripes we will do before writing
+          int stripeIter = 0;
+          for (stripeIter=0;stripeIter<numStripesUsed;stripeIter++)
+            ADIO_ReadContig(fd, (char*)write_buf + ((ADIO_Offset)stripeIter * (ADIO_Offset)stripe_parms.stripeSize), stripeWriteLens[stripeIter],
+              MPI_BYTE, ADIO_EXPLICIT_OFFSET,stripeWriteOffsets[stripeIter], &status, error_code);
+        }
       }
-    }
-    if (romio_onesided_always_rmw) // wait until the first buffer is read
+    } // if iAmUsedAgg
+    if (romio_onesided_always_rmw && ((stripe_parms.stripeSize == 0) || (stripe_parms.segmentIter == 0))) // wait until the first buffer is read
       MPI_Barrier(fd->comm);
 
 #ifdef ROMIO_GPFS
@@ -822,6 +902,7 @@ printf("romio_onesided_always_rmw - first buffer pre-read for file offsets %ld t
     startTimeBase = MPI_Wtime();
 #endif
 
+
     /* This is the second main loop of the algorithm, actually nested loop of target aggs within rounds.  There are 2 flavors of this.
      * For romio_write_aggmethod of 1 each nested iteration for the target
      * agg does an mpi_put on a contiguous chunk using a primative datatype
@@ -938,7 +1019,7 @@ printf("romio_onesided_always_rmw - first buffer pre-read for file offsets %ld t
 
           /* Determine the offset into the target window.
            */
-          MPI_Aint targetDisplacementToUseThisRound = (MPI_Aint) (offsetStart - currentRoundFDStartForMyTargetAgg);
+          MPI_Aint targetDisplacementToUseThisRound = (MPI_Aint) (offsetStart - currentRoundFDStartForMyTargetAgg)  + ((MPI_Aint)(stripe_parms.segmentIter)*(MPI_Aint)(stripe_parms.stripeSize));
 
           /* If using the thread writer select the appropriate side of the split window.
            */
@@ -961,6 +1042,7 @@ printf("romio_onesided_always_rmw - first buffer pre-read for file offsets %ld t
               putSourceData = (char *) ADIOI_Malloc(bufferAmountToSend*sizeof(char));
               nonContigSourceDataBufferAdvance(((char*)buf), flatBuf, bufferAmountToSend, 1, &currentFDSourceBufferState[aggIter], putSourceData);
               MPI_Put(putSourceData,bufferAmountToSend, MPI_BYTE,targetAggsForMyData[aggIter],targetDisplacementToUseThisRound, bufferAmountToSend,MPI_BYTE,write_buf_window);
+
             }
             MPI_Win_unlock(targetAggsForMyData[aggIter], write_buf_window);
             if (!bufTypeIsContig)
@@ -992,7 +1074,7 @@ printf("romio_onesided_always_rmw - first buffer pre-read for file offsets %ld t
             }
           }
 #ifdef onesidedtrace
-        printf("roundIter %d bufferAmountToSend is %d offsetStart is %ld currentRoundFDStartForMyTargetAgg is %ld targetDisplacementToUseThisRound is %ld targetAggsForMyDataFDStart[aggIter] is %ld\n",roundIter, bufferAmountToSend, offsetStart,currentRoundFDStartForMyTargetAgg,targetDisplacementToUseThisRound,targetAggsForMyDataFDStart[aggIter]);
+        printf("roundIter %d bufferAmountToSend is %d offsetStart is %ld currentRoundFDStartForMyTargetAgg is %ld currentRoundFDEndForMyTargetAgg is %ld targetDisplacementToUseThisRound is %ld targetAggsForMyDataFDStart[aggIter] is %ld\n",roundIter, bufferAmountToSend, offsetStart,currentRoundFDStartForMyTargetAgg,currentRoundFDEndForMyTargetAgg,targetDisplacementToUseThisRound,targetAggsForMyDataFDStart[aggIter]);
 #endif
 
         } // bufferAmountToSend > 0
@@ -1045,16 +1127,33 @@ printf("romio_onesided_always_rmw - first buffer pre-read for file offsets %ld t
       }
       } // baseoffset != -1
     } // target aggs
-	} /// contig_access_count > 0
+
+    if (stripe_parms.stripeSize > 0) {
+      lastDataTypeExtent = currentFDSourceBufferState[numTargetAggs-1].dataTypeExtent;
+      lastFlatBufIndice = currentFDSourceBufferState[numTargetAggs-1].flatBufIndice;
+      lastIndiceOffset = currentFDSourceBufferState[numTargetAggs-1].indiceOffset;
 
 #ifdef onesidedtrace
-printf("first barrier roundIter %d\n",roundIter);
+printf("setting lastDataTypeExtent %d lastFlatBufIndice %d lastIndiceOffset %ld\n",lastDataTypeExtent,lastFlatBufIndice,lastIndiceOffset);
 #endif
-    MPI_Barrier(fd->comm);
 
-    if (iAmUsedAgg) {
+    }
+
+    } /// contig_access_count > 0
+
+    /* Synchronize all procs before the file write */
+    if ((stripe_parms.stripeSize == 0) || (stripe_parms.flushCB)) {
+#ifdef onesidedtrace
+      printf("first barrier roundIter %d\n",roundIter);
+#endif
+      MPI_Barrier(fd->comm);
+    }
+
+    if ((iAmUsedAgg || iWasUsedStripingAgg)  && ((stripe_parms.stripeSize == 0) || (stripe_parms.flushCB))) {
+      iWasUsedStripingAgg = 0;
     /* Determine what offsets define the portion of the file domain the agg is writing this round.
      */
+        if (iAmUsedAgg) {
         if ((fd_end[myAggRank] - currentRoundFDStart) < coll_bufsize) {
           if (myAggRank == greatestFileDomainAggRank) {
             if (fd_end[myAggRank] > lastFileOffset)
@@ -1069,23 +1168,59 @@ printf("first barrier roundIter %d\n",roundIter);
         currentRoundFDEnd = currentRoundFDStart + coll_bufsize - (ADIO_Offset)1;
 
 #ifdef onesidedtrace
-        printf("used agg about to writecontig - currentRoundFDStart is %ld currentRoundFDEnd is %ld within file domeain %ld to %ld\n",currentRoundFDStart,currentRoundFDEnd,fd_start[myAggRank],fd_end[myAggRank]);
+        printf("current used agg about to writecontig - currentRoundFDStart is %ld currentRoundFDEnd is %ld within file domeain %ld to %ld\n",currentRoundFDStart,currentRoundFDEnd,fd_start[myAggRank],fd_end[myAggRank]);
+#endif
+        }
+#ifdef onesidedtrace
+        else {
+          printf("former used agg about to writecontig\n");
+        }
 #endif
-
         int doWriteContig = 1;
         if (!romio_onesided_no_rmw) {
-          if (fd->io_buf_put_amounts != ((int)(currentRoundFDEnd - currentRoundFDStart)+1)) {
-            doWriteContig = 0;
-            *hole_found = 1;
+          if (stripe_parms.stripeSize == 0) {
+            if (fd->io_buf_put_amounts != ((int)(currentRoundFDEnd - currentRoundFDStart)+1)) {
+              doWriteContig = 0;
+              *hole_found = 1;
+#ifdef onesidedtrace
+              printf("hole found --- fd->io_buf_put_amounts is %d currentRoundFDEnd is %ld currentRoundFDStart is %ld on roundIter %d\n",fd->io_buf_put_amounts,currentRoundFDEnd,currentRoundFDStart,roundIter);
+#endif
+            }
+          }
+          else { // file striping
+           if (fd->io_buf_put_amounts != amountOfStripedDataExpected) {
+             doWriteContig = 0;
+             *hole_found = 1;
+#ifdef onesidedtrace
+             printf("striping hole found --- fd->io_buf_put_amounts is %d amountOfStripedDataExpected is %d on roundIter %d\n",fd->io_buf_put_amounts,amountOfStripedDataExpected,roundIter);
+#endif
+            }
           }
           fd->io_buf_put_amounts = 0;
         }
 
         if (!useIOBuffer) {
-          if (doWriteContig)
-            ADIO_WriteContig(fd, write_buf, (int)(currentRoundFDEnd - currentRoundFDStart)+1,
-              MPI_BYTE, ADIO_EXPLICIT_OFFSET,currentRoundFDStart, &status, error_code);
-
+          if (doWriteContig) {
+            if (stripe_parms.stripeSize > 0) {
+#ifdef onesidedtrace
+              printf("about to write out %d stripes\n",numStripesUsed);
+#endif
+              int stripeIter = 0;
+              for (stripeIter=0;stripeIter<numStripesUsed;stripeIter++) {
+#ifdef onesidedtrace
+                printf("writing write_buf offset %ld len %ld file offset %ld\n",((ADIO_Offset)stripeIter * (ADIO_Offset)(stripe_parms.stripeSize)),stripeWriteLens[stripeIter],stripeWriteOffsets[stripeIter]);
+#endif
+                ADIO_WriteContig(fd, (char*)write_buf + ((ADIO_Offset)stripeIter * (ADIO_Offset)(stripe_parms.stripeSize)), stripeWriteLens[stripeIter],
+                  MPI_BYTE, ADIO_EXPLICIT_OFFSET,stripeWriteOffsets[stripeIter], &status, error_code);
+              }
+              ADIOI_Free(stripeWriteLens);
+              ADIOI_Free(stripeWriteOffsets);
+            }
+            else {
+              ADIO_WriteContig(fd, write_buf, (int)(currentRoundFDEnd - currentRoundFDStart)+1,
+                MPI_BYTE, ADIO_EXPLICIT_OFFSET,currentRoundFDStart, &status, error_code);
+	        }
+          }
         } else { /* use the thread writer */
 
         if(!pthread_equal(io_thread, pthread_self())) {
@@ -1113,7 +1248,8 @@ printf("first barrier roundIter %d\n",roundIter);
         io_thread_args.io_kind = ADIOI_WRITE;
         io_thread_args.size = (currentRoundFDEnd-currentRoundFDStart) + 1;
         io_thread_args.offset = currentRoundFDStart;
-        io_thread_args.status = &status;
+        ADIO_Status adio_status;
+        io_thread_args.status = &adio_status;
         io_thread_args.error_code = *error_code;
 
         if ( (pthread_create(&io_thread, NULL,
@@ -1124,7 +1260,7 @@ printf("first barrier roundIter %d\n",roundIter);
 
     } // iAmUsedAgg
 
-    if (!iAmUsedAgg && useIOBuffer) {
+    if (!iAmUsedAgg && useIOBuffer && ((stripe_parms.stripeSize == 0) || (stripe_parms.flushCB))) {
         if (currentWriteBuf == 0) {
             currentWriteBuf = 1;
             write_buf = write_buf1;
@@ -1135,7 +1271,7 @@ printf("first barrier roundIter %d\n",roundIter);
         }
     }
 
-    if (iAmUsedAgg) {
+    if (iAmUsedAgg && stripe_parms.stripeSize == 0) {
       currentRoundFDStart += coll_bufsize;
 
       if (romio_onesided_always_rmw && (roundIter<(numberOfRounds-1))) { // read in the buffer for the next round unless this is the last round
@@ -1153,7 +1289,7 @@ printf("first barrier roundIter %d\n",roundIter);
         else
         tmpCurrentRoundFDEnd = currentRoundFDStart + coll_bufsize - (ADIO_Offset)1;
 #ifdef onesidedtrace
-printf("romio_onesided_always_rmw - round %d buffer pre-read for file offsets %ld to %ld total is %d\n",roundIter, currentRoundFDStart,tmpCurrentRoundFDEnd,(int)(tmpCurrentRoundFDEnd - currentRoundFDStart)+1);
+        printf("romio_onesided_always_rmw - round %d buffer pre-read for file offsets %ld to %ld total is %d\n",roundIter, currentRoundFDStart,tmpCurrentRoundFDEnd,(int)(tmpCurrentRoundFDEnd - currentRoundFDStart)+1);
 #endif
         ADIO_ReadContig(fd, write_buf, (int)(tmpCurrentRoundFDEnd - currentRoundFDStart)+1,
           MPI_BYTE, ADIO_EXPLICIT_OFFSET,currentRoundFDStart, &status, error_code);
@@ -1163,7 +1299,7 @@ printf("romio_onesided_always_rmw - round %d buffer pre-read for file offsets %l
 
     if (roundIter<(numberOfRounds-1)) {
 #ifdef onesidedtrace
-printf("second barrier roundIter %d\n",roundIter);
+printf("second barrier roundIter %d --- waiting in loop this time\n",roundIter);
 #endif
       MPI_Barrier(fd->comm);
     }
diff --git a/src/mpi/romio/adio/include/adioi.h b/src/mpi/romio/adio/include/adioi.h
index e6a80b6..bea225d 100644
--- a/src/mpi/romio/adio/include/adioi.h
+++ b/src/mpi/romio/adio/include/adioi.h
@@ -692,6 +692,28 @@ void ADIOI_P2PContigReadAggregation(ADIO_File fd,
 				     ADIO_Offset *fd_start,
 				     ADIO_Offset *fd_end);
 
+/* This data structure holds parameters releated to file   */
+/* striping needed by the one-sided aggregation algorithm. */
+/* A stripeSize of 0 indicates there is no striping.       */
+typedef struct ADIOI_OneSidedStripeParms {    
+    int stripeSize; /* size in bytes of the striping unit - a size of 0 indicates to the */
+                    /* onesided algorithm that we are a non-striping file system         */
+    ADIO_Offset segmentLen; /* size in bytes of the segment (stripeSize*number of aggs) */
+                            /* up to the size of the file)                              */
+    int stripesPerAgg; /* the number of stripes to be packed into an agg cb for this segment */
+    int segmentIter; /* segment number for the group of stripes currently being packed into  */
+                     /* the agg cb - resets to 0 for each cb flush to the file system        */
+    int flushCB; /* once we have fully packed the cb on an agg this flags */
+                 /* tells us to now write to the file                     */
+    ADIO_Offset stripedLastFileOffset; /* since we are now just calling the onesided algorithm */
+                                       /* with the offset range of segment, we still need to   */
+                                       /* know the actual last offset of the file.             */
+    int firstStripedWriteCall; /* whether this is the first call in the first segement of the  */
+                               /* onesided algorithm.                                          */
+    int lastStripedWriteCall; /* whether this is the last call in the last segement of the  */
+                              /* onesided algorithm.                                        */
+} ADIOI_OneSidedStripeParms;
+
 int ADIOI_OneSidedCleanup(ADIO_File fd);
 void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
         ADIO_Offset *offset_list,
@@ -700,12 +722,13 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd,
         const void *buf,
         MPI_Datatype datatype,
         int *error_code,
-        ADIO_Offset *st_offsets,
-        ADIO_Offset *end_offsets,
+        ADIO_Offset firstFileOffset,
+        ADIO_Offset lastFileOffset,
         int numNonZeroDataOffsets,
         ADIO_Offset *fd_start,
         ADIO_Offset* fd_end,
-        int *hole_found);
+        int *hole_found,
+        ADIOI_OneSidedStripeParms stripe_parms);
 void ADIOI_OneSidedReadAggregation(ADIO_File fd,
         ADIO_Offset *offset_list,
         ADIO_Offset *len_list,

http://git.mpich.org/mpich.git/commitdiff/e474997fd5e859fe363d3b2ce116ca63868ad3e2

commit e474997fd5e859fe363d3b2ce116ca63868ad3e2
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Thu Jul 14 13:24:18 2016 -0500

    ROMIO Onesided write aggregation utilization for Lustre
    
    The collective write aggregation call for Lustre
    (ADIOI_LUSTRE_WriteStridedColl) has been enhanced to utilize
    ADIOI_OneSidedWriteAggregation if the one-sided environment
    variables are set.  The general algorithm is to divide the file up into
    segements, a segment being defined as a contiguous region of the file
    which has up to one occurrence of each stripe - the data for each
    stripe being written out by a particular aggregator.  Iteratively call
    ADIOI_OneSidedWriteAggregation for each segment to aggregate
    the data to the collective buffers, but only do the actual write
    once the appropriate number of stripes have been packed (based
    on the size of the collective buffer) or the aggregation for all the
    data is complete, minimizing synchronization.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpi/romio/adio/ad_lustre/ad_lustre.h b/src/mpi/romio/adio/ad_lustre/ad_lustre.h
index 5e8f017..27f13e7 100644
--- a/src/mpi/romio/adio/ad_lustre/ad_lustre.h
+++ b/src/mpi/romio/adio/ad_lustre/ad_lustre.h
@@ -29,6 +29,7 @@
 #include <sys/ioctl.h>
 #include <lustre/lustre_user.h>
 #include "adio.h"
+#include "ad_tuning.h"
 /*#include "adioi.h"*/
 
 #ifdef HAVE_SIGNAL_H
diff --git a/src/mpi/romio/adio/ad_lustre/ad_lustre_wrcoll.c b/src/mpi/romio/adio/ad_lustre/ad_lustre_wrcoll.c
index e87f927..0e47e5e 100644
--- a/src/mpi/romio/adio/ad_lustre/ad_lustre_wrcoll.c
+++ b/src/mpi/romio/adio/ad_lustre/ad_lustre_wrcoll.c
@@ -42,7 +42,7 @@ static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, const void *buf,
 					 ADIO_Offset *len_list, int *send_size,
 					 int *recv_size, ADIO_Offset off,
 					 int size, int *count,
-					 int *start_pos, 
+					 int *start_pos,
 					 int *sent_to_proc, int nprocs,
 					 int myrank, int buftype_is_contig,
 					 int contig_access_count,
@@ -59,6 +59,14 @@ void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
                       ADIO_Offset *srt_off, int *srt_len, int *start_pos,
                       int nprocs, int nprocs_recv, int total_elements);
 
+static void ADIOI_LUSTRE_IterateOneSided(ADIO_File fd, const void *buf, int *striping_info,
+                                         ADIO_Offset *offset_list, ADIO_Offset *len_list,
+                                         int contig_access_count, int currentValidDataIndex,
+                                         int count, int file_ptr_type, ADIO_Offset offset,
+                                         ADIO_Offset start_offset, ADIO_Offset end_offset,
+                                         ADIO_Offset firstFileOffset, ADIO_Offset lastFileOffset,
+                                         MPI_Datatype datatype, int myrank, int *error_code);
+
 void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 				   MPI_Datatype datatype,
 				   int file_ptr_type, ADIO_Offset offset,
@@ -88,6 +96,7 @@ void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count,
     int *striping_info = NULL;
     ADIO_Offset **buf_idx = NULL;
     int old_error, tmp_error;
+    ADIO_Offset *count_sizes;
 
     MPI_Comm_size(fd->comm, &nprocs);
     MPI_Comm_rank(fd->comm, &myrank);
@@ -113,10 +122,25 @@ void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count,
          */
 	st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
 	end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
+    ADIO_Offset my_count_size=0;
+    /* One-sided aggregation needs the amount of data per rank as well because
+     * the difference in starting and ending offsets for 1 byte is 0 the same
+     * as 0 bytes so it cannot be distiguished.
+     */
+    if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
+      count_sizes = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
+      MPI_Count buftype_size;
+      MPI_Type_size_x(datatype, &buftype_size);
+      my_count_size = (ADIO_Offset) count  * (ADIO_Offset)buftype_size;
+    }
 	MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
 		      ADIO_OFFSET, fd->comm);
 	MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
 		      ADIO_OFFSET, fd->comm);
+    if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
+	  MPI_Allgather(&my_count_size, 1, ADIO_OFFSET, count_sizes, 1,
+                ADIO_OFFSET, fd->comm);
+    }
 	/* are the accesses of different processes interleaved? */
 	for (i = 1; i < nprocs; i++)
 	    if ((st_offsets[i] < end_offsets[i-1]) &&
@@ -148,6 +172,8 @@ void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 	    ADIOI_Free(len_list);
             ADIOI_Free(st_offsets);
             ADIOI_Free(end_offsets);
+        if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2))
+          ADIOI_Free(count_sizes);
 	}
 
 	fd->fp_ind = orig_fp;
@@ -168,8 +194,47 @@ void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 	return;
     }
 
+    ADIO_Offset lastFileOffset = 0, firstFileOffset = -1;
+    int currentValidDataIndex = 0;
+    if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
+      /* Take out the 0-data offsets by shifting the indexes with data to the front
+       * and keeping track of the valid data index for use as the length.
+       */
+      for (i=0; i<nprocs; i++) {
+        if (count_sizes[i] > 0) {
+          st_offsets[currentValidDataIndex] = st_offsets[i];
+          end_offsets[currentValidDataIndex] = end_offsets[i];
+
+          lastFileOffset = MPL_MAX(lastFileOffset,end_offsets[currentValidDataIndex]);
+          if (firstFileOffset == -1)
+            firstFileOffset = st_offsets[currentValidDataIndex];
+          else
+            firstFileOffset = MPL_MIN(firstFileOffset,st_offsets[currentValidDataIndex]);
+
+          currentValidDataIndex++;
+        }
+      }
+    }
+
     /* Get Lustre hints information */
     ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1);
+    /* If the user has specified to use a one-sided aggregation method then do that at
+     * this point instead of the two-phase I/O.
+     */
+    if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
+
+      ADIOI_LUSTRE_IterateOneSided(fd, buf, striping_info, offset_list, len_list, contig_access_count, currentValidDataIndex, count, file_ptr_type, offset, start_offset, end_offset, firstFileOffset, lastFileOffset, datatype, myrank, error_code);
+
+      ADIOI_Free(offset_list);
+      ADIOI_Free(len_list);
+      ADIOI_Free(st_offsets);
+      ADIOI_Free(end_offsets);
+      ADIOI_Free(count_sizes);
+      ADIOI_Free(striping_info);
+
+      goto fn_exit;
+
+    } // onesided aggregation
 
     /* calculate what portions of the access requests of this process are
      * located in which process
@@ -262,6 +327,7 @@ void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count,
     ADIOI_Free(end_offsets);
     ADIOI_Free(striping_info);
 
+fn_exit:
 #ifdef HAVE_STATUS_SET_BYTES
     if (status) {
 	MPI_Count bufsize, size;
@@ -982,3 +1048,337 @@ static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, const void *buf,
 	if (send_size[i])
 	    sent_to_proc[i] = curr_to_proc[i];
 }
+
+/* This function calls ADIOI_OneSidedWriteAggregation iteratively to 
+ * essentially pack stripes of data into the collective buffer and then
+ * flush the collective buffer to the file when fully packed, repeating this
+ * process until all the data is written to the file.
+ */
+static void ADIOI_LUSTRE_IterateOneSided(ADIO_File fd, const void *buf, int *striping_info,
+                                         ADIO_Offset *offset_list, ADIO_Offset *len_list,
+                                         int contig_access_count, int currentValidDataIndex,
+                                         int count, int file_ptr_type, ADIO_Offset offset,
+                                         ADIO_Offset start_offset, ADIO_Offset end_offset,
+                                         ADIO_Offset firstFileOffset, ADIO_Offset lastFileOffset,
+                                         MPI_Datatype datatype, int myrank, int *error_code)
+{
+    int i;
+    int stripesPerAgg = fd->hints->cb_buffer_size/striping_info[0];
+    if (stripesPerAgg == 0) {
+      /* The striping unit is larger than the collective buffer size
+       * therefore we must abort since the buffer has already been
+       * allocated during the open.
+       */
+      FPRINTF(stderr,"Error: The collective buffer size %d is less "
+                     "than the striping unit size %d - the ROMIO "
+                     "Lustre one-sided write aggregation algorithm "
+                     "cannot continue.\n",fd->hints->cb_buffer_size,striping_info[0]);
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    /* The maximum number of aggregators we can use is the number of
+     * stripes used in the file - each agg writes exactly 1 stripe.
+     */
+    int numStripedAggs = striping_info[2];
+
+    int orig_cb_nodes = fd->hints->cb_nodes;
+    if (fd->hints->cb_nodes > numStripedAggs)
+      fd->hints->cb_nodes = numStripedAggs;
+    else if (fd->hints->cb_nodes < numStripedAggs)
+      numStripedAggs = fd->hints->cb_nodes;
+
+    /* Declare ADIOI_OneSidedStripeParms here as some fields will not change.
+     */
+    ADIOI_OneSidedStripeParms stripeParms;
+    stripeParms.stripeSize = striping_info[0];
+    stripeParms.stripedLastFileOffset = lastFileOffset;
+
+    /* The general algorithm here is to divide the file up into segements, a segment
+     * being defined as a contiguous region of the file which has up to one occurrence
+     * of each stripe - the data for each stripe being written out by a particular
+     * aggregator.  The segmentLen is the maximum size in bytes of each segment
+     * (stripeSize*number of aggs).  Iteratively call ADIOI_OneSidedWriteAggregation
+     * for each segment to aggregate the data to the collective buffers, but only do
+     * the actual write (via flushCB stripe parm) once stripesPerAgg stripes
+     * have been packed or the aggregation for all the data is complete, minimizing
+     * synchronization.
+     */
+    stripeParms.segmentLen = ((ADIO_Offset)numStripedAggs)*((ADIO_Offset)(striping_info[0]));
+    ADIO_Offset totalFileSize = (lastFileOffset-firstFileOffset)+(ADIO_Offset)1;
+
+    /* These arrays define the file offsets for the stripes for a given segment - similar
+     * to the concept of file domains in GPFS, essentially file domeains for the segment.
+     */
+    ADIO_Offset *segment_stripe_start = (ADIO_Offset *) ADIOI_Malloc(numStripedAggs*sizeof(ADIO_Offset));
+    ADIO_Offset *segment_stripe_end = (ADIO_Offset *) ADIOI_Malloc(numStripedAggs*sizeof(ADIO_Offset));
+
+    /* Find the actual range of stripes in the file that have data in the offset
+     * ranges being written -- skip holes at the front and back of the file.
+     */
+    int currentOffsetListIndex = 0;
+    int fileSegmentIter = 0;
+    int startingStripeWithData = 0;
+    int foundStartingStripeWithData = 0;
+    while (!foundStartingStripeWithData) {
+      if ( ((startingStripeWithData+1) * (ADIO_Offset)(striping_info[0])) > firstFileOffset)
+        foundStartingStripeWithData = 1;
+      else
+        startingStripeWithData++;
+    }
+
+    ADIO_Offset currentSegementOffset = (ADIO_Offset)startingStripeWithData * (ADIO_Offset)(striping_info[0]);
+
+    int numSegments = (int) ((lastFileOffset+(ADIO_Offset)1 - currentSegementOffset)/stripeParms.segmentLen);
+    if ((lastFileOffset+(ADIO_Offset)1 - currentSegementOffset)%stripeParms.segmentLen > 0)
+      numSegments++;
+
+    /* To support read-modify-write use a while-loop to redo the aggregation if necessary
+     * to fill in the holes.
+     */
+    int doAggregation = 1;
+    int holeFound = 0;
+    
+    /* Remember romio_onesided_no_rmw setting if we have to re-do
+     * the aggregation if holes are found.
+     */
+    int prev_romio_onesided_no_rmw = romio_onesided_no_rmw;
+
+    while (doAggregation) {
+
+      int totalDataWrittenLastRound = 0;
+
+      /* This variable tracks how many segment stripes we have packed into the agg
+       * buffers so we know when to flush to the file system.
+       */
+      stripeParms.segmentIter = 0;
+
+      /* stripeParms.stripesPerAgg is the number of stripes to aggregate before doing a flush.
+       */
+      stripeParms.stripesPerAgg = stripesPerAgg;
+      if (stripeParms.stripesPerAgg > numSegments)
+        stripeParms.stripesPerAgg = numSegments;
+
+      for (fileSegmentIter=0;fileSegmentIter < numSegments;fileSegmentIter++) {
+
+        int dataWrittenThisRound = 0;
+
+        /* Define the segment range in terms of file offsets.
+         */
+        ADIO_Offset segmentFirstFileOffset = currentSegementOffset;
+        if ((currentSegementOffset+stripeParms.segmentLen-(ADIO_Offset)1) > lastFileOffset)
+          currentSegementOffset = lastFileOffset;
+        else
+          currentSegementOffset += (stripeParms.segmentLen-(ADIO_Offset)1);
+        ADIO_Offset segmentLastFileOffset = currentSegementOffset;
+        currentSegementOffset++;
+
+        ADIO_Offset segment_stripe_offset = segmentFirstFileOffset;
+        for (i=0;i<numStripedAggs;i++) {
+          if (firstFileOffset > segmentFirstFileOffset)
+            segment_stripe_start[i] = firstFileOffset;
+          else
+            segment_stripe_start[i] = segment_stripe_offset;
+          if ((segment_stripe_offset + (ADIO_Offset)(striping_info[0])) > lastFileOffset)
+            segment_stripe_end[i] = lastFileOffset;
+          else
+            segment_stripe_end[i] = segment_stripe_offset + (ADIO_Offset)(striping_info[0]) - (ADIO_Offset)1;
+          segment_stripe_offset += (ADIO_Offset)(striping_info[0]);
+        }
+
+        /* In the interest of performance for non-contiguous data with large offset lists
+         * essentially modify the given offset and length list appropriately for this segment
+         * and then pass pointers to the sections of the lists being used for this segment
+         * to ADIOI_OneSidedWriteAggregation.  Remember how we have modified the list for this
+         * segment, and then restore it appropriately after processing for this segment has
+         * concluded, so it is ready for the next segment.
+         */
+        int segmentContigAccessCount = 0;
+        int startingOffsetListIndex = -1;
+        int endingOffsetListIndex = -1;
+        ADIO_Offset startingOffsetAdvancement = 0;
+        ADIO_Offset startingLenTrim = 0;
+        ADIO_Offset endingLenTrim = 0;
+
+        while ( ((offset_list[currentOffsetListIndex] + ((ADIO_Offset)(len_list[currentOffsetListIndex]))-(ADIO_Offset)1) < segmentFirstFileOffset) && (currentOffsetListIndex < (contig_access_count-1)))
+          currentOffsetListIndex++;
+        startingOffsetListIndex = currentOffsetListIndex;
+        endingOffsetListIndex = currentOffsetListIndex;
+        int offsetInSegment = 0;
+        ADIO_Offset offsetStart = offset_list[currentOffsetListIndex];
+        ADIO_Offset offsetEnd = (offset_list[currentOffsetListIndex] + ((ADIO_Offset)(len_list[currentOffsetListIndex]))-(ADIO_Offset)1);
+
+        if (len_list[currentOffsetListIndex] == 0)
+          offsetInSegment = 0;
+        else if ((offsetStart >= segmentFirstFileOffset) && (offsetStart <= segmentLastFileOffset)) {
+          offsetInSegment = 1;
+        }
+        else if ((offsetEnd >= segmentFirstFileOffset) && (offsetEnd <= segmentLastFileOffset)) {
+          offsetInSegment = 1;
+        }
+        else if ((offsetStart <= segmentFirstFileOffset) && (offsetEnd >= segmentLastFileOffset)) {
+          offsetInSegment = 1;
+        }
+
+        if (!offsetInSegment) {
+          segmentContigAccessCount = 0;
+        }
+        else { 
+          /* We are in the segment, advance currentOffsetListIndex until we are out of segment.
+           */
+          segmentContigAccessCount = 1;
+
+          while ((offset_list[currentOffsetListIndex] <= segmentLastFileOffset) && (currentOffsetListIndex < contig_access_count)) {
+            dataWrittenThisRound += (int) len_list[currentOffsetListIndex];
+            currentOffsetListIndex++;
+          }
+
+          if (currentOffsetListIndex > startingOffsetListIndex) {
+            /* If we did advance, if we are at the end need to check if we are still in segment.
+            */
+            if (currentOffsetListIndex == contig_access_count) {
+              currentOffsetListIndex--;
+            }
+            else if (offset_list[currentOffsetListIndex] > segmentLastFileOffset) {
+              /* We advanced into the last one and it still in the segment.
+               */
+              currentOffsetListIndex--;
+            }
+            else {
+              dataWrittenThisRound += (int) len_list[currentOffsetListIndex];
+            }
+            segmentContigAccessCount += (currentOffsetListIndex-startingOffsetListIndex);
+            endingOffsetListIndex = currentOffsetListIndex;
+          }
+        }
+
+        if (segmentContigAccessCount > 0) {
+          /* Trim edges here so all data in the offset list range fits exactly in the segment.
+           */
+          if (offset_list[startingOffsetListIndex] < segmentFirstFileOffset) {
+            startingOffsetAdvancement = segmentFirstFileOffset-offset_list[startingOffsetListIndex];
+            offset_list[startingOffsetListIndex] += startingOffsetAdvancement;
+            dataWrittenThisRound -= (int) startingOffsetAdvancement;
+            startingLenTrim = startingOffsetAdvancement;
+            len_list[startingOffsetListIndex] -= startingLenTrim;
+          }
+
+          if ((offset_list[endingOffsetListIndex] + ((ADIO_Offset)(len_list[endingOffsetListIndex]))-(ADIO_Offset)1) > segmentLastFileOffset) {
+            endingLenTrim = offset_list[endingOffsetListIndex]+ ((ADIO_Offset)(len_list[endingOffsetListIndex]))-(ADIO_Offset)1 - segmentLastFileOffset;
+            len_list[endingOffsetListIndex] -= endingLenTrim;
+            dataWrittenThisRound -= (int) endingLenTrim;
+          }
+        }
+
+        int holeFoundThisRound = 0;          
+
+        /* Once we have packed the collective buffers do the actual write.
+         */
+        if ((stripeParms.segmentIter == (stripeParms.stripesPerAgg-1)) || (fileSegmentIter == (numSegments-1))) {
+          stripeParms.flushCB = 1;
+        }
+        else
+          stripeParms.flushCB = 0;
+
+        stripeParms.firstStripedWriteCall = 0;
+        stripeParms.lastStripedWriteCall = 0;
+        if (fileSegmentIter == 0) {
+          stripeParms.firstStripedWriteCall = 1;
+        }
+        else if (fileSegmentIter == (numSegments-1))
+          stripeParms.lastStripedWriteCall = 1;
+
+        /* The difference in calls to ADIOI_OneSidedWriteAggregation is based on the whether the buftype is
+         * contiguous.  The algorithm tracks the position in the source buffer when called
+         * multiple times --  in the case of contiguous data this is simple and can be externalized with
+         * a buffer offset, in the case of non-contiguous data this is complex and the state must be tracked
+         * internally, therefore no external buffer offset.  Care was taken to minimize
+         * ADIOI_OneSidedWriteAggregation changes at the expense of some added complexity to the caller.
+         */
+        int bufTypeIsContig;
+        ADIOI_Datatype_iscontig(datatype, &bufTypeIsContig);
+        if (bufTypeIsContig) {
+          ADIOI_OneSidedWriteAggregation(fd,(ADIO_Offset*)&(offset_list[startingOffsetListIndex]), (ADIO_Offset*)&(len_list[startingOffsetListIndex]), segmentContigAccessCount, buf+totalDataWrittenLastRound, datatype, error_code, segmentFirstFileOffset, segmentLastFileOffset, currentValidDataIndex, segment_stripe_start, segment_stripe_end, &holeFoundThisRound,stripeParms);
+        }
+        else {
+          ADIOI_OneSidedWriteAggregation(fd,(ADIO_Offset*)&(offset_list[startingOffsetListIndex]), (ADIO_Offset*)&(len_list[startingOffsetListIndex]), segmentContigAccessCount, buf, datatype, error_code, segmentFirstFileOffset, segmentLastFileOffset, currentValidDataIndex, segment_stripe_start, segment_stripe_end, &holeFoundThisRound,stripeParms);
+        }
+
+        if (stripeParms.flushCB) {
+          stripeParms.segmentIter = 0;
+          if (stripesPerAgg > (numSegments-fileSegmentIter-1))
+            stripeParms.stripesPerAgg = numSegments-fileSegmentIter-1;
+          else
+            stripeParms.stripesPerAgg = stripesPerAgg;
+        }
+        else
+          stripeParms.segmentIter++;
+
+        if (holeFoundThisRound)
+          holeFound = 1;
+
+        /* If we know we won't be doing a pre-read in a subsequent call to 
+         * ADIOI_OneSidedWriteAggregation which will have a barrier to keep
+         * feeder ranks from doing rma to the collective buffer before the
+         * write completes that we told it do with the stripeParms.flushCB
+         * flag then we need to do a barrier here.
+         */
+        if (!romio_onesided_always_rmw && stripeParms.flushCB) {
+          if (fileSegmentIter < (numSegments-1)) {
+            MPI_Barrier(fd->comm);
+          }
+        }
+
+        /* Restore the offset_list and len_list to values that are ready for the
+         * next iteration.
+         */
+        if (segmentContigAccessCount > 0) {
+          offset_list[endingOffsetListIndex] += len_list[endingOffsetListIndex];
+          len_list[endingOffsetListIndex] = endingLenTrim;
+        }
+        totalDataWrittenLastRound += dataWrittenThisRound;
+      } // fileSegmentIter for-loop
+
+      /* Check for holes in the data unless romio_onesided_no_rmw is set.
+       * If a hole is found redo the entire aggregation and write.
+       */
+      if (!romio_onesided_no_rmw) {
+        int anyHolesFound = 0;
+        MPI_Allreduce(&holeFound, &anyHolesFound, 1, MPI_INT, MPI_MAX, fd->comm);
+
+        if (anyHolesFound) {
+          ADIOI_Free(offset_list);
+          ADIOI_Free(len_list);
+          ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
+	                        &offset_list, &len_list, &start_offset,
+	                        &end_offset, &contig_access_count);
+
+          currentSegementOffset = (ADIO_Offset)startingStripeWithData * (ADIO_Offset)(striping_info[0]);
+          romio_onesided_always_rmw = 1;
+          romio_onesided_no_rmw = 1;
+
+          /* Holes are found in the data and the user has not set
+           * romio_onesided_no_rmw --- set romio_onesided_always_rmw to 1
+           * and redo the entire aggregation and write and if the user has
+           * romio_onesided_inform_rmw set then inform him of this condition
+           * and behavior.
+           */
+          if (romio_onesided_inform_rmw && (myrank ==0)) {
+            FPRINTF(stderr,"Information: Holes found during one-sided "
+            "write aggregation algorithm --- re-running one-sided "
+            "write aggregation with ROMIO_ONESIDED_ALWAYS_RMW set to 1.\n");
+          }
+        }
+        else
+          doAggregation = 0;
+      }
+      else
+        doAggregation = 0;
+    } // while doAggregation
+    romio_onesided_no_rmw = prev_romio_onesided_no_rmw;
+
+    ADIOI_Free(segment_stripe_start);
+    ADIOI_Free(segment_stripe_end);
+
+    fd->hints->cb_nodes = orig_cb_nodes;
+
+}

http://git.mpich.org/mpich.git/commitdiff/598aa493dcef472ac32498fa0100976df15c2b26

commit 598aa493dcef472ac32498fa0100976df15c2b26
Author: Paul Coffman <pcoffman at anl.gov>
Date:   Tue Mar 22 21:27:18 2016 -0700

    ROMIO Onesided write fix hole checking
    
    change implementation of hole checking to use MPI_Accumulate
    
    The current implementation to check for holes to determine if rmw is necessary uses an
    nproc-size array of ints and MPI_Put.  With MPI3 the atomic MPI_Accumulate can be used
    instead with a single integer to track this.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpi/romio/adio/common/onesided_aggregation.c b/src/mpi/romio/adio/common/onesided_aggregation.c
index f7aca5f..c96b8bf 100644
--- a/src/mpi/romio/adio/common/onesided_aggregation.c
+++ b/src/mpi/romio/adio/common/onesided_aggregation.c
@@ -56,8 +56,8 @@ static int ADIOI_OneSidedSetup(ADIO_File fd, int procs) {
     ret = MPI_Win_create(fd->io_buf,fd->hints->cb_buffer_size,1,
 	    MPI_INFO_NULL,fd->comm, &fd->io_buf_window);
     if (ret != MPI_SUCCESS) goto fn_exit;
-    fd->io_buf_put_amounts = (int *) ADIOI_Malloc(procs*sizeof(int));
-    ret =MPI_Win_create(fd->io_buf_put_amounts,procs*sizeof(int),sizeof(int),
+    fd->io_buf_put_amounts = 0;
+    ret =MPI_Win_create(&(fd->io_buf_put_amounts),sizeof(int),sizeof(int),
 	    MPI_INFO_NULL,fd->comm, &fd->io_buf_put_amounts_window);
 fn_exit:
     return ret;
@@ -70,8 +70,6 @@ int ADIOI_OneSidedCleanup(ADIO_File fd)
 	ret = MPI_Win_free(&fd->io_buf_window);
     if (fd->io_buf_put_amounts_window != MPI_WIN_NULL)
 	ret = MPI_Win_free(&fd->io_buf_put_amounts_window);
-    if (fd->io_buf_put_amounts != NULL)
-	ADIOI_Free(fd->io_buf_put_amounts);
 
     return ret;
 }
@@ -771,11 +769,8 @@ printf("end_offsets[%d] is %ld st_offsets[%d] is %ld\n",j,end_offsets[j],j,st_of
     char *write_buf = write_buf0;
     MPI_Win write_buf_window = fd->io_buf_window;
 
-    int *write_buf_put_amounts = fd->io_buf_put_amounts;
     if(!romio_onesided_no_rmw) {
       *hole_found = 0;
-      for (i=0;i<nprocs;i++)
-        write_buf_put_amounts[i] = 0;
     }
 
     /* Counters to track the offset range being written by the used aggs.
@@ -1045,7 +1040,7 @@ printf("romio_onesided_always_rmw - first buffer pre-read for file offsets %ld t
       }
       if (!romio_onesided_no_rmw) {
         MPI_Win_lock(MPI_LOCK_SHARED, targetAggsForMyData[aggIter], 0, fd->io_buf_put_amounts_window);
-        MPI_Put(&numBytesPutThisAggRound,1, MPI_INT,targetAggsForMyData[aggIter],myrank, 1,MPI_INT,fd->io_buf_put_amounts_window);
+        MPI_Accumulate(&numBytesPutThisAggRound,1, MPI_INT,targetAggsForMyData[aggIter],0, 1, MPI_INT, MPI_SUM, fd->io_buf_put_amounts_window);
         MPI_Win_unlock(targetAggsForMyData[aggIter], fd->io_buf_put_amounts_window);
       }
       } // baseoffset != -1
@@ -1079,15 +1074,11 @@ printf("first barrier roundIter %d\n",roundIter);
 
         int doWriteContig = 1;
         if (!romio_onesided_no_rmw) {
-          int numBytesPutIntoBuf = 0;
-          for (i=0;i<nprocs;i++) {
-            numBytesPutIntoBuf += write_buf_put_amounts[i];
-            write_buf_put_amounts[i] = 0;
-          }
-          if (numBytesPutIntoBuf != ((int)(currentRoundFDEnd - currentRoundFDStart)+1)) {
+          if (fd->io_buf_put_amounts != ((int)(currentRoundFDEnd - currentRoundFDStart)+1)) {
             doWriteContig = 0;
             *hole_found = 1;
           }
+          fd->io_buf_put_amounts = 0;
         }
 
         if (!useIOBuffer) {
diff --git a/src/mpi/romio/adio/include/adio.h b/src/mpi/romio/adio/include/adio.h
index 34e1914..d29a01f 100644
--- a/src/mpi/romio/adio/include/adio.h
+++ b/src/mpi/romio/adio/include/adio.h
@@ -230,7 +230,7 @@ typedef struct ADIOI_FileD {
     int my_cb_nodes_index; /* my index into cb_config_list. -1 if N/A */
     char *io_buf;          /* two-phase buffer allocated out of i/o path */
     MPI_Win io_buf_window; /* Window over the io_buf to support one-sided aggregation */
-    int *io_buf_put_amounts; /* array tracking the amount of data mpi_put into the io_buf
+    int io_buf_put_amounts; /* the amount of data mpi_put into the io_buf
                                 during the same round of one-sided write aggregation */
     MPI_Win io_buf_put_amounts_window; /* Window over the io_buf_put_amounts */
     /* External32 */

http://git.mpich.org/mpich.git/commitdiff/ccd815498267d6cc342d25fce7ce3107f3e18294

commit ccd815498267d6cc342d25fce7ce3107f3e18294
Author: Paul Coffman <pcoffman at anl.gov>
Date:   Tue Mar 22 20:43:19 2016 -0700

    ROMIO Onesided write fix improper buffer free
    
    Code was calling MPI_Put buffer after MPI_Win_unlock The current
    implementation errantly frees a buffer used by MPI_Put for aggmethod 1
    (the put-at-a-time approach) BEFORE the call to MPI_Win_unlock - this
    must be done AFTER.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpi/romio/adio/common/onesided_aggregation.c b/src/mpi/romio/adio/common/onesided_aggregation.c
index ec68f5b..f7aca5f 100644
--- a/src/mpi/romio/adio/common/onesided_aggregation.c
+++ b/src/mpi/romio/adio/common/onesided_aggregation.c
@@ -957,17 +957,19 @@ printf("romio_onesided_always_rmw - first buffer pre-read for file offsets %ld t
 
           if (romio_write_aggmethod == 1) {
             MPI_Win_lock(MPI_LOCK_SHARED, targetAggsForMyData[aggIter], 0, write_buf_window);
+            char *putSourceData;
             if (bufTypeIsContig) {
               MPI_Put(((char*)buf) + currentFDSourceBufferState[aggIter].sourceBufferOffset,bufferAmountToSend, MPI_BYTE,targetAggsForMyData[aggIter],targetDisplacementToUseThisRound, bufferAmountToSend,MPI_BYTE,write_buf_window);
               currentFDSourceBufferState[aggIter].sourceBufferOffset += (ADIO_Offset)bufferAmountToSend;
             }
             else {
-              char *putSourceData = (char *) ADIOI_Malloc(bufferAmountToSend*sizeof(char));
+              putSourceData = (char *) ADIOI_Malloc(bufferAmountToSend*sizeof(char));
               nonContigSourceDataBufferAdvance(((char*)buf), flatBuf, bufferAmountToSend, 1, &currentFDSourceBufferState[aggIter], putSourceData);
               MPI_Put(putSourceData,bufferAmountToSend, MPI_BYTE,targetAggsForMyData[aggIter],targetDisplacementToUseThisRound, bufferAmountToSend,MPI_BYTE,write_buf_window);
-              ADIOI_Free(putSourceData);
             }
             MPI_Win_unlock(targetAggsForMyData[aggIter], write_buf_window);
+            if (!bufTypeIsContig)
+              ADIOI_Free(putSourceData);
           }
 
           /* For romio_write_aggmethod of 2 populate the data structures for this round/agg for this offset iter

http://git.mpich.org/mpich.git/commitdiff/b95b4e4ef0ed1b6ea4b47b56859eef18487d5eb8

commit b95b4e4ef0ed1b6ea4b47b56859eef18487d5eb8
Author: Paul Coffman <pcoffman at anl.gov>
Date:   Mon Mar 21 09:08:42 2016 -0700

    Generalize ROMIO collective aggregation tuning environment variables
    
    Several ROMIO optimizations including a one-sided aggregation method
    were done only for GPFS which were enabled via GPFS-specific
    environment variables, now we need to enable them for other file
    systems such as Lustre.  The names and utilization functions must be
    generalized - ad_gpfs_get_env var has been reduced to gpfs-only env
    vars and a new function ad_get_env has been created and called
    appropriately.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs.h b/src/mpi/romio/adio/ad_gpfs/ad_gpfs.h
index d64dcae..9f1e0e0 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs.h
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs.h
@@ -67,6 +67,7 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 
 void ADIOI_GPFS_Flush(ADIO_File fd, int *error_code);
 
+#include "ad_tuning.h"
 #include "ad_gpfs_tuning.h"
 
 
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_hints.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_hints.c
index fd6cdbc..c561ec9 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_hints.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_hints.c
@@ -87,6 +87,7 @@ void ADIOI_GPFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
      */
     if (!fd->hints->initialized) {
 
+	ad_get_env_vars();
 	ad_gpfs_get_env_vars();
 	did_anything = 1;
 
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c
index d504836..bcbd2a4 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c
@@ -179,14 +179,14 @@ void ADIOI_GPFS_ReadStridedColl(ADIO_File fd, void *buf, int count,
     /* One-sided aggregation needs the amount of data per rank as well because the difference in
      * starting and ending offsets for 1 byte is 0 the same as 0 bytes so it cannot be distiguished.
      */
-    if ((gpfsmpio_read_aggmethod == 1) || (gpfsmpio_read_aggmethod == 2)) {
+    if ((romio_read_aggmethod == 1) || (romio_read_aggmethod == 2)) {
         count_sizes = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
         MPI_Count buftype_size;
         MPI_Type_size_x(datatype, &buftype_size);
         my_count_size = (ADIO_Offset) count  * (ADIO_Offset)buftype_size;
     }
     if (gpfsmpio_tunegather) {
-      if ((gpfsmpio_read_aggmethod == 1) || (gpfsmpio_read_aggmethod == 2)) {
+      if ((romio_read_aggmethod == 1) || (romio_read_aggmethod == 2)) {
         gpfs_offsets0 = (ADIO_Offset *) ADIOI_Malloc(3*nprocs*sizeof(ADIO_Offset));
         gpfs_offsets  = (ADIO_Offset *) ADIOI_Malloc(3*nprocs*sizeof(ADIO_Offset));
         for (ii=0; ii<nprocs; ii++)  {
@@ -228,7 +228,7 @@ void ADIOI_GPFS_ReadStridedColl(ADIO_File fd, void *buf, int count,
                       ADIO_OFFSET, fd->comm);
         MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
                       ADIO_OFFSET, fd->comm);
-        if ((gpfsmpio_read_aggmethod == 1) || (gpfsmpio_read_aggmethod == 2)) {
+        if ((romio_read_aggmethod == 1) || (romio_read_aggmethod == 2)) {
 	      MPI_Allgather(&count_sizes, 1, ADIO_OFFSET, count_sizes, 1,
                         ADIO_OFFSET, fd->comm);
         }
@@ -295,7 +295,7 @@ void ADIOI_GPFS_ReadStridedColl(ADIO_File fd, void *buf, int count,
      *
      */
     int currentNonZeroDataIndex = 0;
-    if ((gpfsmpio_read_aggmethod == 1) || (gpfsmpio_read_aggmethod == 2)) {
+    if ((romio_read_aggmethod == 1) || (romio_read_aggmethod == 2)) {
       /* Take out the 0-data offsets by shifting the indexes with data to the
        * front and keeping track of the non-zero data index for use as the
        * length.  By doing this we will optimally use all available aggs
@@ -312,7 +312,7 @@ void ADIOI_GPFS_ReadStridedColl(ADIO_File fd, void *buf, int count,
       }
     }
     if (gpfsmpio_tuneblocking) {
-    if ((gpfsmpio_read_aggmethod == 1) || (gpfsmpio_read_aggmethod == 2)) {
+    if ((romio_read_aggmethod == 1) || (romio_read_aggmethod == 2)) {
     ADIOI_GPFS_Calc_file_domains(fd, st_offsets, end_offsets, currentNonZeroDataIndex,
 			    nprocs_for_coll, &min_st_offset,
 			    &fd_start, &fd_end, &fd_size, fd->fs_ptr);
@@ -324,7 +324,7 @@ void ADIOI_GPFS_ReadStridedColl(ADIO_File fd, void *buf, int count,
     }
     }
     else {
-    if ((gpfsmpio_read_aggmethod == 1) || (gpfsmpio_read_aggmethod == 2)) {
+    if ((romio_read_aggmethod == 1) || (romio_read_aggmethod == 2)) {
     ADIOI_Calc_file_domains(st_offsets, end_offsets, currentNonZeroDataIndex,
 			    nprocs_for_coll, &min_st_offset,
 			    &fd_start, &fd_end,
@@ -341,7 +341,7 @@ void ADIOI_GPFS_ReadStridedColl(ADIO_File fd, void *buf, int count,
     }
 
     GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_MYREQ, GPFSMPIO_CIO_T_FD_PART );
-    if ((gpfsmpio_read_aggmethod == 1) || (gpfsmpio_read_aggmethod == 2)) {
+    if ((romio_read_aggmethod == 1) || (romio_read_aggmethod == 2)) {
     /* If the user has specified to use a one-sided aggregation method then do that at
      * this point instead of the two-phase I/O.
      */
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
index 333612b..8b4d06d 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
@@ -37,14 +37,9 @@ long    bglocklessmpio_f_type;
 int     gpfsmpio_bg_nagg_pset;
 int     gpfsmpio_pthreadio;
 int     gpfsmpio_p2pcontig;
-int     gpfsmpio_write_aggmethod;
-int     gpfsmpio_read_aggmethod;
 int	gpfsmpio_balancecontig;
 int     gpfsmpio_devnullio;
 int     gpfsmpio_bridgeringagg;
-int     gpfsmpio_onesided_no_rmw;
-int     gpfsmpio_onesided_always_rmw;
-int     gpfsmpio_onesided_inform_rmw;
 
 double	gpfsmpio_prof_cw    [GPFSMPIO_CIO_LAST+1];
 double	gpfsmpio_prof_cr    [GPFSMPIO_CIO_LAST+1];
@@ -110,40 +105,6 @@ double	gpfsmpio_prof_cr    [GPFSMPIO_CIO_LAST+1];
  * 3.) There are no gaps between the offsets.
  * 4.) No single rank has a data size which spans multiple file domains.
  *
- * - GPFSMPIO_WRITE_AGGMETHOD/GPFSMPIO_READ_AGGMETHOD -  Replaces the two-phase
- *   collective IO aggregation
- *   with a one-sided algorithm, significantly reducing communication and
- *   memory overhead.  Fully
- *   supports all datasets and datatypes, the only caveat is that any holes in the data
- *   when writing to a pre-existing file are ignored -- there is no read-modify-write
- *   support to maintain the correctness of regions of pre-existing data so every byte
- *   must be explicitly written to maintain correctness.  Users must beware of middle-ware
- *   libraries like PNETCDF which may count on read-modify-write functionality for certain
- *   features (like fill values).  Possible values:
- *   - 0 - Normal two-phase collective IO is used.
- *   - 1 - A separate one-sided MPI_Put or MPI_Get is used for each contigous chunk of data
- *         for a compute to write to or read from the collective buffer on the aggregator.
- *   - 2 - An MPI derived datatype is created using all the contigous chunks and just one
- *         call to MPI_Put or MPI_Get is done with the derived datatype.  On Blue Gene /Q
- *         optimal performance for this is achieved when paired with PAMID_TYPED_ONESIDED=1.
- *   - Default is 0
- *
- * - GPFSMPIO_ONESIDED_NO_RMW - For one-sided aggregation (GPFSMPIO_WRITE_AGGMETHOD = 1 or 2)
- *   disable the detection of holes in the data when writing to a pre-existing
- *   file requiring a read-modify-write, thereby avoiding the communication
- *   overhead for this detection.
- *   - 0 (hole detection enabled) or 1 (hole detection disabled)
- *   - Default is 0
- *
- * - GPFSMPIO_ONESIDED_INFORM_RMW - For one-sided aggregation
- *   (GPFSMPIO_AGGMETHOD = 1 or 2) generate an informational message informing
- *   the user whether holes exist in the data when writing to a pre-existing
- *   file requiring a read-modify-write, thereby educating the user to set
- *   GPFSMPIO_ONESIDED_NO_RMW=1 on a future run to avoid the communication
- *   overhead for this detection.
- *   - 0 (disabled) or 1 (enabled)
- *   - Default is 0
- *
  * - GPFSMPIO_BALANCECONTIG -  Relevant only to BGQ.  File domain blocks are assigned
  *   to aggregators in a breadth-first fashion relative to the ions - additionally,
  *   file domains on the aggregators sharing the same bridgeset and ion have contiguous
@@ -196,22 +157,10 @@ void ad_gpfs_get_env_vars() {
     x = getenv("GPFSMPIO_NAGG_PSET");
     if (x) gpfsmpio_bg_nagg_pset = atoi(x);
 
-    gpfsmpio_pthreadio = 0;
-    x = getenv( "GPFSMPIO_PTHREADIO" );
-    if (x) gpfsmpio_pthreadio = atoi(x);
-
     gpfsmpio_p2pcontig = 0;
     x = getenv( "GPFSMPIO_P2PCONTIG" );
     if (x) gpfsmpio_p2pcontig = atoi(x);
 
-    gpfsmpio_write_aggmethod = 0;
-    x = getenv( "GPFSMPIO_WRITE_AGGMETHOD" );
-    if (x) gpfsmpio_write_aggmethod = atoi(x);
-
-    gpfsmpio_read_aggmethod = 0;
-    x = getenv( "GPFSMPIO_READ_AGGMETHOD" );
-    if (x) gpfsmpio_read_aggmethod = atoi(x);
-
     gpfsmpio_balancecontig = 0;
     x = getenv( "GPFSMPIO_BALANCECONTIG" );
     if (x) gpfsmpio_balancecontig = atoi(x);
@@ -224,19 +173,6 @@ void ad_gpfs_get_env_vars() {
     x = getenv( "GPFSMPIO_BRIDGERINGAGG" );
     if (x) gpfsmpio_bridgeringagg = atoi(x);
 
-    gpfsmpio_onesided_no_rmw = 0;
-    x = getenv( "GPFSMPIO_ONESIDED_NO_RMW" );
-    if (x) gpfsmpio_onesided_no_rmw = atoi(x);
-
-    gpfsmpio_onesided_always_rmw = 0;
-    x = getenv( "GPFSMPIO_ONESIDED_ALWAYS_RMW" );
-    if (x) gpfsmpio_onesided_always_rmw = atoi(x);
-    if (gpfsmpio_onesided_always_rmw)
-      gpfsmpio_onesided_no_rmw = 1;
-
-    gpfsmpio_onesided_inform_rmw = 0;
-    x = getenv( "GPFSMPIO_ONESIDED_INFORM_RMW" );
-    if (x) gpfsmpio_onesided_inform_rmw = atoi(x);
 }
 
 /* report timing breakdown for MPI I/O collective call */
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
index 35fdc8d..21f98f6 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
@@ -66,14 +66,9 @@ extern int 	gpfsmpio_tuneblocking;
 extern long bglocklessmpio_f_type;
 extern int      gpfsmpio_pthreadio;
 extern int      gpfsmpio_p2pcontig;
-extern int      gpfsmpio_write_aggmethod;
-extern int      gpfsmpio_read_aggmethod;
 extern int  gpfsmpio_balancecontig;
 extern int      gpfsmpio_devnullio;
 extern int      gpfsmpio_bridgeringagg;
-extern int      gpfsmpio_onesided_no_rmw;
-extern int      gpfsmpio_onesided_always_rmw;
-extern int      gpfsmpio_onesided_inform_rmw;
 
 /* Default is, well, kind of complicated. Blue Gene /L and /P had "psets": one
  * i/o node and all compute nodes wired to it.  On Blue Gene /Q that
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
index 036aad2..0e1d6b6 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
@@ -181,14 +181,14 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
      * the difference in starting and ending offsets for 1 byte is 0 the same
      * as 0 bytes so it cannot be distiguished.
      */
-    if ((gpfsmpio_write_aggmethod == 1) || (gpfsmpio_write_aggmethod == 2)) {
+    if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
         count_sizes = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
         MPI_Count buftype_size;
         MPI_Type_size_x(datatype, &buftype_size);
         my_count_size = (ADIO_Offset) count  * (ADIO_Offset)buftype_size;
     }
     if (gpfsmpio_tunegather) {
-      if ((gpfsmpio_write_aggmethod == 1) || (gpfsmpio_write_aggmethod == 2)) {
+      if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
         gpfs_offsets0 = (ADIO_Offset *) ADIOI_Malloc(3*nprocs*sizeof(ADIO_Offset));
         gpfs_offsets  = (ADIO_Offset *) ADIOI_Malloc(3*nprocs*sizeof(ADIO_Offset));
         for (ii=0; ii<nprocs; ii++)  {
@@ -230,7 +230,7 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 		      ADIO_OFFSET, fd->comm);
 	MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
 		      ADIO_OFFSET, fd->comm);
-        if ((gpfsmpio_write_aggmethod == 1) || (gpfsmpio_write_aggmethod == 2)) {
+        if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
 	    MPI_Allgather(&count_sizes, 1, ADIO_OFFSET, count_sizes, 1,
                      ADIO_OFFSET, fd->comm);
         }
@@ -287,7 +287,7 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
    process may directly access only its own file domain. */
 
     int currentValidDataIndex = 0;
-    if ((gpfsmpio_write_aggmethod == 1) || (gpfsmpio_write_aggmethod == 2)) {
+    if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
       /* Take out the 0-data offsets by shifting the indexes with data to the front
        * and keeping track of the valid data index for use as the length.
        */
@@ -301,7 +301,7 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
     }
 
     if (gpfsmpio_tuneblocking) {
-	if ((gpfsmpio_write_aggmethod == 1) || (gpfsmpio_write_aggmethod == 2)) {
+	if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
 	    ADIOI_GPFS_Calc_file_domains(fd, st_offsets, end_offsets,
 		    currentValidDataIndex,
 		    nprocs_for_coll, &min_st_offset,
@@ -315,7 +315,7 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 	}
     }
     else {
-	if ((gpfsmpio_write_aggmethod == 1) || (gpfsmpio_write_aggmethod == 2)) {
+	if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
 	    ADIOI_Calc_file_domains(st_offsets, end_offsets, currentValidDataIndex,
 		    nprocs_for_coll, &min_st_offset,
 		    &fd_start, &fd_end,
@@ -333,7 +333,7 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 
     GPFSMPIO_T_CIO_SET_GET( w, 1, 1, GPFSMPIO_CIO_T_MYREQ, GPFSMPIO_CIO_T_FD_PART );
 
-    if ((gpfsmpio_write_aggmethod == 1) || (gpfsmpio_write_aggmethod == 2)) {
+    if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
     /* If the user has specified to use a one-sided aggregation method then do that at
      * this point instead of the two-phase I/O.
      */
@@ -342,7 +342,7 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 	      buf, datatype, error_code, st_offsets, end_offsets,
 	      currentValidDataIndex, fd_start, fd_end, &holeFound);
       int anyHolesFound = 0;
-      if (!gpfsmpio_onesided_no_rmw)
+      if (!romio_onesided_no_rmw)
         MPI_Allreduce(&holeFound, &anyHolesFound, 1, MPI_INT, MPI_MAX, fd->comm);
       if (anyHolesFound == 0) {
       GPFSMPIO_T_CIO_REPORT( 1, fd, myrank, nprocs)
@@ -357,21 +357,21 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 	  }
       else {
 	/* Holes are found in the data and the user has not set
-	 * gpfsmpio_onesided_no_rmw --- set gpfsmpio_onesided_always_rmw to 1
+	 * romio_onesided_no_rmw --- set romio_onesided_always_rmw to 1
 	 * and re-call ADIOI_OneSidedWriteAggregation and if the user has
-	 * gpfsmpio_onesided_inform_rmw set then inform him of this condition
+	 * romio_onesided_inform_rmw set then inform him of this condition
 	 * and behavior.
          */
 
-        if (gpfsmpio_onesided_inform_rmw && (myrank ==0))
+        if (romio_onesided_inform_rmw && (myrank ==0))
           FPRINTF(stderr,"Information: Holes found during one-sided "
 		  "write aggregation algorithm --- re-running one-sided "
 		  "write aggregation with GPFSMPIO_ONESIDED_ALWAYS_RMW set to 1.\n");
-          gpfsmpio_onesided_always_rmw = 1;
-          int prev_gpfsmpio_onesided_no_rmw = gpfsmpio_onesided_no_rmw;
-          gpfsmpio_onesided_no_rmw = 1;
+          romio_onesided_always_rmw = 1;
+          int prev_romio_onesided_no_rmw = romio_onesided_no_rmw;
+          romio_onesided_no_rmw = 1;
           ADIOI_OneSidedWriteAggregation(fd, offset_list, len_list, contig_access_count, buf, datatype, error_code, st_offsets, end_offsets, currentValidDataIndex, fd_start, fd_end, &holeFound);
-          gpfsmpio_onesided_no_rmw = prev_gpfsmpio_onesided_no_rmw;
+          romio_onesided_no_rmw = prev_romio_onesided_no_rmw;
           GPFSMPIO_T_CIO_REPORT( 1, fd, myrank, nprocs)
           ADIOI_Free(offset_list);
           ADIOI_Free(len_list);
diff --git a/src/mpi/romio/adio/common/Makefile.mk b/src/mpi/romio/adio/common/Makefile.mk
index 80194ef..a06f059 100644
--- a/src/mpi/romio/adio/common/Makefile.mk
+++ b/src/mpi/romio/adio/common/Makefile.mk
@@ -72,5 +72,6 @@ romio_other_sources +=                  \
     adio/common/ad_threaded_io.c        \
     adio/common/p2p_aggregation.c       \
     adio/common/onesided_aggregation.c  \
+    adio/common/ad_tuning.c             \
     adio/common/utils.c
 
diff --git a/src/mpi/romio/adio/common/ad_hints.c b/src/mpi/romio/adio/common/ad_hints.c
index 83f0533..dcad549 100644
--- a/src/mpi/romio/adio/common/ad_hints.c
+++ b/src/mpi/romio/adio/common/ad_hints.c
@@ -8,6 +8,7 @@
 #include "adio.h"
 #include "adio_extern.h"
 #include "hint_fns.h"
+#include "ad_tuning.h"
 
 void ADIOI_GEN_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
 {
@@ -31,6 +32,7 @@ void ADIOI_GEN_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
 	    *error_code = MPI_SUCCESS;
 	    return;
     }
+	ad_get_env_vars();
 
     if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info));
     info = fd->info;
diff --git a/src/mpi/romio/adio/common/ad_tuning.c b/src/mpi/romio/adio/common/ad_tuning.c
new file mode 100644
index 0000000..ee9c318
--- /dev/null
+++ b/src/mpi/romio/adio/common/ad_tuning.c
@@ -0,0 +1,100 @@
+/* ---------------------------------------------------------------- */
+/* (C)Copyright IBM Corp.  2007, 2008                               */
+/* ---------------------------------------------------------------- */
+/**
+ * \file ad_tuning.c
+ * \brief Defines common performance tuning env var options
+ */
+
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *   Copyright (C) 2008 University of Chicago.
+ *   See COPYRIGHT notice in top-level directory.
+ */
+
+/*---------------------------------------------------------------------
+ * ad_tuning.c
+ *
+ * defines common global variables and functions for performance tuning
+ * and functional debugging.
+ *---------------------------------------------------------------------*/
+
+#include "ad_tuning.h"
+
+int     romio_write_aggmethod;
+int     romio_read_aggmethod;
+int     romio_onesided_no_rmw;
+int     romio_onesided_always_rmw;
+int     romio_onesided_inform_rmw;
+
+/* set internal variables for tuning environment variables */
+/** \page mpiio_vars MPIIO Configuration
+  \section env_sec Environment Variables
+ *
+ * - ROMIO_WRITE_AGGMETHOD/ROMIO_READ_AGGMETHOD -  Replaces the two-phase
+ *   collective IO aggregation
+ *   with a one-sided algorithm, significantly reducing communication and
+ *   memory overhead.  Fully
+ *   supports all datasets and datatypes, the only caveat is that any holes in the data
+ *   when writing to a pre-existing file are ignored -- there is no read-modify-write
+ *   support to maintain the correctness of regions of pre-existing data so every byte
+ *   must be explicitly written to maintain correctness.  Users must beware of middle-ware
+ *   libraries like PNETCDF which may count on read-modify-write functionality for certain
+ *   features (like fill values).  Possible values:
+ *   - 0 - Normal two-phase collective IO is used.
+ *   - 1 - A separate one-sided MPI_Put or MPI_Get is used for each contigous chunk of data
+ *         for a compute to write to or read from the collective buffer on the aggregator.
+ *   - 2 - An MPI derived datatype is created using all the contigous chunks and just one
+ *         call to MPI_Put or MPI_Get is done with the derived datatype.  On Blue Gene /Q
+ *         optimal performance for this is achieved when paired with PAMID_TYPED_ONESIDED=1.
+ *   - Default is 0
+ *
+ * - ROMIO_ONESIDED_NO_RMW - For one-sided write aggregation (ROMIO_WRITE_AGGMETHOD = 1 or 2)
+ *   disable the detection of holes in the data when writing to a pre-existing
+ *   file requiring a read-modify-write, thereby avoiding the communication
+ *   overhead for this detection.
+ *   - 0 (hole detection enabled) or 1 (hole detection disabled)
+ *   - Default is 0
+ *
+ * - ROMIO_ONESIDED_ALWAYS_RMW - For one-sided write aggregation (ROMIO_WRITE_AGGMETHOD = 1 or 2)
+ *   always pre-read the offset range being written to a pre-existing file thereby filling
+ *   any holes that may exist in the data before being written.
+ *   - 0 (do not pre-read file offset range) or 1 (pre-read file offset range)
+ *   - Default is 0
+ *
+ * - ROMIO_ONESIDED_INFORM_RMW - For one-sided aggregation
+ *   (ROMIO_WRITE_AGGMETHOD = 1 or 2) generate an informational message informing
+ *   the user whether holes exist in the data when writing to a pre-existing
+ *   file requiring a read-modify-write, thereby educating the user to set
+ *   ROMIO_ONESIDED_NO_RMW=1 on a future run to avoid the communication
+ *   overhead for this detection.
+ *   - 0 (disabled) or 1 (enabled)
+ *   - Default is 0
+ *
+ */
+
+void ad_get_env_vars() {
+    char *x, *dummy;
+
+    romio_write_aggmethod = 0;
+    x = getenv( "ROMIO_WRITE_AGGMETHOD" );
+    if (x) romio_write_aggmethod = atoi(x);
+
+    romio_read_aggmethod = 0;
+    x = getenv( "ROMIO_READ_AGGMETHOD" );
+    if (x) romio_read_aggmethod = atoi(x);
+
+    romio_onesided_no_rmw = 0;
+    x = getenv( "ROMIO_ONESIDED_NO_RMW" );
+    if (x) romio_onesided_no_rmw = atoi(x);
+
+    romio_onesided_always_rmw = 0;
+    x = getenv( "ROMIO_ONESIDED_ALWAYS_RMW" );
+    if (x) romio_onesided_always_rmw = atoi(x);
+    if (romio_onesided_always_rmw)
+      romio_onesided_no_rmw = 1;
+
+    romio_onesided_inform_rmw = 0;
+    x = getenv( "ROMIO_ONESIDED_INFORM_RMW" );
+    if (x) romio_onesided_inform_rmw = atoi(x);
+}
diff --git a/src/mpi/romio/adio/common/onesided_aggregation.c b/src/mpi/romio/adio/common/onesided_aggregation.c
index c627570..ec68f5b 100644
--- a/src/mpi/romio/adio/common/onesided_aggregation.c
+++ b/src/mpi/romio/adio/common/onesided_aggregation.c
@@ -6,16 +6,12 @@
 
 #include "adio.h"
 #include "adio_extern.h"
+#include "ad_tuning.h"
 #ifdef ROMIO_GPFS
-/* right now this is GPFS only but TODO: extend this to all file systems */
 #include "../ad_gpfs/ad_gpfs_tuning.h"
-#else
-int gpfsmpio_onesided_no_rmw = 0;
-int gpfsmpio_write_aggmethod = 0;
-int gpfsmpio_read_aggmethod = 0;
-int gpfsmpio_onesided_always_rmw = 0;
 #endif
 
+
 #include <pthread.h>
 
 //  #define onesidedtrace 1
@@ -776,7 +772,7 @@ printf("end_offsets[%d] is %ld st_offsets[%d] is %ld\n",j,end_offsets[j],j,st_of
     MPI_Win write_buf_window = fd->io_buf_window;
 
     int *write_buf_put_amounts = fd->io_buf_put_amounts;
-    if(!gpfsmpio_onesided_no_rmw) {
+    if(!romio_onesided_no_rmw) {
       *hole_found = 0;
       for (i=0;i<nprocs;i++)
         write_buf_put_amounts[i] = 0;
@@ -800,7 +796,7 @@ printf("end_offsets[%d] is %ld st_offsets[%d] is %ld\n",j,end_offsets[j],j,st_of
 #ifdef onesidedtrace
 printf("iAmUsedAgg - currentRoundFDStart initialized to %ld currentRoundFDEnd to %ld\n",currentRoundFDStart,currentRoundFDEnd);
 #endif
-      if (gpfsmpio_onesided_always_rmw) { // read in the first buffer
+      if (romio_onesided_always_rmw) { // read in the first buffer
         ADIO_Offset tmpCurrentRoundFDEnd = 0;
         if ((fd_end[myAggRank] - currentRoundFDStart) < coll_bufsize) {
           if (myAggRank == greatestFileDomainAggRank) {
@@ -815,14 +811,14 @@ printf("iAmUsedAgg - currentRoundFDStart initialized to %ld currentRoundFDEnd to
         else
         tmpCurrentRoundFDEnd = currentRoundFDStart + coll_bufsize - (ADIO_Offset)1;
 #ifdef onesidedtrace
-printf("gpfsmpio_onesided_always_rmw - first buffer pre-read for file offsets %ld to %ld total is %d\n",currentRoundFDStart,tmpCurrentRoundFDEnd,(int)(tmpCurrentRoundFDEnd - currentRoundFDStart)+1);
+printf("romio_onesided_always_rmw - first buffer pre-read for file offsets %ld to %ld total is %d\n",currentRoundFDStart,tmpCurrentRoundFDEnd,(int)(tmpCurrentRoundFDEnd - currentRoundFDStart)+1);
 #endif
         ADIO_ReadContig(fd, write_buf, (int)(tmpCurrentRoundFDEnd - currentRoundFDStart)+1,
           MPI_BYTE, ADIO_EXPLICIT_OFFSET,currentRoundFDStart, &status, error_code);
 
       }
     }
-    if (gpfsmpio_onesided_always_rmw) // wait until the first buffer is read
+    if (romio_onesided_always_rmw) // wait until the first buffer is read
       MPI_Barrier(fd->comm);
 
 #ifdef ROMIO_GPFS
@@ -832,10 +828,10 @@ printf("gpfsmpio_onesided_always_rmw - first buffer pre-read for file offsets %l
 #endif
 
     /* This is the second main loop of the algorithm, actually nested loop of target aggs within rounds.  There are 2 flavors of this.
-     * For gpfsmpio_write_aggmethod of 1 each nested iteration for the target
+     * For romio_write_aggmethod of 1 each nested iteration for the target
      * agg does an mpi_put on a contiguous chunk using a primative datatype
      * determined using the data structures from the first main loop.  For
-     * gpfsmpio_write_aggmethod of 2 each nested iteration for the target agg
+     * romio_write_aggmethod of 2 each nested iteration for the target agg
      * builds up data to use in created a derived data type for 1 mpi_put that is done for the target agg for each round.
      * To support lustre there will need to be an additional layer of nesting
      * for the multiple file domains within target aggs.
@@ -859,7 +855,7 @@ printf("gpfsmpio_onesided_always_rmw - first buffer pre-read for file offsets %l
       int targetAggContigAccessCount = 0;
 
       /* These data structures are used for the derived datatype mpi_put
-       * in the gpfsmpio_write_aggmethod of 2 case.
+       * in the romio_write_aggmethod of 2 case.
        */
       int *targetAggBlockLengths=NULL;
       MPI_Aint *targetAggDisplacements=NULL, *sourceBufferDisplacements=NULL;
@@ -920,7 +916,7 @@ printf("gpfsmpio_onesided_always_rmw - first buffer pre-read for file offsets %l
         printf("bufferAmountToSend is %d\n",bufferAmountToSend);
 #endif
         if (bufferAmountToSend > 0) { /* we have data to send this round */
-          if (gpfsmpio_write_aggmethod == 2) {
+          if (romio_write_aggmethod == 2) {
             /* Only allocate these arrays if we are using method 2 and only do it once for this round/target agg.
              */
             if (!allocatedDerivedTypeArrays) {
@@ -955,11 +951,11 @@ printf("gpfsmpio_onesided_always_rmw - first buffer pre-read for file offsets %l
             targetDisplacementToUseThisRound += (MPI_Aint) coll_bufsize;
           }
 
-          /* For gpfsmpio_write_aggmethod of 1 do the mpi_put using the primitive MPI_BYTE type for each contiguous
+          /* For romio_write_aggmethod of 1 do the mpi_put using the primitive MPI_BYTE type for each contiguous
            * chunk in the target, of source data is non-contiguous then pack the data first.
            */
 
-          if (gpfsmpio_write_aggmethod == 1) {
+          if (romio_write_aggmethod == 1) {
             MPI_Win_lock(MPI_LOCK_SHARED, targetAggsForMyData[aggIter], 0, write_buf_window);
             if (bufTypeIsContig) {
               MPI_Put(((char*)buf) + currentFDSourceBufferState[aggIter].sourceBufferOffset,bufferAmountToSend, MPI_BYTE,targetAggsForMyData[aggIter],targetDisplacementToUseThisRound, bufferAmountToSend,MPI_BYTE,write_buf_window);
@@ -974,11 +970,11 @@ printf("gpfsmpio_onesided_always_rmw - first buffer pre-read for file offsets %l
             MPI_Win_unlock(targetAggsForMyData[aggIter], write_buf_window);
           }
 
-          /* For gpfsmpio_write_aggmethod of 2 populate the data structures for this round/agg for this offset iter
+          /* For romio_write_aggmethod of 2 populate the data structures for this round/agg for this offset iter
            * to be used subsequently when building the derived type for 1 mpi_put for all the data for this
            * round/agg.
            */
-          else if (gpfsmpio_write_aggmethod == 2) {
+          else if (romio_write_aggmethod == 2) {
 
             if (bufTypeIsContig) {
               targetAggBlockLengths[targetAggContigAccessCount]= bufferAmountToSend;
@@ -1005,9 +1001,9 @@ printf("gpfsmpio_onesided_always_rmw - first buffer pre-read for file offsets %l
         } // bufferAmountToSend > 0
       } // contig list
 
-      /* For gpfsmpio_write_aggmethod of 2 now build the derived type using the data from this round/agg and do 1 single mpi_put.
+      /* For romio_write_aggmethod of 2 now build the derived type using the data from this round/agg and do 1 single mpi_put.
        */
-      if (gpfsmpio_write_aggmethod == 2) {
+      if (romio_write_aggmethod == 2) {
 
         MPI_Datatype sourceBufferDerivedDataType, targetBufferDerivedDataType;
         MPI_Type_create_struct(targetAggContigAccessCount, targetAggBlockLengths, sourceBufferDisplacements, targetAggDataTypes, &sourceBufferDerivedDataType);
@@ -1045,7 +1041,7 @@ printf("gpfsmpio_onesided_always_rmw - first buffer pre-read for file offsets %l
         MPI_Type_free(&targetBufferDerivedDataType);
         }
       }
-      if (!gpfsmpio_onesided_no_rmw) {
+      if (!romio_onesided_no_rmw) {
         MPI_Win_lock(MPI_LOCK_SHARED, targetAggsForMyData[aggIter], 0, fd->io_buf_put_amounts_window);
         MPI_Put(&numBytesPutThisAggRound,1, MPI_INT,targetAggsForMyData[aggIter],myrank, 1,MPI_INT,fd->io_buf_put_amounts_window);
         MPI_Win_unlock(targetAggsForMyData[aggIter], fd->io_buf_put_amounts_window);
@@ -1080,7 +1076,7 @@ printf("first barrier roundIter %d\n",roundIter);
 #endif
 
         int doWriteContig = 1;
-        if (!gpfsmpio_onesided_no_rmw) {
+        if (!romio_onesided_no_rmw) {
           int numBytesPutIntoBuf = 0;
           for (i=0;i<nprocs;i++) {
             numBytesPutIntoBuf += write_buf_put_amounts[i];
@@ -1149,7 +1145,7 @@ printf("first barrier roundIter %d\n",roundIter);
     if (iAmUsedAgg) {
       currentRoundFDStart += coll_bufsize;
 
-      if (gpfsmpio_onesided_always_rmw && (roundIter<(numberOfRounds-1))) { // read in the buffer for the next round unless this is the last round
+      if (romio_onesided_always_rmw && (roundIter<(numberOfRounds-1))) { // read in the buffer for the next round unless this is the last round
         ADIO_Offset tmpCurrentRoundFDEnd = 0;
         if ((fd_end[myAggRank] - currentRoundFDStart) < coll_bufsize) {
           if (myAggRank == greatestFileDomainAggRank) {
@@ -1164,7 +1160,7 @@ printf("first barrier roundIter %d\n",roundIter);
         else
         tmpCurrentRoundFDEnd = currentRoundFDStart + coll_bufsize - (ADIO_Offset)1;
 #ifdef onesidedtrace
-printf("gpfsmpio_onesided_always_rmw - round %d buffer pre-read for file offsets %ld to %ld total is %d\n",roundIter, currentRoundFDStart,tmpCurrentRoundFDEnd,(int)(tmpCurrentRoundFDEnd - currentRoundFDStart)+1);
+printf("romio_onesided_always_rmw - round %d buffer pre-read for file offsets %ld to %ld total is %d\n",roundIter, currentRoundFDStart,tmpCurrentRoundFDEnd,(int)(tmpCurrentRoundFDEnd - currentRoundFDStart)+1);
 #endif
         ADIO_ReadContig(fd, write_buf, (int)(tmpCurrentRoundFDEnd - currentRoundFDStart)+1,
           MPI_BYTE, ADIO_EXPLICIT_OFFSET,currentRoundFDStart, &status, error_code);
@@ -1720,7 +1716,7 @@ printf("end_offsets[%d] is %ld st_offsets[%d] is %ld\n",j,end_offsets[j],j,st_of
             }
 
             additionalFDCounter++;
- 
+
 
 #ifdef onesidedtrace
             printf("block extended beyond fd init settings numSourceAggs %d offset_list[%d] with value %ld past fd border %ld with len %ld\n",numSourceAggs,i,offset_list[blockIter],fd_start[currentAggRankListIndex],len_list[blockIter]);
@@ -1820,8 +1816,8 @@ printf("iAmUsedAgg - currentRoundFDStart initialized "
 
 
     /* This is the second main loop of the algorithm, actually nested loop of target aggs within rounds.  There are 2 flavors of this.
-     * For gpfsmpio_read_aggmethod of 1 each nested iteration for the source agg does an mpi_put on a contiguous chunk using a primative datatype
-     * determined using the data structures from the first main loop.  For gpfsmpio_read_aggmethod of 2 each nested iteration for the source agg
+     * For romio_read_aggmethod of 1 each nested iteration for the source agg does an mpi_put on a contiguous chunk using a primative datatype
+     * determined using the data structures from the first main loop.  For romio_read_aggmethod of 2 each nested iteration for the source agg
      * builds up data to use in created a derived data type for 1 mpi_put that is done for the target agg for each round.
      * To support lustre there will need to be an additional layer of nesting for the multiple file domains
      * within target aggs.
@@ -1959,7 +1955,7 @@ printf("iAmUsedAgg - currentRoundFDStart initialized "
       int sourceAggContigAccessCount = 0;
 
       /* These data structures are used for the derived datatype mpi_get
-       * in the gpfsmpio_read_aggmethod of 2 case.
+       * in the romio_read_aggmethod of 2 case.
        */
       int *sourceAggBlockLengths=NULL;
       MPI_Aint *sourceAggDisplacements=NULL, *recvBufferDisplacements=NULL;
@@ -2004,7 +2000,7 @@ printf("iAmUsedAgg - currentRoundFDStart initialized "
         }
 
         if (bufferAmountToRecv > 0) { /* we have data to recv this round */
-          if (gpfsmpio_read_aggmethod == 2) {
+          if (romio_read_aggmethod == 2) {
             /* Only allocate these arrays if we are using method 2 and only do it once for this round/source agg.
              */
             if (!allocatedDerivedTypeArrays) {
@@ -2039,11 +2035,11 @@ printf("iAmUsedAgg - currentRoundFDStart initialized "
             sourceDisplacementToUseThisRound += (MPI_Aint)coll_bufsize;
           }
 
-          /* For gpfsmpio_read_aggmethod of 1 do the mpi_get using the primitive MPI_BYTE type from each
+          /* For romio_read_aggmethod of 1 do the mpi_get using the primitive MPI_BYTE type from each
            * contiguous chunk from the target, if the source is non-contiguous then unpack the data after
            * the MPI_Win_unlock is done to make sure the data has arrived first.
            */
-          if (gpfsmpio_read_aggmethod == 1) {
+          if (romio_read_aggmethod == 1) {
             MPI_Win_lock(MPI_LOCK_SHARED, sourceAggsForMyData[aggIter], 0, read_buf_window);
             char *getSourceData = NULL;
             if (bufTypeIsContig) {
@@ -2063,11 +2059,11 @@ printf("iAmUsedAgg - currentRoundFDStart initialized "
             }
           }
 
-          /* For gpfsmpio_read_aggmethod of 2 populate the data structures for this round/agg for this offset iter
+          /* For romio_read_aggmethod of 2 populate the data structures for this round/agg for this offset iter
            * to be used subsequently when building the derived type for 1 mpi_put for all the data for this
            * round/agg.
            */
-          else if (gpfsmpio_read_aggmethod == 2) {
+          else if (romio_read_aggmethod == 2) {
             if (bufTypeIsContig) {
               sourceAggBlockLengths[sourceAggContigAccessCount]= bufferAmountToRecv;
               sourceAggDataTypes[sourceAggContigAccessCount] = MPI_BYTE;
@@ -2088,9 +2084,9 @@ printf("iAmUsedAgg - currentRoundFDStart initialized "
           } // bufferAmountToRecv > 0
       } // contig list
 
-      /* For gpfsmpio_read_aggmethod of 2 now build the derived type using the data from this round/agg and do 1 single mpi_put.
+      /* For romio_read_aggmethod of 2 now build the derived type using the data from this round/agg and do 1 single mpi_put.
        */
-      if (gpfsmpio_read_aggmethod == 2) {
+      if (romio_read_aggmethod == 2) {
         MPI_Datatype recvBufferDerivedDataType, sourceBufferDerivedDataType;
 
         MPI_Type_create_struct(sourceAggContigAccessCount, sourceAggBlockLengths, recvBufferDisplacements, sourceAggDataTypes, &recvBufferDerivedDataType);
diff --git a/src/mpi/romio/adio/common/p2p_aggregation.c b/src/mpi/romio/adio/common/p2p_aggregation.c
index ac77a8f..5ea590f 100644
--- a/src/mpi/romio/adio/common/p2p_aggregation.c
+++ b/src/mpi/romio/adio/common/p2p_aggregation.c
@@ -6,6 +6,7 @@
 
 #include "adio.h"
 #include "adio_extern.h"
+#include "ad_tuning.h"
 #include "../ad_gpfs/ad_gpfs_tuning.h"
 
 #include <pthread.h>
diff --git a/src/mpi/romio/adio/include/ad_tuning.h b/src/mpi/romio/adio/include/ad_tuning.h
new file mode 100644
index 0000000..51fc926
--- /dev/null
+++ b/src/mpi/romio/adio/include/ad_tuning.h
@@ -0,0 +1,38 @@
+/* ---------------------------------------------------------------- */
+/* (C)Copyright IBM Corp.  2007, 2008                               */
+/* ---------------------------------------------------------------- */
+/**
+ * \file ad_tuning.h
+ * \brief Defines common performance tuning env var options
+ */
+
+/*---------------------------------------------------------------------
+ * ad_tuning.h
+ *
+ * declares common global variables and functions for performance tuning
+ * and functional debugging.
+ *---------------------------------------------------------------------*/
+
+#ifndef AD_TUNING_H_
+#define AD_TUNING_H_
+
+#include "adio.h"
+
+
+/*-----------------------------------------
+ *  Global variables for the control of performance tuning.
+ *-----------------------------------------*/
+
+/* corresponds to environment variables to select optimizations and timing level */
+extern int      romio_pthreadio;
+extern int      romio_p2pcontig;
+extern int      romio_write_aggmethod;
+extern int      romio_read_aggmethod;
+extern int      romio_onesided_no_rmw;
+extern int      romio_onesided_always_rmw;
+extern int      romio_onesided_inform_rmw;
+
+/* set internal variables for tuning environment variables */
+void ad_get_env_vars(void);
+
+#endif  /* AD_TUNING_H_ */

-----------------------------------------------------------------------

Summary of changes:
 src/mpi/romio/adio/ad_gpfs/ad_gpfs.h             |    1 +
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_hints.c       |    1 +
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c      |   16 +-
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c      |   74 ----
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h      |    6 -
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c      |   61 ++-
 src/mpi/romio/adio/ad_lustre/ad_lustre.h         |    1 +
 src/mpi/romio/adio/ad_lustre/ad_lustre_wrcoll.c  |  443 +++++++++++++++++++++-
 src/mpi/romio/adio/common/Makefile.mk            |    1 +
 src/mpi/romio/adio/common/ad_hints.c             |    2 +
 src/mpi/romio/adio/common/ad_tuning.c            |  111 ++++++
 src/mpi/romio/adio/common/onesided_aggregation.c |  331 +++++++++++-----
 src/mpi/romio/adio/common/p2p_aggregation.c      |    1 +
 src/mpi/romio/adio/include/ad_tuning.h           |   36 ++
 src/mpi/romio/adio/include/adio.h                |    2 +-
 src/mpi/romio/adio/include/adioi.h               |   29 ++-
 16 files changed, 899 insertions(+), 217 deletions(-)
 create mode 100644 src/mpi/romio/adio/common/ad_tuning.c
 create mode 100644 src/mpi/romio/adio/include/ad_tuning.h


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list