[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.1-109-g1d622ee

Service Account noreply at mpich.org
Wed Mar 26 16:02:27 CDT 2014


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  1d622ee99084f0e624c83f961e6b21d2021178b7 (commit)
       via  30b4ac9c730e264a7d5d89a336372c910c78408b (commit)
       via  3e8d8114783308f31b6da14b27b67adc42d052f4 (commit)
       via  3e37d5b4930de76d45f5440f1cc6e5c19930d37d (commit)
       via  f0617bdcede575c84eaada54718e3ea72520af30 (commit)
       via  e567464f8a1cb72ff487fa3b7d44ea3e2b326f1b (commit)
       via  019b42180dbeaf016bc3c117b1a15f1c46a817d3 (commit)
       via  e96115075b6fdaca4bdd285fe7f0f71debb3c5d6 (commit)
       via  b991dc7cb200ade30e28b189534b730bf27c07b3 (commit)
       via  d1c97cb08a84bd0390f5e678f2aa9a8e040744b2 (commit)
       via  8fa2b391d852a348dfbf0bd35f9fdae23001c869 (commit)
       via  88ccf46760a07a6a1eb83a2c3401c6372fade946 (commit)
       via  d4b3106d7ba6372fde66dc7d3476064edb9f803b (commit)
       via  bc1ae63767f19ba4e9de97612eb59348c6ca2c61 (commit)
       via  614819fd401b2c8452a5d33937a6f7761ece6a93 (commit)
       via  283629cd960f01957b99b8a6254af47a2fedcb1d (commit)
       via  e8b5dfdbb9765dde0c17f37735d6f4381cd59d13 (commit)
       via  030fd0f12b1648851c8773ff31db5cd128e63445 (commit)
       via  1ce0fe811842913d79dfb3f316b7e5f8caca771f (commit)
       via  fdc4cb6f8227adb5ebee35edcb5fe3dfc281b438 (commit)
       via  87102f400cab7635ae95ddec6cb67fedcf34d131 (commit)
       via  35d0c5b45aacff992cc72c9bbf3735a54ddfabe5 (commit)
       via  917af7dca45c7ec63eef8532b6401371f27e64a3 (commit)
       via  7ec40e90620e04b7abc699c8ae75facc41eaa4f8 (commit)
       via  a19edd236c3bfb574c6d49080202a17e3e6a4cbe (commit)
       via  dde97df0e58d3b8b9ba4f32b4dace0285e90ddbc (commit)
       via  0a437100052f921abb2a73c3d806c8828e68bfe3 (commit)
       via  d1e292ca3b0f21b63adc631ef728eb48c0860cab (commit)
       via  da9d3398de3c2cd339cb43f4755f4067944701db (commit)
       via  5e34974e1da0038bfc0fbd65598d3e871c1541bf (commit)
       via  6ca13e5d2b1ceafa649d1b66700208470ccd03a2 (commit)
       via  f3a43a5acf948c84e816ee304156247cda31b341 (commit)
       via  5bc8aedcff265252754a4c3ac01e709ad66ee9af (commit)
       via  c97af627dc0611881f28041f5451ee7d5603d1bf (commit)
       via  751176bc3c371056039ab77f7832c24eafa2ef02 (commit)
      from  a4b73a8e9260a46ac258e2172e3cd2fe3dad2bad (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/1d622ee99084f0e624c83f961e6b21d2021178b7

commit 1d622ee99084f0e624c83f961e6b21d2021178b7
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Wed Mar 26 09:19:45 2014 -0500

    include pthread.h for pthread routines

diff --git a/src/mpi/romio/adio/common/ad_threaded_io.c b/src/mpi/romio/adio/common/ad_threaded_io.c
index d06d828..a4b3da3 100644
--- a/src/mpi/romio/adio/common/ad_threaded_io.c
+++ b/src/mpi/romio/adio/common/ad_threaded_io.c
@@ -7,6 +7,7 @@
 #include "adio.h"
 #include "adio_extern.h"
 
+#include <pthread.h>
 /* Function for running in another thread for doing the file reading while the
  * main thread is doing data aggregation - useful only when multiple rounds are
  * needed due to file size relative to the read buffer size and number of

http://git.mpich.org/mpich.git/commitdiff/30b4ac9c730e264a7d5d89a336372c910c78408b

commit 30b4ac9c730e264a7d5d89a336372c910c78408b
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Wed Mar 26 09:18:36 2014 -0500

    unused variable cleanup
    
    we introduced some bluegene code to common code, which introduced some
    variable not used in common case. also, lseek no longer exists now that
    we use pread/pwrite.

diff --git a/src/mpi/romio/adio/common/ad_read.c b/src/mpi/romio/adio/common/ad_read.c
index 0f263c3..13d5986 100644
--- a/src/mpi/romio/adio/common/ad_read.c
+++ b/src/mpi/romio/adio/common/ad_read.c
@@ -22,13 +22,12 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
 			  ADIO_Offset offset, ADIO_Status *status,
 			  int *error_code)
 {
-    off_t err_lseek = -1;
     ssize_t err = -1;
     MPI_Count datatype_size;
     ADIO_Offset len, bytes_xfered=0;
     size_t rd_count;
     static char myname[] = "ADIOI_GEN_READCONTIG";
-    double io_time=0, io_time2=0;
+    double io_time=0;
     char *p;
 
 #ifdef AGGREGATION_PROFILE
@@ -37,9 +36,9 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
     MPI_Type_size_x(datatype, &datatype_size);
     len = datatype_size * (ADIO_Offset)count;
 
+    io_time = MPI_Wtime();
 #ifdef ROMIO_GPFS
     if (gpfsmpio_timing) {
-	io_time = MPI_Wtime();
 	gpfsmpio_prof_cr[ GPFSMPIO_CIO_DATA_SIZE ] += len;
     }
 #endif
@@ -48,9 +47,6 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
 	offset = fd->fp_ind;
     }
 
-#ifdef ROMIO_GPFS
-    if (gpfsmpio_timing) io_time2 = MPI_Wtime();
-#endif
     p=buf;
     while (bytes_xfered < len) {
 #ifdef ADIOI_MPE_LOGGING
@@ -86,7 +82,7 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
 	p += err;
     }
 #ifdef ROMIO_GPFS
-    if (gpfsmpio_timing) gpfsmpio_prof_cr[ GPFSMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
+    if (gpfsmpio_timing) gpfsmpio_prof_cr[ GPFSMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time);
 #endif
     fd->fp_sys_posn = offset + bytes_xfered;
 
diff --git a/src/mpi/romio/adio/common/ad_write.c b/src/mpi/romio/adio/common/ad_write.c
index 55e4a31..72a9a53 100644
--- a/src/mpi/romio/adio/common/ad_write.c
+++ b/src/mpi/romio/adio/common/ad_write.c
@@ -24,13 +24,12 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
 			   ADIO_Offset offset, ADIO_Status *status,
 			   int *error_code)
 {
-    off_t err_lseek = -1;
     ssize_t err = -1;
     MPI_Count datatype_size;
     ADIO_Offset len, bytes_xfered=0;
     size_t wr_count;
     static char myname[] = "ADIOI_GEN_WRITECONTIG";
-    double io_time=0, io_time2=0;
+    double io_time=0;
     char * p;
 
 #ifdef AGGREGATION_PROFILE
@@ -40,9 +39,9 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
     MPI_Type_size_x(datatype, &datatype_size);
     len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
 
+    io_time = MPI_Wtime();
 #ifdef ROMIO_GPFS
     if (gpfsmpio_timing) {
-	io_time = MPI_Wtime();
 	gpfsmpio_prof_cw[ GPFSMPIO_CIO_DATA_SIZE ] += len;
     }
 #endif
@@ -51,9 +50,6 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
 	offset = fd->fp_ind;
     }
 
-#ifdef ROMIO_GPFS
-    if (gpfsmpio_timing) io_time2 = MPI_Wtime();
-#endif
     p = (char *)buf;
     while (bytes_xfered < len) {
 #ifdef ADIOI_MPE_LOGGING
@@ -85,7 +81,7 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
     }
 
 #ifdef ROMIO_GPFS
-    if (gpfsmpio_timing) gpfsmpio_prof_cw[ GPFSMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
+    if (gpfsmpio_timing) gpfsmpio_prof_cw[ GPFSMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time);
 #endif
     fd->fp_sys_posn = offset + bytes_xfered;
 

http://git.mpich.org/mpich.git/commitdiff/3e8d8114783308f31b6da14b27b67adc42d052f4

commit 3e8d8114783308f31b6da14b27b67adc42d052f4
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Wed Mar 26 09:15:50 2014 -0500

    feature request for pread/pwrite
    
    as with lustre, pread/pwrtie need a feature level newer than
    --enable-strict requests.  autoconf already checked for the function at
    configure time, so we know it's there.  TODO: more robust autoconf
    checks and provide a "my_pwrite" that wraps lseek/write for a fallback.

diff --git a/src/mpi/romio/adio/common/ad_read.c b/src/mpi/romio/adio/common/ad_read.c
index 93bb7d1..0f263c3 100644
--- a/src/mpi/romio/adio/common/ad_read.c
+++ b/src/mpi/romio/adio/common/ad_read.c
@@ -5,11 +5,11 @@
  *   See COPYRIGHT notice in top-level directory.
  */
 
-#include "adio.h"
 
-#ifdef HAVE_UNISTD_H
+#define _XOPEN_SOURCE 500
 #include <unistd.h>
-#endif
+
+#include "adio.h"
 #ifdef AGGREGATION_PROFILE
 #include "mpe.h"
 #endif
diff --git a/src/mpi/romio/adio/common/ad_write.c b/src/mpi/romio/adio/common/ad_write.c
index 45c2eec..55e4a31 100644
--- a/src/mpi/romio/adio/common/ad_write.c
+++ b/src/mpi/romio/adio/common/ad_write.c
@@ -5,11 +5,11 @@
  *   See COPYRIGHT notice in top-level directory.
  */
 
-#include "adio.h"
 
-#ifdef HAVE_UNISTD_H
+#define _XOPEN_SOURCE 500
 #include <unistd.h>
-#endif
+
+#include "adio.h"
 #ifdef AGGREGATION_PROFILE
 #include "mpe.h"
 #endif

http://git.mpich.org/mpich.git/commitdiff/3e37d5b4930de76d45f5440f1cc6e5c19930d37d

commit 3e37d5b4930de76d45f5440f1cc6e5c19930d37d
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Tue Mar 25 22:22:57 2014 -0500

    clean up P2Pcontig warnings
    
    P2Pcontig needed a prototype and clang complaiend about several shadowed
    declarations.

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
index f583dc4..2ff9842 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
@@ -264,7 +264,7 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 	if (inOrderAndNoGaps && buftype_is_contig) {
 	    /* if these conditions exist then execute the P2PContig code else
 	     * execute the original code */
-	    P2PContigWriteAggregation(fd, buf, 
+	    ADIOI_P2PContigWriteAggregation(fd, buf,
 		    error_code, st_offsets, end_offsets, fd_start, fd_end);
 	    /* NOTE: we are skipping the rest of two-phase in this path */
             GPFSMPIO_T_CIO_REPORT( 1, fd, myrank, nprocs)
diff --git a/src/mpi/romio/adio/common/p2p_aggregation.c b/src/mpi/romio/adio/common/p2p_aggregation.c
index 53df38e..692de74 100644
--- a/src/mpi/romio/adio/common/p2p_aggregation.c
+++ b/src/mpi/romio/adio/common/p2p_aggregation.c
@@ -1,9 +1,10 @@
 #include "adio.h"
 #include "adio_extern.h"
-#include <mpix.h>
 #include "../ad_gpfs/ad_gpfs_tuning.h"
 
-void P2PContigWriteAggregation(ADIO_File fd,
+#include <pthread.h>
+
+void ADIOI_P2PContigWriteAggregation(ADIO_File fd,
 	const void *buf,
 	int *error_code,
 	ADIO_Offset *st_offsets,
@@ -32,10 +33,12 @@ void P2PContigWriteAggregation(ADIO_File fd,
 
     int naggs = fd->hints->cb_nodes;
     int coll_bufsize = fd->hints->cb_buffer_size;
+#ifdef ROMIO_GPFS
     if (gpfsmpio_pthreadio == 1) {
 	/* split buffer in half for a kind of double buffering with the threads*/
 	coll_bufsize = fd->hints->cb_buffer_size/2;
     }
+#endif
 
     int j;
     for (j=0;j<naggs;j++) {
@@ -85,7 +88,6 @@ void P2PContigWriteAggregation(ADIO_File fd,
 			 need from what procs */
 
 	// count numSourceProcs so we know how large to make the arrays
-	int i;
 	for (i=0;i<nprocs;i++)
 	    if ( ((st_offsets[i] >= fd_start[myAggRank]) &&  (st_offsets[i] <= fd_end[myAggRank])) || ((end_offsets[i] >= fd_start[myAggRank]) &&  (end_offsets[i] <= fd_end[myAggRank])))
 		numSourceProcs++;
@@ -154,10 +156,12 @@ void P2PContigWriteAggregation(ADIO_File fd,
 
     int currentWriteBuf = 0;
     int useIOBuffer = 0;
+#ifdef ROMIO_GPFS
     if (gpfsmpio_pthreadio && (numberOfRounds>1)) {
 	useIOBuffer = 1;
 	io_thread = pthread_self();
     }
+#endif
 
     ADIO_Offset currentRoundFDStart = 0;
     ADIO_Offset currentRoundFDEnd = 0;
@@ -170,7 +174,9 @@ void P2PContigWriteAggregation(ADIO_File fd,
     int *mpiRequestMapPerProc = (int *)ADIOI_Malloc(numSourceProcs * sizeof(int));
 
     endTimeBase = MPI_Wtime();
+#ifdef ROMIO_GPFS
     gpfsmpio_prof_cw[GPFSMPIO_CIO_T_MYREQ] += (endTimeBase-startTimeBase);
+#endif
     startTimeBase = MPI_Wtime();
 
     /* each iteration of this loop writes a coll_bufsize portion of the file
@@ -194,7 +200,6 @@ void P2PContigWriteAggregation(ADIO_File fd,
 	int irecv;
 
 	/* the source procs receive the amount of data the aggs want them to send */
-	int i;
 	startTimeBase = MPI_Wtime();
 	for (i=0;i<numTargetAggs;i++) {
 	    MPI_Irecv(&amountOfDataReqestedByTargetAgg[i],1,
@@ -248,14 +253,16 @@ void P2PContigWriteAggregation(ADIO_File fd,
 
 	}
 
+#ifdef ROMIO_GPFS
 	gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_SETUP] += (endTimeBase-startTimeBase);
+#endif
 	startTimeBase = MPI_Wtime();
 
 	// the aggs receive the data from the source procs
 	int numDataRecvToWaitFor = 0;
 	for (i=0;i<numSourceProcs;i++) {
 
-	    int j, currentWBOffset = 0;
+	    int currentWBOffset = 0;
 	    for (j=0;j<i;j++)
 		currentWBOffset += dataSizeGottenThisRoundPerProc[j];
 
@@ -296,7 +303,9 @@ void P2PContigWriteAggregation(ADIO_File fd,
 	}
 
 	endTimeBase = MPI_Wtime();
+#ifdef ROMIO_GPFS
 	gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_NET] += (endTimeBase-startTimeBase);
+#endif
 	// the aggs now write the data
 	if (numDataRecvToWaitFor > 0) {
 
@@ -350,7 +359,9 @@ void P2PContigWriteAggregation(ADIO_File fd,
     } // for-loop roundIter
 
     endTimeBase=MPI_Wtime();
+#ifdef ROMIO_GPFS
     gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH] += (endTimeBase-startTimeBase);
+#endif
 
     if (useIOBuffer) { // thread writer cleanup
 
@@ -385,7 +396,7 @@ void P2PContigWriteAggregation(ADIO_File fd,
     return;
 }
 
-void P2PContigReadAggregation(ADIO_File fd,
+void ADIOI_P2PContigReadAggregation(ADIO_File fd,
 	const void *buf,
 	int *error_code,
 	ADIO_Offset *st_offsets,
@@ -413,9 +424,11 @@ void P2PContigReadAggregation(ADIO_File fd,
 
     int naggs = fd->hints->cb_nodes;
     int coll_bufsize = fd->hints->cb_buffer_size;
+#ifdef ROMIO_GPFS
     if (gpfsmpio_pthreadio == 1)
 	/* share buffer between working threads */
 	coll_bufsize = coll_bufsize/2;
+#endif
 
     int j;
     for (j=0;j<naggs;j++) {
@@ -515,7 +528,6 @@ void P2PContigReadAggregation(ADIO_File fd,
 
 
     int totalAmountDataSent = 0;
-    int totalAmountDataReceived = 0;
     MPI_Request *mpiSizeToSendRequest = (MPI_Request *) ADIOI_Malloc(numSourceAggs * sizeof(MPI_Request));
     MPI_Request *mpiRecvDataFromSourceAggsRequest = (MPI_Request *) ADIOI_Malloc(numSourceAggs * sizeof(MPI_Request));
     MPI_Request *mpiSendDataSizeRequest = (MPI_Request *) ADIOI_Malloc(numTargetProcs * sizeof(MPI_Request));
@@ -550,13 +562,17 @@ void P2PContigReadAggregation(ADIO_File fd,
 
     int currentReadBuf = 0;
     int useIOBuffer = 0;
+#ifdef ROMIO_GPFS
     if (gpfsmpio_pthreadio && (numberOfRounds>1)) {
 	useIOBuffer = 1;
 	io_thread = pthread_self();
     }
+#endif
 
     endTimeBase = MPI_Wtime();
+#ifdef ROMIO_GPFS
     gpfsmpio_prof_cw[GPFSMPIO_CIO_T_MYREQ] += (endTimeBase-startTimeBase);
+#endif
 
 
     // each iteration of this loop reads a coll_bufsize portion of the file domain
@@ -658,7 +674,6 @@ void P2PContigReadAggregation(ADIO_File fd,
 	} // IAmUsedAgg
 
 	/* the source procs receive the amount of data the aggs will be sending them */
-	int i;
 	for (i=0;i<numSourceAggs;i++) {
 	    MPI_Irecv(&amountOfDataReqestedFromSourceAgg[i],1,
 		    MPI_INT,sourceAggsForMyData[i],0,
@@ -717,7 +732,7 @@ void P2PContigReadAggregation(ADIO_File fd,
 	// the aggs send the data to the source procs
 	for (i=0;i<numTargetProcs;i++) {
 
-	    int j, currentWBOffset = 0;
+	    int currentWBOffset = 0;
 	    for (j=0;j<i;j++)
 		currentWBOffset += dataSizeSentThisRoundPerProc[j];
 
diff --git a/src/mpi/romio/adio/include/adioi.h b/src/mpi/romio/adio/include/adioi.h
index 205a9cb..f600b77 100644
--- a/src/mpi/romio/adio/include/adioi.h
+++ b/src/mpi/romio/adio/include/adioi.h
@@ -558,6 +558,22 @@ int ADIOI_Build_client_req(ADIO_File fd,
 			   ADIO_Offset agg_comm_sz,
 			   MPI_Datatype *agg_comm_dtype_p);
 
+void ADIOI_P2PContigWriteAggregation(ADIO_File fd,
+	                             const void *buf,
+				     int *error_code,
+				     ADIO_Offset *st_offsets,
+				     ADIO_Offset *end_offset,
+				     ADIO_Offset *fd_start,
+				     ADIO_Offset *fd_end);
+
+void ADIOI_P2PContigReadAggregation(ADIO_File fd,
+	                             const void *buf,
+				     int *error_code,
+				     ADIO_Offset *st_offsets,
+				     ADIO_Offset *end_offset,
+				     ADIO_Offset *fd_start,
+				     ADIO_Offset *fd_end);
+
 ADIO_Offset ADIOI_GEN_SeekIndividual(ADIO_File fd, ADIO_Offset offset, 
 				     int whence, int *error_code);
 void ADIOI_GEN_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);

http://git.mpich.org/mpich.git/commitdiff/f0617bdcede575c84eaada54718e3ea72520af30

commit f0617bdcede575c84eaada54718e3ea72520af30
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Tue Mar 25 21:48:22 2014 -0500

    blksize_t not available under enable-strict

diff --git a/src/mpi/romio/configure.ac b/src/mpi/romio/configure.ac
index ae61052..30b2f92 100644
--- a/src/mpi/romio/configure.ac
+++ b/src/mpi/romio/configure.ac
@@ -1361,6 +1361,17 @@ AC_CHECK_FUNCS(statvfs,
     )
 )
 
+AC_CHECK_TYPE([blksize_t],[],[AC_DEFINE_UNQUOTED([blksize_t],[__blksize_t],[Provide blksize_t if not available]) ], [[
+	       #ifdef HAVE_SYS_TYPES_H
+	       #include <sys/types.h>
+	       #endif
+	       #ifdef HAVE_SYS_STAT_H
+	       #include <sys/stat.h>
+	       #endif
+	       #ifdef HAVE_UNISTD_H
+	       #include <unistd.h>
+	       #endif]] )
+
 #
 # Check for large file support.  Make sure that we can use the off64_t 
 # type (in some cases, it is an array, and the ROMIO code isn't prepared for

http://git.mpich.org/mpich.git/commitdiff/e567464f8a1cb72ff487fa3b7d44ea3e2b326f1b

commit e567464f8a1cb72ff487fa3b7d44ea3e2b326f1b
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Fri Mar 21 21:52:05 2014 +0000

    disable aio on BlueGene
    
    until we figure out what's up with aio routines on blue gene, let's
    just disable it.  the romio aio tests would just hang in aio_suspend

diff --git a/src/mpi/romio/configure.ac b/src/mpi/romio/configure.ac
index ed34946..ae61052 100644
--- a/src/mpi/romio/configure.ac
+++ b/src/mpi/romio/configure.ac
@@ -807,6 +807,8 @@ AM_CONDITIONAL([BUILD_AD_PE],[false])
 if test "$file_system_args" = "BGQ" -a -n "$file_system_gpfs"; then
     AC_DEFINE(BGQPLATFORM,1,BGQ platform)
     AM_CONDITIONAL([BUILD_AD_BG],[true])
+    dnl what if anything can make Blue Gene support aio?
+    disable_aio=yes
 fi
 if test "$file_system_args" = "PE" -a -n "$file_system_gpfs"; then
     AC_DEFINE(PEPLATFORM,1,PE platform)

http://git.mpich.org/mpich.git/commitdiff/019b42180dbeaf016bc3c117b1a15f1c46a817d3

commit 019b42180dbeaf016bc3c117b1a15f1c46a817d3
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Thu Mar 13 11:57:35 2014 -0500

    Replace ADIOI_GPFS_assert with ADIOI_Assert

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.c
index 4af0504..dc2b22e 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.c
@@ -105,7 +105,7 @@ int ADIOI_GPFS_Calc_aggregator(ADIO_File fd,
     ADIO_Offset avail_bytes;
     TRACE_ERR("Entering ADIOI_GPFS_Calc_aggregator\n");
 
-    ADIOI_GPFS_assert ( (off <= fd_end[fd->hints->cb_nodes-1] && off >= min_off && fd_start[0] >= min_off ) );
+    ADIOI_Assert ( (off <= fd_end[fd->hints->cb_nodes-1] && off >= min_off && fd_start[0] >= min_off ) );
 
     /* binary search --> rank_index is returned */
     int ub = fd->hints->cb_nodes;
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
index a145043..735eba6 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
@@ -18,11 +18,6 @@
 
 #include "adio.h"
 
-#define ADIOI_GPFS_assert( a ) if (!(a)) { \
-                                fprintf( stderr, "AD_GPFS_assert, file=%s, line=%d\n", __FILE__, __LINE__ ); \
-                                MPI_Abort( MPI_COMM_WORLD, 1 ); \
-                           }
-
 
 /*-----------------------------------------
  *  Global variables for the control of
diff --git a/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.c b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.c
index 41ab2af..959e564 100644
--- a/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.c
+++ b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.c
@@ -255,7 +255,7 @@ ADIOI_BG_compute_agg_ranklist_serial_do (const ADIOI_BG_ConfInfo_t *confInfo,
        {
          for(j = 0; j < numAggs; j++)
          {
-           ADIOI_GPFS_assert(nextAggr<aggTotal);
+           ADIOI_Assert(nextAggr<aggTotal);
            aggList[nextAggr] = bridgelist[procIndex+j*distance+1].rank;
            TRACE_ERR("agglist[%d] -> bridgelist[%d] = %d\n", nextAggr, procIndex+j*distance+1,aggList[nextAggr]);
            if(aggList[nextAggr]==lastBridge) /* can't have bridge in the list twice */
diff --git a/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_hints.c b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_hints.c
index 443020c..6669b30 100644
--- a/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_hints.c
+++ b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_hints.c
@@ -70,7 +70,7 @@ void ADIOI_BG_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
      */
 
     value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
-    ADIOI_GPFS_assert ((value != NULL));
+    ADIOI_Assert ((value != NULL));
 
     /* initialize info and hints to default values if they haven't been
      * previously initialized
diff --git a/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.c b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.c
index e358432..499c757 100644
--- a/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.c
+++ b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.c
@@ -31,7 +31,7 @@ ADIOI_BG_ProcInfo_t *
 ADIOI_BG_ProcInfo_new()
 {
     ADIOI_BG_ProcInfo_t *p = (ADIOI_BG_ProcInfo_t *) ADIOI_Malloc (sizeof(ADIOI_BG_ProcInfo_t));
-    ADIOI_GPFS_assert ((p != NULL));
+    ADIOI_Assert ((p != NULL));
     return p;
 }
 
@@ -39,7 +39,7 @@ ADIOI_BG_ProcInfo_t *
 ADIOI_BG_ProcInfo_new_n( int n )
 {
     ADIOI_BG_ProcInfo_t *p = (ADIOI_BG_ProcInfo_t *) ADIOI_Malloc (n * sizeof(ADIOI_BG_ProcInfo_t));
-    ADIOI_GPFS_assert ((p != NULL));
+    ADIOI_Assert ((p != NULL));
     return p;
 }
 
@@ -53,7 +53,7 @@ ADIOI_BG_ConfInfo_t *
 ADIOI_BG_ConfInfo_new ()
 {
     ADIOI_BG_ConfInfo_t *p = (ADIOI_BG_ConfInfo_t *) ADIOI_Malloc (sizeof(ADIOI_BG_ConfInfo_t));
-    ADIOI_GPFS_assert ((p != NULL));
+    ADIOI_Assert ((p != NULL));
     return p;
 }
 
@@ -135,7 +135,7 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
                   pers.Network_Config.cnBridge_C << 12 | 
                   pers.Network_Config.cnBridge_D << 6 | 
                   pers.Network_Config.cnBridge_E << 2;
-   ADIOI_BG_assert((bridgeCoords >= 0)); /* A dim is < 6 bits or sorting won't work */
+   ADIOI_Assert((bridgeCoords >= 0)); /* A dim is < 6 bits or sorting won't work */
 
    if((hw.Coords[0] == pers.Network_Config.cnBridge_A) && 
       (hw.Coords[1] == pers.Network_Config.cnBridge_B) && 
@@ -273,7 +273,7 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
       TRACE_ERR("Maximum ranks under a bridge rank: %d, minimum: %d, nAggrs: %d, numBridgeRanks: %d pset dflt: %d naggrs: %d ratio: %f\n", maxcompute, mincompute, conf->nAggrs, conf->numBridgeRanks, ADIOI_BG_NAGG_PSET_DFLT, conf->nAggrs, conf->aggRatio);
    }
 
-   ADIOI_GPFS_assert((bridgerank != -1));
+   ADIOI_Assert((bridgerank != -1));
    proc->bridgeRank = bridgerank;
    proc->iamBridge = iambridge;
    TRACE_ERR("Rank %d has bridge set index %d (bridge rank: %d) with %d other ranks, ioNodeIndex: %d\n", rank,  proc->ioNodeIndex, bridgerank, proc->myIOSize, proc->ioNodeIndex);

http://git.mpich.org/mpich.git/commitdiff/e96115075b6fdaca4bdd285fe7f0f71debb3c5d6

commit e96115075b6fdaca4bdd285fe7f0f71debb3c5d6
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Wed Mar 12 01:09:08 2014 -0500

    fixed configuration but so gpfs-only build does not include bluegene files

diff --git a/src/mpi/romio/adio/Makefile.mk b/src/mpi/romio/adio/Makefile.mk
index caca412..825abad 100644
--- a/src/mpi/romio/adio/Makefile.mk
+++ b/src/mpi/romio/adio/Makefile.mk
@@ -22,6 +22,7 @@ noinst_HEADERS +=                      \
     adio/include/nopackage.h
 
 include $(top_srcdir)/adio/ad_gpfs/Makefile.mk
+include $(top_srcdir)/adio/ad_gpfs/bg/Makefile.mk
 include $(top_srcdir)/adio/ad_gridftp/Makefile.mk
 include $(top_srcdir)/adio/ad_hfs/Makefile.mk
 include $(top_srcdir)/adio/ad_lustre/Makefile.mk
diff --git a/src/mpi/romio/adio/ad_gpfs/Makefile.mk b/src/mpi/romio/adio/ad_gpfs/Makefile.mk
index fbcc4d2..fdf94f5 100644
--- a/src/mpi/romio/adio/ad_gpfs/Makefile.mk
+++ b/src/mpi/romio/adio/ad_gpfs/Makefile.mk
@@ -10,19 +10,14 @@ if BUILD_AD_GPFS
 AM_CPPFLAGS += -DBGL_OPTIM_STEP1_2=1 -DBGL_OPTIM_STEP1_1=1
 
 noinst_HEADERS +=                                                    \
-    adio/ad_gpfs/bg/ad_bg_aggrs.h                                         \
     adio/ad_gpfs/ad_gpfs_aggrs.h                                         \
     adio/ad_gpfs/ad_gpfs.h                                               \
-    adio/ad_gpfs/bg/ad_bg_pset.h                                          \
     adio/ad_gpfs/ad_gpfs_tuning.h
 
 romio_other_sources +=                                               \
-    adio/ad_gpfs/bg/ad_bg_aggrs.c                                         \
     adio/ad_gpfs/ad_gpfs_aggrs.c                                         \
     adio/ad_gpfs/ad_gpfs_close.c                                         \
     adio/ad_gpfs/ad_gpfs_flush.c                                         \
-    adio/ad_gpfs/bg/ad_bg_hints.c                                         \
-    adio/ad_gpfs/bg/ad_bg_pset.c                                          \
     adio/ad_gpfs/ad_gpfs_tuning.c                                        \
     adio/ad_gpfs/ad_gpfs.c                                               \
     adio/ad_gpfs/ad_gpfs_open.c                                          \
diff --git a/src/mpi/romio/adio/ad_gpfs/bg/Makefile.mk b/src/mpi/romio/adio/ad_gpfs/bg/Makefile.mk
new file mode 100644
index 0000000..c6436d0
--- /dev/null
+++ b/src/mpi/romio/adio/ad_gpfs/bg/Makefile.mk
@@ -0,0 +1,21 @@
+## -*- Mode: Makefile; -*-
+## vim: set ft=automake :
+##
+## (C) 2012 by Argonne National Laboratory.
+##     See COPYRIGHT in top-level directory.
+##
+
+if BUILD_AD_BG
+
+AM_CPPFLAGS += -DBGL_OPTIM_STEP1_2=1 -DBGL_OPTIM_STEP1_1=1
+
+noinst_HEADERS +=                                                    \
+    adio/ad_gpfs/bg/ad_bg_aggrs.h                                         \
+    adio/ad_gpfs/bg/ad_bg_pset.h
+
+romio_other_sources +=                                               \
+    adio/ad_gpfs/bg/ad_bg_aggrs.c                                         \
+    adio/ad_gpfs/bg/ad_bg_hints.c                                         \
+    adio/ad_gpfs/bg/ad_bg_pset.c 
+
+endif BUILD_AD_BG
diff --git a/src/mpi/romio/configure.ac b/src/mpi/romio/configure.ac
index d614fbe..ed34946 100644
--- a/src/mpi/romio/configure.ac
+++ b/src/mpi/romio/configure.ac
@@ -802,11 +802,15 @@ if test -n "$file_system_gpfs"; then
     AC_DEFINE(ROMIO_GPFS,1,[Define for ROMIO with GPFS])
 fi
 
+AM_CONDITIONAL([BUILD_AD_BG],[false])
+AM_CONDITIONAL([BUILD_AD_PE],[false])
 if test "$file_system_args" = "BGQ" -a -n "$file_system_gpfs"; then
     AC_DEFINE(BGQPLATFORM,1,BGQ platform)
+    AM_CONDITIONAL([BUILD_AD_BG],[true])
 fi
 if test "$file_system_args" = "PE" -a -n "$file_system_gpfs"; then
     AC_DEFINE(PEPLATFORM,1,PE platform)
+    AM_CONDITIONAL([BUILD_AD_PE],[true])
 fi
 
 # echo "with_file_system is :"$with_file_system": file_system_args is :"$file_system_args": FILE_SYSTEM is :"$FILE_SYSTEM":"
@@ -1418,8 +1422,6 @@ if test -n "$mpi_hp"; then
 fi
 #
 AC_CHECK_FUNCS(strerror)
-AC_CHECK_FUNCS(pwrite pread ,, AC_MSG_ERROR([pwrite/pread not detected and no workaround has been implemented]))
-
 if test -z "$srcdir" -o "$srcdir" = "." ; then srcdir="$ROMIO_HOME" ; fi
 AC_SUBST(srcdir)
 

http://git.mpich.org/mpich.git/commitdiff/b991dc7cb200ade30e28b189534b730bf27c07b3

commit b991dc7cb200ade30e28b189534b730bf27c07b3
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Mon Mar 10 18:24:04 2014 -0500

    further lockless removal/doc fixes
    
    remove gpfs-specific shared fp call
    further lockless removal
    documentation fixups

diff --git a/src/mpi/romio/adio/ad_gpfs/Makefile.mk b/src/mpi/romio/adio/ad_gpfs/Makefile.mk
index 9cd1dc8..fbcc4d2 100644
--- a/src/mpi/romio/adio/ad_gpfs/Makefile.mk
+++ b/src/mpi/romio/adio/ad_gpfs/Makefile.mk
@@ -25,10 +25,8 @@ romio_other_sources +=                                               \
     adio/ad_gpfs/bg/ad_bg_pset.c                                          \
     adio/ad_gpfs/ad_gpfs_tuning.c                                        \
     adio/ad_gpfs/ad_gpfs.c                                               \
-    adio/ad_gpfs/ad_gpfs_getsh.c                                         \
     adio/ad_gpfs/ad_gpfs_open.c                                          \
     adio/ad_gpfs/ad_gpfs_rdcoll.c                                        \
-    adio/ad_gpfs/ad_gpfs_setsh.c                                         \
     adio/ad_gpfs/ad_gpfs_wrcoll.c
 
 endif BUILD_AD_GPFS
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs.h b/src/mpi/romio/adio/ad_gpfs/ad_gpfs.h
index d7db201..b4b1556 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs.h
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs.h
@@ -65,9 +65,6 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
                        ADIO_Offset offset, ADIO_Status *status, int
                        *error_code);
 
-void ADIOI_GPFS_Get_shared_fp(ADIO_File fd, ADIO_Offset size, ADIO_Offset *shared_fp, int *error_code);
-void ADIOI_GPFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
-
 void ADIOI_GPFS_Flush(ADIO_File fd, int *error_code);
 
 #include "ad_gpfs_tuning.h"
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.h b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.h
index d257cb2..e5b666e 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.h
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.h
@@ -9,12 +9,8 @@
 /*
  * File: ad_gpfs_aggrs.h
  *
- * Declares functions specific for GPFS parallel I/O solution. The implemented optimizations are:
- * 	. Aligned file-domain partitioning, integrated in 7/28/2005
+ * Declares functions optimized specifically for GPFS parallel I/O solution.
  *
- * In addition, following optimizations are planned:
- * 	. Integrating multiple file-domain partitioning schemes
- *	  (corresponding to Alok Chouhdary's persistent file domain work).
  */
 
 #ifndef AD_GPFS_AGGRS_H_
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_flush.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_flush.c
index 8d9603e..5f7b182 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_flush.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_flush.c
@@ -3,7 +3,7 @@
 /* ---------------------------------------------------------------- */
 /**
  * \file ad_gpfs_flush.c
- * \brief Scalable flush based on underlying filesystem and psets
+ * \brief Scalable flush for GPFS
  */
 
 /* -*- Mode: C; c-basic-offset:4 ; -*- */
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_getsh.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_getsh.c
deleted file mode 100644
index f68771d..0000000
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_getsh.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bg_getsh.c
- * \brief ???
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/*
- *   Copyright (C) 1997 University of Chicago.
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "ad_gpfs.h"
-
-/* returns the current location of the shared_fp in terms of the
-   no. of etypes relative to the current view, and also increments the
-   shared_fp by the number of etypes to be accessed (incr) in the read
-   or write following this function. */
-
-void ADIOI_GPFS_Get_shared_fp(ADIO_File fd, ADIO_Offset incr, ADIO_Offset *shared_fp,
-			 int *error_code)
-{
-    ADIO_Offset new_fp;
-    int err;
-    MPI_Comm dupcommself;
-    static char myname[] = "ADIOI_BG_GET_SHARED_FP";
-
-    if (fd->shared_fp_fd == ADIO_FILE_NULL) {
-	MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
-	fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF,
-				     dupcommself,
-				     fd->shared_fp_fname,
-				     fd->file_system,
-				     fd->fns,
-				     ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE,
-				     0,
-				     MPI_BYTE,
-				     MPI_BYTE,
-				     MPI_INFO_NULL,
-				     ADIO_PERM_NULL,
-				     error_code);
-	if (*error_code != MPI_SUCCESS) return;
-	*shared_fp = 0;
-	ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
-	err = read(fd->shared_fp_fd->fd_sys, shared_fp, sizeof(ADIO_Offset));
-        /* if the file is empty, the above read may return error
-           (reading beyond end of file). In that case, shared_fp = 0,
-           set above, is the correct value. */
-    }
-    else {
-	ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
-
-	err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
-	if (err == 0) {
-	    err = read(fd->shared_fp_fd->fd_sys, shared_fp,
-		       sizeof(ADIO_Offset));
-	}
-	if (err == -1) {
-	    ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE, myname,
-					       __LINE__, MPI_ERR_IO, "**io",
-					       "**io %s", strerror(errno));
-	    return;
-	}
-    }
-
-    new_fp = *shared_fp + incr;
-
-    err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
-    if (err == 0) {
-	err = write(fd->shared_fp_fd->fd_sys, &new_fp, sizeof(ADIO_Offset));
-    }
-    ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
-    if (err == -1) {
-	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
-					   myname, __LINE__, MPI_ERR_IO,
-					   "**io",
-					   "**io %s", strerror(errno));
-    }
-    else *error_code = MPI_SUCCESS;
-}
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_setsh.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_setsh.c
deleted file mode 100644
index f169776..0000000
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_setsh.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bg_setsh.c
- * \brief ???
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/*
- *   Copyright (C) 1997 University of Chicago.
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "ad_gpfs.h"
-
-/* set the shared file pointer to "offset" etypes relative to the current
-   view */
-
-/*
-This looks very similar to ADIOI_GEN_Set_shared_fp, except this
-function avoids locking the file twice.  The generic version does
-
-Write lock
-ADIO_WriteContig
-Unlock
-
-For BG, ADIOI_BG_WriteContig does a lock before writing to disable
-caching. To avoid the lock being called twice, this version for BG does
-
-Write lock
-Lseek
-Write
-Unlock
-
-*/
-
-void ADIOI_GPFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code)
-{
-    int err;
-    MPI_Comm dupcommself;
-    static char myname[] = "ADIOI_BG_SET_SHARED_FP";
-
-    if (fd->shared_fp_fd == ADIO_FILE_NULL) {
-	MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
-	fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF, dupcommself,
-				     fd->shared_fp_fname,
-				     fd->file_system, fd->fns,
-				     ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE,
-				     0, MPI_BYTE, MPI_BYTE, MPI_INFO_NULL,
-				     ADIO_PERM_NULL, error_code);
-    }
-
-    if (*error_code != MPI_SUCCESS) return;
-
-    ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
-    lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
-    err = write(fd->shared_fp_fd->fd_sys, &offset, sizeof(ADIO_Offset));
-    ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
-
-    if (err == -1) {
-	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
-					   myname, __LINE__, MPI_ERR_IO,
-					   "**io",
-					   "**io %s", strerror(errno));
-    }
-    else *error_code = MPI_SUCCESS;
-}
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
new file mode 100644
index 0000000..902c3ac
--- /dev/null
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
@@ -0,0 +1,265 @@
+/* ---------------------------------------------------------------- */
+/* (C)Copyright IBM Corp.  2007, 2008                               */
+/* ---------------------------------------------------------------- */
+/**
+ * \file ad_gpfs_tuning.c
+ * \brief Defines ad_gpfs performance tuning
+ */
+
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *   Copyright (C) 2008 University of Chicago.
+ *   See COPYRIGHT notice in top-level directory.
+ */
+
+/*---------------------------------------------------------------------
+ * ad_gpfs_tuning.c
+ *
+ * defines global variables and functions for performance tuning and
+ * functional debugging.
+ *---------------------------------------------------------------------*/
+
+#include "ad_gpfs_tuning.h"
+#include "mpi.h"
+
+#if !defined(PVFS2_SUPER_MAGIC)
+  #define PVFS2_SUPER_MAGIC (0x20030528)
+#endif
+
+
+int 	gpfsmpio_timing;
+int 	gpfsmpio_timing2;
+int     gpfsmpio_timing_cw_level;
+int 	gpfsmpio_comm;
+int 	gpfsmpio_tunegather;
+int 	gpfsmpio_tuneblocking;
+long    bglocklessmpio_f_type;
+int     gpfsmpio_bg_nagg_pset;
+int     gpfsmpio_pthreadio;
+int     gpfsmpio_p2pcontig;
+int	gpfsmpio_balancecontig;
+int     gpfsmpio_devnullio;
+
+double	gpfsmpio_prof_cw    [GPFSMPIO_CIO_LAST];
+double	gpfsmpio_prof_cr    [GPFSMPIO_CIO_LAST];
+
+/* set internal variables for tuning environment variables */
+/** \page mpiio_vars MPIIO Configuration
+  \section env_sec Environment Variables
+ * - GPFSMPIO_COMM - Define how data is exchanged on collective
+ *   reads and writes.  Possible values:
+ *   - 0 - Use MPI_Alltoallv.
+ *   - 1 - Use MPI_Isend/MPI_Irecv.
+ *   - Default is 0.
+ *
+ * - GPFSMPIO_TIMING - collect timing breakdown for MPI I/O collective calls.
+ *   Possible values:
+ *   - 0 - Do not collect/report timing.
+ *   - 1 - Collect/report timing.
+ *   - Default is 0.
+ *
+ * - GPFSMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated
+ *   for aggregator collective i/o.  Possible values:
+ *   - 0 - Use two MPI_Allgather's to collect starting and ending offsets.
+ *   - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets.
+ *   - Default is 1.
+ *
+ * - GPFSMPIO_TUNEBLOCKING - Tune how aggregate file domains are
+ *   calculated (block size).  Possible values:
+ *   - 0 - Evenly calculate file domains across aggregators.  Also use
+ *   MPI_Isend/MPI_Irecv to exchange domain information.
+ *   - 1 - Align file domains with the underlying file system's block size.  Also use
+ *   MPI_Alltoallv to exchange domain information.
+ *   - Default is 1.
+ *
+ * - BGLOCKLESSMPIO_F_TYPE - Specify a filesystem type that should run
+ *   the ad_bglockless driver.   NOTE: Using romio prefixes (such as
+ *   "bg:" or "bglockless:") on a file name will override this environment
+ *   variable.  Possible values:
+ *   - 0xnnnnnnnn - Any valid file system type (or "magic number") from
+ *                  statfs() field f_type.
+ *   - The default is 0x20030528 (PVFS2_SUPER_MAGIC)
+ *
+ * - GPFSMPIO_NAGG_PSET - Specify a ratio of "I/O aggregators" to use for each
+ *   compute group (compute nodes + i/o nodes).    Possible values:
+ *   - any integer
+ *   - Default is 8
+ *
+ * - GPFSMPIO_PTHREADIO - Enables a very simple form of asyncronous io where a
+ *   pthread is spawned to do the posix writes while the main thread does the
+ *   data aggregation - useful for large files where multiple rounds are
+ *   required (more that the cb_buffer_size of data per aggregator).   User
+ *   must ensure there is hw resource available for the thread to run.  I
+ *   am sure there is a better way to do this involving comm threads - this is
+ *   just a start.  NOTE: For some reason the stats collected when this is
+ *   enabled misses some of the data so the data sizes are off a bit - this is
+ *   a statistical issue only, the data is still accurately written out
+ *
+ * - GPFSMPIO_P2PCONTIG -  Does simple point-to-point communication between the
+ *   aggregator and the procs that feed it.  Performance could be enhanced by a
+ *   one-sided put algorithm.  Current implementation allows only 1 round of
+ *   data.  Useful/allowed only when:
+ * 1.) The datatype is contiguous.
+ * 2.) The offsets are increasing in rank-order.
+ * 3.) There are no gaps between the offsets.
+ * 4.) No single rank has a data size which spans multiple file domains.
+ *
+ * - GPFSMPIO_BALANCECONTIG -  Relevant only to BGQ.  File domain blocks are assigned
+ *   to aggregators in a breadth-first fashion relative to the ions - additionally,
+ *   file domains on the aggregators sharing the same bridgeset and ion have contiguous
+ *   offsets.  The breadth-first assignment improves performance in the case of
+ *   a relatively small file of size less than the gpfs block size multiplied
+ *   by the number of ions. Files: ad_gpfs_aggrs.c ad_bg_aggrs.c.  Possible Values
+ *   - 0 - assign file domain blocks in the traditional manner
+ *   - 1 - if there are variable sized file domain blocks, spread them out
+ *         (balance) across bridge nodes
+ *
+ * - GPFSMPIO_DEVNULLIO - do everything *except* write to / read from the file
+ *   system. When experimenting with different two-phase I/O strategies, it's
+ *   helpful to remove the highly variable file system from the experiment.
+ *   - 0 (disabled) or 1 (enabled)
+ *   - Default is 0
+ *
+ */
+
+void ad_gpfs_get_env_vars() {
+    char *x, *dummy;
+
+    gpfsmpio_comm   = 0;
+	x = getenv( "GPFSMPIO_COMM"         );
+	if (x) gpfsmpio_comm         = atoi(x);
+    gpfsmpio_timing = 0;
+	x = getenv( "GPFSMPIO_TIMING"       );
+	if (x) gpfsmpio_timing       = atoi(x);
+    gpfsmpio_tunegather = 1;
+	x = getenv( "GPFSMPIO_TUNEGATHER"   );
+	if (x) gpfsmpio_tunegather   = atoi(x);
+    gpfsmpio_tuneblocking = 1;
+    x = getenv( "GPFSMPIO_TUNEBLOCKING" );
+    if (x) gpfsmpio_tuneblocking = atoi(x);
+    bglocklessmpio_f_type = PVFS2_SUPER_MAGIC;
+    x = getenv( "BGLOCKLESSMPIO_F_TYPE" );
+    if (x) bglocklessmpio_f_type = strtol(x,&dummy,0);
+    DBG_FPRINTF(stderr,"BGLOCKLESSMPIO_F_TYPE=%ld/%#lX\n",
+            bglocklessmpio_f_type,bglocklessmpio_f_type);
+    /* note: this value will be 'sanity checked' in ADIOI_BG_persInfo_init(),
+     * when we know a bit more about what "largest possible value" and
+     * "smallest possible value" should be */
+    gpfsmpio_bg_nagg_pset = ADIOI_BG_NAGG_PSET_DFLT;
+    x = getenv("GPFSMPIO_NAGG_PSET");
+    if (x) gpfsmpio_bg_nagg_pset = atoi(x);
+
+    gpfsmpio_pthreadio = 0;
+    x = getenv( "GPFSMPIO_PTHREADIO" );
+    if (x) gpfsmpio_pthreadio = atoi(x);
+
+    gpfsmpio_p2pcontig = 0;
+    x = getenv( "GPFSMPIO_P2PCONTIG" );
+    if (x) gpfsmpio_p2pcontig = atoi(x);
+
+    gpfsmpio_balancecontig = 0;
+    x = getenv( "GPFSMPIO_BALANCECONTIG" );
+    if (x) gpfsmpio_balancecontig = atoi(x);
+
+    gpfsmpio_devnullio = 0;
+    x = getenv( "GPFSMPIO_DEVNULLIO" );
+    if (x) gpfsmpio_devnullio = atoi(x);
+}
+
+/* report timing breakdown for MPI I/O collective call */
+void ad_gpfs_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs )
+{
+    int i;
+
+    if (gpfsmpio_timing) {
+	/* Timing across the whole communicator is a little bit interesting,
+	 * but what is *more* interesting is if we single out the aggregators
+	 * themselves.  non-aggregators spend a lot of time in "exchange" not
+	 * exchanging data, but blocked because they are waiting for
+	 * aggregators to finish writing.  If we focus on just the aggregator
+	 * processes we will get a more clear picture about the data exchange
+	 * vs. i/o time breakdown */
+
+	/* if deferred open enabled, we could use the aggregator communicator */
+	MPI_Comm agg_comm;
+	int nr_aggs, agg_rank;
+	MPI_Comm_split(fd->comm, (fd->is_agg ? 1 : MPI_UNDEFINED), 0, &agg_comm);
+	if(agg_comm != MPI_COMM_NULL) {
+	    MPI_Comm_size(agg_comm, &nr_aggs);
+	    MPI_Comm_rank(agg_comm, &agg_rank);
+	}
+
+	double *gpfsmpio_prof_org = gpfsmpio_prof_cr;
+	if (rw) gpfsmpio_prof_org = gpfsmpio_prof_cw;
+
+	double gpfsmpio_prof_avg[ GPFSMPIO_CIO_LAST ];
+	double gpfsmpio_prof_max[ GPFSMPIO_CIO_LAST ];
+
+	if( agg_comm != MPI_COMM_NULL) {
+	    MPI_Reduce( gpfsmpio_prof_org, gpfsmpio_prof_avg, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_SUM, 0, agg_comm);
+	    MPI_Reduce( gpfsmpio_prof_org, gpfsmpio_prof_max, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_MAX, 0, agg_comm);
+	}
+	if (agg_comm != MPI_COMM_NULL && agg_rank == 0) {
+
+	    for (i=0; i<GPFSMPIO_CIO_LAST; i++) gpfsmpio_prof_avg[i] /= nr_aggs;
+
+	    gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_POSI_RW  ] =
+		gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
+		gpfsmpio_prof_max[ GPFSMPIO_CIO_T_POSI_RW  ];
+	    gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_RW  ] =
+		gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
+		gpfsmpio_prof_max[ GPFSMPIO_CIO_T_MPIO_RW  ];
+
+	    gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_CRW ] =
+		gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
+		gpfsmpio_prof_max[ GPFSMPIO_CIO_T_MPIO_CRW ];
+
+	    fprintf(stderr,"TIMING-%1s,", (rw ? "W" : "R") );
+	    fprintf(stderr,"SIZE: %12.4lld , ", (long long int)(gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs));
+	    fprintf(stderr,"SEEK-avg: %10.3f , ",
+		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_SEEK ]     );
+	    fprintf(stderr,"SEEK-max: %10.3f , ",
+		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_SEEK ]     );
+	    fprintf(stderr,"LOCAL-avg: %10.3f , ",
+		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_LCOMP ]    );
+	    fprintf(stderr,"GATHER-max: %10.3f , ",
+		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_GATHER ]   );
+	    fprintf(stderr,"PATTERN-avg: %10.3f , ",
+		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_PATANA ]   );
+	    fprintf(stderr,"FILEDOMAIN-avg: %10.3f , ",
+		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_FD_PART ]  );
+	    fprintf(stderr,"MYREQ-avg: %10.3f , ",
+		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MYREQ ]    );
+	    fprintf(stderr,"OTHERREQ-max: %10.3f , ",
+		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_OTHREQ ]   );
+	    fprintf(stderr,"EXCHANGE-max: %10.3f , ",
+		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH ]    );
+	    fprintf(stderr, "EXCHANGE-RECV_EXCH-max: %10.3f , ",
+		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_RECV_EXCH]  );
+	    fprintf(stderr, "EXCHANGE-SETUP-max: %10.3f , ",
+		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SETUP]  );
+	    fprintf(stderr, "EXCHANGE-NET-max: %10.3f , ",
+		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_NET]  );
+	    fprintf(stderr, "EXCHANGE-SORT-max: %10.3f , ",
+		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SORT]  );
+	    fprintf(stderr, "EXCHANGE-SIEVE-max: %10.3f , ",
+		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SIEVE]  );
+	    fprintf(stderr,"POSIX-TIME-avg: %10.3f , ",
+		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_POSI_RW ]  );
+	    fprintf(stderr,"POSIX-TIME-max: %10.3f , ",
+		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_POSI_RW ]  );
+	    fprintf(stderr,"MPIIO-CONTIG-TIME-avg: %10.3f , ",
+		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MPIO_RW ]  );
+	    fprintf(stderr,"MPIIO-STRIDED-TIME-avg: %10.3f , ",
+		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MPIO_CRW ] );
+	    fprintf(stderr,"POSIX-BW-avg: %10.3f , ",
+		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_POSI_RW ]  );
+	    fprintf(stderr,"MPI-BW-avg: %10.3f , ",
+		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_RW ]  );
+	    fprintf(stderr,"MPI-BW-collective-avg: %10.3f\n ",
+		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_CRW ] );
+	}
+	if (agg_comm != MPI_COMM_NULL) MPI_Comm_free(&agg_comm);
+    }
+
+}
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
new file mode 100644
index 0000000..a145043
--- /dev/null
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
@@ -0,0 +1,117 @@
+/* ---------------------------------------------------------------- */
+/* (C)Copyright IBM Corp.  2007, 2008                               */
+/* ---------------------------------------------------------------- */
+/**
+ * \file ad_gpfs_tuning.h
+ * \brief ???
+ */
+
+/*---------------------------------------------------------------------
+ * ad_gpfs_tuning.h
+ *
+ * declares global variables and macros for performance tuning and
+ * functional debugging.
+ *---------------------------------------------------------------------*/
+
+#ifndef AD_GPFS_TUNING_H_
+#define AD_GPFS_TUNING_H_
+
+#include "adio.h"
+
+#define ADIOI_GPFS_assert( a ) if (!(a)) { \
+                                fprintf( stderr, "AD_GPFS_assert, file=%s, line=%d\n", __FILE__, __LINE__ ); \
+                                MPI_Abort( MPI_COMM_WORLD, 1 ); \
+                           }
+
+
+/*-----------------------------------------
+ *  Global variables for the control of
+ *  1.  timing
+ *  2.  select specific optimizations
+ *-----------------------------------------*/
+
+/* timing fields */
+enum {
+    GPFSMPIO_CIO_DATA_SIZE=0,
+    GPFSMPIO_CIO_T_SEEK,
+    GPFSMPIO_CIO_T_LCOMP,	/* time for ADIOI_Calc_my_off_len(), local */
+    GPFSMPIO_CIO_T_GATHER,	/* time for previous MPI_Allgather, now Allreduce */
+    GPFSMPIO_CIO_T_PATANA,	/* time for a quick test if access is contiguous or not, local */
+    GPFSMPIO_CIO_T_FD_PART,	/* time for file domain partitioning, local */
+    GPFSMPIO_CIO_T_MYREQ,	/* time for ADIOI_Calc_my_req(), local */
+    GPFSMPIO_CIO_T_OTHREQ,	/* time for ADIOI_Calc_others_req(), short Alltoall */
+    GPFSMPIO_CIO_T_DEXCH,	/* time for I/O data exchange */
+    /* the next DEXCH_* timers capture finer-grained portions of T_DEXCH */
+    GPFSMPIO_CIO_T_DEXCH_RECV_EXCH,/* time for each process to exchange recieve
+				    size info with everyone else */
+    GPFSMPIO_CIO_T_DEXCH_SETUP,	/* time for setup portion of I/O data exchange */
+    GPFSMPIO_CIO_T_DEXCH_NET,	/* time for network portion of I/O data exchange */
+    GPFSMPIO_CIO_T_DEXCH_SORT, 	/* time to sort requesst in I/O data exchange */
+    GPFSMPIO_CIO_T_DEXCH_SIEVE, 	/* time for read portion of RMW in two phase */
+    GPFSMPIO_CIO_T_POSI_RW,
+    GPFSMPIO_CIO_B_POSI_RW,
+    GPFSMPIO_CIO_T_MPIO_RW,	/* time for ADIOI_WriteContig() */
+    GPFSMPIO_CIO_B_MPIO_RW,
+    GPFSMPIO_CIO_T_MPIO_CRW,	/* time for ADIOI_GPFS_WriteStridedColl() */
+    GPFSMPIO_CIO_B_MPIO_CRW,
+    GPFSMPIO_CIO_LAST
+};
+
+extern double 	gpfsmpio_prof_cw    [GPFSMPIO_CIO_LAST];
+extern double 	gpfsmpio_prof_cr    [GPFSMPIO_CIO_LAST];
+
+
+/* corresponds to environment variables to select optimizations and timing level */
+extern int 	gpfsmpio_timing;
+extern int      gpfsmpio_timing_cw_level;
+extern int 	gpfsmpio_comm;
+extern int 	gpfsmpio_tunegather;
+extern int 	gpfsmpio_tuneblocking;
+extern long bglocklessmpio_f_type;
+extern int      gpfsmpio_pthreadio;
+extern int      gpfsmpio_p2pcontig;
+extern int  gpfsmpio_balancecontig;
+extern int      gpfsmpio_devnullio;
+
+/* Default is, well, kind of complicated. Blue Gene /L and /P had "psets": one
+ * i/o node and all compute nodes wired to it.  On Blue Gene /Q that
+ * relationship is a lot more fluid.  There are still I/O nodes, and compute
+ * nodes are assigned to an i/o node, but there are two routes to the i/o node,
+ * via compute nodes designated as "bridge nodes".  In this code, what we used
+ * to call a "pset" is actually "compute nodes associated with and including a
+ * bridge node".  So, "nAgg" is roughly "number of aggregators per bridge", but
+ * look closely at ADIOI_BG_persInfo_init() for the details */
+
+#define ADIOI_BG_NAGG_PSET_DFLT 16
+
+extern int     gpfsmpio_bg_nagg_pset;
+
+
+/* set internal variables for tuning environment variables */
+void ad_gpfs_get_env_vars();
+
+/* report timing breakdown for MPI I/O collective call */
+void ad_gpfs_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs );
+
+/* note:
+ *   T := timing;
+ * CIO := collective I/O
+ */
+#define GPFSMPIO_T_CIO_RESET( RW ) \
+	{ \
+	  int i; \
+	  for ( i = 0; i < GPFSMPIO_CIO_LAST; i ++ ) \
+	    gpfsmpio_prof_c##RW [ i ] = 0; \
+	}
+
+#define GPFSMPIO_T_CIO_REPORT( RW, FD, MYRANK, NPROCS ) \
+	ad_gpfs_timing_crw_report ( RW, FD, MYRANK, NPROCS ); \
+
+#define GPFSMPIO_T_CIO_SET_GET(RW, ISSET, ISGET, VAR1, VAR2 ) \
+         {\
+	 double temp = MPI_Wtime(); \
+	 if ( ISSET ) gpfsmpio_prof_c##RW [ VAR1 ] = temp; \
+	 if ( ISGET ) gpfsmpio_prof_c##RW [ VAR2 ] = temp - gpfsmpio_prof_c##RW [ VAR2 ] ;\
+	 }
+
+#endif  /* AD_GPFS_TUNING_H_ */
diff --git a/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.h b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.h
index a5322a0..cdd0497 100644
--- a/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.h
+++ b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.h
@@ -7,14 +7,11 @@
  */
 
 /* 
- * File: ad_bg_aggrs.h
- * 
- * Declares functions specific for BG/L - GPFS parallel I/O solution. The implemented optimizations are:
- * 	. Aligned file-domain partitioning, integrated in 7/28/2005
- * 
- * In addition, following optimizations are planned:
- * 	. Integrating multiple file-domain partitioning schemes 
- *	  (corresponding to Alok Chouhdary's persistent file domain work).
+ *
+ * Declares functions specific for the BlueGene platform within the GPFS
+ * parallel I/O solution.  Implements aligned file-domain partitioning
+ * (7/28/2005);  persistent file doamin work not implemented
+ *
  */
 
 #ifndef AD_BG_AGGRS_H_
diff --git a/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.h b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.h
index 832c01e..51ae4a0 100644
--- a/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.h
+++ b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.h
@@ -7,8 +7,8 @@
  */
 
 /* File: ad_bg_pset.h
- * 
- * Defines two structures that keep BG/L PSET specific information and their public interfaces:
+ *
+ * Defines two structures that keep BlueGene PSET specific information and their public interfaces:
  * 	. ADIOI_BG_ProcInfo_t object keeps specific information to each process
  * 	. ADIOI_BG_ConfInfo_t object keeps general information for the whole communicator, only kept
  *	  on process 0.
@@ -66,10 +66,10 @@ typedef struct {
 
 
 /* public funcs for a pair of ADIOI_BG_ConfInfo_t and ADIOI_BG_ProcInfo_t objects */
-    void ADIOI_BG_persInfo_init( ADIOI_BG_ConfInfo_t *conf, 
-				  ADIOI_BG_ProcInfo_t *proc, 
+    void ADIOI_BG_persInfo_init( ADIOI_BG_ConfInfo_t *conf,
+				  ADIOI_BG_ProcInfo_t *proc,
 				  int s, int r, int n_aggrs, MPI_Comm comm);
-    void ADIOI_BG_persInfo_free( ADIOI_BG_ConfInfo_t *conf, 
+    void ADIOI_BG_persInfo_free( ADIOI_BG_ConfInfo_t *conf,
 				  ADIOI_BG_ProcInfo_t *proc );
 
 
diff --git a/src/mpi/romio/adio/common/ad_get_sh_fp.c b/src/mpi/romio/adio/common/ad_get_sh_fp.c
index 786a4b3..1213327 100644
--- a/src/mpi/romio/adio/common/ad_get_sh_fp.c
+++ b/src/mpi/romio/adio/common/ad_get_sh_fp.c
@@ -6,10 +6,6 @@
 
 #include "adio.h"
 
-#ifdef ROMIO_GPFS
-void ADIOI_GPFS_Get_shared_fp(ADIO_File fd, int size, ADIO_Offset *shared_fp, int *error_code);
-#endif
-
 /* returns the current location of the shared_fp in terms of the
    no. of etypes relative to the current view, and also increments the
    shared_fp by the number of etypes to be accessed (incr) in the read
@@ -39,14 +35,6 @@ void ADIO_Get_shared_fp(ADIO_File fd, ADIO_Offset incr, ADIO_Offset *shared_fp,
     }
 #endif
 
-#ifdef ROMIO_GPFS
-    if (fd->file_system == ADIO_GPFS) {
-	ADIOI_GPFS_Get_shared_fp(fd, incr, shared_fp, error_code);
-	return;
-    }
-#endif
-
-
     if (fd->shared_fp_fd == ADIO_FILE_NULL) {
 	MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
 	fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF, dupcommself, 
diff --git a/src/mpi/romio/adio/common/ad_set_sh_fp.c b/src/mpi/romio/adio/common/ad_set_sh_fp.c
index 77bcc6c..c55f2ef 100644
--- a/src/mpi/romio/adio/common/ad_set_sh_fp.c
+++ b/src/mpi/romio/adio/common/ad_set_sh_fp.c
@@ -5,9 +5,6 @@
  */
 
 #include "adio.h"
-#ifdef ROMIO_GPFS
-void ADIOI_GPFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
-#endif
 
 /* set the shared file pointer to "offset" etypes relative to the current 
    view */
@@ -26,13 +23,6 @@ void ADIO_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code)
     }
 #endif
 
-#ifdef ROMIO_GPFS
-    if (fd->file_system == ADIO_GPFS) {
-	ADIOI_GPFS_Set_shared_fp(fd, offset, error_code);
-	return;
-    }
-#endif
-
     if (fd->shared_fp_fd == ADIO_FILE_NULL) {
 	MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
 	fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF, dupcommself, 

http://git.mpich.org/mpich.git/commitdiff/d1c97cb08a84bd0390f5e678f2aa9a8e040744b2

commit d1c97cb08a84bd0390f5e678f2aa9a8e040744b2
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Mon Mar 10 17:20:46 2014 -0500

    pushed bluegene platform code to bg subdir and removed ad_bg_getsh.c and ad_bg_setsh.c

diff --git a/src/mpi/romio/adio/ad_gpfs/Makefile.mk b/src/mpi/romio/adio/ad_gpfs/Makefile.mk
index d04a8b7..9cd1dc8 100644
--- a/src/mpi/romio/adio/ad_gpfs/Makefile.mk
+++ b/src/mpi/romio/adio/ad_gpfs/Makefile.mk
@@ -10,19 +10,19 @@ if BUILD_AD_GPFS
 AM_CPPFLAGS += -DBGL_OPTIM_STEP1_2=1 -DBGL_OPTIM_STEP1_1=1
 
 noinst_HEADERS +=                                                    \
-    adio/ad_gpfs/ad_bg_aggrs.h                                         \
+    adio/ad_gpfs/bg/ad_bg_aggrs.h                                         \
     adio/ad_gpfs/ad_gpfs_aggrs.h                                         \
     adio/ad_gpfs/ad_gpfs.h                                               \
-    adio/ad_gpfs/ad_bg_pset.h                                          \
+    adio/ad_gpfs/bg/ad_bg_pset.h                                          \
     adio/ad_gpfs/ad_gpfs_tuning.h
 
 romio_other_sources +=                                               \
-    adio/ad_gpfs/ad_bg_aggrs.c                                         \
+    adio/ad_gpfs/bg/ad_bg_aggrs.c                                         \
     adio/ad_gpfs/ad_gpfs_aggrs.c                                         \
     adio/ad_gpfs/ad_gpfs_close.c                                         \
     adio/ad_gpfs/ad_gpfs_flush.c                                         \
-    adio/ad_gpfs/ad_bg_hints.c                                         \
-    adio/ad_gpfs/ad_bg_pset.c                                          \
+    adio/ad_gpfs/bg/ad_bg_hints.c                                         \
+    adio/ad_gpfs/bg/ad_bg_pset.c                                          \
     adio/ad_gpfs/ad_gpfs_tuning.c                                        \
     adio/ad_gpfs/ad_gpfs.c                                               \
     adio/ad_gpfs/ad_gpfs_getsh.c                                         \
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_bg_aggrs.c b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.c
similarity index 99%
rename from src/mpi/romio/adio/ad_gpfs/ad_bg_aggrs.c
rename to src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.c
index 58f005c..41ab2af 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_bg_aggrs.c
+++ b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.c
@@ -13,11 +13,13 @@
  */
 
 /*#define TRACE_ON */
+
+// Uncomment this line to turn tracing on for the gpfsmpio_balancecontig aggr selection optimization
 // #define balancecontigtrace 1
 
 #include "adio.h"
 #include "adio_cb_config_list.h"
-#include "ad_gpfs.h"
+#include "../ad_gpfs.h"
 #include "ad_bg_pset.h"
 #include "ad_bg_aggrs.h"
 #ifdef AGGREGATION_PROFILE
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_bg_aggrs.h b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.h
similarity index 100%
rename from src/mpi/romio/adio/ad_gpfs/ad_bg_aggrs.h
rename to src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.h
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_bg_hints.c b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_hints.c
similarity index 99%
rename from src/mpi/romio/adio/ad_gpfs/ad_bg_hints.c
rename to src/mpi/romio/adio/ad_gpfs/bg/ad_bg_hints.c
index 68163fa..443020c 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_bg_hints.c
+++ b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_hints.c
@@ -16,7 +16,7 @@
 #include "adio_extern.h"
 #include "hint_fns.h"
 
-#include "ad_gpfs.h"
+#include "../ad_gpfs.h"
 #include "ad_bg_pset.h"
 #include "ad_bg_aggrs.h"
 
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_bg_pset.c b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.c
similarity index 99%
rename from src/mpi/romio/adio/ad_gpfs/ad_bg_pset.c
rename to src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.c
index 6470969..e358432 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_bg_pset.c
+++ b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.c
@@ -14,7 +14,7 @@
 
 /* #define TRACE_ON */
 #include <stdlib.h>
-#include "ad_gpfs.h"
+#include "../ad_gpfs.h"
 #include "ad_bg_pset.h"
 #include <spi/include/kernel/process.h>
 #include <firmware/include/personality.h>
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_bg_pset.h b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.h
similarity index 100%
rename from src/mpi/romio/adio/ad_gpfs/ad_bg_pset.h
rename to src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.h

http://git.mpich.org/mpich.git/commitdiff/8fa2b391d852a348dfbf0bd35f9fdae23001c869

commit 8fa2b391d852a348dfbf0bd35f9fdae23001c869
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Mon Mar 10 15:02:35 2014 -0500

    remove ad_bgl and ad_bglockless directories

diff --git a/src/mpi/romio/adio/ad_bgl/Makefile.mk b/src/mpi/romio/adio/ad_bgl/Makefile.mk
deleted file mode 100644
index 45ecd71..0000000
--- a/src/mpi/romio/adio/ad_bgl/Makefile.mk
+++ /dev/null
@@ -1,34 +0,0 @@
-## -*- Mode: Makefile; -*-
-## vim: set ft=automake :
-##
-## (C) 2011 by Argonne National Laboratory.
-##     See COPYRIGHT in top-level directory.
-##
-
-if BUILD_AD_BGL
-
-noinst_HEADERS +=               \
-    adio/ad_bgl/ad_bgl.h        \
-    adio/ad_bgl/ad_bgl_aggrs.h  \
-    adio/ad_bgl/ad_bgl_pset.h   \
-    adio/ad_bgl/ad_bgl_tuning.h
-
-romio_other_sources +=          \
-    adio/ad_bgl/ad_bgl_open.c   \
-    adio/ad_bgl/ad_bgl_close.c  \
-    adio/ad_bgl/ad_bgl_fcntl.c  \
-    adio/ad_bgl/ad_bgl_flush.c  \
-    adio/ad_bgl/ad_bgl_read.c   \
-    adio/ad_bgl/ad_bgl_write.c  \
-    adio/ad_bgl/ad_bgl_getsh.c  \
-    adio/ad_bgl/ad_bgl_setsh.c  \
-    adio/ad_bgl/ad_bgl.c        \
-    adio/ad_bgl/ad_bgl_aggrs.c  \
-    adio/ad_bgl/ad_bgl_pset.c   \
-    adio/ad_bgl/ad_bgl_hints.c  \
-    adio/ad_bgl/ad_bgl_rdcoll.c \
-    adio/ad_bgl/ad_bgl_wrcoll.c \
-    adio/ad_bgl/ad_bgl_tuning.c
-
-endif BUILD_AD_BGL
-
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl.c b/src/mpi/romio/adio/ad_bgl/ad_bgl.c
deleted file mode 100644
index e17cf31..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl.c
- * \brief ???
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 2001 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "ad_bgl.h"
-
-/* adioi.h has the ADIOI_Fns_struct define */
-#include "adioi.h"
-
-struct ADIOI_Fns_struct ADIO_BGL_operations = {
-    ADIOI_BGL_Open, /* Open */
-    ADIOI_GEN_OpenColl, /* Collective open */
-    ADIOI_BGL_ReadContig, /* ReadContig */
-    ADIOI_BGL_WriteContig, /* WriteContig */
-#if BGL_OPTIM_STEP1_2
-    ADIOI_BGL_ReadStridedColl, /* ReadStridedColl */
-    ADIOI_BGL_WriteStridedColl, /* WriteStridedColl */
-#else
-    ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
-    ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
-#endif
-    ADIOI_GEN_SeekIndividual, /* SeekIndividual */
-    ADIOI_BGL_Fcntl, /* Fcntl */
-#if BGL_OPTIM_STEP1_1
-    ADIOI_BGL_SetInfo, /* SetInfo */
-#else
-    ADIOI_GEN_SetInfo, /* SetInfo */
-#endif
-    ADIOI_BGL_ReadStrided, /* ReadStrided */
-    ADIOI_BGL_WriteStrided, /* WriteStrided */
-    ADIOI_BGL_Close, /* Close */
-#ifdef ROMIO_HAVE_WORKING_AIO
-#warning Consider BG support for NFS before enabling this.
-    ADIOI_GEN_IreadContig, /* IreadContig */
-    ADIOI_GEN_IwriteContig, /* IwriteContig */
-#else
-    ADIOI_FAKE_IreadContig, /* IreadContig */
-    ADIOI_FAKE_IwriteContig, /* IwriteContig */
-#endif
-    ADIOI_GEN_IODone, /* ReadDone */
-    ADIOI_GEN_IODone, /* WriteDone */
-    ADIOI_GEN_IOComplete, /* ReadComplete */
-    ADIOI_GEN_IOComplete, /* WriteComplete */
-    ADIOI_GEN_IreadStrided, /* IreadStrided */
-    ADIOI_GEN_IwriteStrided, /* IwriteStrided */
-    ADIOI_BGL_Flush, /* Flush */
-    ADIOI_GEN_Resize, /* Resize */
-    ADIOI_GEN_Delete, /* Delete */
-    ADIOI_GEN_Feature, /* Features */
-};
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl.h b/src/mpi/romio/adio/ad_bgl/ad_bgl.h
deleted file mode 100644
index 73ab111..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl.h
- * \brief ???
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 1997 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#ifndef AD_BGL_INCLUDE
-#define AD_BGL_INCLUDE
-
-#include <unistd.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <fcntl.h>
-#include "adio.h"
-
-#ifdef HAVE_SIGNAL_H
-#include <signal.h>
-#endif
-#ifdef HAVE_AIO_H
-#include <aio.h>
-#endif
-
-#if 0 
-int ADIOI_BGL_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
-		  int wr, void *handle);
-#endif
-
-void ADIOI_BGL_Open(ADIO_File fd, int *error_code);
-
-void ADIOI_BGL_Close(ADIO_File fd, int *error_code);
-
-void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count, 
-                      MPI_Datatype datatype, int file_ptr_type,
-                     ADIO_Offset offset, ADIO_Status *status, int
-		     *error_code);
-void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count, 
-                      MPI_Datatype datatype, int file_ptr_type,
-                      ADIO_Offset offset, ADIO_Status *status, int
-		      *error_code);   
-#if 0
-void ADIOI_BGL_IwriteContig(ADIO_File fd, void *buf, int count, 
-                      MPI_Datatype datatype, int file_ptr_type,
-                      ADIO_Offset offset, ADIO_Request *request, int
-		      *error_code);   
-void ADIOI_BGL_IreadContig(ADIO_File fd, void *buf, int count, 
-                      MPI_Datatype datatype, int file_ptr_type,
-                      ADIO_Offset offset, ADIO_Request *request, int
-		      *error_code);   
-int ADIOI_BGL_ReadDone(ADIO_Request *request, ADIO_Status *status, int
-		       *error_code);
-int ADIOI_BGL_WriteDone(ADIO_Request *request, ADIO_Status *status, int
-		       *error_code);
-void ADIOI_BGL_ReadComplete(ADIO_Request *request, ADIO_Status *status, int
-		       *error_code); 
-void ADIOI_BGL_WriteComplete(ADIO_Request *request, ADIO_Status *status,
-			int *error_code); 
-#endif
-void ADIOI_BGL_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int
-		*error_code); 
-void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
-
-void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
-		       MPI_Datatype datatype, int file_ptr_type,
-		       ADIO_Offset offset, ADIO_Status *status, int
-		       *error_code);
-void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
-		       MPI_Datatype datatype, int file_ptr_type,
-		       ADIO_Offset offset, ADIO_Status *status, int
-		       *error_code);
-
-void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
-                               MPI_Datatype datatype, int file_ptr_type,
-                               ADIO_Offset offset, ADIO_Status *status, int
-                               *error_code);
-
-void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
-                       MPI_Datatype datatype, int file_ptr_type,
-                       ADIO_Offset offset, ADIO_Status *status, int
-                       *error_code);
-
-void ADIOI_BGL_Get_shared_fp(ADIO_File fd, ADIO_Offset size, ADIO_Offset *shared_fp, int *error_code);
-void ADIOI_BGL_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
-
-void ADIOI_BGL_Flush(ADIO_File fd, int *error_code);
-
-#include "ad_bgl_tuning.h"
-
-
-#endif
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl_aggrs.c b/src/mpi/romio/adio/ad_bgl/ad_bgl_aggrs.c
deleted file mode 100644
index cbfbb23..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl_aggrs.c
+++ /dev/null
@@ -1,966 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl_aggrs.c
- * \brief The externally used function from this file is is declared in ad_bgl_aggrs.h
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 1997-2001 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "adio.h"
-#include "adio_cb_config_list.h"
-#include "ad_bgl.h"
-#include "ad_bgl_pset.h"
-#include "ad_bgl_aggrs.h"
-#ifdef AGGREGATION_PROFILE
-#include "mpe.h"
-#endif
-
-#ifdef USE_DBG_LOGGING
-  #define AGG_DEBUG 1
-#endif
-
-
-
-static int aggrsInPsetSize=0;
-static int *aggrsInPset=NULL;
-
-/* Comments copied from common:
- * This file contains four functions:
- *
- * ADIOI_Calc_aggregator()
- * ADIOI_Calc_file_domains()
- * ADIOI_Calc_my_req()
- * ADIOI_Calc_others_req()
- *
- * The last three of these were originally in ad_read_coll.c, but they are
- * also shared with ad_write_coll.c.  I felt that they were better kept with
- * the rest of the shared aggregation code.  
- */
-
-/* Discussion of values available from above:
- *
- * ADIO_Offset st_offsets[0..nprocs-1]
- * ADIO_Offset end_offsets[0..nprocs-1]
- *    These contain a list of start and end offsets for each process in 
- *    the communicator.  For example, an access at loc 10, size 10 would
- *    have a start offset of 10 and end offset of 19.
- * int nprocs
- *    number of processors in the collective I/O communicator
- * ADIO_Offset min_st_offset
- * ADIO_Offset fd_start[0..nprocs_for_coll-1]
- *    starting location of "file domain"; region that a given process will
- *    perform aggregation for (i.e. actually do I/O)
- * ADIO_Offset fd_end[0..nprocs_for_coll-1]
- *    start + size - 1 roughly, but it can be less, or 0, in the case of 
- *    uneven distributions
- */
-
-/* forward declaration */
-static void 
-ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd, 
-					const ADIOI_BGL_ConfInfo_t *confInfo, 
-					ADIOI_BGL_ProcInfo_t *all_procInfo,
-					int *aggrsInPset );
-
-/*
- * Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO.
- * The parameters are 
- * 	. the number of aggregators (proxies) : fd->hints->cb_nodes
- *	. the ranks of the aggregators :        fd->hints->ranklist
- * By compute these two parameters in a BGL-PSET-aware way, the default 2-phase collective IO of 
- *	ADIO can work more efficiently.
- */
-int 
-ADIOI_BGL_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset) 
-{
-    int r, s;
-    ADIOI_BGL_ProcInfo_t  *procInfo, *all_procInfo;
-    ADIOI_BGL_ConfInfo_t  *confInfo;
-
-    MPI_Comm_size( fd->comm, &s );
-    MPI_Comm_rank( fd->comm, &r );
-
-  /* Collect individual BGL personality information */
-    confInfo = ADIOI_BGL_ConfInfo_new ();
-    procInfo = ADIOI_BGL_ProcInfo_new ();
-    ADIOI_BGL_persInfo_init( confInfo, procInfo, s, r, n_aggrs_per_pset );
-
-  /* Gather BGL personality infomation onto process 0 */
-    // if (r == 0) 
-    all_procInfo  = ADIOI_BGL_ProcInfo_new_n  (s);
-    if(s > aggrsInPsetSize)
-    {
-      if(aggrsInPset) ADIOI_Free(aggrsInPset);
-      aggrsInPset   = (int *) ADIOI_Malloc (s *sizeof(int));
-      aggrsInPsetSize = s;
-    }
-
-
-    MPI_Gather( (void *)procInfo,     sizeof(ADIOI_BGL_ProcInfo_t), MPI_BYTE, 
-		(void *)all_procInfo, sizeof(ADIOI_BGL_ProcInfo_t), MPI_BYTE, 
-		0, 
-		fd->comm );
-
-  /* Compute a list of the ranks of chosen IO proxy CN on process 0 */
-    if (r == 0) { 
-	ADIOI_BGL_compute_agg_ranklist_serial (fd, confInfo, all_procInfo, aggrsInPset);    
-	// ADIOI_BGL_ProcInfo_free (all_procInfo);
-    }
-    ADIOI_BGL_ProcInfo_free (all_procInfo);
-
-  /* Send the info of IO proxy CN to all processes and keep the info in fd->hints struct.
-     Declared in adio_cb_config_list.h */
-    ADIOI_cb_bcast_rank_map(fd);		
-
-  /* Broadcast the BGL-GPFS related file domain info */
-    MPI_Bcast( (void *)aggrsInPset, 
-	  	fd->hints->cb_nodes * sizeof(int), MPI_BYTE, 
-		0, 
-		fd->comm );
-    
-    ADIOI_BGL_persInfo_free( confInfo, procInfo );
-    return 0;
-}
-
-/*  
- * the purpose of abstracting out this routine is to make it easy for trying different proxy-selection criteria. 
- */
-static int 
-ADIOI_BGL_select_agg_in_pset (const ADIOI_BGL_ConfInfo_t *confInfo,
-			      ADIOI_BGL_ProcInfo_t *pset_procInfo, 
-			      int nCN_in_pset, 
-			      int *tmp_ranklist)
-{
-/* first implementation, based on their rank order. */
-
-    int i, j, k; 
-
-    /* The number of aggregators in the PSET is proportional to the CNs in the PSET */
-    int nAggrs = nCN_in_pset * confInfo->aggRatio;	
-    if (nAggrs < ADIOI_BGL_NAGG_PSET_MIN) nAggrs = ADIOI_BGL_NAGG_PSET_MIN;
-
-    /* for not virtual-node-mode, pick aggregators in this PSET based on the order of the global rank */
-    if (!confInfo->isVNM) 
-    {
-	for (i=0; i<nAggrs; i++) tmp_ranklist[i] = pset_procInfo[i].rank;
-    }
-
-    /* for virtual-node-mode, first pick aggregators among CPU-0 */
-    else 
-    {
-	/* Try to pick from CPU-0 first, then CPU-1, then ... CPU-n */
-      j = 0;
-      for (k=0; k < confInfo->cpuidSize; k++){
-  	  for (i=0; i< nCN_in_pset ; i++) {
-	    if (pset_procInfo[i].cpuid == k) 
-	      tmp_ranklist[j++] = pset_procInfo[i].rank;
-	    if ( j >= nAggrs) break;
-  	  }
-	if ( j >= nAggrs) break;
-      }
-    }
-
-    return nAggrs;
-}
-
-/* 
- * Pick IO aggregators based on the under PSET organization and stores the ranks of the proxy CNs in tmp_ranklist.
- * The first order of tmp_ranklist is : PSET number
- * The secondary order of the list is determined in ADIOI_BGL_select_agg_in_pset() and thus adjustable.
- */
-static int 
-ADIOI_BGL_compute_agg_ranklist_serial_do (const ADIOI_BGL_ConfInfo_t *confInfo, 
-					  ADIOI_BGL_ProcInfo_t       *all_procInfo, 
-					  int *aggrsInPset, 
-					  int *tmp_ranklist)
-{
-    int i, j;
-
-    /* a list of the numbers of all the PSETS */
-    int *psetNumList = (int *) ADIOI_Malloc ( confInfo->nProcs * sizeof(int) );
-
-  /* sweep through all processes' records, collect the numbers of all the PSETS. 
-   * The reason for not doing MIN, MAX is that the owned PSETs may not have contiguous numbers */
-    int n_psets=0;
-    for (i=0; i<confInfo->nProcs; i++) {
-
-	ADIOI_BGL_ProcInfo_t *info_p = all_procInfo+i;
-
-	int exist = 0;
-	for (j=n_psets-1; j>=0; j--) 
-	    if (info_p->psetNum == psetNumList[j]) { exist=1; break; }
-
-	if (!exist) {
-	    psetNumList [n_psets] = info_p->psetNum;
-	    n_psets ++;
-	}
-    }
-
-  /* bucket sort:  put the CN nodes into ordered buckets, each of which represents a PSET */
-
-    /* bucket space for bucket sort */
-    ADIOI_BGL_ProcInfo_t *sorted_procInfo = ADIOI_BGL_ProcInfo_new_n ( n_psets * confInfo->virtualPsetSize );
-    int *PsetIdx = (int *) ADIOI_Malloc ( n_psets * sizeof(int) );
-    AD_BGL_assert ( (PsetIdx != NULL) );
-
-    /* initialize bucket pointer */
-    for (i=0; i<n_psets; i++) {
-        PsetIdx[i] = i*confInfo->virtualPsetSize;
-    }
-
-    /* sort */
-    for (i=0; i<confInfo->nProcs; i++) {
-        int pset_id = all_procInfo[i].psetNum;
-
-	for (j=n_psets-1; j>=0; j--) if (pset_id == psetNumList[j]) break;
-	AD_BGL_assert ( (j >= 0) ); 				/* got to find a PSET bucket */
-
-        sorted_procInfo[ PsetIdx[j] ++ ] = all_procInfo[i];
-    }
-
-    ADIOI_Free(psetNumList);
-
-  /* select a number of CN aggregators from each Pset */
-    int naggs = 0;
-    for (i=0; i<n_psets; i++) {
-
-	/* the number of CN in this PSET -- may not be a full PSET */
-        int nCN_in_pset = PsetIdx[i] - i*confInfo->virtualPsetSize;	
-
-	/* select aggregators and put them into tmp_ranklist contiguously. */
-	int local_naggs = ADIOI_BGL_select_agg_in_pset( confInfo, 
-				      sorted_procInfo + i*confInfo->virtualPsetSize, 
-				      nCN_in_pset, 
-				      tmp_ranklist + naggs);
-	aggrsInPset[i+1] = local_naggs;
-
-        naggs += local_naggs;
-    }
-        aggrsInPset[0] = n_psets;
-
-  /* leave */
-    ADIOI_Free ( PsetIdx );
-    ADIOI_BGL_ProcInfo_free ( sorted_procInfo );
-    return naggs;
-}
-
-/* 
- * compute aggregators ranklist and put it into fd->hints struct
- */ 
-static void 
-ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd, 
-					const ADIOI_BGL_ConfInfo_t *confInfo, 
-					ADIOI_BGL_ProcInfo_t *all_procInfo,
-					int *aggrsInPset )
-{
-#   if AGG_DEBUG
-    int i; 
-#   endif
-    int naggs; 
-    int *tmp_ranklist;
-
-  /* compute the ranklist of IO aggregators and put into tmp_ranklist */
-    tmp_ranklist = (int *) ADIOI_Malloc (confInfo->nProcs * sizeof(int));
-
-#   if AGG_DEBUG
-    for (i=0; i<confInfo->nProcs; i++) {
-      DBG_FPRINTF(stderr, "\tcpuid %1d, rank = %6d\n", all_procInfo[i].cpuid, all_procInfo[i].rank );
-    }
-#   endif
-
-    naggs = 
-    ADIOI_BGL_compute_agg_ranklist_serial_do (confInfo, all_procInfo, aggrsInPset, tmp_ranklist);
-
-#   define VERIFY 0
-#   if VERIFY
-    DBG_FPRINTF(stderr, "\tconfInfo = %3d,%3d,%3d,%3d,%3d,%3d,%.4f; naggs = %d\n", 
-	    confInfo->PsetSize        ,
-	    confInfo->numPsets        ,
-	    confInfo->isVNM           ,
-	    confInfo->virtualPsetSize ,
-	    confInfo->nProcs          ,
-	    confInfo->nAggrs          ,
-	    confInfo->aggRatio        ,
-	    naggs );
-#   endif
-
-#   if AGG_DEBUG
-    for (i=0; i<naggs; i++) {
-      DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );
-    }
-#   endif
-
-  /* copy the ranklist of IO aggregators to fd->hints */
-    if(fd->hints->ranklist != NULL) ADIOI_Free (fd->hints->ranklist);
-
-    fd->hints->cb_nodes = naggs;
-    fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
-    memcpy( fd->hints->ranklist, tmp_ranklist, naggs*sizeof(int) );
-
-  /* */
-    ADIOI_Free( tmp_ranklist );
-    return;
-}
-
-/* Description from common/ad_aggregate.c.  (Does it completely apply to bgl?)
- * ADIOI_Calc_aggregator()
- *
- * The intention here is to implement a function which provides basically 
- * the same functionality as in Rajeev's original version of 
- * ADIOI_Calc_my_req().  He used a ceiling division approach to assign the 
- * file domains, and we use the same approach here when calculating the
- * location of an offset/len in a specific file domain.  Further we assume
- * this same distribution when calculating the rank_index, which is later
- *  used to map to a specific process rank in charge of the file domain.
- *
- * A better (i.e. more general) approach would be to use the list of file
- * domains only.  This would be slower in the case where the
- * original ceiling division was used, but it would allow for arbitrary
- * distributions of regions to aggregators.  We'd need to know the 
- * nprocs_for_coll in that case though, which we don't have now.
- *
- * Note a significant difference between this function and Rajeev's old code:
- * this code doesn't necessarily return a rank in the range
- * 0..nprocs_for_coll; instead you get something in 0..nprocs.  This is a
- * result of the rank mapping; any set of ranks in the communicator could be
- * used now.
- *
- * Returns an integer representing a rank in the collective I/O communicator.
- *
- * The "len" parameter is also modified to indicate the amount of data
- * actually available in this file domain.
- */
-/* 
- * This is more general aggregator search function which does not base on the assumption
- * that each aggregator hosts the file domain with the same size 
- */
-int ADIOI_BGL_Calc_aggregator(ADIO_File fd,
-			      ADIO_Offset off,
-			      ADIO_Offset min_off,
-			      ADIO_Offset *len,
-			      ADIO_Offset fd_size,
-			      ADIO_Offset *fd_start,
-			      ADIO_Offset *fd_end)
-{
-    int rank_index, rank;
-    ADIO_Offset avail_bytes;
-
-    AD_BGL_assert ( (off <= fd_end[fd->hints->cb_nodes-1] && off >= min_off && fd_start[0] >= min_off ) );
-
-    /* binary search --> rank_index is returned */
-    int ub = fd->hints->cb_nodes;
-    int lb = 0;
-    /* get an index into our array of aggregators */
-    /* Common code for striping - bgl doesn't use it but it's
-       here to make diff'ing easier.
-    rank_index = (int) ((off - min_off + fd_size)/ fd_size - 1);
-
-    if (fd->hints->striping_unit > 0) {
-        * wkliao: implementation for file domain alignment
-           fd_start[] and fd_end[] have been aligned with file lock
-	   boundaries when returned from ADIOI_Calc_file_domains() so cannot
-	   just use simple arithmatic as above *
-        rank_index = 0;
-        while (off > fd_end[rank_index]) rank_index++;
-    } 
-    bgl does it's own striping below 
-    */
-    rank_index = fd->hints->cb_nodes / 2;
-    while ( off < fd_start[rank_index] || off > fd_end[rank_index] ) {
-	if ( off > fd_end  [rank_index] ) {
-	    lb = rank_index;
-	    rank_index = (rank_index + ub) / 2;
-	}
-	else 
-	if ( off < fd_start[rank_index] ) {
-	    ub = rank_index;
-	    rank_index = (rank_index + lb) / 2;
-	}
-    }
-    /* we index into fd_end with rank_index, and fd_end was allocated to be no
-     * bigger than fd->hins->cb_nodes.   If we ever violate that, we're
-     * overrunning arrays.  Obviously, we should never ever hit this abort */
-    if (rank_index >= fd->hints->cb_nodes || rank_index < 0) {
-        FPRINTF(stderr, "Error in ADIOI_Calc_aggregator(): rank_index(%d) >= fd->hints->cb_nodes (%d) fd_size=%lld off=%lld\n",
-			rank_index,fd->hints->cb_nodes,fd_size,off);
-        MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    // DBG_FPRINTF ("ADIOI_BGL_Calc_aggregator: rank_index = %d\n", rank_index );
-
-    /* 
-     * remember here that even in Rajeev's original code it was the case that
-     * different aggregators could end up with different amounts of data to
-     * aggregate.  here we use fd_end[] to make sure that we know how much
-     * data this aggregator is working with.  
-     *
-     * the +1 is to take into account the end vs. length issue.
-     */
-    avail_bytes = fd_end[rank_index] + 1 - off;
-    if (avail_bytes < *len && avail_bytes > 0) {
-        /* this file domain only has part of the requested contig. region */
-
-        *len = avail_bytes;
-    }
-
-    /* map our index to a rank */
-    /* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
-    rank = fd->hints->ranklist[rank_index];
-
-    return rank;
-}
-
-/* 
- * Compute a dynamic access range based file domain partition among I/O aggregators,
- * which align to the GPFS block size
- * Divide the I/O workload among "nprocs_for_coll" processes. This is
- * done by (logically) dividing the file into file domains (FDs); each
- * process may directly access only its own file domain. 
- * Additional effort is to make sure that each I/O aggregator get
- * a file domain that aligns to the GPFS block size.  So, there will 
- * not be any false sharing of GPFS file blocks among multiple I/O nodes. 
- *  
- * The common version of this now accepts a min_fd_size and striping_unit. 
- * It doesn't seem necessary here (using GPFS block sizes) but keep it in mind
- * (e.g. we could pass striping unit instead of using fs_ptr->blksize). 
- */
-void ADIOI_BGL_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
-                                      ADIO_Offset *end_offsets,
-                                      int          nprocs,
-                                      int          nprocs_for_coll,
-                                      ADIO_Offset *min_st_offset_ptr,
-                                      ADIO_Offset **fd_start_ptr,
-                                      ADIO_Offset **fd_end_ptr,
-                                      ADIO_Offset *fd_size_ptr,
-                                      void        *fs_ptr)
-{
-    ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
-    int i, aggr;
-
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5004, 0, NULL);
-#endif
-
-#   if AGG_DEBUG
-    static char myname[] = "ADIOI_BGL_GPFS_Calc_file_domains";
-    DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n", 
-	    myname,__LINE__,nprocs_for_coll);
-#   endif
-    __blksize_t blksize = 1048576; /* default to 1M */
-    if(fs_ptr && ((ADIOI_BGL_fs*)fs_ptr)->blksize) /* ignore null ptr or 0 blksize */
-      blksize = ((ADIOI_BGL_fs*)fs_ptr)->blksize;
-#   if AGG_DEBUG
-    DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);
-#   endif
-/* find min of start offsets and max of end offsets of all processes */
-    min_st_offset  = st_offsets [0];
-    max_end_offset = end_offsets[0];
-    for (i=1; i<nprocs; i++) {
-        min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
-        max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
-    }
-
-    // DBG_FPRINTF(stderr, "_calc_file_domains, min_st_offset, max_ = %qd, %qd\n", min_st_offset, max_end_offset );
-
-    /* determine the "file domain (FD)" of each process, i.e., the portion of
-       the file that will be "owned" by each process */
-
-    ADIO_Offset gpfs_ub       = (max_end_offset +blksize-1) / blksize * blksize - 1;
-    ADIO_Offset gpfs_lb       = min_st_offset / blksize * blksize;
-    ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset;
-    ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize;
-    ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1;
-
-    int         naggs    = nprocs_for_coll;
-
-    /* Tweak the file domains so that no fd is smaller than a threshold.  We
-     * have to strike a balance between efficency and parallelism: somewhere
-     * between 10k processes sending 32-byte requests and one process sending a
-     * 320k request is a (system-dependent) sweet spot 
-     
-    This is from the common code - the new min_fd_size parm that we didn't implement. 
-    (And common code uses a different declaration of fd_size so beware)  */
-     
-
-    /* this is not entirely sufficient on BlueGene: we must be mindful of
-     * imbalance over psets.  the hint processing code has already picked, say,
-     * 8 processors per pset, so if we go increasing fd_size we'll end up with
-     * some psets with 8 processors and some psets with none.  */
-    /*
-    if (fd_size < min_fd_size)
-        fd_size = min_fd_size;
-	*/
-    fd_size              = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
-    *fd_start_ptr        = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
-    *fd_end_ptr          = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
-    fd_start             = *fd_start_ptr;
-    fd_end               = *fd_end_ptr;
-
-    ADIO_Offset n_gpfs_blk    = fd_gpfs_range / blksize;
-    ADIO_Offset nb_cn_small   = n_gpfs_blk/naggs;
-    ADIO_Offset naggs_large   = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
-    ADIO_Offset naggs_small   = naggs - naggs_large;
-
-    /* nb_cn_small * blksize: evenly split file domain among processors:
-     *      equivalent to fd_gpfs_rnage/naggs 
-     * (nb_cn_small+1) * blksize: keeps file domain at least 'blksize' big
-     */
-    for (i=0; i<naggs; i++)
-        if (i < naggs_small) fd_size[i] = nb_cn_small     * blksize;
-			else fd_size[i] = (nb_cn_small+1) * blksize;
-			/*potential optimization: if n_gpfs_blk smalller than
-			 * naggs, slip in some zero-sized file
-			 * domains to spread the work across all psets.  */
-
-#   if AGG_DEBUG
-     DBG_FPRINTF(stderr,"%s(%d): "
-                   "gpfs_ub       %llu, "
-                   "gpfs_lb       %llu, "
-                   "gpfs_ub_rdoff %llu, "
-                   "gpfs_lb_rdoff %llu, "
-                   "fd_gpfs_range %llu, "
-                   "n_gpfs_blk    %llu, "
-                   "nb_cn_small   %llu, "
-                   "naggs_large   %llu, "
-                   "naggs_small   %llu, "
-                   "\n",
-                   myname,__LINE__,
-                   gpfs_ub      ,
-                   gpfs_lb      ,
-                   gpfs_ub_rdoff,
-                   gpfs_lb_rdoff,
-                   fd_gpfs_range,
-                   n_gpfs_blk   ,
-                   nb_cn_small  ,
-                   naggs_large  ,
-                   naggs_small
-                   );
-#   endif
-
-    fd_size[0]       -= gpfs_lb_rdoff;
-    fd_size[naggs-1] -= gpfs_ub_rdoff;
-
-    /* compute the file domain for each aggr */
-    ADIO_Offset offset = min_st_offset;
-    for (aggr=0; aggr<naggs; aggr++) {
-        fd_start[aggr] = offset;
-        fd_end  [aggr] = offset + fd_size[aggr] - 1;
-        offset += fd_size[aggr];
-    }
-
-    *fd_size_ptr = fd_size[0];
-    *min_st_offset_ptr = min_st_offset;
-
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5005, 0, NULL);
-#endif
-    ADIOI_Free (fd_size);
-}
-
-/* 
- * When a process is an IO aggregator, this will return its index in the aggrs list.
- * Otherwise, this will return -1 
- */
-int ADIOI_BGL_Aggrs_index( ADIO_File fd, int myrank )
-{
-    int i;
-    for (i=0; i<fd->hints->cb_nodes; i++) 
-	if (fd->hints->ranklist[i] == myrank) return i;
-    return -1;
-}
-
-/* 
- * ADIOI_BGL_Calc_my_req() overrides ADIOI_Calc_my_req for the default implementation 
- * is specific for static file domain partitioning.
- *
- * ADIOI_Calc_my_req() - calculate what portions of the access requests
- * of this process are located in the file domains of various processes
- * (including this one)
- */
-void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list, 
-			   int contig_access_count, ADIO_Offset 
-			   min_st_offset, ADIO_Offset *fd_start,
-			   ADIO_Offset *fd_end, ADIO_Offset fd_size,
-			   int nprocs,
-			   int *count_my_req_procs_ptr,
-			   int **count_my_req_per_proc_ptr,
-			   ADIOI_Access **my_req_ptr,
-			   int **buf_idx_ptr)
-/* Possibly reconsider if buf_idx's are ok as int's, or should they be aints/offsets? 
-   They are used as memory buffer indices so it seems like the 2G limit is in effect */
-{
-    int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
-    int i, l, proc;
-    ADIO_Offset fd_len, rem_len, curr_idx, off;
-    ADIOI_Access *my_req;
-
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5024, 0, NULL);
-#endif
-
-    *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs,sizeof(int)); 
-    count_my_req_per_proc = *count_my_req_per_proc_ptr;
-/* count_my_req_per_proc[i] gives the no. of contig. requests of this
-   process in process i's file domain. calloc initializes to zero.
-   I'm allocating memory of size nprocs, so that I can do an 
-   MPI_Alltoall later on.*/
-
-    buf_idx = (int *) ADIOI_Malloc(nprocs*sizeof(int));
-/* buf_idx is relevant only if buftype_is_contig.
-   buf_idx[i] gives the index into user_buf where data received
-   from proc. i should be placed. This allows receives to be done
-   without extra buffer. This can't be done if buftype is not contig. */
-   
-    /* initialize buf_idx to -1 */
-    for (i=0; i < nprocs; i++) buf_idx[i] = -1;
-
-    /* one pass just to calculate how much space to allocate for my_req;
-     * contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
-     */
-    for (i=0; i < contig_access_count; i++) {
-	/* short circuit offset/len processing if len == 0 
-	 * 	(zero-byte  read/write */
-	if (len_list[i] == 0) 
-		continue;
-	off = offset_list[i];
-	fd_len = len_list[i];
-	/* note: we set fd_len to be the total size of the access.  then
-	 * ADIOI_Calc_aggregator() will modify the value to return the 
-	 * amount that was available from the file domain that holds the
-	 * first part of the access.
-	 */
-	proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, 
-				     fd_start, fd_end);
-	count_my_req_per_proc[proc]++;
-
-	/* figure out how much data is remaining in the access (i.e. wasn't 
-	 * part of the file domain that had the starting byte); we'll take 
-	 * care of this data (if there is any) in the while loop below.
-	 */
-	rem_len = len_list[i] - fd_len;
-
-	while (rem_len > 0) {
-	    off += fd_len; /* point to first remaining byte */
-	    fd_len = rem_len; /* save remaining size, pass to calc */
-	    proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, 
-					 fd_size, fd_start, fd_end);
-
-	    count_my_req_per_proc[proc]++;
-	    rem_len -= fd_len; /* reduce remaining length by amount from fd */
-	}
-    }
-
-/* now allocate space for my_req, offset, and len */
-
-    *my_req_ptr = (ADIOI_Access *)
-	ADIOI_Malloc(nprocs*sizeof(ADIOI_Access)); 
-    my_req = *my_req_ptr;
-
-    count_my_req_procs = 0;
-    for (i=0; i < nprocs; i++) {
-	if (count_my_req_per_proc[i]) {
-	    my_req[i].offsets = (ADIO_Offset *)
-		ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset));
-	    my_req[i].lens = (int *)
-		ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(int));
-	    count_my_req_procs++;
-	}	    
-	my_req[i].count = 0;  /* will be incremented where needed
-				      later */
-    }
-
-/* now fill in my_req */
-    curr_idx = 0;
-    for (i=0; i<contig_access_count; i++) { 
-	/* short circuit offset/len processing if len == 0 
-	 * 	(zero-byte  read/write */
-	if (len_list[i] == 0)
-		continue;
-	off = offset_list[i];
-	fd_len = len_list[i];
-	proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, 
-				     fd_start, fd_end);
-
-	/* for each separate contiguous access from this process */
-	if (buf_idx[proc] == -1)
-  {
-    ADIOI_Assert(curr_idx == (int) curr_idx);
-    buf_idx[proc] = (int) curr_idx;
-  }
-
-	l = my_req[proc].count;
-	curr_idx += fd_len;
-
-	rem_len = len_list[i] - fd_len;
-
-	/* store the proc, offset, and len information in an array
-         * of structures, my_req. Each structure contains the 
-         * offsets and lengths located in that process's FD, 
-	 * and the associated count. 
-	 */
-	my_req[proc].offsets[l] = off;
-  ADIOI_Assert(fd_len == (int) fd_len);
-	my_req[proc].lens[l] = (int) fd_len;
-	my_req[proc].count++;
-
-	while (rem_len > 0) {
-	    off += fd_len;
-	    fd_len = rem_len;
-	    proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, 
-					 fd_size, fd_start, fd_end);
-
-	    if (buf_idx[proc] == -1) 
-      {
-        ADIOI_Assert(curr_idx == (int) curr_idx);
-        buf_idx[proc] = (int) curr_idx;
-      }
-
-	    l = my_req[proc].count;
-	    curr_idx += fd_len;
-	    rem_len -= fd_len;
-
-	    my_req[proc].offsets[l] = off;
-      ADIOI_Assert(fd_len == (int) fd_len);
-	    my_req[proc].lens[l] = (int) fd_len;
-	    my_req[proc].count++;
-	}
-    }
-
-#ifdef AGG_DEBUG
-    for (i=0; i<nprocs; i++) {
-	if (count_my_req_per_proc[i] > 0) {
-	    DBG_FPRINTF(stderr, "data needed from %d (count = %d):\n", i, 
-		    my_req[i].count);
-	    for (l=0; l < my_req[i].count; l++) {
-		DBG_FPRINTF(stderr, "   off[%d] = %lld, len[%d] = %d\n", l,
-			my_req[i].offsets[l], l, my_req[i].lens[l]);
-	    }
-	}
-	DBG_FPRINTF(stderr, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
-    }
-#endif
-
-    *count_my_req_procs_ptr = count_my_req_procs;
-    *buf_idx_ptr = buf_idx;
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5025, 0, NULL);
-#endif
-}
-
-/*
- * ADIOI_Calc_others_req (copied to bgl and switched to all to all for performance)
- *
- * param[in]  count_my_req_procs        Number of processes whose file domain my
- *                                        request touches.
- * param[in]  count_my_req_per_proc     count_my_req_per_proc[i] gives the no. of 
- *                                        contig. requests of this process in 
- *                                        process i's file domain.
- * param[in]  my_req                    A structure defining my request
- * param[in]  nprocs                    Number of nodes in the block
- * param[in]  myrank                    Rank of this node
- * param[out] count_others_req_proc_ptr Number of processes whose requests lie in
- *                                        my process's file domain (including my 
- *                                        process itself)
- * param[out] others_req_ptr            Array of other process' requests that lie
- *                                        in my process's file domain
- */
-void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs, 
-				int *count_my_req_per_proc,
-				ADIOI_Access *my_req, 
-				int nprocs, int myrank,
-				int *count_others_req_procs_ptr,
-				ADIOI_Access **others_req_ptr)  
-{
-/* determine what requests of other processes lie in this process's
-   file domain */
-
-/* count_others_req_procs = number of processes whose requests lie in
-   this process's file domain (including this process itself) 
-   count_others_req_per_proc[i] indicates how many separate contiguous
-   requests of proc. i lie in this process's file domain. */
-
-    int *count_others_req_per_proc, count_others_req_procs;
-    int i;
-    ADIOI_Access *others_req;
-    
-    /* Parameters for MPI_Alltoallv */
-    int *scounts, *sdispls, *rcounts, *rdispls;
-
-    /* Parameters for MPI_Alltoallv.  These are the buffers, which
-     * are later computed to be the lowest address of all buffers
-     * to be sent/received for offsets and lengths.  Initialize to
-     * the highest possible address which is the current minimum.
-     */
-    void *sendBufForOffsets=(void*)0xFFFFFFFF, 
-	 *sendBufForLens   =(void*)0xFFFFFFFF, 
-	 *recvBufForOffsets=(void*)0xFFFFFFFF, 
-	 *recvBufForLens   =(void*)0xFFFFFFFF; 
-
-/* first find out how much to send/recv and from/to whom */
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5026, 0, NULL);
-#endif
-    /* Send 1 int to each process.  count_my_req_per_proc[i] is the number of 
-     * requests that my process will do to the file domain owned by process[i].
-     * Receive 1 int from each process.  count_others_req_per_proc[i] is the number of
-     * requests that process[i] will do to the file domain owned by my process.
-     */
-    count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int));
-/*     cora2a1=timebase(); */
-    MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
-		 count_others_req_per_proc, 1, MPI_INT, fd->comm);
-/*     total_cora2a+=timebase()-cora2a1; */
-
-    /* Allocate storage for an array of other nodes' accesses of our
-     * node's file domain.  Also allocate storage for the alltoallv
-     * parameters.
-     */
-    *others_req_ptr = (ADIOI_Access *)
-	ADIOI_Malloc(nprocs*sizeof(ADIOI_Access)); 
-    others_req = *others_req_ptr;
-
-    scounts = ADIOI_Malloc(nprocs*sizeof(int));
-    sdispls = ADIOI_Malloc(nprocs*sizeof(int));
-    rcounts = ADIOI_Malloc(nprocs*sizeof(int));
-    rdispls = ADIOI_Malloc(nprocs*sizeof(int));
-
-    /* If process[i] has any requests in my file domain,
-     *   initialize an ADIOI_Access structure that will describe each request
-     *   from process[i].  The offsets, lengths, and buffer pointers still need
-     *   to be obtained to complete the setting of this structure.
-     */
-    count_others_req_procs = 0;
-    for (i=0; i<nprocs; i++) {
-	if (count_others_req_per_proc[i]) {
-	    others_req[i].count = count_others_req_per_proc[i];
-
-	    others_req[i].offsets = (ADIO_Offset *)
-		ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(ADIO_Offset));
-	    others_req[i].lens = (int *)
-		ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(int)); 
-
-	    if ( (MPIR_Upint)others_req[i].offsets < (MPIR_Upint)recvBufForOffsets )
-		recvBufForOffsets = others_req[i].offsets;
-	    if ( (MPIR_Upint)others_req[i].lens < (MPIR_Upint)recvBufForLens )
-		recvBufForLens = others_req[i].lens;
-
-	    others_req[i].mem_ptrs = (MPI_Aint *)
-		ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(MPI_Aint)); 
-
-	    count_others_req_procs++;
-	}
-	else 
-	{
-	    others_req[i].count = 0;
-	    others_req[i].offsets = NULL;
-	    others_req[i].lens    = NULL;
-	}
-    }
-    /* If no recv buffer was allocated in the loop above, make it NULL */
-    if ( recvBufForOffsets == (void*)0xFFFFFFFF) recvBufForOffsets = NULL;
-    if ( recvBufForLens    == (void*)0xFFFFFFFF) recvBufForLens    = NULL;
-    
-    /* Now send the calculated offsets and lengths to respective processes */
-
-    /************************/
-    /* Exchange the offsets */
-    /************************/
-
-    /* Determine the lowest sendBufForOffsets/Lens */
-    for (i=0; i<nprocs; i++)
-    {
-	if ( (my_req[i].count) &&
-	     ((MPIR_Upint)my_req[i].offsets <= (MPIR_Upint)sendBufForOffsets) )
-	  sendBufForOffsets = my_req[i].offsets;
-	   
-	if ( (my_req[i].count) &&
-	     ((MPIR_Upint)my_req[i].lens <= (MPIR_Upint)sendBufForLens) )
-	    sendBufForLens = my_req[i].lens;
-    }
-
-    /* If no send buffer was found in the loop above, make it NULL */
-    if ( sendBufForOffsets == (void*)0xFFFFFFFF) sendBufForOffsets = NULL;
-    if ( sendBufForLens    == (void*)0xFFFFFFFF) sendBufForLens    = NULL;
-
-    /* Calculate the displacements from the sendBufForOffsets/Lens */
-    for (i=0; i<nprocs; i++)
-    {
-	// Send these offsets to process i.
-	scounts[i] = count_my_req_per_proc[i];
-	if ( scounts[i] == 0 )
-	    sdispls[i] = 0;
-	else
-  	  sdispls[i] =  (int)
-	                ( ( (MPIR_Upint)my_req[i].offsets - 
-			   (MPIR_Upint)sendBufForOffsets ) / 
-			  (MPIR_Upint)sizeof(ADIO_Offset) );
-
-	// Receive these offsets from process i.
-	rcounts[i] = count_others_req_per_proc[i];
-	if ( rcounts[i] == 0 )
-	    rdispls[i] = 0;
-	else
-	    rdispls[i] = (int)
-	                 ( ( (MPIR_Upint)others_req[i].offsets - 
-			     (MPIR_Upint)recvBufForOffsets ) / 
-			   (MPIR_Upint)sizeof(ADIO_Offset) );
-    }
-
-    /* Exchange the offsets */
-    MPI_Alltoallv(sendBufForOffsets,
-		  scounts, sdispls, ADIO_OFFSET,
-		  recvBufForOffsets,
-		  rcounts, rdispls, ADIO_OFFSET,
-		  fd->comm);
-
-    /************************/
-    /* Exchange the lengths */
-    /************************/
-
-    for (i=0; i<nprocs; i++)
-    {
-	// Send these lengths to process i.
-	scounts[i] = count_my_req_per_proc[i];
-	if ( scounts[i] == 0 )
-	    sdispls[i] = 0;
-	else
-	  sdispls[i] = (int)
-	               ( ( (MPIR_Upint)my_req[i].lens - 
-			   (MPIR_Upint)sendBufForLens ) / 
-			 (MPIR_Upint) sizeof(int) );
-	
-	// Receive these offsets from process i.
-	rcounts[i] = count_others_req_per_proc[i];
-	if ( rcounts[i] == 0 )
-	    rdispls[i] = 0;
-	else
-	    rdispls[i] = (int)
-	                 ( ( (MPIR_Upint)others_req[i].lens - 
-			     (MPIR_Upint)recvBufForLens ) / 
-			   (MPIR_Upint) sizeof(int) );
-    }
-
-    /* Exchange the lengths */
-    MPI_Alltoallv(sendBufForLens,
-		  scounts, sdispls, MPI_INT,
-		  recvBufForLens,
-		  rcounts, rdispls, MPI_INT,
-		  fd->comm);
-
-    /* Clean up */
-    ADIOI_Free(count_others_req_per_proc);
-    ADIOI_Free (scounts);
-    ADIOI_Free (sdispls);
-    ADIOI_Free (rcounts);
-    ADIOI_Free (rdispls);
-
-    *count_others_req_procs_ptr = count_others_req_procs;
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5027, 0, NULL);
-#endif
-}
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl_aggrs.h b/src/mpi/romio/adio/ad_bgl/ad_bgl_aggrs.h
deleted file mode 100644
index ffa9ec0..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl_aggrs.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl_aggrs.h
- * \brief ???
- */
-
-/* 
- * File: ad_bgl_aggrs.h
- * 
- * Declares functions specific for BG/L - GPFS parallel I/O solution. The implemented optimizations are:
- * 	. Aligned file-domain partitioning, integrated in 7/28/2005
- * 
- * In addition, following optimizations are planned:
- * 	. Integrating multiple file-domain partitioning schemes 
- *	  (corresponding to Alok Chouhdary's persistent file domain work).
- */
-
-#ifndef AD_BGL_AGGRS_H_
-#define AD_BGL_AGGRS_H_
-
-#include "adio.h"
-#include <sys/stat.h>
-
-#if !defined(GPFS_SUPER_MAGIC)
-  #define GPFS_SUPER_MAGIC (0x47504653)
-#endif
-
-#if !defined(PVFS2_SUPER_MAGIC)
-  #define PVFS2_SUPER_MAGIC (0x20030528)
-#endif
-
-    /* File system (BGL) specific information - 
-         hung off of ADIOI_FileD file descriptor (fd->fs_ptr) at open */
-    typedef struct ADIOI_BGL_fs_s {
-      __blksize_t blksize;
-      int         fsync_aggr; /* "fsync aggregation" flags (below) */
-#define ADIOI_BGL_FSYNC_AGGREGATION_DISABLED  0x00
-#define ADIOI_BGL_FSYNC_AGGREGATION_ENABLED   0x01
-#define ADIOI_BGL_FSYNC_AGGREGATOR            0x10 /* This rank is an aggregator */
-    }  ADIOI_BGL_fs;
-
-    /* generate a list of I/O aggregators that utilizes BGL-PSET orginization. */
-    int ADIOI_BGL_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset);
-
-    /* overriding ADIOI_Calc_file_domains() to apply 'aligned file domain partitioning'. */
-    void ADIOI_BGL_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
-				          ADIO_Offset *end_offsets,
-				          int          nprocs,
-				          int          nprocs_for_coll,
-				          ADIO_Offset *min_st_offset_ptr,
-				          ADIO_Offset **fd_start_ptr,
-				          ADIO_Offset **fd_end_ptr,
-				          ADIO_Offset *fd_size_ptr,
-                  void        *fs_ptr);
-
-    /* a utilitiy function for debugging */
-    int ADIOI_BGL_Aggrs_index(ADIO_File fd, int myrank );
-
-    /* overriding ADIOI_Calc_aggregator() for the default implementation is specific for 
-       static file domain partitioning */
-    int ADIOI_BGL_Calc_aggregator(ADIO_File fd,
-				  ADIO_Offset off,
-				  ADIO_Offset min_off,
-				  ADIO_Offset *len,
-				  ADIO_Offset fd_size,
-				  ADIO_Offset *fd_start,
-				  ADIO_Offset *fd_end);
-
-    /* overriding ADIOI_Calc_my_req for the default implementation is specific for 
-       static file domain partitioning */
-    void ADIOI_BGL_Calc_my_req ( ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
-				 int contig_access_count, ADIO_Offset
-				 min_st_offset, ADIO_Offset *fd_start,
-				 ADIO_Offset *fd_end, ADIO_Offset fd_size,
-				 int nprocs,
-				 int *count_my_req_procs_ptr,
-				 int **count_my_req_per_proc_ptr,
-				 ADIOI_Access **my_req_ptr,
-				 int **buf_idx_ptr);
-
-    /*
-     * ADIOI_Calc_others_req
-     *
-     * param[in]  count_my_req_procs        Number of processes whose file domain my
-     *                                        request touches.
-     * param[in]  count_my_req_per_proc     count_my_req_per_proc[i] gives the no. of 
-     *                                        contig. requests of this process in 
-     *                                        process i's file domain.
-     * param[in]  my_req                    A structure defining my request
-     * param[in]  nprocs                    Number of nodes in the block
-     * param[in]  myrank                    Rank of this node
-     * param[out] count_others_req_proc_ptr Number of processes whose requests lie in
-     *                                        my process's file domain (including my 
-     *                                        process itself)
-     * param[out] others_req_ptr            Array of other process' requests that lie
-     *                                        in my process's file domain
-     */
-     void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs, 
-				    int *count_my_req_per_proc,
-				    ADIOI_Access *my_req, 
-				    int nprocs, int myrank,
-				    int *count_others_req_procs_ptr,
-				    ADIOI_Access **others_req_ptr);
-
-
-#endif  /* AD_BGL_AGGRS_H_ */
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl_close.c b/src/mpi/romio/adio/ad_bgl/ad_bgl_close.c
deleted file mode 100644
index 8fcf857..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl_close.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl_close.c
- * \brief ???
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 1997 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "ad_bgl.h"
-#include "ad_bgl_aggrs.h"
-
-void ADIOI_BGL_Close(ADIO_File fd, int *error_code)
-{
-  int err, derr=0;
-  static char myname[] = "ADIOI_BGL_CLOSE";
-
-#ifdef PROFILE
-  MPE_Log_event(9, 0, "start close");
-#endif
-
-  err = close(fd->fd_sys);
-  if (fd->fd_direct >= 0)
-  {
-    derr = close(fd->fd_direct);
-  }
-
-#ifdef PROFILE
-  MPE_Log_event(10, 0, "end close");
-#endif
-
-/*  FPRINTF(stderr,"%s(%d):'%s'. Free %#X\n",myname,__LINE__,fd->filename,(int)fd->fs_ptr);*/
-  if (fd->fs_ptr != NULL) {
-	  ADIOI_Free(fd->fs_ptr);
-	  fd->fs_ptr = NULL;
-  }
-  fd->fd_sys    = -1;
-  fd->fd_direct = -1;
-
-  if (err == -1 || derr == -1)
-  {
-    *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
-                                       myname, __LINE__, MPI_ERR_IO,
-                                       "**io",
-                                       "**io %s", strerror(errno));
-  }
-  else *error_code = MPI_SUCCESS;
-}
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl_fcntl.c b/src/mpi/romio/adio/ad_bgl/ad_bgl_fcntl.c
deleted file mode 100644
index afae2c2..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl_fcntl.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl_fcntl.c
- * \brief ???
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 1997 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "ad_bgl.h"
-#include "adio_extern.h"
-/* #ifdef MPISGI
-#include "mpisgi2.h"
-#endif */
-
-void ADIOI_BGL_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct,
-		     int *error_code)
-{
-    static char myname[] = "ADIOI_BGL_FCNTL";
-
-    switch(flag) {
-    case ADIO_FCNTL_GET_FSIZE:
-	fcntl_struct->fsize = lseek(fd->fd_sys, 0, SEEK_END);
-	if (fd->fp_sys_posn != -1) 
-	     lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET);
-	if (fcntl_struct->fsize == -1) {
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE, myname,
-					       __LINE__, MPI_ERR_IO, "**io",
-					       "**io %s", strerror(errno));
-	}
-	else *error_code = MPI_SUCCESS;
-	break;
-
-    case ADIO_FCNTL_SET_DISKSPACE:
-	ADIOI_GEN_Prealloc(fd, fcntl_struct->diskspace, error_code);
-	break;
-
-    case ADIO_FCNTL_SET_ATOMICITY:
-	fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1;
-	*error_code = MPI_SUCCESS;
-	break;
-
-	/* --BEGIN ERROR HANDLING-- */
-    default:
-	*error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					   MPIR_ERR_RECOVERABLE,
-					   myname, __LINE__,
-					   MPI_ERR_ARG,
-					   "**flag", "**flag %d", flag);
-	/* --END ERROR HANDLING-- */
-    }
-}
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl_flush.c b/src/mpi/romio/adio/ad_bgl/ad_bgl_flush.c
deleted file mode 100644
index 97fd2ca..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl_flush.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl_flush.c
- * \brief Scalable flush based on underlying filesystem and psets
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *
- *   Copyright (C) 1997 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "ad_bgl.h"
-#include "ad_bgl_aggrs.h"
-
-void ADIOI_BGL_Flush(ADIO_File fd, int *error_code)
-{
-  int err=0;
-  static char myname[] = "ADIOI_BGL_FLUSH";
-
-
-  if(((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr & ADIOI_BGL_FSYNC_AGGREGATION_ENABLED)
-  {
-    int rank;
- 
-    /* Barrier so we can collectively do fewer fsync's */
-    MPI_Barrier(fd->comm);
-  
-    MPI_Comm_rank(fd->comm, &rank);
-  
-    /* All ranks marked as "fsync aggregators" should fsync. 
-       (We currently only do one fsync on rank 0 but this is general 
-       enough to support >1 aggregator using allreduce to get the
-       results instead of simply bcast'ing the results from rank 0.)*/
-    if(((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr & ADIOI_BGL_FSYNC_AGGREGATOR)
-    {
-      err = fsync(fd->fd_sys);
-      DBG_FPRINTF(stderr,"aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
-      /* We want errno, not the return code if it failed */
-      if (err == -1) err = errno;
-      else err = 0;
-    }
-    /* Just pick an errno (using unsigned MPI_MAX) from any failures */
-    MPI_Allreduce( MPI_IN_PLACE, (unsigned*)&err, 1, MPI_UNSIGNED, MPI_MAX, fd->comm);
-    DBGV_FPRINTF(stderr,"aggregation result:fsync %s, errno %#X,\n",fd->filename, err);
-
-    if (err) /* if it's non-zero, it must be an errno */
-    {
-      errno = err;
-      err = -1;
-    }
-  }
-  else /* Non-aggregated fsync */
-  {
-#ifdef USE_DBG_LOGGING
-    int rank;
-#endif
-    err = fsync(fd->fd_sys);
-#ifdef USE_DBG_LOGGING
-    MPI_Comm_rank(fd->comm, &rank);
-
-    if(rank == 0)
-    {
-        DBG_FPRINTF(stderr,"no aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
-    }
-    else
-    {
-        DBGV_FPRINTF(stderr,"no aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
-    }
-#endif
-  }
-
-  /* --BEGIN ERROR HANDLING-- */
-  if (err == -1)
-  {
-    *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
-                                       myname, __LINE__, MPI_ERR_IO,
-                                       "**io",
-                                       "**io %s", strerror(errno));
-    DBGT_FPRINTF(stderr,"fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
-    return;
-  }
-  /* --END ERROR HANDLING-- */
-
-  *error_code = MPI_SUCCESS;
-}
-
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl_getsh.c b/src/mpi/romio/adio/ad_bgl/ad_bgl_getsh.c
deleted file mode 100644
index 4b47c35..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl_getsh.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl_getsh.c
- * \brief ???
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 1997 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "ad_bgl.h"
-
-/* returns the current location of the shared_fp in terms of the
-   no. of etypes relative to the current view, and also increments the
-   shared_fp by the number of etypes to be accessed (incr) in the read
-   or write following this function. */
-
-void ADIOI_BGL_Get_shared_fp(ADIO_File fd, ADIO_Offset incr, ADIO_Offset *shared_fp,
-			 int *error_code)
-{
-    ADIO_Offset new_fp;
-    int err;
-    MPI_Comm dupcommself;
-    static char myname[] = "ADIOI_BGL_GET_SHARED_FP";
-
-    if (fd->shared_fp_fd == ADIO_FILE_NULL) {
-	MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
-	fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF, 
-				     dupcommself,
-				     fd->shared_fp_fname, 
-				     fd->file_system,
-				     fd->fns,
-				     ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE, 
-				     0, 
-				     MPI_BYTE, 
-				     MPI_BYTE, 
-				     MPI_INFO_NULL, 
-				     ADIO_PERM_NULL, 
-				     error_code);
-	if (*error_code != MPI_SUCCESS) return;
-	*shared_fp = 0;
-	ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
-	err = read(fd->shared_fp_fd->fd_sys, shared_fp, sizeof(ADIO_Offset));
-        /* if the file is empty, the above read may return error
-           (reading beyond end of file). In that case, shared_fp = 0, 
-           set above, is the correct value. */
-    }
-    else {
-	ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
-
-	err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
-	if (err == 0) {
-	    err = read(fd->shared_fp_fd->fd_sys, shared_fp,
-		       sizeof(ADIO_Offset));
-	}
-	if (err == -1) {
-	    ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE, myname,
-					       __LINE__, MPI_ERR_IO, "**io",
-					       "**io %s", strerror(errno));
-	    return;
-	}
-    }
-
-    new_fp = *shared_fp + incr;
-
-    err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
-    if (err == 0) {
-	err = write(fd->shared_fp_fd->fd_sys, &new_fp, sizeof(ADIO_Offset));
-    }
-    ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
-    if (err == -1) {
-	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
-					   myname, __LINE__, MPI_ERR_IO,
-					   "**io",
-					   "**io %s", strerror(errno));
-    }
-    else *error_code = MPI_SUCCESS;
-}
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl_hints.c b/src/mpi/romio/adio/ad_bgl/ad_bgl_hints.c
deleted file mode 100644
index e9f5a31..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl_hints.c
+++ /dev/null
@@ -1,302 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl_hints.c
- * \brief BlueGene hint processing
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 1997 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "adio.h"
-#include "adio_extern.h"
-#include "hints_fn.h"
-
-#include "ad_bgl.h"
-#include "ad_bgl_pset.h"
-#include "ad_bgl_aggrs.h"
-
-#define   ADIOI_BGL_CB_BUFFER_SIZE_DFLT      	"16777216"
-#define	  ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT	"4194304"
-#define   ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT	"4194304"
-#define   ADIOI_BGL_NAGG_IN_PSET_HINT_NAME	"bgl_nodes_pset"
-/** \page mpiio_vars MPIIO Configuration
- *  
- * BlueGene MPIIO configuration and performance tuning. Used by ad_bgl and ad_bglockless ADIO's.
- *  
- * \section hint_sec Hints
- * - bgl_nodes_pset - Specify how many aggregators to use per pset.
- *   This hint will override the cb_nodes hint based on BlueGene psets.
- *   - N - Use N nodes per pset as aggregators.
- *   - Default is based on partition configuration and cb_nodes.
- *  
- *   The following default key/value pairs may differ from other platform defaults.
- *  
- *     - key = cb_buffer_size     value = 16777216
- *     - key = romio_cb_read      value = enable
- *     - key = romio_cb_write     value = enable
- *     - key = ind_rd_buffer_size value = 4194304
- *     - key = ind_wr_buffer_size value = 4194304
- */
-
-/* Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO. */
-extern int 
-ADIOI_BGL_gen_agg_ranklist(ADIO_File fd, int n_proxy_per_pset);
-
-void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
-{
-/* if fd->info is null, create a new info object. 
-   Initialize fd->info to default values.
-   Initialize fd->hints to default values.
-   Examine the info object passed by the user. If it contains values that
-   ROMIO understands, override the default. */
-
-    MPI_Info info;
-    char *value;
-    int flag, intval, tmp_val, nprocs=0;
-    static char myname[] = "ADIOI_BGL_SETINFO";
-
-    int did_anything = 0;
-
-    if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info));
-    info = fd->info;
-
-    /* Note that fd->hints is allocated at file open time; thus it is
-     * not necessary to allocate it, or check for allocation, here.
-     */
-
-    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
-    AD_BGL_assert ((value != NULL));
-
-    /* initialize info and hints to default values if they haven't been
-     * previously initialized
-     */
-    if (!fd->hints->initialized) {
-
-	did_anything = 1;
-
-	/* buffer size for collective I/O */
-	ADIOI_Info_set(info, "cb_buffer_size", ADIOI_BGL_CB_BUFFER_SIZE_DFLT); 
-	fd->hints->cb_buffer_size = atoi(ADIOI_BGL_CB_BUFFER_SIZE_DFLT);
-
-	/* default is to let romio automatically decide when to use
-	 * collective buffering
-	 */
-	ADIOI_Info_set(info, "romio_cb_read", "enable"); 
-	fd->hints->cb_read = ADIOI_HINT_ENABLE;
-	ADIOI_Info_set(info, "romio_cb_write", "enable"); 
-	fd->hints->cb_write = ADIOI_HINT_ENABLE;
-
-   	if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list);
-	fd->hints->cb_config_list = NULL;
-
-	/* number of processes that perform I/O in collective I/O */
-	MPI_Comm_size(fd->comm, &nprocs);
-	ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs);
-	ADIOI_Info_set(info, "cb_nodes", value);
-	fd->hints->cb_nodes = -1;
-
-	/* hint indicating that no indep. I/O will be performed on this file */
-	ADIOI_Info_set(info, "romio_no_indep_rw", "false");
-	fd->hints->no_indep_rw = 0;
-
-	/* bgl is not implementing file realms (ADIOI_IOStridedColl),
-	   initialize to disabled it. 	   */
-	/* hint instructing the use of persistent file realms */
-	ADIOI_Info_set(info, "romio_cb_pfr", "disable");
-	fd->hints->cb_pfr = ADIOI_HINT_DISABLE;
-	
-	/* hint guiding the assignment of persistent file realms */
-	ADIOI_Info_set(info, "romio_cb_fr_types", "aar");
-	fd->hints->cb_fr_type = ADIOI_FR_AAR;
-
-	/* hint to align file realms with a certain byte value */
-	ADIOI_Info_set(info, "romio_cb_fr_alignment", "1");
-	fd->hints->cb_fr_alignment = 1;
-
-	/* hint to set a threshold percentage for a datatype's size/extent at
-	 * which data sieving should be done in collective I/O */
-	ADIOI_Info_set(info, "romio_cb_ds_threshold", "0");
-	fd->hints->cb_ds_threshold = 0;
-
-	/* hint to switch between point-to-point or all-to-all for two-phase */
-	ADIOI_Info_set(info, "romio_cb_alltoall", "automatic");
-	fd->hints->cb_alltoall = ADIOI_HINT_AUTO;
-
-	 /* deferred_open derived from no_indep_rw and cb_{read,write} */
-	fd->hints->deferred_open = 0;
-
-	/* buffer size for data sieving in independent reads */
-	ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT);
-	fd->hints->ind_rd_buffer_size = atoi(ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT);
-
-	/* buffer size for data sieving in independent writes */
-	ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT);
-	fd->hints->ind_wr_buffer_size = atoi(ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT);
-
-  if(fd->file_system == ADIO_UFS)
-  {
-    /* default for ufs/pvfs is to disable data sieving  */
-    ADIOI_Info_set(info, "romio_ds_read", "disable"); 
-    fd->hints->ds_read = ADIOI_HINT_DISABLE;
-    ADIOI_Info_set(info, "romio_ds_write", "disable"); 
-    fd->hints->ds_write = ADIOI_HINT_DISABLE;
-  }
-  else
-  {
-    /* default is to let romio automatically decide when to use data
-     * sieving
-     */
-    ADIOI_Info_set(info, "romio_ds_read", "automatic"); 
-    fd->hints->ds_read = ADIOI_HINT_AUTO;
-    ADIOI_Info_set(info, "romio_ds_write", "automatic"); 
-    fd->hints->ds_write = ADIOI_HINT_AUTO;
-  }
-
-    /* still to do: tune this a bit for a variety of file systems. there's
-	 * no good default value so just leave it unset */
-    fd->hints->min_fdomain_size = 0;
-    fd->hints->striping_unit = 0;
-
-    fd->hints->initialized = 1;
-    }
-
-    /* add in user's info if supplied */
-    if (users_info != MPI_INFO_NULL) {
-	ADIOI_Info_check_and_install_int(fd, users_info, "cb_buffer_size",
-		&(fd->hints->cb_buffer_size), myname, error_code);
-#if 0
-	/* bgl is not implementing file realms (ADIOI_IOStridedColl) ... */
-	/* aligning file realms to certain sizes (e.g. stripe sizes)
-	 * may benefit I/O performance */
-	ADIOI_Info_check_and_install_int(fd, users_info, "romio_cb_fr_alignment", 
-		&(fd->hints->cb_fr_alignment), myname, error_code);
-
-	/* for collective I/O, try to be smarter about when to do data sieving
-	 * using a specific threshold for the datatype size/extent
-	 * (percentage 0-100%) */
-	ADIOI_Info_check_and_install_int(fd, users_info, "romio_cb_ds_threshold", 
-		&(fd->hints->cb_ds_threshold), myname, error_code);
-
-	ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_alltoall",
-		&(fd->hints->cb_alltoall), myname, error_code);
-#endif
-	/* new hints for enabling/disabling coll. buffering on
-	 * reads/writes
-	 */
-	ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_read",
-		&(fd->hints->cb_read), myname, error_code);
-	if (fd->hints->cb_read == ADIOI_HINT_DISABLE) {
-	    /* romio_cb_read overrides no_indep_rw */
-	    ADIOI_Info_set(info, "romio_no_indep_rw", "false");
-	    fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
-	}
-
-	ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_write",
-		&(fd->hints->cb_write), myname, error_code);
-	if (fd->hints->cb_write == ADIOI_HINT_DISABLE) {
-	    /* romio_cb_write overrides no_indep_rw */
-	    ADIOI_Info_set(info, "romio_no_indep_rw", "false");
-	    fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
-	}
-
-
-#if 0
-	/* bgl is not implementing file realms (ADIOI_IOStridedColl) ... */
-	/* enable/disable persistent file realms for collective I/O */
-	/* may want to check for no_indep_rdwr hint as well */
-	ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_pfr",
-		&(fd->hints->cb_pfr), myname, error_code);
-
-
-	/* file realm assignment types ADIOI_FR_AAR(0),
-	 ADIOI_FR_FSZ(-1), ADIOI_FR_USR_REALMS(-2), all others specify
-	 a regular fr size in bytes. probably not the best way... */
-	ADIOI_Info_check_and_install_int(fd, users_info, "romio_cb_fr_type",
-		&(fd->hints->cb_fr_type), myname, error_code);
-#endif
-	/* Has the user indicated all I/O will be done collectively? */
-	ADIOI_Info_check_and_install_true(fd, users_info, "romio_no_indep_rw",
-		&(fd->hints->no_indep_rw), myname, error_code);
-	if (fd->hints->no_indep_rw == 1) {
-	    /* if 'no_indep_rw' set, also hint that we will do
-	     * collective buffering: if we aren't doing independent io,
-	     * then we have to do collective  */
-	    ADIOI_Info_set(info, "romio_cb_write", "enable");
-	    ADIOI_Info_set(info, "romio_cb_read", "enable");
-	    fd->hints->cb_read = 1;
-	    fd->hints->cb_write = 1;
-	} 
-	/* new hints for enabling/disabling data sieving on
-	 * reads/writes
-	 */
-	ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_read",
-		&(fd->hints->ds_read), myname, error_code);
-	ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_write",
-		&(fd->hints->ds_write), myname, error_code);
-	}
-
-	ADIOI_Info_check_and_install_int(fd, users_info, "ind_wr_buffer_size",
-		&(fd->hints->ind_wr_buffer_size), myname, error_code);
-	ADIOI_Info_check_and_install_int(fd, users_info, "ind_rd_buffer_size",
-		&(fd->hints->ind_rd_buffer_size), myname, error_code);
-
-
-	ADIOI_Info_check_and_install_int(fd, users_info, "romio_min_fdomain_size",
-		&(fd->hints->min_fdomain_size), myname, error_code);
-	}
-  /* Now we use striping unit in common code so we should
-     process hints for it. */
-	ADIOI_Info_check_and_install_int(fd, users_info, "striping_unit", 
-		&(fd->hints->striping_unit), myname, error_code);
-	}
-
-	memset( value, 0, MPI_MAX_INFO_VAL+1 );
-        ADIOI_Info_get(users_info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL,
-		     value, &flag);
-	if (flag && ((intval = atoi(value)) > 0)) {
-
-	    did_anything = 1;
-	    ADIOI_Info_set(info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, value);
-	    fd->hints->cb_nodes = intval;
-	}
-    }
-
-    /* associate CB aggregators to certain CNs in every involved PSET */
-    if (did_anything) {
-	ADIOI_BGL_gen_agg_ranklist(fd, fd->hints->cb_nodes);
-    }
-    /* ignore defered open hints and do not enable it for bluegene: need all
-     * processors in the open path so we can stat-and-broadcast the blocksize
-     */
-    ADIOI_Info_set(info, "romio_no_indep_rw", "false");
-    fd->hints->no_indep_rw = 0;
-    fd->hints->deferred_open = 0;
-
-    /* BobC commented this out, but since hint processing runs on both bgl and
-     * bglockless, we need to keep DS writes enabled on gpfs and disabled on
-     * PVFS */
-    if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) {
-    /* disable data sieving for fs that do not
-       support file locking */
-       	ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
-		     value, &flag);
-	if (flag) {
-	    /* get rid of this value if it is set */
-	    ADIOI_Info_delete(info, "ind_wr_buffer_size");
-	}
-	/* note: leave ind_wr_buffer_size alone; used for other cases
-	 * as well. -- Rob Ross, 04/22/2003
-	 */
-	ADIOI_Info_set(info, "romio_ds_write", "disable");
-	fd->hints->ds_write = ADIOI_HINT_DISABLE;
-    }
-
-    ADIOI_Free(value);
-
-    *error_code = MPI_SUCCESS;
-}
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl_open.c b/src/mpi/romio/adio/ad_bgl/ad_bgl_open.c
deleted file mode 100644
index f811572..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl_open.c
+++ /dev/null
@@ -1,304 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl_open.c
- * \brief ???
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 1997 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "ad_bgl.h"
-#include "ad_bgl_aggrs.h"
-
-#include <sys/statfs.h>
-#include <sys/vfs.h>
-
-/* COPIED FROM ad_fstype.c since it is static in that file
-
- ADIO_FileSysType_parentdir - determines a string pathname for the
- parent directory of a given filename.
-
-Input Parameters:
-. filename - pointer to file name character array
-
-Output Parameters:
-. dirnamep - pointer to location in which to store a pointer to a string
-
- Note that the caller should free the memory located at the pointer returned
- after the string is no longer needed.
-*/
-
-#ifndef PATH_MAX
-#define PATH_MAX 65535
-#endif
-
-/* In a strict ANSI environment, S_ISLNK may not be defined.  Fix that
-   here.  We assume that S_ISLNK is *always* defined as a macro.  If
-   that is not universally true, then add a test to the romio
-   configure that trys to link a program that references S_ISLNK */
-#if !defined(S_ISLNK) 
-#    if defined(S_IFLNK)
-     /* Check for the link bit */
-#    define S_ISLNK(mode) ((mode) & S_IFLNK)
-#    else
-     /* no way to check if it is a link, so say false */
-#    define S_ISLNK(mode) 0   
-#    endif
-#endif /* !(S_ISLNK) */
-
-/* ADIO_FileSysType_parentdir
- *
- * Returns pointer to string in dirnamep; that string is allocated with
- * strdup and must be free()'d.
- */
-static void ADIO_FileSysType_parentdir(char *filename, char **dirnamep)
-{
-    int err;
-    char *dir = NULL, *slash;
-    struct stat statbuf;
-    
-    err = lstat(filename, &statbuf);
-
-    if (err || (!S_ISLNK(statbuf.st_mode))) {
-	/* no such file, or file is not a link; these are the "normal"
-	 * cases where we can just return the parent directory.
-	 */
-	dir = ADIOI_Strdup(filename);
-    }
-    else {
-	/* filename is a symlink.  we've presumably already tried
-	 * to stat it and found it to be missing (dangling link),
-	 * but this code doesn't care if the target is really there
-	 * or not.
-	 */
-	int namelen;
-	char *linkbuf;
-
-	linkbuf = ADIOI_Malloc(PATH_MAX+1);
-	namelen = readlink(filename, linkbuf, PATH_MAX+1);
-	if (namelen == -1) {
-	    /* something strange has happened between the time that
-	     * we determined that this was a link and the time that
-	     * we attempted to read it; punt and use the old name.
-	     */
-	    dir = ADIOI_Strdup(filename);
-	}
-	else {
-	    /* successfully read the link */
-	    linkbuf[namelen] = '\0'; /* readlink doesn't null terminate */
-	    dir = ADIOI_Strdup(linkbuf);
-	    ADIOI_Free(linkbuf);
-	}
-    }
-
-    slash = strrchr(dir, '/');
-    if (!slash) ADIOI_Strncpy(dir, ".", 2);
-    else {
-	if (slash == dir) *(dir + 1) = '\0';
-	else *slash = '\0';
-    }
-
-    *dirnamep = dir;
-    return;
-}
-
-static void scaleable_stat(ADIO_File fd)
-{
-    struct stat64 bgl_stat;
-    struct statfs bgl_statfs;
-    int rank, rc;
-    char * dir;
-    long buf[2];
-    MPI_Comm_rank(fd->comm, &rank);
-
-    if (rank == 0) {
-	/* Get the (real) underlying file system block size */
-	rc = stat64(fd->filename, &bgl_stat);
-	if (rc >= 0)
-	{
-	    buf[0] = bgl_stat.st_blksize;
-	    DBGV_FPRINTF(stderr,"Successful stat '%s'.  Blocksize=%ld\n",
-		    fd->filename,bgl_stat.st_blksize);
-	}
-	else
-	{
-	    DBGV_FPRINTF(stderr,"Stat '%s' failed with rc=%d, errno=%d\n",
-		    fd->filename,rc,errno);
-	}
-	/* Get the (real) underlying file system type so we can 
-	 * plan our fsync scaling strategy */
-	rc = statfs(fd->filename,&bgl_statfs);
-	if (rc >= 0)
-	{
-	    DBGV_FPRINTF(stderr,"Successful statfs '%s'.  Magic number=%#X\n",
-		    fd->filename,bgl_statfs.f_type);
-	    buf[1] = bgl_statfs.f_type;
-	}
-	else
-	{
-	    DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",
-		    fd->filename,rc,errno);
-	    ADIO_FileSysType_parentdir(fd->filename, &dir);
-	    rc = statfs(dir,&bgl_statfs);
-	    if (rc >= 0)
-	    {
-		DBGV_FPRINTF(stderr,"Successful statfs '%s'.  Magic number=%#X\n",dir,bgl_statfs.f_type);
-		buf[1] = bgl_statfs.f_type;
-	    }
-	    else
-	    {
-		/* Hmm.  Guess we'll assume the worst-case, that it's not GPFS
-		 * or BGLOCKLESSMPIO_F_TYPE (default PVFS2) below */
-		buf[1] = -1; /* bogus magic number */
-		DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",dir,rc,errno);
-	    }
-	    free(dir);
-	}
-    }
-    /* now we can broadcast the stat/statfs data to everyone else */
-    MPI_Bcast(buf, 2, MPI_LONG, 0, fd->comm);
-    bgl_stat.st_blksize = buf[0];
-    bgl_statfs.f_type = buf[1];
-
-    /* data from stat64 */
-    /* store the blksize in the file system specific storage */
-    ((ADIOI_BGL_fs*)fd->fs_ptr)->blksize = bgl_stat.st_blksize;
-
-    /* data from statfs */
-    if ((bgl_statfs.f_type == GPFS_SUPER_MAGIC) ||
-	    (bgl_statfs.f_type == bglocklessmpio_f_type))
-    {
-	((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr = 
-	    ADIOI_BGL_FSYNC_AGGREGATION_ENABLED;
-
-	/* Only one rank is an "fsync aggregator" because only one 
-	 * fsync is needed */
-	if (rank == 0)
-	{
-	    ((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr |= 
-		ADIOI_BGL_FSYNC_AGGREGATOR;
-	    DBG_FPRINTF(stderr,"fsync aggregator %d\n",rank);
-	}
-	else ; /* aggregation enabled but this rank is not an aggregator*/
-    }
-    else; /* Other filesystems default to no fsync aggregation */
-}
-
-
-void ADIOI_BGL_Open(ADIO_File fd, int *error_code)
-{
-    int perm, old_mask, amode;
-    static char myname[] = "ADIOI_BGL_OPEN";
-
-    /* set internal variables for tuning environment variables */
-    ad_bgl_get_env_vars();		
-
-    if (fd->perm == ADIO_PERM_NULL) {
-	old_mask = umask(022);
-	umask(old_mask);
-	perm = old_mask ^ 0666;
-    }
-    else perm = fd->perm;
-
-    amode = 0;
-    if (fd->access_mode & ADIO_CREATE)
-	amode = amode | O_CREAT;
-    if (fd->access_mode & ADIO_RDONLY)
-	amode = amode | O_RDONLY;
-    if (fd->access_mode & ADIO_WRONLY)
-	amode = amode | O_WRONLY;
-    if (fd->access_mode & ADIO_RDWR)
-	amode = amode | O_RDWR;
-    if (fd->access_mode & ADIO_EXCL)
-	amode = amode | O_EXCL;
-#ifdef ADIOI_MPE_LOGGING
-    MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
-#endif
-    fd->fd_sys = open(fd->filename, amode, perm);
-#ifdef ADIOI_MPE_LOGGING
-    MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
-#endif
-  DBG_FPRINTF(stderr,"open('%s',%#X,%#X) rc=%d, errno=%d\n",fd->filename,amode,perm,fd->fd_sys,errno);
-    fd->fd_direct = -1;
-
-    if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
-	fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
-
-    if(fd->fd_sys != -1)
-    {
-        /* Initialize the ad_bgl file system specific information */
-        AD_BGL_assert(fd->fs_ptr == NULL);
-        fd->fs_ptr = (ADIOI_BGL_fs*) ADIOI_Malloc(sizeof(ADIOI_BGL_fs));
-
-        ((ADIOI_BGL_fs*)fd->fs_ptr)->blksize = 1048576; /* default to 1M */
-
-        /* default is no fsync aggregation */
-        ((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr = 
-	    ADIOI_BGL_FSYNC_AGGREGATION_DISABLED; 
-
-
-#ifdef ADIOI_MPE_LOGGING
-        MPE_Log_event(ADIOI_MPE_stat_a, 0, NULL);
-#endif
-        scaleable_stat(fd);
-#ifdef ADIOI_MPE_LOGGING
-        MPE_Log_event(ADIOI_MPE_stat_b, 0, NULL);
-#endif
-    }
-
-    if (fd->fd_sys == -1) {
-	if (errno == ENAMETOOLONG)
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE, myname,
-					       __LINE__, MPI_ERR_BAD_FILE,
-					       "**filenamelong",
-					       "**filenamelong %s %d",
-					       fd->filename,
-					       strlen(fd->filename));
-	else if (errno == ENOENT)
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE, myname,
-					       __LINE__, MPI_ERR_NO_SUCH_FILE,
-					       "**filenoexist",
-					       "**filenoexist %s",
-					       fd->filename);
-	else if (errno == ENOTDIR || errno == ELOOP)
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE,
-					       myname, __LINE__,
-					       MPI_ERR_BAD_FILE,
-					       "**filenamedir",
-					       "**filenamedir %s",
-					       fd->filename);
-	else if (errno == EACCES) {
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE, myname,
-					       __LINE__, MPI_ERR_ACCESS,
-					       "**fileaccess",
-					       "**fileaccess %s", 
-					       fd->filename );
-	}
-	else if (errno == EROFS) {
-	    /* Read only file or file system and write access requested */
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE, myname,
-					       __LINE__, MPI_ERR_READ_ONLY,
-					       "**ioneedrd", 0 );
-	}
-	else {
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE, myname,
-					       __LINE__, MPI_ERR_IO, "**io",
-					       "**io %s", strerror(errno));
-	}
-    }
-    else *error_code = MPI_SUCCESS;
-}
-/* 
- *vim: ts=8 sts=4 sw=4 noexpandtab 
- */
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl_pset.c b/src/mpi/romio/adio/ad_bgl/ad_bgl_pset.c
deleted file mode 100644
index 370b731..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl_pset.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl_pset.c
- * \brief Definition of functions associated to structs ADIOI_BGL_ProcInfo_t and ADIOI_BGL_ConfInfo_t 
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 1997 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include <stdlib.h>
-#include "ad_bgl.h"
-#include "ad_bgl_pset.h"
-#include "mpidimpl.h"
-
-ADIOI_BGL_ProcInfo_t *
-ADIOI_BGL_ProcInfo_new()
-{
-    ADIOI_BGL_ProcInfo_t *p = (ADIOI_BGL_ProcInfo_t *) ADIOI_Malloc (sizeof(ADIOI_BGL_ProcInfo_t));
-    AD_BGL_assert ((p != NULL));
-    return p;
-}
-
-ADIOI_BGL_ProcInfo_t *
-ADIOI_BGL_ProcInfo_new_n( int n )
-{
-    ADIOI_BGL_ProcInfo_t *p = (ADIOI_BGL_ProcInfo_t *) ADIOI_Malloc (n * sizeof(ADIOI_BGL_ProcInfo_t));
-    AD_BGL_assert ((p != NULL));
-    return p;
-}
-
-void
-ADIOI_BGL_ProcInfo_free( ADIOI_BGL_ProcInfo_t *info )
-{
-    if (info != NULL) ADIOI_Free (info);
-}
-
-static
-void 
-ADIOI_BGL_ProcInfo_set(ADIOI_BGL_ProcInfo_t *info, const DCMF_Hardware_t *hw, int r)
-{
-    info->psetNum    = hw->idOfPset;
-    info->xInPset    = hw->xCoord;
-    info->yInPset    = hw->yCoord;
-    info->zInPset    = hw->zCoord;
-    info->cpuid      = hw->tCoord;
-    info->rank       = r;
-    info->rankInPset = hw->rankInPset;
-}
-
-
-ADIOI_BGL_ConfInfo_t *
-ADIOI_BGL_ConfInfo_new ()
-{
-    ADIOI_BGL_ConfInfo_t *p = (ADIOI_BGL_ConfInfo_t *) ADIOI_Malloc (sizeof(ADIOI_BGL_ConfInfo_t));
-    AD_BGL_assert ((p != NULL));
-    return p;
-}
-
-static
-void
-ADIOI_BGL_ConfInfo_set(ADIOI_BGL_ConfInfo_t *info, const DCMF_Hardware_t *hw, int s, int n_aggrs)
-{
-    info->PsetSize        = hw->sizeOfPset;
-    info->numPsets        = (hw->xSize * hw->ySize *
-					hw->zSize) / hw->sizeOfPset;
-    info->isVNM           = (hw->tSize != 1);
-    info->cpuidSize       = hw->tSize;
-    info->virtualPsetSize = hw->sizeOfPset * hw->tSize;
-    info->nProcs          = s;
-
-    /* More complicated logic maybe needed for nAggrs specification */
-    info->nAggrs          = n_aggrs;
-    if ( info->nAggrs <=0 || MIN(info->nProcs, info->virtualPsetSize) < info->nAggrs ) 
-        info->nAggrs      = ADIOI_BGL_NAGG_PSET_DFLT;
-    if ( info->nAggrs > info->virtualPsetSize ) info->nAggrs = info->virtualPsetSize;
-
-    info->aggRatio        = 1. * info->nAggrs / info->virtualPsetSize;
-    if (info->aggRatio > 1) info->aggRatio = 1.;
-}
-
-void
-ADIOI_BGL_ConfInfo_free( ADIOI_BGL_ConfInfo_t *info )
-{
-    if (info != NULL) ADIOI_Free (info);
-}
-
-void 
-ADIOI_BGL_persInfo_init(ADIOI_BGL_ConfInfo_t *conf, 
-			ADIOI_BGL_ProcInfo_t *proc, 
-			int s, int r, int n_aggrs)
-{
-    DCMF_Hardware_t hw;
-    DCMF_Hardware(&hw);
-
-    ADIOI_BGL_ConfInfo_set (conf, &hw, s, n_aggrs);
-    ADIOI_BGL_ProcInfo_set (proc, &hw, r);
-}
-
-void 
-ADIOI_BGL_persInfo_free( ADIOI_BGL_ConfInfo_t *conf, ADIOI_BGL_ProcInfo_t *proc )
-{
-    ADIOI_BGL_ConfInfo_free( conf );
-    ADIOI_BGL_ProcInfo_free( proc );
-}
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl_pset.h b/src/mpi/romio/adio/ad_bgl/ad_bgl_pset.h
deleted file mode 100644
index 774e04a..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl_pset.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl_pset.h
- * \brief ???
- */
-
-/* File: ad_bgl_pset.h
- * 
- * Defines two structures that keep BG/L PSET specific information and their public interfaces:
- * 	. ADIOI_BGL_ProcInfo_t object keeps specific information to each process
- * 	. ADIOI_BGL_ConfInfo_t object keeps general information for the whole communicator, only kept
- *	  on process 0.
- */
-
-#ifndef AD_BGL_PSET_H_
-#define AD_BGL_PSET_H_
-
-/* Keeps specific information to each process, will be exchanged among processes */
-typedef struct {
-
-    int psetNum;	/* which PSET I am in */
-    int rank;		/* my rank */
-    int xInPset;	/* my relative coordinates in my PSET */
-    int yInPset;
-    int zInPset;
-    int cpuid;		/* my CPU id -- for virtual node mode (t coord)*/
-    int rankInPset;	/* my relative rank in my PSET */
-
-    int __pad;          /* pad to 16 byte alignment */
-
-} ADIOI_BGL_ProcInfo_t __attribute__((aligned(16)));
-
-
-/* Keeps general information for the whole communicator, only on process 0 */
-typedef struct {
-
-    int PsetSize;
-    int nAggrs;
-    int numPsets;
-    int isVNM;
-    int virtualPsetSize;
-    int nProcs;
-    float aggRatio;
-    int cpuidSize;       /* how many cpu ids? (t size) */
-
-} ADIOI_BGL_ConfInfo_t __attribute__((aligned(16)));
-
-
-#undef MIN
-#define MIN(a,b) (((a)<(b) ? (a) : (b)))
-
-
-/* Default is to choose 8 aggregator nodes in each 32 CN pset. 
-   Also defines default ratio of aggregator nodes in each a pset.
-   For Virtual Node Mode, the ratio is 8/64 */
-#define ADIOI_BGL_NAGG_PSET_MIN  1
-#define ADIOI_BGL_NAGG_PSET_DFLT 8
-#define ADIOI_BGL_PSET_SIZE_DFLT 32
-
-
-/* public funcs for ADIOI_BGL_ProcInfo_t objects */
-    ADIOI_BGL_ProcInfo_t * ADIOI_BGL_ProcInfo_new();
-    ADIOI_BGL_ProcInfo_t * ADIOI_BGL_ProcInfo_new_n( int n );
-    void ADIOI_BGL_ProcInfo_free( ADIOI_BGL_ProcInfo_t *info );
-
-
-/* public funcs for ADIOI_BGL_ConfInfo_t objects */
-    ADIOI_BGL_ConfInfo_t * ADIOI_BGL_ConfInfo_new ();
-    void ADIOI_BGL_ConfInfo_free( ADIOI_BGL_ConfInfo_t *info );
-
-
-/* public funcs for a pair of ADIOI_BGL_ConfInfo_t and ADIOI_BGL_ProcInfo_t objects */
-    void ADIOI_BGL_persInfo_init( ADIOI_BGL_ConfInfo_t *conf, 
-				  ADIOI_BGL_ProcInfo_t *proc, 
-				  int s, int r, int n_aggrs );
-    void ADIOI_BGL_persInfo_free( ADIOI_BGL_ConfInfo_t *conf, 
-				  ADIOI_BGL_ProcInfo_t *proc );
-
-
-#endif  /* AD_BGL_PSET_H_ */
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl_rdcoll.c b/src/mpi/romio/adio/ad_bgl/ad_bgl_rdcoll.c
deleted file mode 100644
index a979d0f..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl_rdcoll.c
+++ /dev/null
@@ -1,1147 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl_rdcoll.c
- * \brief ???
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *
- *   Copyright (C) 1997 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "adio.h"
-#include "adio_extern.h"
-#include "ad_bgl.h"
-#include "ad_bgl_pset.h"
-#include "ad_bgl_aggrs.h"
-
-#ifdef PROFILE
-#include "mpe.h"
-#endif
-
-#ifdef USE_DBG_LOGGING
-  #define RDCOLL_DEBUG 1
-#endif
-#ifdef AGGREGATION_PROFILE
-#include "mpe.h"
-#endif
-
-/* prototypes of functions used for collective reads only. */
-static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
-				datatype, int nprocs,
-				int myrank, ADIOI_Access
-				*others_req, ADIO_Offset *offset_list,
-				ADIO_Offset *len_list, int contig_access_count, 
-				ADIO_Offset
-				min_st_offset, ADIO_Offset fd_size,
-				ADIO_Offset *fd_start, ADIO_Offset *fd_end,
-				int *buf_idx, int *error_code);
-static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
-				  *flat_buf, ADIO_Offset *offset_list, ADIO_Offset
-				  *len_list, int *send_size, int *recv_size,
-				  int *count, int *start_pos, 
-				  int *partial_send, 
-				  int *recd_from_proc, int nprocs, 
-				  int myrank, int
-				  buftype_is_contig, int contig_access_count,
-				  ADIO_Offset min_st_offset, 
-				  ADIO_Offset fd_size,
-				  ADIO_Offset *fd_start, ADIO_Offset *fd_end, 
-				  ADIOI_Access *others_req, 
-				  int iter, 
-				  MPI_Aint buftype_extent, int *buf_idx);
-static void ADIOI_R_Exchange_data_alltoallv(ADIO_File fd, void *buf, ADIOI_Flatlist_node
-                                  *flat_buf, ADIO_Offset *offset_list, ADIO_Offset
-                                  *len_list, int *send_size, int *recv_size,
-                                  int *count, int *start_pos,
-                                  int *partial_send,
-                                  int *recd_from_proc, int nprocs,
-                                  int myrank, int
-                                  buftype_is_contig, int contig_access_count,
-                                  ADIO_Offset min_st_offset,
-                                  ADIO_Offset fd_size,
-                                  ADIO_Offset *fd_start, ADIO_Offset *fd_end,
-                                  ADIOI_Access *others_req,
-                                  int iter,
-                                  MPI_Aint buftype_extent, int *buf_idx);
-static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
-				   *flat_buf, char **recv_buf, ADIO_Offset 
-				   *offset_list, ADIO_Offset *len_list, 
-				   unsigned *recv_size, 
-				   MPI_Request *requests, MPI_Status *statuses,
-				   int *recd_from_proc, int nprocs,
-				   int contig_access_count, 
-				   ADIO_Offset min_st_offset, 
-				   ADIO_Offset fd_size, ADIO_Offset *fd_start, 
-				   ADIO_Offset *fd_end,
-				   MPI_Aint buftype_extent);
-
-extern void ADIOI_Calc_my_off_len(ADIO_File fd, int bufcount, MPI_Datatype
-			    datatype, int file_ptr_type, ADIO_Offset
-			    offset, ADIO_Offset **offset_list_ptr, ADIO_Offset
-			    **len_list_ptr, ADIO_Offset *start_offset_ptr,
-			    ADIO_Offset *end_offset_ptr, int
-			   *contig_access_count_ptr);
-
-void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
-			       MPI_Datatype datatype, int file_ptr_type,
-			       ADIO_Offset offset, ADIO_Status *status, int
-			       *error_code)
-{
-/* Uses a generalized version of the extended two-phase method described
-   in "An Extended Two-Phase Method for Accessing Sections of 
-   Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary,
-   Scientific Programming, (5)4:301--317, Winter 1996. 
-   http://www.mcs.anl.gov/home/thakur/ext2ph.ps */
-
-    ADIOI_Access *my_req; 
-    /* array of nprocs structures, one for each other process in
-       whose file domain this process's request lies */
-    
-    ADIOI_Access *others_req;
-    /* array of nprocs structures, one for each other process
-       whose request lies in this process's file domain. */
-
-    int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank;
-    int contig_access_count=0, interleave_count = 0, buftype_is_contig;
-    int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
-    ADIO_Offset start_offset, end_offset, orig_fp, fd_size, min_st_offset, off;
-    ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL,
-	*fd_end = NULL, *end_offsets = NULL;
-    ADIO_Offset *bgl_offsets0 = NULL, *bgl_offsets = NULL;
-    int  ii;
-    ADIO_Offset *len_list = NULL;
-    int *buf_idx = NULL;
-#if BGL_PROFILE 
-    BGLMPIO_T_CIO_RESET( 0, r )
-#endif
-
-#ifdef HAVE_STATUS_SET_BYTES
-    MPI_Count bufsize, size;
-#endif
-
-#if 0
-/*   From common code - not implemented for bgl. */
-    if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) {
-        ADIOI_IOStridedColl (fd, buf, count, ADIOI_READ, datatype, 
-			file_ptr_type, offset, status, error_code);
-        return;
-    } */
-#endif
-#ifdef PROFILE
-        MPE_Log_event(13, 0, "start computation");
-#endif
-
-    MPI_Comm_size(fd->comm, &nprocs);
-    MPI_Comm_rank(fd->comm, &myrank);
-
-    /* number of aggregators, cb_nodes, is stored in the hints */
-    nprocs_for_coll = fd->hints->cb_nodes;
-    orig_fp = fd->fp_ind;
-
-#if BGL_PROFILE 
-    BGLMPIO_T_CIO_SET_GET( 0, r, 0, 1, 0, BGLMPIO_CIO_LCOMP, BGLMPIO_CIO_LAST )
-#endif
-
-    /* only check for interleaving if cb_read isn't disabled */
-    if (fd->hints->cb_read != ADIOI_HINT_DISABLE) {
-	/* For this process's request, calculate the list of offsets and
-	   lengths in the file and determine the start and end offsets. */
-
-	/* Note: end_offset points to the last byte-offset that will be accessed.
-	   e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/
-
-	ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
-			      &offset_list, &len_list, &start_offset,
-			      &end_offset, &contig_access_count); 
-    
-#if BGL_PROFILE 
-    BGLMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGLMPIO_CIO_GATHER, BGLMPIO_CIO_LCOMP )
-#endif
-
-#ifdef RDCOLL_DEBUG
-    for (i=0; i<contig_access_count; i++) {
-	      DBG_FPRINTF(stderr, "rank %d  off %lld  len %lld\n", 
-			      myrank, offset_list[i], len_list[i]);
-    }
-#endif
-
-	/* each process communicates its start and end offsets to other 
-	   processes. The result is an array each of start and end offsets
-	   stored in order of process rank. */ 
-    
-	st_offsets   = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
-	end_offsets  = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
-
-    if (bglmpio_tunegather) {
-	    bgl_offsets0 = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
-	    bgl_offsets  = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
-	    for (ii=0; ii<nprocs; ii++)  {
-		bgl_offsets0[ii*2]   = 0;
-		bgl_offsets0[ii*2+1] = 0;
-	    }
-	    bgl_offsets0[myrank*2]   = start_offset;
-	    bgl_offsets0[myrank*2+1] =   end_offset;
-
-	MPI_Allreduce( bgl_offsets0, bgl_offsets, nprocs*2, ADIO_OFFSET, MPI_MAX, fd->comm );
-
-	    for (ii=0; ii<nprocs; ii++)  {
-		st_offsets [ii] = bgl_offsets[ii*2]  ;
-		end_offsets[ii] = bgl_offsets[ii*2+1];
-	    }
-	    ADIOI_Free( bgl_offsets0 );
-	    ADIOI_Free( bgl_offsets  );
-    } else {
-        MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
-                      ADIO_OFFSET, fd->comm);
-        MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
-                      ADIO_OFFSET, fd->comm);
-    }
-
-#if BGL_PROFILE 
-    BGLMPIO_T_CIO_SET_GET( 0, r, 0, 1, 1, BGLMPIO_CIO_PATANA, BGLMPIO_CIO_GATHER )
-#endif
-
-	/* are the accesses of different processes interleaved? */
-	for (i=1; i<nprocs; i++)
-	    if ((st_offsets[i] < end_offsets[i-1]) && 
-                (st_offsets[i] <= end_offsets[i]))
-                interleave_count++;
-	/* This is a rudimentary check for interleaving, but should suffice
-	   for the moment. */
-    }
-
-    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
-
-    if (fd->hints->cb_read == ADIOI_HINT_DISABLE
-	|| (!interleave_count && (fd->hints->cb_read == ADIOI_HINT_AUTO))) 
-    {
-	/* don't do aggregation */
-	if (fd->hints->cb_read != ADIOI_HINT_DISABLE) {
-	    ADIOI_Free(offset_list);
-	    ADIOI_Free(len_list);
-	    ADIOI_Free(st_offsets);
-	    ADIOI_Free(end_offsets);
-	}
-
-	fd->fp_ind = orig_fp;
-	ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
-
-	if (buftype_is_contig && filetype_is_contig) {
-	    if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
-		off = fd->disp + (ADIO_Offset)(fd->etype_size) * offset;
-		ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET,
-                       off, status, error_code);
-	    }
-	    else ADIO_ReadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
-                       0, status, error_code);
-	}
-	else ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type,
-                       offset, status, error_code);
-
-	return;
-    }
-
-#if BGL_PROFILE 
-    BGLMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGLMPIO_CIO_FD_PART, BGLMPIO_CIO_PATANA )
-#endif
-
-    /* We're going to perform aggregation of I/O.  Here we call
-     * ADIOI_Calc_file_domains() to determine what processes will handle I/O
-     * to what regions.  We pass nprocs_for_coll into this function; it is
-     * used to determine how many processes will perform I/O, which is also
-     * the number of regions into which the range of bytes must be divided.
-     * These regions are called "file domains", or FDs.
-     *
-     * When this function returns, fd_start, fd_end, fd_size, and
-     * min_st_offset will be filled in.  fd_start holds the starting byte
-     * location for each file domain.  fd_end holds the ending byte location.
-     * min_st_offset holds the minimum byte location that will be accessed.
-     *
-     * Both fd_start[] and fd_end[] are indexed by an aggregator number; this
-     * needs to be mapped to an actual rank in the communicator later.
-     *
-     */
-    if (bglmpio_tuneblocking)
-    ADIOI_BGL_GPFS_Calc_file_domains(st_offsets, end_offsets, nprocs,
-			    nprocs_for_coll, &min_st_offset,
-			    &fd_start, &fd_end, &fd_size, fd->fs_ptr);
-    else
-    ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs,
-			    nprocs_for_coll, &min_st_offset,
-			    &fd_start, &fd_end,
-			    fd->hints->min_fdomain_size, &fd_size, 
-			    fd->hints->striping_unit);
-
-#if BGL_PROFILE 
-    BGLMPIO_T_CIO_SET_GET( 0, r, 0, 1, 1, BGLMPIO_CIO_MYREQ, BGLMPIO_CIO_FD_PART )
-#endif
-
-    /* calculate where the portions of the access requests of this process 
-     * are located in terms of the file domains.  this could be on the same
-     * process or on other processes.  this function fills in:
-     * count_my_req_procs - number of processes (including this one) for which
-     *     this process has requests in their file domain
-     * count_my_req_per_proc - count of requests for each process, indexed
-     *     by rank of the process
-     * my_req[] - array of data structures describing the requests to be
-     *     performed by each process (including self).  indexed by rank.
-     * buf_idx[] - array of locations into which data can be directly moved;
-     *     this is only valid for contiguous buffer case
-     */
-    if (bglmpio_tuneblocking)
-    ADIOI_BGL_Calc_my_req(fd, offset_list, len_list, contig_access_count,
-		      min_st_offset, fd_start, fd_end, fd_size,
-		      nprocs, &count_my_req_procs, 
-		      &count_my_req_per_proc, &my_req,
-		      &buf_idx);
-    else
-    ADIOI_Calc_my_req(fd, offset_list, len_list, contig_access_count,
-		      min_st_offset, fd_start, fd_end, fd_size,
-		      nprocs, &count_my_req_procs, 
-		      &count_my_req_per_proc, &my_req,
-		      &buf_idx);
-
-#if BGL_PROFILE 
-    BGLMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGLMPIO_CIO_OTHREQ, BGLMPIO_CIO_MYREQ )
-#endif
-
-    /* perform a collective communication in order to distribute the
-     * data calculated above.  fills in the following:
-     * count_others_req_procs - number of processes (including this
-     *     one) which have requests in this process's file domain.
-     * count_others_req_per_proc[] - number of separate contiguous
-     *     requests from proc i lie in this process's file domain.
-     */
-    if (bglmpio_tuneblocking)
-    ADIOI_BGL_Calc_others_req(fd, count_my_req_procs, 
-			  count_my_req_per_proc, my_req, 
-			  nprocs, myrank, &count_others_req_procs, 
-			  &others_req); 
-
-    else
-    ADIOI_Calc_others_req(fd, count_my_req_procs, 
-			  count_my_req_per_proc, my_req, 
-			  nprocs, myrank, &count_others_req_procs, 
-			  &others_req); 
-
-#if BGL_PROFILE 
-    BGLMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGLMPIO_CIO_DEXCH, BGLMPIO_CIO_OTHREQ )
-#endif
-
-    /* my_req[] and count_my_req_per_proc aren't needed at this point, so 
-     * let's free the memory 
-     */
-    ADIOI_Free(count_my_req_per_proc);
-    for (i=0; i<nprocs; i++) {
-	if (my_req[i].count) {
-	    ADIOI_Free(my_req[i].offsets);
-	    ADIOI_Free(my_req[i].lens);
-	}
-    }
-    ADIOI_Free(my_req);
-
-
-    /* read data in sizes of no more than ADIOI_Coll_bufsize, 
-     * communicate, and fill user buf. 
-     */
-    ADIOI_Read_and_exch(fd, buf, datatype, nprocs, myrank,
-                        others_req, offset_list,
-			len_list, contig_access_count, min_st_offset,
-			fd_size, fd_start, fd_end, buf_idx, error_code);
-
-#if BGL_PROFILE 
-    BGLMPIO_T_CIO_SET_GET( 0, r, 1, 0, 1, BGLMPIO_CIO_LAST, BGLMPIO_CIO_T_DEXCH )
-    BGLMPIO_T_CIO_SET_GET( 0, r, 0, 0, 1, BGLMPIO_CIO_LAST, BGLMPIO_CIO_T_MPIO_CRW )
-
-    BGLMPIO_T_CIO_REPORT( 0, r, fd, myrank )
-#endif
-
-    if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
-
-    /* free all memory allocated for collective I/O */
-    for (i=0; i<nprocs; i++) {
-	if (others_req[i].count) {
-	    ADIOI_Free(others_req[i].offsets);
-	    ADIOI_Free(others_req[i].lens);
-	    ADIOI_Free(others_req[i].mem_ptrs);
-	}
-    }
-    ADIOI_Free(others_req);
-
-    ADIOI_Free(buf_idx);
-    ADIOI_Free(offset_list);
-    ADIOI_Free(len_list);
-    ADIOI_Free(st_offsets);
-    ADIOI_Free(end_offsets);
-    ADIOI_Free(fd_start);
-    ADIOI_Free(fd_end);
-
-#ifdef HAVE_STATUS_SET_BYTES
-    MPI_Type_size_x(datatype, &size);
-    bufsize = size * count;
-    MPIR_Status_set_bytes(status, datatype, bufsize);
-/* This is a temporary way of filling in status. The right way is to 
-   keep track of how much data was actually read and placed in buf 
-   during collective I/O. */
-#endif
-
-    fd->fp_sys_posn = -1;   /* set it to null. */
-}
-
-static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
-			 datatype, int nprocs,
-			 int myrank, ADIOI_Access
-			 *others_req, ADIO_Offset *offset_list,
-			 ADIO_Offset *len_list, int contig_access_count, ADIO_Offset
-                         min_st_offset, ADIO_Offset fd_size,
-			 ADIO_Offset *fd_start, ADIO_Offset *fd_end,
-                         int *buf_idx, int *error_code)
-{
-/* Read in sizes of no more than coll_bufsize, an info parameter.
-   Send data to appropriate processes. 
-   Place recd. data in user buf.
-   The idea is to reduce the amount of extra memory required for
-   collective I/O. If all data were read all at once, which is much
-   easier, it would require temp space more than the size of user_buf,
-   which is often unacceptable. For example, to read a distributed
-   array from a file, where each local array is 8Mbytes, requiring
-   at least another 8Mbytes of temp space is unacceptable. */
-
-    int i, j, m, ntimes, max_ntimes, buftype_is_contig;
-    ADIO_Offset st_loc=-1, end_loc=-1, off, done, real_off, req_off;
-    char *read_buf = NULL, *tmp_buf;
-    int *curr_offlen_ptr, *count, *send_size, *recv_size;
-    int *partial_send, *recd_from_proc, *start_pos;
-    /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets*/
-    ADIO_Offset real_size, size, for_curr_iter, for_next_iter;
-    int req_len, flag, rank;
-    MPI_Status status;
-    ADIOI_Flatlist_node *flat_buf=NULL;
-    MPI_Aint buftype_extent;
-    int coll_bufsize;
-#ifdef RDCOLL_DEBUG
-    int iii;
-#endif
-    *error_code = MPI_SUCCESS;  /* changed below if error */
-    /* only I/O errors are currently reported */
-    
-/* calculate the number of reads of size coll_bufsize
-   to be done by each process and the max among all processes.
-   That gives the no. of communication phases as well.
-   coll_bufsize is obtained from the hints object. */
-
-    coll_bufsize = fd->hints->cb_buffer_size;
-
-    /* grab some initial values for st_loc and end_loc */
-    for (i=0; i < nprocs; i++) {
-	if (others_req[i].count) {
-	    st_loc = others_req[i].offsets[0];
-	    end_loc = others_req[i].offsets[0];
-	    break;
-	}
-    }
-
-    /* now find the real values */
-    for (i=0; i < nprocs; i++)
-	for (j=0; j<others_req[i].count; j++) {
-	    st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
-	    end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j]
-					  + others_req[i].lens[j] - 1));
-	}
-
-    /* calculate ntimes, the number of times this process must perform I/O
-     * operations in order to complete all the requests it has received.
-     * the need for multiple I/O operations comes from the restriction that
-     * we only use coll_bufsize bytes of memory for internal buffering.
-     */
-    if ((st_loc==-1) && (end_loc==-1)) {
-	/* this process does no I/O. */
-	ntimes = 0;
-    }
-    else {
-	/* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/
-	ntimes = (int) ((end_loc - st_loc + coll_bufsize)/coll_bufsize);
-    }
-
-    MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm); 
-
-    if (ntimes) read_buf = (char *) ADIOI_Malloc(coll_bufsize);
-
-    curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); 
-    /* its use is explained below. calloc initializes to 0. */
-
-    count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
-    /* to store count of how many off-len pairs per proc are satisfied
-       in an iteration. */
-
-    partial_send = (int *) ADIOI_Calloc(nprocs, sizeof(int));
-    /* if only a portion of the last off-len pair is sent to a process 
-       in a particular iteration, the length sent is stored here.
-       calloc initializes to 0. */
-
-    send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
-    /* total size of data to be sent to each proc. in an iteration */
-
-    recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
-    /* total size of data to be recd. from each proc. in an iteration.
-       Of size nprocs so that I can use MPI_Alltoall later. */
-
-    recd_from_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
-    /* amount of data recd. so far from each proc. Used in
-       ADIOI_Fill_user_buffer. initialized to 0 here. */
-
-    start_pos = (int *) ADIOI_Malloc(nprocs*sizeof(int));
-    /* used to store the starting value of curr_offlen_ptr[i] in 
-       this iteration */
-
-    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
-    if (!buftype_is_contig) {
-	ADIOI_Flatten_datatype(datatype);
-	flat_buf = ADIOI_Flatlist;
-        while (flat_buf->type != datatype) flat_buf = flat_buf->next;
-    }
-    MPI_Type_extent(datatype, &buftype_extent);
-
-    done = 0;
-    off = st_loc;
-    for_curr_iter = for_next_iter = 0;
-
-    MPI_Comm_rank(fd->comm, &rank);
-
-#ifdef PROFILE
-        MPE_Log_event(14, 0, "end computation");
-#endif
-
-    for (m=0; m<ntimes; m++) {
-       /* read buf of size coll_bufsize (or less) */
-       /* go through all others_req and check if any are satisfied
-          by the current read */
-
-       /* since MPI guarantees that displacements in filetypes are in 
-          monotonically nondecreasing order, I can maintain a pointer
-	  (curr_offlen_ptr) to 
-          current off-len pair for each process in others_req and scan
-          further only from there. There is still a problem of filetypes
-          such as:  (1, 2, 3 are not process nos. They are just numbers for
-          three chunks of data, specified by a filetype.)
-
-                   1  -------!--
-                   2    -----!----
-                   3       --!-----
-
-          where ! indicates where the current read_size limitation cuts 
-          through the filetype.  I resolve this by reading up to !, but
-          filling the communication buffer only for 1. I copy the portion
-          left over for 2 into a tmp_buf for use in the next
-	  iteration. i.e., 2 and 3 will be satisfied in the next
-	  iteration. This simplifies filling in the user's buf at the
-	  other end, as only one off-len pair with incomplete data
-	  will be sent. I also don't need to send the individual
-	  offsets and lens along with the data, as the data is being
-	  sent in a particular order. */ 
-
-          /* off = start offset in the file for the data actually read in 
-                   this iteration 
-             size = size of data read corresponding to off
-             real_off = off minus whatever data was retained in memory from
-                  previous iteration for cases like 2, 3 illustrated above
-             real_size = size plus the extra corresponding to real_off
-             req_off = off in file for a particular contiguous request 
-                       minus what was satisfied in previous iteration
-             req_size = size corresponding to req_off */
-
-#ifdef PROFILE
-        MPE_Log_event(13, 0, "start computation");
-#endif
-	size = ADIOI_MIN((unsigned)coll_bufsize, end_loc-st_loc+1-done); 
-	real_off = off - for_curr_iter;
-	real_size = size + for_curr_iter;
-
-	for (i=0; i<nprocs; i++) count[i] = send_size[i] = 0;
-	for_next_iter = 0;
-
-	for (i=0; i<nprocs; i++) {
-#ifdef RDCOLL_DEBUG
-	    DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n", rank, i, others_req[i].count); 
-#endif
-	    if (others_req[i].count) {
-		start_pos[i] = curr_offlen_ptr[i];
-		for (j=curr_offlen_ptr[i]; j<others_req[i].count;
-		     j++) {
-		    if (partial_send[i]) {
-			/* this request may have been partially
-			   satisfied in the previous iteration. */
-			req_off = others_req[i].offsets[j] +
-			    partial_send[i]; 
-                        req_len = others_req[i].lens[j] -
-			    partial_send[i];
-			partial_send[i] = 0;
-			/* modify the off-len pair to reflect this change */
-			others_req[i].offsets[j] = req_off;
-			others_req[i].lens[j] = req_len;
-		    }
-		    else {
-			req_off = others_req[i].offsets[j];
-                        req_len = others_req[i].lens[j];
-		    }
-		    if (req_off < real_off + real_size) {
-			count[i]++;
-      ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+req_off-real_off) == (ADIO_Offset)(MPIR_Upint)(read_buf+req_off-real_off));
-			MPI_Address(read_buf+req_off-real_off, 
-                               &(others_req[i].mem_ptrs[j]));
-      ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off));
-			send_size[i] += (int)(ADIOI_MIN(real_off + real_size - req_off, 
-                                      (ADIO_Offset)(unsigned)req_len)); 
-
-			if (real_off+real_size-req_off < (ADIO_Offset)(unsigned)req_len) {
-			    partial_send[i] = (int) (real_off + real_size - req_off);
-			    if ((j+1 < others_req[i].count) && 
-                                 (others_req[i].offsets[j+1] < 
-                                     real_off+real_size)) { 
-				/* this is the case illustrated in the
-				   figure above. */
-				for_next_iter = ADIOI_MAX(for_next_iter,
-					  real_off + real_size - others_req[i].offsets[j+1]); 
-				/* max because it must cover requests 
-				   from different processes */
-			    }
-			    break;
-			}
-		    }
-		    else break;
-		}
-		curr_offlen_ptr[i] = j;
-	    }
-	}
-
-	flag = 0;
-	for (i=0; i<nprocs; i++)
-	    if (count[i]) flag = 1;
-
-#ifdef PROFILE
-        MPE_Log_event(14, 0, "end computation");
-#endif
-	if (flag) {
-      ADIOI_Assert(size == (int)size);
-	    ADIO_ReadContig(fd, read_buf+for_curr_iter, (int)size, MPI_BYTE,
-			    ADIO_EXPLICIT_OFFSET, off, &status, error_code);
-#ifdef RDCOLL_DEBUG
-	    DBG_FPRINTF(stderr, "\tread_coll: 700, data read [%lld] = ", size );
-	    for (iii=0; iii<size && iii<80; iii++) { DBGV_FPRINTF(stderr, "%3d,", *((unsigned char *)read_buf + for_curr_iter + iii) ); }
-	    DBG_FPRINTF(stderr, "\n" );
-#endif
-
-	    if (*error_code != MPI_SUCCESS) return;
-	}
-	
-	for_curr_iter = for_next_iter;
-	
-#ifdef PROFILE
-        MPE_Log_event(7, 0, "start communication");
-#endif
-	if (bglmpio_comm == 1)
-	ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list,
-			    send_size, recv_size, count, 
-       			    start_pos, partial_send, recd_from_proc, nprocs,
-			    myrank, 
-			    buftype_is_contig, contig_access_count,
-			    min_st_offset, fd_size, fd_start, fd_end,
-			    others_req, 
-                            m, buftype_extent, buf_idx); 
-        else    
-	if (bglmpio_comm == 0) {
-        ADIOI_R_Exchange_data_alltoallv(fd, buf, flat_buf, offset_list, len_list,
-                            send_size, recv_size, count,
-                            start_pos, partial_send, recd_from_proc, nprocs,
-                            myrank,
-                            buftype_is_contig, contig_access_count,
-                            min_st_offset, fd_size, fd_start, fd_end,
-                            others_req,
-                            m, buftype_extent, buf_idx);
-	}
-
-
-#ifdef PROFILE
-        MPE_Log_event(8, 0, "end communication");
-#endif
-
-	if (for_next_iter) {
-	    tmp_buf = (char *) ADIOI_Malloc(for_next_iter);
-      ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+real_size-for_next_iter) == (ADIO_Offset)(MPIR_Upint)(read_buf+real_size-for_next_iter));
-      ADIOI_Assert((for_next_iter+coll_bufsize) == (size_t)(for_next_iter+coll_bufsize));
-	    memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter);
-	    ADIOI_Free(read_buf);
-	    read_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize);
-	    memcpy(read_buf, tmp_buf, for_next_iter);
-	    ADIOI_Free(tmp_buf);
-	}
-
-	off += size;
-	done += size;
-    }
-
-    for (i=0; i<nprocs; i++) count[i] = send_size[i] = 0;
-#ifdef PROFILE
-        MPE_Log_event(7, 0, "start communication");
-#endif
-    for (m=ntimes; m<max_ntimes; m++) 
-/* nothing to send, but check for recv. */
-
-	if (bglmpio_comm == 1)
-	ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list,
-			    send_size, recv_size, count, 
-			    start_pos, partial_send, recd_from_proc, nprocs,
-			    myrank, 
-			    buftype_is_contig, contig_access_count,
-			    min_st_offset, fd_size, fd_start, fd_end,
-			    others_req, m,
-                            buftype_extent, buf_idx); 
-        else    /* strncmp( env_switch, "alltoall", 8 ) == 0 */
-	if (bglmpio_comm == 0)
-        ADIOI_R_Exchange_data_alltoallv(fd, buf, flat_buf, offset_list, len_list,
-                            send_size, recv_size, count, 
-                            start_pos, partial_send, recd_from_proc, nprocs,
-                            myrank, 
-                            buftype_is_contig, contig_access_count,
-                            min_st_offset, fd_size, fd_start, fd_end,
-                            others_req, 
-                            m, buftype_extent, buf_idx);
-
-#ifdef PROFILE
-        MPE_Log_event(8, 0, "end communication");
-#endif
-
-    if (ntimes) ADIOI_Free(read_buf);
-    ADIOI_Free(curr_offlen_ptr);
-    ADIOI_Free(count);
-    ADIOI_Free(partial_send);
-    ADIOI_Free(send_size);
-    ADIOI_Free(recv_size);
-    ADIOI_Free(recd_from_proc);
-    ADIOI_Free(start_pos);
-}
-
-static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
-			 *flat_buf, ADIO_Offset *offset_list, ADIO_Offset
-                         *len_list, int *send_size, int *recv_size,
-			 int *count, int *start_pos, int *partial_send, 
-			 int *recd_from_proc, int nprocs, 
-			 int myrank, int
-			 buftype_is_contig, int contig_access_count,
-			 ADIO_Offset min_st_offset, ADIO_Offset fd_size,
-			 ADIO_Offset *fd_start, ADIO_Offset *fd_end, 
-			 ADIOI_Access *others_req, 
-                         int iter, MPI_Aint buftype_extent, int *buf_idx)
-{
-    int i, j, k=0, tmp=0, nprocs_recv, nprocs_send;
-    char **recv_buf = NULL; 
-    MPI_Request *requests;
-    MPI_Datatype send_type;
-    MPI_Status *statuses;
-
-/* exchange send_size info so that each process knows how much to
-   receive from whom and how much memory to allocate. */
-
-    MPI_Alltoall(send_size, 1, MPI_INT, recv_size, 1, MPI_INT, fd->comm);
-
-    nprocs_recv = 0;
-    for (i=0; i < nprocs; i++) if (recv_size[i]) nprocs_recv++;
-
-    nprocs_send = 0;
-    for (i=0; i<nprocs; i++) if (send_size[i]) nprocs_send++;
-
-    requests = (MPI_Request *)
-	ADIOI_Malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request));
-/* +1 to avoid a 0-size malloc */
-
-/* post recvs. if buftype_is_contig, data can be directly recd. into
-   user buf at location given by buf_idx. else use recv_buf. */
-
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5032, 0, NULL);
-#endif
-
-    if (buftype_is_contig) {
-	j = 0;
-	for (i=0; i < nprocs; i++) 
-	    if (recv_size[i]) { 
-		MPI_Irecv(((char *) buf) + buf_idx[i], recv_size[i], 
-		  MPI_BYTE, i, myrank+i+100*iter, fd->comm, requests+j);
-		j++;
-		buf_idx[i] += recv_size[i];
-	    }
-    }
-    else {
-/* allocate memory for recv_buf and post receives */
-	recv_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char*));
-	for (i=0; i < nprocs; i++) 
-	    if (recv_size[i]) recv_buf[i] = 
-                                  (char *) ADIOI_Malloc(recv_size[i]);
-
-	    j = 0;
-	    for (i=0; i < nprocs; i++) 
-		if (recv_size[i]) {
-		    MPI_Irecv(recv_buf[i], recv_size[i], MPI_BYTE, i, 
-			      myrank+i+100*iter, fd->comm, requests+j);
-		    j++;
-#ifdef RDCOLL_DEBUG
-		    DBG_FPRINTF(stderr, "node %d, recv_size %d, tag %d \n", 
-		       myrank, recv_size[i], myrank+i+100*iter); 
-#endif
-		}
-    }
-
-/* create derived datatypes and send data */
-
-    j = 0;
-    for (i=0; i<nprocs; i++) {
-	if (send_size[i]) {
-/* take care if the last off-len pair is a partial send */
-	    if (partial_send[i]) {
-		k = start_pos[i] + count[i] - 1;
-		tmp = others_req[i].lens[k];
-		others_req[i].lens[k] = partial_send[i];
-	    }
-	    MPI_Type_hindexed(count[i], 
-                 &(others_req[i].lens[start_pos[i]]),
-	            &(others_req[i].mem_ptrs[start_pos[i]]), 
-			 MPI_BYTE, &send_type);
-	    /* absolute displacement; use MPI_BOTTOM in send */
-	    MPI_Type_commit(&send_type);
-	    MPI_Isend(MPI_BOTTOM, 1, send_type, i, myrank+i+100*iter,
-		      fd->comm, requests+nprocs_recv+j);
-	    MPI_Type_free(&send_type);
-	    if (partial_send[i]) others_req[i].lens[k] = tmp;
-	    j++;
-	}
-    }
-
-    statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send+nprocs_recv+1) * \
-                                     sizeof(MPI_Status)); 
-     /* +1 to avoid a 0-size malloc */
-
-    /* wait on the receives */
-    if (nprocs_recv) {
-#ifdef NEEDS_MPI_TEST
-	j = 0;
-	while (!j) MPI_Testall(nprocs_recv, requests, &j, statuses);
-#else
-	MPI_Waitall(nprocs_recv, requests, statuses);
-#endif
-
-	/* if noncontiguous, to the copies from the recv buffers */
-	if (!buftype_is_contig) 
-	    ADIOI_Fill_user_buffer(fd, buf, flat_buf, recv_buf,
-				   offset_list, len_list, (unsigned*)recv_size, 
-				   requests, statuses, recd_from_proc, 
-				   nprocs, contig_access_count,
-				   min_st_offset, fd_size, fd_start, fd_end,
-				   buftype_extent);
-    }
-
-    /* wait on the sends*/
-    MPI_Waitall(nprocs_send, requests+nprocs_recv, statuses+nprocs_recv);
-
-    ADIOI_Free(statuses);
-    ADIOI_Free(requests);
-
-    if (!buftype_is_contig) {
-	for (i=0; i < nprocs; i++) 
-	    if (recv_size[i]) ADIOI_Free(recv_buf[i]);
-	ADIOI_Free(recv_buf);
-    }
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5033, 0, NULL);
-#endif
-}
-
-#define ADIOI_BUF_INCR \
-{ \
-    while (buf_incr) { \
-	size_in_buf = ADIOI_MIN(buf_incr, flat_buf_sz); \
-	user_buf_idx += size_in_buf; \
-	flat_buf_sz -= size_in_buf; \
-	if (!flat_buf_sz) { \
-            if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
-            else { \
-                flat_buf_idx = 0; \
-                n_buftypes++; \
-            } \
-            user_buf_idx = flat_buf->indices[flat_buf_idx] + \
-                              (ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
-	    flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
-	} \
-	buf_incr -= size_in_buf; \
-    } \
-}
-
-
-#define ADIOI_BUF_COPY \
-{ \
-    while (size) { \
-	size_in_buf = ADIOI_MIN(size, flat_buf_sz); \
-  ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)buf) + user_buf_idx) == (ADIO_Offset)(MPIR_Upint)(buf + user_buf_idx)); \
-  ADIOI_Assert(size_in_buf == (size_t)size_in_buf); \
-	memcpy(((char *) buf) + user_buf_idx, \
-	       &(recv_buf[p][recv_buf_idx[p]]), size_in_buf); \
-	recv_buf_idx[p] += size_in_buf; /* already tested (size_t)size_in_buf*/ \
-	user_buf_idx += size_in_buf; \
-	flat_buf_sz -= size_in_buf; \
-	if (!flat_buf_sz) { \
-            if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
-            else { \
-                flat_buf_idx = 0; \
-                n_buftypes++; \
-            } \
-            user_buf_idx = flat_buf->indices[flat_buf_idx] + \
-                              (ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
-	    flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
-	} \
-	size -= size_in_buf; \
-	buf_incr -= size_in_buf; \
-    } \
-    ADIOI_BUF_INCR \
-}
-
-static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
-				   *flat_buf, char **recv_buf, ADIO_Offset 
-				   *offset_list, ADIO_Offset *len_list, 
-				   unsigned *recv_size, 
-				   MPI_Request *requests, MPI_Status *statuses,
-				   int *recd_from_proc, int nprocs,
-				   int contig_access_count, 
-				   ADIO_Offset min_st_offset, 
-				   ADIO_Offset fd_size, ADIO_Offset *fd_start, 
-				   ADIO_Offset *fd_end,
-				   MPI_Aint buftype_extent)
-{
-
-/* this function is only called if buftype is not contig */
-
-    int i, p, flat_buf_idx;
-    ADIO_Offset flat_buf_sz, size_in_buf, buf_incr, size;
-    int n_buftypes;
-    ADIO_Offset off, len, rem_len, user_buf_idx;
-    /* Not sure unsigned is necessary, but it makes the math safer */
-    unsigned *curr_from_proc, *done_from_proc, *recv_buf_idx;
-
-    ADIOI_UNREFERENCED_ARG(requests);
-    ADIOI_UNREFERENCED_ARG(statuses);
-
-/*  curr_from_proc[p] = amount of data recd from proc. p that has already
-                        been accounted for so far
-    done_from_proc[p] = amount of data already recd from proc. p and 
-                        filled into user buffer in previous iterations
-    user_buf_idx = current location in user buffer 
-    recv_buf_idx[p] = current location in recv_buf of proc. p  */
-    curr_from_proc = (unsigned *) ADIOI_Malloc(nprocs * sizeof(unsigned));
-    done_from_proc = (unsigned *) ADIOI_Malloc(nprocs * sizeof(unsigned));
-    recv_buf_idx   = (unsigned *) ADIOI_Malloc(nprocs * sizeof(unsigned));
-
-    for (i=0; i < nprocs; i++) {
-	recv_buf_idx[i] = curr_from_proc[i] = 0;
-	done_from_proc[i] = recd_from_proc[i];
-    }
-
-    user_buf_idx = flat_buf->indices[0];
-    flat_buf_idx = 0;
-    n_buftypes = 0;
-    flat_buf_sz = flat_buf->blocklens[0];
-
-    /* flat_buf_idx = current index into flattened buftype
-       flat_buf_sz = size of current contiguous component in 
-                flattened buf */
-
-    for (i=0; i<contig_access_count; i++) { 
-	off     = offset_list[i];
-	rem_len = len_list[i];
-
-	/* this request may span the file domains of more than one process */
-	while (rem_len > 0) {
-	    len = rem_len;
-	    /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
-	     * longer than the single region that processor "p" is responsible
-	     * for.
-	     */
-	    p = ADIOI_BGL_Calc_aggregator(fd,
-				      off,
-				      min_st_offset,
-				      &len,
-				      fd_size,
-				      fd_start,
-				      fd_end);
-
-	    if (recv_buf_idx[p] < recv_size[p]) {
-		if (curr_from_proc[p]+len > done_from_proc[p]) {
-		    if (done_from_proc[p] > curr_from_proc[p]) {
-			size = ADIOI_MIN(curr_from_proc[p] + len - 
-			      done_from_proc[p], recv_size[p]-recv_buf_idx[p]);
-			buf_incr = done_from_proc[p] - curr_from_proc[p];
-			ADIOI_BUF_INCR
-			buf_incr = curr_from_proc[p]+len-done_from_proc[p];
-      ADIOI_Assert((done_from_proc[p] + size) == (unsigned)((ADIO_Offset)done_from_proc[p] + size));
-			curr_from_proc[p] = done_from_proc[p] + size;
-			ADIOI_BUF_COPY
-		    }
-		    else {
-			size = ADIOI_MIN(len,recv_size[p]-recv_buf_idx[p]);
-			buf_incr = len;
-      ADIOI_Assert((curr_from_proc[p] + size) == (unsigned)((ADIO_Offset)curr_from_proc[p] + size));
-			curr_from_proc[p] += (unsigned) size;
-			ADIOI_BUF_COPY
-		    }
-		}
-		else {
-        ADIOI_Assert((curr_from_proc[p] + len) == (unsigned)((ADIO_Offset)curr_from_proc[p] + len));
-		    curr_from_proc[p] += (unsigned) len;
-		    buf_incr = len;
-		    ADIOI_BUF_INCR
-		}
-	    }
-	    else {
-		buf_incr = len;
-		ADIOI_BUF_INCR
-	    }
-	    off     += len;
-	    rem_len -= len;
-	}
-    }
-    for (i=0; i < nprocs; i++) 
-	if (recv_size[i]) recd_from_proc[i] = curr_from_proc[i];
-
-    ADIOI_Free(curr_from_proc);
-    ADIOI_Free(done_from_proc);
-    ADIOI_Free(recv_buf_idx);
-}
-
-static void ADIOI_R_Exchange_data_alltoallv(
-                ADIO_File fd, void *buf, ADIOI_Flatlist_node
-                *flat_buf, ADIO_Offset *offset_list, ADIO_Offset
-                *len_list, int *send_size, int *recv_size, 
-                int *count, int *start_pos, int *partial_send,
-                int *recd_from_proc, int nprocs,
-                int myrank, int
-                buftype_is_contig, int contig_access_count,
-                ADIO_Offset min_st_offset, ADIO_Offset fd_size,
-                ADIO_Offset *fd_start, ADIO_Offset *fd_end, 
-                ADIOI_Access *others_req,
-                int iter, MPI_Aint buftype_extent, int *buf_idx)
-{   
-    int i, j, k=0, tmp=0, nprocs_recv, nprocs_send;
-    char **recv_buf = NULL;
-    MPI_Request *requests=NULL;
-    MPI_Status *statuses=NULL;
-    int rtail, stail;
-    char *sbuf_ptr, *from_ptr;
-    int  len;
-    int  *sdispls, *rdispls;
-    char *all_recv_buf, *all_send_buf;
-
-  /* exchange send_size info so that each process knows how much to
-     receive from whom and how much memory to allocate. */
-    MPI_Alltoall(send_size, 1, MPI_INT, recv_size, 1, MPI_INT, fd->comm);
-    
-    nprocs_recv = 0;
-    for (i=0; i<nprocs; i++) if (recv_size[i]) { nprocs_recv++; break; }
-    
-    nprocs_send = 0;
-    for (i=0; i<nprocs; i++) if (send_size[i]) { nprocs_send++; break; }
-    
-  /* receiver side data structures */
-    rdispls = (int *) ADIOI_Malloc( nprocs * sizeof(int) );
-    rtail = 0;
-    for (i=0; i<nprocs; i++) { rdispls[i] = rtail; rtail += recv_size[i]; }
-
-        /* data buffer */
-    all_recv_buf = (char *) ADIOI_Malloc( rtail );
-    recv_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
-    for (i=0; i<nprocs; i++) { recv_buf[i] = all_recv_buf + rdispls[i]; }
-
-  /* sender side data structures */
-    sdispls = (int *) ADIOI_Malloc( nprocs * sizeof(int) );
-    stail = 0;
-    for (i=0; i<nprocs; i++) { sdispls[i] = stail; stail += send_size[i]; }
-
-        /* data buffer */
-    all_send_buf = (char *) ADIOI_Malloc( stail );
-    for (i=0; i<nprocs; i++)
-    {
-        if (send_size[i]) {
-	    if (partial_send[i]) {
-		k = start_pos[i] + count[i] - 1;
-		tmp = others_req[i].lens[k];
-		others_req[i].lens[k] = partial_send[i];
-	    }
-            sbuf_ptr = all_send_buf + sdispls[i];
-            for (j=0; j<count[i]; j++) {
-                ADIOI_ENSURE_AINT_FITS_IN_PTR( others_req[i].mem_ptrs[ start_pos[i]+j ]);
-                from_ptr = (char *) ADIOI_AINT_CAST_TO_VOID_PTR ( others_req[i].mem_ptrs[ start_pos[i]+j ] );
-                len      =           others_req[i].lens[     start_pos[i]+j ]  ;
-                memcpy( sbuf_ptr, from_ptr, len );
-                sbuf_ptr += len;
-            }
-	    if (partial_send[i]) others_req[i].lens[k] = tmp;
-        }
-    }
-
-#if RDCOLL_DEBUG
-    DBG_FPRINTF(stderr, "\tsend_size = [%d]%2d,",0,send_size[0]);
-    for (i=1; i<nprocs; i++) if(send_size[i-1]!=send_size[i]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i,send_size[i] ); }
-    DBG_FPRINTF(stderr, "\trecv_size =  [%d]%2d,",0,recv_size[0]);
-    for (i=1; i<nprocs; i++) if(recv_size[i-1]!=recv_size[i]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i,recv_size[i] ); }
-    DBG_FPRINTF(stderr, "\tsdispls   =  [%d]%2d,",0,sdispls[0]);
-    for (i=1; i<nprocs; i++) if(sdispls[i-1]!=sdispls[i]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i,sdispls  [i] ); }
-    DBG_FPRINTF(stderr, "\trdispls   =  [%d]%2d,",0,rdispls[0]);
-    for (i=1; i<nprocs; i++) if(rdispls[i-1]!=rdispls[i]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i,rdispls  [i] ); }
-    DBG_FPRINTF(stderr, "\ttails = %4d, %4d\n", stail, rtail );
-    if (nprocs_send) {
-    DBG_FPRINTF(stderr, "\tall_send_buf =  [%d]%2d,",0,all_send_buf[0]);
-    for (i=1; i<nprocs; i++) if(all_send_buf[(i-1)*131072]!=all_send_buf[i*131072]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i, all_send_buf  [i*131072] ); }
-    }
-#endif
-    
-  /* alltoallv */
-    MPI_Alltoallv( 
-            all_send_buf, send_size, sdispls, MPI_BYTE,
-            all_recv_buf, recv_size, rdispls, MPI_BYTE,
-            fd->comm ); 
-
-#if 0
-    DBG_FPRINTF(stderr, "\tall_recv_buf = " );
-    for (i=131072; i<131073; i++) { DBG_FPRINTF(stderr, "%2d,", all_recv_buf  [i] ); }
-    DBG_FPRINTF(stderr, "\n" );
-#endif
-    
-  /* unpack at the receiver side */
-    if (nprocs_recv) { 
-        if (!buftype_is_contig)
-            ADIOI_Fill_user_buffer(fd, buf, flat_buf, recv_buf,
-                                   offset_list, len_list, (unsigned*)recv_size,
-                                   requests, statuses,          /* never used inside */
-                                   recd_from_proc,
-                                   nprocs, contig_access_count,
-                                   min_st_offset, fd_size, fd_start, fd_end,
-                               buftype_extent);
-        else {
-	    rtail = 0;
-            for (i=0; i < nprocs; i++)
-                if (recv_size[i]) {
-                    memcpy( (char *)buf + buf_idx[i], all_recv_buf + rtail, recv_size[i] );
-		    buf_idx[i] += recv_size[i];
-		    rtail += recv_size[i];
-		}
-        }
-    }
-    
-    ADIOI_Free( all_send_buf );
-    ADIOI_Free( all_recv_buf );
-    ADIOI_Free( recv_buf  );
-    ADIOI_Free( sdispls );
-    ADIOI_Free( rdispls );
-    return; 
-}   
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl_read.c b/src/mpi/romio/adio/ad_bgl/ad_bgl_read.c
deleted file mode 100644
index 96adf91..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl_read.c
+++ /dev/null
@@ -1,549 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl_read.c
- * \brief ???
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 1997 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "ad_bgl.h"
-#include "adio_extern.h"
-
-#include "ad_bgl_tuning.h"
-
-void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count, 
-                     MPI_Datatype datatype, int file_ptr_type,
-		     ADIO_Offset offset, ADIO_Status *status, int *error_code)
-{
-    MPI_Count err=-1, datatype_size;
-    ADIO_Offset len;
-    static char myname[] = "ADIOI_BGL_READCONTIG";
-#if BGL_PROFILE
-		/* timing */
-		double io_time, io_time2;
-
-		if (bglmpio_timing) {
-		    io_time = MPI_Wtime();
-		    bglmpio_prof_cr[ BGLMPIO_CIO_DATA_SIZE ] += len;
-		}
-#endif
-
-    MPI_Type_size_x(datatype, &datatype_size);
-    len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
-    ADIOI_Assert(len == (unsigned int) len); /* read takes an unsigned int parm */
-
-#if BGL_PROFILE
-
-    if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
-        	if (bglmpio_timing2) io_time2 = MPI_Wtime();
-	if (fd->fp_sys_posn != offset)
-	    lseek(fd->fd_sys, offset, SEEK_SET);
-        	if (bglmpio_timing2) bglmpio_prof_cr[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
-	if (fd->atomicity)
-	    ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
-	else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
-        	if (bglmpio_timing2) io_time2 = MPI_Wtime();
-	err = read(fd->fd_sys, buf, (unsigned int)len);
-        	if (bglmpio_timing2) bglmpio_prof_cr[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
-	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
-	fd->fp_sys_posn = offset + err;
-	/* individual file pointer not updated */        
-    }
-    else {  /* read from curr. location of ind. file pointer */
-	offset = fd->fp_ind;
-        	if (bglmpio_timing2) io_time2 = MPI_Wtime();
-	if (fd->fp_sys_posn != fd->fp_ind)
-	    lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
-        	if (bglmpio_timing2) bglmpio_prof_cr[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
-	if (fd->atomicity)
-	    ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
-	else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
-        	if (bglmpio_timing2) io_time2 = MPI_Wtime();
-	err = read(fd->fd_sys, buf, (unsigned int)len);
-        	if (bglmpio_timing2) bglmpio_prof_cr[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
-	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
-	fd->fp_ind += err;
-	fd->fp_sys_posn = fd->fp_ind;
-    }
-
-#else	/* BGL_PROFILE */
-
-    if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
-	if (fd->fp_sys_posn != offset)
-	    lseek(fd->fd_sys, offset, SEEK_SET);
-	if (fd->atomicity)
-	    ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
-	else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
-	err = read(fd->fd_sys, buf, (unsigned int)len);
-	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
-	fd->fp_sys_posn = offset + err;
-	/* individual file pointer not updated */        
-    }
-    else {  /* read from curr. location of ind. file pointer */
-	offset = fd->fp_ind;
-	if (fd->fp_sys_posn != fd->fp_ind)
-	    lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
-	if (fd->atomicity)
-	    ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
-	else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
-	err = read(fd->fd_sys, buf, (unsigned int)len);
-	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
-	fd->fp_ind += err;
-	fd->fp_sys_posn = fd->fp_ind;
-    }
-
-#endif   /* BGL_PROFILE */
-
-#if BGL_PROFILE
-    		if (bglmpio_timing) bglmpio_prof_cr[ BGLMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
-#endif
-
-    /* --BEGIN ERROR HANDLING-- */
-    if (err == -1) {
-	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
-					   myname, __LINE__, MPI_ERR_IO,
-					   "**io", "**io %s", strerror(errno));
-	return;
-    }
-    /* --END ERROR HANDLING-- */
-
-#ifdef HAVE_STATUS_SET_BYTES
-    MPIR_Status_set_bytes(status, datatype, err);
-#endif
-
-    *error_code = MPI_SUCCESS;
-}
-
-
-#define ADIOI_BUFFERED_READ \
-{ \
-    if (req_off >= readbuf_off + readbuf_len) { \
-	readbuf_off = req_off; \
-	readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));\
-	lseek(fd->fd_sys, readbuf_off, SEEK_SET);\
-        if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\
-        err = read(fd->fd_sys, readbuf, readbuf_len);\
-        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\
-        if (err == -1) err_flag = 1; \
-    } \
-    while (req_len > readbuf_off + readbuf_len - req_off) { \
-  ADIOI_Assert((readbuf_off + readbuf_len - req_off) == (int) (readbuf_off + readbuf_len - req_off));\
-	partial_read = (int) (readbuf_off + readbuf_len - req_off); \
-	tmp_buf = (char *) ADIOI_Malloc(partial_read); \
-	memcpy(tmp_buf, readbuf+readbuf_len-partial_read, partial_read); \
-	ADIOI_Free(readbuf); \
-	readbuf = (char *) ADIOI_Malloc(partial_read + max_bufsize); \
-	memcpy(readbuf, tmp_buf, partial_read); \
-	ADIOI_Free(tmp_buf); \
-	readbuf_off += readbuf_len-partial_read; \
-	readbuf_len = (unsigned) (partial_read + ADIOI_MIN(max_bufsize, \
-				       end_offset-readbuf_off+1)); \
-	lseek(fd->fd_sys, readbuf_off+partial_read, SEEK_SET);\
-        if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
-        err = read(fd->fd_sys, readbuf+partial_read, readbuf_len-partial_read);\
-        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
-        if (err == -1) err_flag = 1; \
-    } \
-    ADIOI_Assert(req_len == (size_t)req_len); \
-    memcpy((char *)buf + userbuf_off, readbuf+req_off-readbuf_off, req_len); \
-}
-
-
-void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
-                       MPI_Datatype datatype, int file_ptr_type,
-                       ADIO_Offset offset, ADIO_Status *status, int
-                       *error_code)
-{
-/* offset is in units of etype relative to the filetype. */
-
-
-    ADIOI_Flatlist_node *flat_buf, *flat_file;
-    ADIO_Offset i_offset, new_brd_size, brd_size, size;
-    int i, j, k, err=-1, st_index=0;
-    ADIO_Offset frd_size=0, new_frd_size, st_frd_size;
-    unsigned num, bufsize; 
-    int n_etypes_in_filetype;
-    ADIO_Offset n_filetypes, etype_in_filetype, st_n_filetypes, size_in_filetype;
-    ADIO_Offset abs_off_in_filetype=0;
-    MPI_Count filetype_size, etype_size, buftype_size, partial_read;
-    MPI_Aint filetype_extent, buftype_extent; 
-    int buf_count, buftype_is_contig, filetype_is_contig;
-    ADIO_Offset userbuf_off, req_len, sum;
-    ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off;
-    char *readbuf, *tmp_buf, *value;
-    int err_flag=0, info_flag;
-    unsigned max_bufsize, readbuf_len;
-    static char myname[] = "ADIOI_BGL_READSTRIDED";
-
-    if (fd->hints->ds_read == ADIOI_HINT_DISABLE) {
-  /* if user has disabled data sieving on reads, use naive
-	 * approach instead.
-	 */
-      /*FPRINTF(stderr, "ADIOI_GEN_ReadStrided_naive(%d):\n", __LINE__);*/
-      ADIOI_GEN_ReadStrided_naive(fd, 
-				    buf,
-				    count,
-				    datatype,
-				    file_ptr_type,
-				    offset,
-				    status,
-				    error_code);
-    	return;
-    }
-    /*FPRINTF(stderr, "%s(%d):\n",myname, __LINE__);*/
-
-    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
-    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
-
-    MPI_Type_size_x(fd->filetype, &filetype_size);
-    if ( ! filetype_size ) {
-#ifdef HAVE_STATUS_SET_BYTES
-	MPIR_Status_set_bytes(status, datatype, 0);
-#endif
-	*error_code = MPI_SUCCESS; 
-	return;
-    }
-
-    MPI_Type_extent(fd->filetype, &filetype_extent);
-    MPI_Type_size_x(datatype, &buftype_size);
-    MPI_Type_extent(datatype, &buftype_extent);
-    etype_size = fd->etype_size;
-
-    ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
-    bufsize = buftype_size * count;
-
-/* get max_bufsize from the info object. */
-
-    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
-    ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, 
-                 &info_flag);
-    max_bufsize = atoi(value);
-    ADIOI_Free(value);
-
-    if (!buftype_is_contig && filetype_is_contig) {
-
-/* noncontiguous in memory, contiguous in file. */
-
-	ADIOI_Flatten_datatype(datatype);
-	flat_buf = ADIOI_Flatlist;
-	while (flat_buf->type != datatype) flat_buf = flat_buf->next;
-
-        off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : 
-                 fd->disp + (ADIO_Offset)etype_size * offset;
-
-	start_off = off;
-	end_offset = off + bufsize - 1;
-        readbuf_off = off;
-        readbuf = (char *) ADIOI_Malloc(max_bufsize);
-        readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
-
-/* if atomicity is true, lock (exclusive) the region to be accessed */
-        if (fd->atomicity)
-            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
-
-	lseek(fd->fd_sys, readbuf_off, SEEK_SET);
-        if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);
-        err = read(fd->fd_sys, readbuf, readbuf_len);
-        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len);
-        if (err == -1) err_flag = 1;
-
-        for (j=0; j<count; j++) 
-        {
-          int i;
-              for (i=0; i<flat_buf->count; i++) {
-                  userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
-      req_off = off;
-      req_len = flat_buf->blocklens[i];
-      ADIOI_BUFFERED_READ
-                  off += flat_buf->blocklens[i];
-              }
-        }
-
-        if (fd->atomicity)
-            ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
-
-        if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
-
-	ADIOI_Free(readbuf); /* malloced in the buffered_read macro */
-
-	if (err_flag) {
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE, myname,
-					       __LINE__, MPI_ERR_IO, "**io",
-					       "**io %s", strerror(errno));
-	}
-	else *error_code = MPI_SUCCESS;
-    }
-
-    else {  /* noncontiguous in file */
-
-/* filetype already flattened in ADIO_Open */
-	flat_file = ADIOI_Flatlist;
-	while (flat_file->type != fd->filetype) flat_file = flat_file->next;
-	disp = fd->disp;
-
-	if (file_ptr_type == ADIO_INDIVIDUAL) {
-	    /* Wei-keng reworked type processing to be a bit more efficient */
-            offset       = fd->fp_ind - disp;
-            n_filetypes  = (offset - flat_file->indices[0]) / filetype_extent;
-	    offset -= (ADIO_Offset)n_filetypes * filetype_extent;
-	    /* now offset is local to this extent */
-
-            /* find the block where offset is located, skip blocklens[i]==0 */
-            for (i=0; i<flat_file->count; i++) {
-                ADIO_Offset dist;
-                if (flat_file->blocklens[i] == 0) continue;
-                dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
-                /* frd_size is from offset to the end of block i */
-		if (dist == 0) {
-		    i++;
-		    offset   = flat_file->indices[i];
-		    frd_size = flat_file->blocklens[i];
-		    break;
-		}
-		if (dist > 0) {
-                    frd_size = dist;
-		    break;
-		}
-	    }
-            st_index = i;  /* starting index in flat_file->indices[] */
-            offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
-	}
-	else {
-	    n_etypes_in_filetype = filetype_size/etype_size;
-	    n_filetypes = offset / n_etypes_in_filetype;
-	    etype_in_filetype = offset % n_etypes_in_filetype;
-	    size_in_filetype = etype_in_filetype * etype_size;
- 
-	    sum = 0;
-	    for (i=0; i<flat_file->count; i++) {
-		sum += flat_file->blocklens[i];
-		if (sum > size_in_filetype) {
-		    st_index = i;
-		    frd_size = sum - size_in_filetype;
-		    abs_off_in_filetype = flat_file->indices[i] +
-			size_in_filetype - (sum - flat_file->blocklens[i]);
-		    break;
-		}
-	    }
-
-	    /* abs. offset in bytes in the file */
-	    offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + 
-		    abs_off_in_filetype;
-	}
-
-        start_off = offset;
-
-	/* Wei-keng Liao: read request is within a single flat_file contig
-	 * block e.g. with subarray types that actually describe the whole
-	 * array */
-	if (buftype_is_contig && bufsize <= frd_size) {
-            ADIO_ReadContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
-                             offset, status, error_code);
-
-	    if (file_ptr_type == ADIO_INDIVIDUAL) {
-                /* update MPI-IO file pointer to point to the first byte that 
-		 * can be accessed in the fileview. */
-		fd->fp_ind = offset + bufsize;
-		if (bufsize == frd_size) {
-		    do {
-			st_index++;
-			if (st_index == flat_file->count) {
-			    st_index = 0;
-			    n_filetypes++;
-			}
-                    } while (flat_file->blocklens[st_index] == 0);
-		    fd->fp_ind = disp + flat_file->indices[st_index]
-                               + n_filetypes*filetype_extent;
-		}
-	    }
-	    fd->fp_sys_posn = -1;   /* set it to null. */ 
-#ifdef HAVE_STATUS_SET_BYTES
-	    MPIR_Status_set_bytes(status, datatype, bufsize);
-#endif 
-            return;
-	}
-
-       /* Calculate end_offset, the last byte-offset that will be accessed.
-         e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/
-
-	st_frd_size = frd_size;
-	st_n_filetypes = n_filetypes;
-	i_offset = 0;
-	j = st_index;
-	off = offset;
-	frd_size = ADIOI_MIN(st_frd_size, bufsize);
-	while (i_offset < bufsize) {
-	    i_offset += frd_size;
-	    end_offset = off + frd_size - 1;
-
-	    j = (j+1) % flat_file->count;
-            n_filetypes += (j == 0) ? 1 : 0;
-            while (flat_file->blocklens[j]==0) {
-		j = (j+1) % flat_file->count;
-		n_filetypes += (j == 0) ? 1 : 0;
-	    }
-	    off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent;
-	    frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
-	}
-
-/* if atomicity is true, lock (exclusive) the region to be accessed */
-        if (fd->atomicity)
-            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
-
-        /* initial read into readbuf */
-	readbuf_off = offset;
-	readbuf = (char *) ADIOI_Malloc(max_bufsize);
-	readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
-
-	lseek(fd->fd_sys, offset, SEEK_SET);
-        if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, offset, SEEK_SET, readbuf_len);
-        err = read(fd->fd_sys, readbuf, readbuf_len);
-        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, offset, SEEK_SET, readbuf_len);
-
-        if (err == -1) err_flag = 1;
-
-	if (buftype_is_contig && !filetype_is_contig) {
-
-/* contiguous in memory, noncontiguous in file. should be the most
-   common case. */
-
-	    i_offset = 0;
-	    j = st_index;
-	    off = offset;
-	    n_filetypes = st_n_filetypes;
-	    frd_size = ADIOI_MIN(st_frd_size, bufsize);
-	    while (i_offset < bufsize) {
-                if (frd_size) { 
-                    /* TYPE_UB and TYPE_LB can result in 
-                       frd_size = 0. save system call in such cases */ 
-		    /* lseek(fd->fd_sys, off, SEEK_SET);
-		    err = read(fd->fd_sys, ((char *) buf) + i, frd_size);*/
-
-		    req_off = off;
-		    req_len = frd_size;
-		    userbuf_off = i_offset;
-		    ADIOI_BUFFERED_READ
-		}
-		i_offset += frd_size;
-
-                if (off + frd_size < disp + flat_file->indices[j] +
-                   flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
-                       off += frd_size;
-                /* did not reach end of contiguous block in filetype.
-                   no more I/O needed. off is incremented by frd_size. */
-                else {
-                    j = (j+1) % flat_file->count;
-                    n_filetypes += (j == 0) ? 1 : 0;
-                    while (flat_file->blocklens[j]==0) {
-                        j = (j+1) % flat_file->count;
-                        n_filetypes += (j == 0) ? 1 : 0;
-		    }
-		    off = disp + flat_file->indices[j] + 
-                                        n_filetypes*(ADIO_Offset)filetype_extent;
-		    frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
-		}
-	    }
-	}
-	else {
-/* noncontiguous in memory as well as in file */
-
-	    ADIOI_Flatten_datatype(datatype);
-	    flat_buf = ADIOI_Flatlist;
-	    while (flat_buf->type != datatype) flat_buf = flat_buf->next;
-
-	    k = num = buf_count = 0;
-	    i_offset = flat_buf->indices[0];
-	    j = st_index;
-	    off = offset;
-	    n_filetypes = st_n_filetypes;
-	    frd_size = st_frd_size;
-	    brd_size = flat_buf->blocklens[0];
-
-	    while (num < bufsize) {
-		size = ADIOI_MIN(frd_size, brd_size);
-		if (size) {
-		    /* lseek(fd->fd_sys, off, SEEK_SET);
-		    err = read(fd->fd_sys, ((char *) buf) + i, size); */
-
-		    req_off = off;
-		    req_len = size;
-		    userbuf_off = i_offset;
-		    ADIOI_BUFFERED_READ
-		}
-
-		new_frd_size = frd_size;
-		new_brd_size = brd_size;
-
-		if (size == frd_size) {
-/* reached end of contiguous block in file */
-                    j = (j+1) % flat_file->count;
-                    n_filetypes += (j == 0) ? 1 : 0;
-                    while (flat_file->blocklens[j]==0) {
-                        j = (j+1) % flat_file->count;
-                        n_filetypes += (j == 0) ? 1 : 0;
-		    }
-
-		    off = disp + flat_file->indices[j] + 
-                                              n_filetypes*(ADIO_Offset)filetype_extent;
-
-		    new_frd_size = flat_file->blocklens[j];
-		    if (size != brd_size) {
-			i_offset += size;
-			new_brd_size -= size;
-		    }
-		}
-
-		if (size == brd_size) {
-/* reached end of contiguous block in memory */
-
-		    k = (k + 1)%flat_buf->count;
-		    buf_count++;
-		    i_offset = ((ADIO_Offset)buftype_extent*(ADIO_Offset)(buf_count/flat_buf->count) +
-			flat_buf->indices[k]); 
-		    new_brd_size = flat_buf->blocklens[k];
-		    if (size != frd_size) {
-			off += size;
-			new_frd_size -= size;
-		    }
-		}
-    ADIOI_Assert(((ADIO_Offset)num + size) == (unsigned)(num + size));
-		num += size;
-		frd_size = new_frd_size;
-                brd_size = new_brd_size;
-	    }
-	}
-	
-        if (fd->atomicity)
-            ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
-
-	if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
-
-	ADIOI_Free(readbuf); /* malloced in the buffered_read macro */
-
-	if (err_flag) {
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE, myname,
-					       __LINE__, MPI_ERR_IO, "**io",
-					       "**io %s", strerror(errno));
-	}
-	else *error_code = MPI_SUCCESS;
-    }
-
-    fd->fp_sys_posn = -1;   /* set it to null. */
-
-#ifdef HAVE_STATUS_SET_BYTES
-    MPIR_Status_set_bytes(status, datatype, bufsize);
-/* This is a temporary way of filling in status. The right way is to 
-   keep track of how much data was actually read and placed in buf 
-   by ADIOI_BUFFERED_READ. */
-#endif
-
-    if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
-}
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl_setsh.c b/src/mpi/romio/adio/ad_bgl/ad_bgl_setsh.c
deleted file mode 100644
index b7a8bce..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl_setsh.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl_setsh.c
- * \brief ???
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 1997 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "ad_bgl.h"
-
-/* set the shared file pointer to "offset" etypes relative to the current 
-   view */
-
-/*
-This looks very similar to ADIOI_GEN_Set_shared_fp, except this 
-function avoids locking the file twice.  The generic version does
-
-Write lock
-ADIO_WriteContig
-Unlock
-
-For BGL, ADIOI_BGL_WriteContig does a lock before writing to disable
-caching. To avoid the lock being called twice, this version for BGL does
-
-Write lock
-Lseek
-Write
-Unlock 
-
-*/
-
-void ADIOI_BGL_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code)
-{
-    int err;
-    MPI_Comm dupcommself;
-    static char myname[] = "ADIOI_BGL_SET_SHARED_FP";
-
-    if (fd->shared_fp_fd == ADIO_FILE_NULL) {
-	MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
-	fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF, dupcommself,
-				     fd->shared_fp_fname, 
-				     fd->file_system, fd->fns,
-				     ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE, 
-				     0, MPI_BYTE, MPI_BYTE, MPI_INFO_NULL, 
-				     ADIO_PERM_NULL, error_code);
-    }
-
-    if (*error_code != MPI_SUCCESS) return;
-
-    ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
-    lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
-    err = write(fd->shared_fp_fd->fd_sys, &offset, sizeof(ADIO_Offset));
-    ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
-
-    if (err == -1) {
-	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
-					   myname, __LINE__, MPI_ERR_IO,
-					   "**io",
-					   "**io %s", strerror(errno));
-    }
-    else *error_code = MPI_SUCCESS;
-}
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl_tuning.c b/src/mpi/romio/adio/ad_bgl/ad_bgl_tuning.c
deleted file mode 100644
index 0f54de1..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl_tuning.c
+++ /dev/null
@@ -1,163 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl_tuning.c
- * \brief defines ad_bgl performance tuning
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 2008 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-/*---------------------------------------------------------------------
- * ad_bgl_tuning.c
- *
- * defines global variables and functions for performance tuning and 
- * functional debugging.
- *---------------------------------------------------------------------*/
-
-#include "ad_bgl_tuning.h"
-#include "mpi.h"
-
-#if !defined(PVFS2_SUPER_MAGIC)
-  #define PVFS2_SUPER_MAGIC (0x20030528)
-#endif
-
-int 	bglmpio_timing;
-int 	bglmpio_timing2;
-int 	bglmpio_comm;
-int 	bglmpio_tunegather;
-int 	bglmpio_tuneblocking;
-long    bglocklessmpio_f_type;
-
-double	bglmpio_prof_cw    [BGLMPIO_CIO_LAST];
-double	bglmpio_prof_cr    [BGLMPIO_CIO_LAST];
-
-/* set internal variables for tuning environment variables */
-/** \page mpiio_vars MPIIO Configuration
-  \section env_sec Environment Variables
- * - BGLMPIO_COMM - Define how data is exchanged on collective
- *   reads and writes.  Possible values:
- *   - 0 - Use MPI_Alltoallv.
- *   - 1 - Use MPI_Isend/MPI_Irecv.
- *   - Default is 0.
- *
- * - BGLMPIO_TIMING - collect timing breakdown for MPI I/O collective calls.
- *   Must also compile the library with BGL_PROFILE defined. Possible values:
- *   - 0 - Do not collect/report timing.
- *   - 1 - Collect/report timing.
- *   - Default is 0.
- *
- * - BGLMPIO_TIMING2 - collect additional averages for MPI I/O collective calls.
- *   Must also compile the library with BGL_PROFILE defined. Possible values:
- *   - 0 - Do not collect/report averages.
- *   - 1 - Collect/report averages.
- *   - Default is 0.
- *
- * - BGLMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated
- *   for aggregator collective i/o.  Possible values:
- *   - 0 - Use two MPI_Allgather's to collect starting and ending offsets.
- *   - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets.
- *   - Default is 1.
- *
- * - BGLMPIO_TUNEBLOCKING - Tune how aggregate file domains are 
- *   calculated (block size).  Possible values:
- *   - 0 - Evenly calculate file domains across aggregators.  Also use 
- *   MPI_Isend/MPI_Irecv to exchange domain information.
- *   - 1 - Align file domains with the underlying file system's block size.  Also use 
- *   MPI_Alltoallv to exchange domain information.
- *   - Default is 1.
- * 
- * - BGLOCKLESSMPIO_F_TYPE - Specify a filesystem type that should run
- *   the ad_bglockless driver.   NOTE: Using romio prefixes (such as
- *   "bgl:" or "bglockless:") on a file name will override this environment
- *   variable.  Possible values:
- *   - 0xnnnnnnnn - Any valid file system type (or "magic number") from
- *                  statfs() field f_type.
- *   - The default is 0x20030528 (PVFS2_SUPER_MAGIC)
- *
-*/
-void ad_bgl_get_env_vars() {
-    char *x, *dummy;
-
-    bglmpio_comm   = 0;
-	x = getenv( "BGLMPIO_COMM"         ); 
-	if (x) bglmpio_comm         = atoi(x);
-    bglmpio_timing = 0;
-	x = getenv( "BGLMPIO_TIMING"       ); 
-	if (x) bglmpio_timing       = atoi(x);
-    bglmpio_timing2 = 0;
-	x = getenv( "BGLMPIO_TIMING2"      ); 
-	if (x) bglmpio_timing2      = atoi(x);
-    bglmpio_tunegather = 1;
-	x = getenv( "BGLMPIO_TUNEGATHER"   ); 
-	if (x) bglmpio_tunegather   = atoi(x);
-    bglmpio_tuneblocking = 1;
-	x = getenv( "BGLMPIO_TUNEBLOCKING" ); 
-	if (x) bglmpio_tuneblocking = atoi(x);
-    bglocklessmpio_f_type = PVFS2_SUPER_MAGIC;
-    x = getenv( "BGLOCKLESSMPIO_F_TYPE" ); 
-    if (x) bglocklessmpio_f_type = strtol(x,&dummy,0);
-    DBG_FPRINTF(stderr,"BGLOCKLESSMPIO_F_TYPE=%ld/%#lX\n",
-            bglocklessmpio_f_type,bglocklessmpio_f_type);
-}
-
-/* report timing breakdown for MPI I/O collective call */
-void ad_bgl_wr_timing_report( int rw, ADIO_File fd, int myrank, int nprocs )
-{
-    int i;
-
-    if (bglmpio_timing) {
-
-	double *bglmpio_prof_org = bglmpio_prof_cr;
-	if (rw) bglmpio_prof_org = bglmpio_prof_cw;
-
-	double bglmpio_prof_avg[ BGLMPIO_CIO_LAST ];
-	double bglmpio_prof_max[ BGLMPIO_CIO_LAST ];
-	
-	MPI_Reduce( bglmpio_prof_org, bglmpio_prof_avg, BGLMPIO_CIO_LAST, MPI_DOUBLE, MPI_SUM, 0, fd->comm );
-	MPI_Reduce( bglmpio_prof_org, bglmpio_prof_max, BGLMPIO_CIO_LAST, MPI_DOUBLE, MPI_MAX, 0, fd->comm );
-
-	if (myrank == 0) {
-
-	    for (i=0; i<BGLMPIO_CIO_LAST; i++) bglmpio_prof_avg[i] /= nprocs;
-
-	    if (bglmpio_timing2) {
-		bglmpio_prof_avg[ BGLMPIO_CIO_B_POSI_RW  ] = bglmpio_prof_avg[ BGLMPIO_CIO_DATA_SIZE ] * nprocs / 
-							     bglmpio_prof_max[ BGLMPIO_CIO_T_POSI_RW  ];
-		bglmpio_prof_avg[ BGLMPIO_CIO_B_MPIO_RW  ] = bglmpio_prof_avg[ BGLMPIO_CIO_DATA_SIZE ] * nprocs / 
-							     bglmpio_prof_max[ BGLMPIO_CIO_T_MPIO_RW  ];
-	    } else {
-
-		bglmpio_prof_avg[ BGLMPIO_CIO_B_POSI_RW  ] = 0;
-		bglmpio_prof_avg[ BGLMPIO_CIO_B_MPIO_RW  ] = 0;
-	    }
-
-		bglmpio_prof_avg[ BGLMPIO_CIO_B_MPIO_CRW ] = bglmpio_prof_avg[ BGLMPIO_CIO_DATA_SIZE ] * nprocs / 
-							     bglmpio_prof_max[ BGLMPIO_CIO_T_MPIO_CRW ];
-
-	    printf("\tTIMING-1 %1s , ", (rw ? "W" : "R") );
-	    printf(    "SZ: %12.4f , ", bglmpio_prof_avg[ BGLMPIO_CIO_DATA_SIZE ] * nprocs );
-	    printf(  "SK-a: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_SEEK ]     );
-	    printf(  "SK-m: %10.3f , ", bglmpio_prof_max[ BGLMPIO_CIO_T_SEEK ]     );
-	    printf(  "LC-a: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_LCOMP ]    );
-	    printf(  "GA-m: %10.3f , ", bglmpio_prof_max[ BGLMPIO_CIO_T_GATHER ]   );
-	    printf(  "AN-a: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_PATANA ]   );
-	    printf(  "FD-a: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_FD_PART ]  );
-	    printf(  "MY-a: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_MYREQ ]    );
-	    printf(  "OT-m: %10.3f , ", bglmpio_prof_max[ BGLMPIO_CIO_T_OTHREQ ]   );
-	    printf(  "EX-m: %10.3f , ", bglmpio_prof_max[ BGLMPIO_CIO_T_DEXCH ]    );
-	    printf("\tTIMING-2 %1s , ", (rw ? "W" : "R") );
-	    printf( "PXT-m: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_POSI_RW ]  );
-	    printf( "MPT-m: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_MPIO_RW ]  );
-	    printf("MPTC-m: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_MPIO_CRW ] );
-	    printf(   "PXB: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_B_POSI_RW ]  );
-	    printf(   "MPB: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_B_MPIO_RW ]  );
-	    printf(  "MPBC: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_B_MPIO_CRW ] );
-	}
-    }
-
-}
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl_tuning.h b/src/mpi/romio/adio/ad_bgl/ad_bgl_tuning.h
deleted file mode 100644
index ae69d36..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl_tuning.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl_tuning.h
- * \brief ???
- */
-
-/*---------------------------------------------------------------------
- * ad_bgl_tuning.h
- *
- * declares global variables and macros for performance tuning and 
- * functional debugging.
- *---------------------------------------------------------------------*/
-
-#ifndef AD_BGL_TUNING_H_
-#define AD_BGL_TUNING_H_
-
-#include "adio.h"
-
-#define AD_BGL_assert( a ) if (!(a)) { \
-                                fprintf( stderr, "AD_BGL_assert, file=%s, line=%d\n", __FILE__, __LINE__ ); \
-                                MPI_Abort( MPI_COMM_WORLD, 1 ); \
-                           }
-
-/*-----------------------------------------
- *  Global variables for the control of
- *  1.  timing
- *  2.  select specific optimizations
- *-----------------------------------------*/
-
-/* timing fields */
-enum {
-    BGLMPIO_CIO_DATA_SIZE=0,	
-    BGLMPIO_CIO_T_SEEK,		
-    BGLMPIO_CIO_T_LCOMP,	/* time for ADIOI_Calc_my_off_len(), local */
-    BGLMPIO_CIO_T_GATHER,	/* time for previous MPI_Allgather, now Allreduce */
-    BGLMPIO_CIO_T_PATANA,	/* time for a quick test if access is contiguous or not, local */
-    BGLMPIO_CIO_T_FD_PART,	/* time for file domain partitioning, local */
-    BGLMPIO_CIO_T_MYREQ,	/* time for ADIOI_BGL_Calc_my_req(), local */
-    BGLMPIO_CIO_T_OTHREQ,	/* time for ADIOI_Calc_others_req(), short Alltoall */
-    BGLMPIO_CIO_T_DEXCH,	/* time for I/O data exchange */
-    BGLMPIO_CIO_T_POSI_RW,
-    BGLMPIO_CIO_B_POSI_RW,
-    BGLMPIO_CIO_T_MPIO_RW,	/* time for ADIOI_BGL_WriteContig() */
-    BGLMPIO_CIO_B_MPIO_RW,
-    BGLMPIO_CIO_T_MPIO_CRW,	/* time for ADIOI_BGL_WriteStridedColl() */
-    BGLMPIO_CIO_B_MPIO_CRW,
-    BGLMPIO_CIO_LAST
-};
-
-extern double 	bglmpio_prof_cw    [BGLMPIO_CIO_LAST];
-extern double 	bglmpio_prof_cr    [BGLMPIO_CIO_LAST];
-
-
-/* corresponds to environment variables to select optimizations and timing level */
-extern int 	bglmpio_timing;
-extern int 	bglmpio_timing2;
-extern int 	bglmpio_comm;
-extern int 	bglmpio_tunegather;
-extern int 	bglmpio_tuneblocking;
-extern long     bglocklessmpio_f_type;
-
-
-/* set internal variables for tuning environment variables */
-void ad_bgl_get_env_vars();
-
-/* report timing breakdown for MPI I/O collective call */
-void ad_bgl_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs );
-
-/* note: 	
- *   T := timing; 
- * CIO := collective I/O 
- */
-#define BGLMPIO_T_CIO_RESET( LEVEL, RW ) \
-	if (bglmpio_timing_cw_level >= LEVEL) { \
-	  int i; \
-	  for ( i = 0; i < BGLMPIO_T_LAST; i ++ ) \
-	    bglmpio_prof_c##RW [ i ] = 0; \
-	}
-
-#define BGLMPIO_T_CIO_REPORT( LEVEL, RW, FD, MYRANK, NPROCS ) \
-	if (bglmpio_timing_cw_level >= LEVEL) { \
-	  ad_bgl_timing_crw_report ( RW, FD, MYRANK, NPROCS ); \
-   	}
-
-#define BGLMPIO_T_CIO_SET_GET( LEVEL, RW, DOBAR, ISSET, ISGET, VAR1, VAR2 ) \
-	if (bglmpio_timing_cw_level >= LEVEL) { \
-	  if ( DOBAR ) MPI_Barrier( fd->comm ); \
-	  double temp = MPI_Wtime(); \
-	  if ( ISSET ) bglmpio_prof_c##RW [ VAR1 ] = temp; \
-	  if ( ISGET ) bglmpio_prof_c##RW [ VAR2 ] = temp - bglmpio_prof_c##RW [ VAR2 ] ; \
-	}
-
-#endif  /* AD_BGL_TUNING_H_ */
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl_wrcoll.c b/src/mpi/romio/adio/ad_bgl/ad_bgl_wrcoll.c
deleted file mode 100644
index 3e1bf9e..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl_wrcoll.c
+++ /dev/null
@@ -1,1535 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl_wrcoll.c
- * \brief ???
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 1997 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "adio.h"
-#include "adio_extern.h"
-#include "ad_bgl.h"
-#include "ad_bgl_pset.h"
-#include "ad_bgl_aggrs.h"
-
-#ifdef AGGREGATION_PROFILE
-#include "mpe.h"
-#endif
-#ifdef PROFILE
-#include "mpe.h"
-#endif
-
-/* prototypes of functions used for collective writes only. */
-static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
-                         datatype, int nprocs, int myrank, ADIOI_Access
-                         *others_req, ADIO_Offset *offset_list,
-                         ADIO_Offset *len_list, int contig_access_count, ADIO_Offset
-                         min_st_offset, ADIO_Offset fd_size,
-                         ADIO_Offset *fd_start, ADIO_Offset *fd_end,
-                         int *buf_idx, int *error_code);
-static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
-                         ADIOI_Flatlist_node *flat_buf, ADIO_Offset 
-                         *offset_list, ADIO_Offset *len_list, int *send_size, 
-                         int *recv_size, ADIO_Offset off, int size,
-                         int *count, int *start_pos, int *partial_recv, 
-                         int *sent_to_proc, int nprocs, 
-                         int myrank, int
-                         buftype_is_contig, int contig_access_count,
-                         ADIO_Offset min_st_offset, ADIO_Offset fd_size,
-                         ADIO_Offset *fd_start, ADIO_Offset *fd_end, 
-                         ADIOI_Access *others_req, 
-                         int *send_buf_idx, int *curr_to_proc,
-                         int *done_to_proc, int *hole, int iter,
-                         MPI_Aint buftype_extent, int *buf_idx, int *error_code);
-static void ADIOI_W_Exchange_data_alltoallv(
-		ADIO_File fd, void *buf,
-		char *write_buf,					/* 1 */
-		ADIOI_Flatlist_node *flat_buf, 
-		ADIO_Offset *offset_list, 
-		ADIO_Offset *len_list, int *send_size, int *recv_size, 
-		ADIO_Offset off, int size,				/* 2 */
-		int *count, int *start_pos, int *partial_recv,
-		int *sent_to_proc, int nprocs, int myrank, 
-		int buftype_is_contig, int contig_access_count,
-		ADIO_Offset min_st_offset,
-		ADIO_Offset fd_size,
-		ADIO_Offset *fd_start, 
-		ADIO_Offset *fd_end,
-		ADIOI_Access *others_req,
-		int *send_buf_idx, int *curr_to_proc,			/* 3 */
-		int *done_to_proc, int *hole, 				/* 4 */
-		int iter, MPI_Aint buftype_extent, int *buf_idx,
-		int *error_code);
-static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
-                           *flat_buf, char **send_buf, ADIO_Offset 
-                           *offset_list, ADIO_Offset *len_list, int *send_size, 
-                           MPI_Request *requests, int *sent_to_proc, 
-                           int nprocs, int myrank, 
-                           int contig_access_count, ADIO_Offset
-                           min_st_offset, ADIO_Offset fd_size,
-                           ADIO_Offset *fd_start, ADIO_Offset *fd_end, 
-                           int *send_buf_idx, int *curr_to_proc, 
-                           int *done_to_proc, int iter, 
-                           MPI_Aint buftype_extent);
-static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlist_node
-                           *flat_buf, char **send_buf, ADIO_Offset 
-                           *offset_list, ADIO_Offset *len_list, int *send_size, 
-                           MPI_Request *requests, int *sent_to_proc, 
-                           int nprocs, int myrank, 
-                           int contig_access_count, ADIO_Offset
-                           min_st_offset, ADIO_Offset fd_size,
-                           ADIO_Offset *fd_start, ADIO_Offset *fd_end, 
-                           int *send_buf_idx, int *curr_to_proc, 
-                           int *done_to_proc, int iter, 
-                           MPI_Aint buftype_extent);
-static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, 
-                      ADIO_Offset *srt_off, int *srt_len, int *start_pos,
-                      int nprocs, int nprocs_recv, int total_elements);
-
-
-void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
-                       MPI_Datatype datatype, int file_ptr_type,
-                       ADIO_Offset offset, ADIO_Status *status, int
-                       *error_code)
-{
-/* Uses a generalized version of the extended two-phase method described
-   in "An Extended Two-Phase Method for Accessing Sections of 
-   Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary,
-   Scientific Programming, (5)4:301--317, Winter 1996. 
-   http://www.mcs.anl.gov/home/thakur/ext2ph.ps */
-
-    ADIOI_Access *my_req; 
-    /* array of nprocs access structures, one for each other process in
-       whose file domain this process's request lies */
-    
-    ADIOI_Access *others_req;
-    /* array of nprocs access structures, one for each other process
-       whose request lies in this process's file domain. */
-
-    int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank;
-    int contig_access_count=0, interleave_count = 0, buftype_is_contig;
-    int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
-    ADIO_Offset orig_fp, start_offset, end_offset, fd_size, min_st_offset, off;
-    ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL,
-	*fd_end = NULL, *end_offsets = NULL;
-    ADIO_Offset *bgl_offsets0 = NULL, *bgl_offsets = NULL;
-    int  ii;
-
-    int *buf_idx = NULL;
-    ADIO_Offset *len_list = NULL;
-#if BGL_PROFILE 
-    BGLMPIO_T_CIO_RESET( 0, w )
-#endif
-#if 0
-    /* From common code - not implemented for bgl.*/
-    int old_error, tmp_error;
-#endif
-#ifdef PROFILE
-	MPE_Log_event(13, 0, "start computation");
-#endif
-
-#if 0
-/*   From common code - not implemented for bgl. */
-     if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) { 
-	ADIOI_IOStridedColl (fd, buf, count, ADIOI_WRITE, datatype, 
-			file_ptr_type, offset, status, error_code);
-	return;
-    }
-#endif
-    MPI_Comm_size(fd->comm, &nprocs);
-    MPI_Comm_rank(fd->comm, &myrank);
-
-/* the number of processes that actually perform I/O, nprocs_for_coll,
- * is stored in the hints off the ADIO_File structure
- */
-    nprocs_for_coll = fd->hints->cb_nodes;
-    orig_fp = fd->fp_ind;
-
-#if BGL_PROFILE 
-    BGLMPIO_T_CIO_SET_GET( 0, w, 0, 1, 0, BGLMPIO_CIO_LCOMP, BGLMPIO_CIO_LAST )
-#endif
-
-
-    /* only check for interleaving if cb_write isn't disabled */
-    if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
-	/* For this process's request, calculate the list of offsets and
-	   lengths in the file and determine the start and end offsets. */
-
-	/* Note: end_offset points to the last byte-offset that will be accessed.
-	   e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/
-
-	ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
-			      &offset_list, &len_list, &start_offset,
-			      &end_offset, &contig_access_count); 
-
-#if BGL_PROFILE 
-    BGLMPIO_T_CIO_SET_GET( 0, w, 1, 1, 1, BGLMPIO_CIO_GATHER, BGLMPIO_CIO_LCOMP )
-#endif
-
-	/* each process communicates its start and end offsets to other 
-	   processes. The result is an array each of start and end offsets stored
-	   in order of process rank. */ 
-    
-	st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
-	end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
-
-    if (bglmpio_tunegather) {
-            bgl_offsets0 = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
-            bgl_offsets  = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
-            for (ii=0; ii<nprocs; ii++)  {
-                bgl_offsets0[ii*2]   = 0;
-                bgl_offsets0[ii*2+1] = 0;
-            }
-            bgl_offsets0[myrank*2]   = start_offset;
-            bgl_offsets0[myrank*2+1] =   end_offset;
-
-        MPI_Allreduce( bgl_offsets0, bgl_offsets, nprocs*2, ADIO_OFFSET, MPI_MAX, fd->comm );
-
-            for (ii=0; ii<nprocs; ii++)  {
-                st_offsets [ii] = bgl_offsets[ii*2]  ;
-                end_offsets[ii] = bgl_offsets[ii*2+1];
-            }
-            ADIOI_Free( bgl_offsets0 );
-            ADIOI_Free( bgl_offsets  );
-    } else {
-	MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
-		      ADIO_OFFSET, fd->comm);
-	MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
-		      ADIO_OFFSET, fd->comm);
-    }
-
-#if BGL_PROFILE 
-    BGLMPIO_T_CIO_SET_GET( 0, w, 0, 1, 1, BGLMPIO_CIO_PATANA, BGLMPIO_CIO_GATHER )
-#endif
-
-	/* are the accesses of different processes interleaved? */
-	for (i=1; i<nprocs; i++)
-      if ((st_offsets[i] < end_offsets[i-1]) &&
-                (st_offsets[i] <= end_offsets[i]))
-                interleave_count++;
-	/* This is a rudimentary check for interleaving, but should suffice
-	   for the moment. */
-    }
-
-    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
-
-    if (fd->hints->cb_write == ADIOI_HINT_DISABLE ||
-	(!interleave_count && (fd->hints->cb_write == ADIOI_HINT_AUTO)))
-    {
-	/* use independent accesses */
-	if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
-	    ADIOI_Free(offset_list);
-	    ADIOI_Free(len_list);
-	    ADIOI_Free(st_offsets);
-	    ADIOI_Free(end_offsets);
-	}
-
-	fd->fp_ind = orig_fp;
-        ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
-
-        if (buftype_is_contig && filetype_is_contig) {
-
-            if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
-                off = fd->disp + (ADIO_Offset)(fd->etype_size) * offset;
-                ADIO_WriteContig(fd, buf, count, datatype,
-				 ADIO_EXPLICIT_OFFSET,
-				 off, status, error_code);
-            }
-            else ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
-				  0, status, error_code);
-        }
-	else ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
-			       offset, status, error_code);
-
-	return;
-    }
-
-#if BGL_PROFILE 
-    BGLMPIO_T_CIO_SET_GET( 0, w, 1, 1, 1, BGLMPIO_CIO_FD_PART, BGLMPIO_CIO_PATANA )
-#endif
-	
-/* Divide the I/O workload among "nprocs_for_coll" processes. This is
-   done by (logically) dividing the file into file domains (FDs); each
-   process may directly access only its own file domain. */
-
-    if (bglmpio_tuneblocking)
-    ADIOI_BGL_GPFS_Calc_file_domains(st_offsets, end_offsets, nprocs,
-			    nprocs_for_coll, &min_st_offset,
-			    &fd_start, &fd_end, &fd_size, fd->fs_ptr);   
-    else
-    ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs,
-			    nprocs_for_coll, &min_st_offset,
-			    &fd_start, &fd_end,
-			    fd->hints->min_fdomain_size, &fd_size,
-			    fd->hints->striping_unit);   
-
-#if BGL_PROFILE 
-    BGLMPIO_T_CIO_SET_GET( 0, w, 0, 1, 1, BGLMPIO_CIO_MYREQ, BGLMPIO_CIO_FD_PART )
-#endif
-	
-/* calculate what portions of the access requests of this process are
-   located in what file domains */
-
-    if (bglmpio_tuneblocking)
-    ADIOI_BGL_Calc_my_req(fd, offset_list, len_list, contig_access_count,
-		      min_st_offset, fd_start, fd_end, fd_size,
-		      nprocs, &count_my_req_procs, 
-		      &count_my_req_per_proc, &my_req,
-		      &buf_idx); 
-    else
-    ADIOI_Calc_my_req(fd, offset_list, len_list, contig_access_count,
-		      min_st_offset, fd_start, fd_end, fd_size,
-		      nprocs, &count_my_req_procs, 
-		      &count_my_req_per_proc, &my_req,
-		      &buf_idx); 
-
-#if BGL_PROFILE 
-    BGLMPIO_T_CIO_SET_GET( 0, w, 1, 1, 1, BGLMPIO_CIO_OTHREQ, BGLMPIO_CIO_MYREQ )
-#endif
-	
-/* based on everyone's my_req, calculate what requests of other
-   processes lie in this process's file domain.
-   count_others_req_procs = number of processes whose requests lie in
-   this process's file domain (including this process itself) 
-   count_others_req_per_proc[i] indicates how many separate contiguous
-   requests of proc. i lie in this process's file domain. */
-
-    if (bglmpio_tuneblocking)
-	ADIOI_BGL_Calc_others_req(fd, count_my_req_procs,
-			      count_my_req_per_proc, my_req,
-			      nprocs, myrank,
-			      &count_others_req_procs, &others_req);
-    else
-    ADIOI_Calc_others_req(fd, count_my_req_procs, 
-			  count_my_req_per_proc, my_req, 
-			  nprocs, myrank,
-			  &count_others_req_procs, &others_req); 
-    
-#if BGL_PROFILE 
-    BGLMPIO_T_CIO_SET_GET( 0, w, 1, 1, 1, BGLMPIO_CIO_DEXCH, BGLMPIO_CIO_OTHREQ )
-#endif
-
-    ADIOI_Free(count_my_req_per_proc);
-    for (i=0; i < nprocs; i++) {
-	if (my_req[i].count) {
-	    ADIOI_Free(my_req[i].offsets);
-	    ADIOI_Free(my_req[i].lens);
-	}
-    }
-    ADIOI_Free(my_req);
-
-/* exchange data and write in sizes of no more than coll_bufsize. */
-    ADIOI_Exch_and_write(fd, buf, datatype, nprocs, myrank,
-                        others_req, offset_list,
-			len_list, contig_access_count, min_st_offset,
-			fd_size, fd_start, fd_end, buf_idx, error_code);
-
-#if BGL_PROFILE 
-    BGLMPIO_T_CIO_SET_GET( 0, w, 1, 0, 1, BGLMPIO_CIO_LAST, BGLMPIO_CIO_T_DEXCH )
-    BGLMPIO_T_CIO_SET_GET( 0, w, 0, 0, 1, BGLMPIO_CIO_LAST, BGLMPIO_CIO_T_MPIO_CRW )
-
-    BGLMPIO_T_CIO_REPORT( 0, w, fd, myrank )
-#endif
-#if 0
-    /* From common code - not implemented for bgl.
-     * 
-     * If this collective write is followed by an independent write,
-     * it's possible to have those subsequent writes on other processes
-     * race ahead and sneak in before the read-modify-write completes.
-     * We carry out a collective communication at the end here so no one
-     * can start independent i/o before collective I/O completes. 
-     *
-     * need to do some gymnastics with the error codes so that if something
-     * went wrong, all processes report error, but if a process has a more
-     * specific error code, we can still have that process report the
-     * additional information */
-
-    old_error = *error_code;
-    if (*error_code != MPI_SUCCESS) *error_code = MPI_ERR_IO;
-
-     /* optimization: if only one process performing i/o, we can perform
-     * a less-expensive Bcast  */
-#ifdef ADIOI_MPE_LOGGING
-    MPE_Log_event( ADIOI_MPE_postwrite_a, 0, NULL );
-#endif
-    if (fd->hints->cb_nodes == 1) 
-	    MPI_Bcast(error_code, 1, MPI_INT, 
-			    fd->hints->ranklist[0], fd->comm);
-    else {
-	    tmp_error = *error_code;
-	    MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT, 
-			    MPI_MAX, fd->comm);
-    }
-#ifdef ADIOI_MPE_LOGGING
-    MPE_Log_event( ADIOI_MPE_postwrite_b, 0, NULL );
-#endif
-#ifdef AGGREGATION_PROFILE
-	MPE_Log_event (5012, 0, NULL);
-#endif
-
-    if ( (old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO) )
-	    *error_code = old_error;
-
-
-#endif
-/* free all memory allocated for collective I/O */
-    if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
-
-    for (i=0; i<nprocs; i++) {
-	if (others_req[i].count) {
-	    ADIOI_Free(others_req[i].offsets);
-	    ADIOI_Free(others_req[i].lens);
-	    ADIOI_Free(others_req[i].mem_ptrs);
-	}
-    }
-    ADIOI_Free(others_req);
-
-    ADIOI_Free(buf_idx);
-    ADIOI_Free(offset_list);
-    ADIOI_Free(len_list);
-    ADIOI_Free(st_offsets);
-    ADIOI_Free(end_offsets);
-    ADIOI_Free(fd_start);
-    ADIOI_Free(fd_end);
-
-#ifdef HAVE_STATUS_SET_BYTES
-    if (status) {
-      MPI_Count bufsize, size;
-      /* Don't set status if it isn't needed */
-      MPI_Type_size_x(datatype, &size);
-      bufsize = size * count;
-      MPIR_Status_set_bytes(status, datatype, bufsize);
-    }
-/* This is a temporary way of filling in status. The right way is to 
-   keep track of how much data was actually written during collective I/O. */
-#endif
-
-    fd->fp_sys_posn = -1;   /* set it to null. */
-#ifdef AGGREGATION_PROFILE
-	MPE_Log_event (5013, 0, NULL);
-#endif
-}
-
-
-
-/* If successful, error_code is set to MPI_SUCCESS.  Otherwise an error
- * code is created and returned in error_code.
- */
-static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
-				 datatype, int nprocs, 
-				 int myrank,
-				 ADIOI_Access
-				 *others_req, ADIO_Offset *offset_list,
-				 ADIO_Offset *len_list, int contig_access_count,
-				 ADIO_Offset min_st_offset, ADIO_Offset fd_size,
-				 ADIO_Offset *fd_start, ADIO_Offset *fd_end,
-				 int *buf_idx, int *error_code)
-{
-/* Send data to appropriate processes and write in sizes of no more
-   than coll_bufsize.    
-   The idea is to reduce the amount of extra memory required for
-   collective I/O. If all data were written all at once, which is much
-   easier, it would require temp space more than the size of user_buf,
-   which is often unacceptable. For example, to write a distributed
-   array to a file, where each local array is 8Mbytes, requiring
-   at least another 8Mbytes of temp space is unacceptable. */
-
-    /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets*/
-    ADIO_Offset size=0;
-    int hole, i, j, m, ntimes, max_ntimes, buftype_is_contig;
-    ADIO_Offset st_loc=-1, end_loc=-1, off, done, req_off;
-    char *write_buf=NULL;
-    int *curr_offlen_ptr, *count, *send_size, req_len, *recv_size;
-    int *partial_recv, *sent_to_proc, *start_pos, flag;
-    int *send_buf_idx, *curr_to_proc, *done_to_proc;
-    MPI_Status status;
-    ADIOI_Flatlist_node *flat_buf=NULL;
-    MPI_Aint buftype_extent;
-    int info_flag, coll_bufsize;
-    char *value;
-    static char myname[] = "ADIOI_EXCH_AND_WRITE";
-
-    *error_code = MPI_SUCCESS;  /* changed below if error */
-    /* only I/O errors are currently reported */
-
-/* calculate the number of writes of size coll_bufsize
-   to be done by each process and the max among all processes.
-   That gives the no. of communication phases as well. */
-
-    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
-    ADIOI_Info_get(fd->info, "cb_buffer_size", MPI_MAX_INFO_VAL, value, 
-                 &info_flag);
-    coll_bufsize = atoi(value);
-    ADIOI_Free(value);
-
-
-    for (i=0; i < nprocs; i++) {
-	if (others_req[i].count) {
-	    st_loc = others_req[i].offsets[0];
-	    end_loc = others_req[i].offsets[0];
-	    break;
-	}
-    }
-
-    for (i=0; i < nprocs; i++)
-	for (j=0; j < others_req[i].count; j++) {
-	    st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
-	    end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j]
-				       + others_req[i].lens[j] - 1));
-	}
-
-/* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/
-
-    ntimes = (int) ((end_loc - st_loc + coll_bufsize)/coll_bufsize);
-
-    if ((st_loc==-1) && (end_loc==-1)) {
-	ntimes = 0; /* this process does no writing. */
-    }
-
-    MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX,
-		  fd->comm); 
-
-    if (ntimes) write_buf = (char *) ADIOI_Malloc(coll_bufsize);
-
-    curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); 
-    /* its use is explained below. calloc initializes to 0. */
-
-    count = (int *) ADIOI_Malloc(nprocs*sizeof(int));
-    /* to store count of how many off-len pairs per proc are satisfied
-       in an iteration. */
-
-    partial_recv = (int *) ADIOI_Calloc(nprocs, sizeof(int));
-    /* if only a portion of the last off-len pair is recd. from a process
-       in a particular iteration, the length recd. is stored here.
-       calloc initializes to 0. */
-
-    send_size = (int *) ADIOI_Malloc(nprocs*sizeof(int));
-    /* total size of data to be sent to each proc. in an iteration.
-       Of size nprocs so that I can use MPI_Alltoall later. */
-
-    recv_size = (int *) ADIOI_Malloc(nprocs*sizeof(int));
-    /* total size of data to be recd. from each proc. in an iteration.*/
-
-    sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
-    /* amount of data sent to each proc so far. Used in
-       ADIOI_Fill_send_buffer. initialized to 0 here. */
-
-    send_buf_idx = (int *) ADIOI_Malloc(nprocs*sizeof(int));
-    curr_to_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int));
-    done_to_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int));
-    /* Above three are used in ADIOI_Fill_send_buffer*/
-
-    start_pos = (int *) ADIOI_Malloc(nprocs*sizeof(int));
-    /* used to store the starting value of curr_offlen_ptr[i] in 
-       this iteration */
-
-    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
-    if (!buftype_is_contig) {
-	ADIOI_Flatten_datatype(datatype);
-	flat_buf = ADIOI_Flatlist;
-        while (flat_buf->type != datatype) flat_buf = flat_buf->next;
-    }
-    MPI_Type_extent(datatype, &buftype_extent);
-
-
-/* I need to check if there are any outstanding nonblocking writes to
-   the file, which could potentially interfere with the writes taking
-   place in this collective write call. Since this is not likely to be
-   common, let me do the simplest thing possible here: Each process
-   completes all pending nonblocking operations before completing. */
-
-    /*ADIOI_Complete_async(error_code);
-    if (*error_code != MPI_SUCCESS) return;
-    MPI_Barrier(fd->comm);
-    */
-
-    done = 0;
-    off = st_loc;
-
-#ifdef PROFILE
-	MPE_Log_event(14, 0, "end computation");
-#endif
-
-    for (m=0; m < ntimes; m++) {
-       /* go through all others_req and check which will be satisfied
-          by the current write */
-
-       /* Note that MPI guarantees that displacements in filetypes are in 
-          monotonically nondecreasing order and that, for writes, the
-	  filetypes cannot specify overlapping regions in the file. This
-	  simplifies implementation a bit compared to reads. */
-
-          /* off = start offset in the file for the data to be written in 
-                   this iteration 
-             size = size of data written (bytes) corresponding to off
-             req_off = off in file for a particular contiguous request 
-                       minus what was satisfied in previous iteration
-             req_size = size corresponding to req_off */
-
-	/* first calculate what should be communicated */
-
-#ifdef PROFILE
-	MPE_Log_event(13, 0, "start computation");
-#endif
-	for (i=0; i < nprocs; i++) count[i] = recv_size[i] = 0;
-
-	size = ADIOI_MIN((unsigned)coll_bufsize, end_loc-st_loc+1-done); 
-
-	for (i=0; i < nprocs; i++) {
-	    if (others_req[i].count) {
-		start_pos[i] = curr_offlen_ptr[i];
-		for (j=curr_offlen_ptr[i]; j<others_req[i].count; j++) {
-		    if (partial_recv[i]) {
-			/* this request may have been partially
-			   satisfied in the previous iteration. */
-			req_off = others_req[i].offsets[j] +
-			    partial_recv[i]; 
-                        req_len = others_req[i].lens[j] -
-			    partial_recv[i];
-			partial_recv[i] = 0;
-			/* modify the off-len pair to reflect this change */
-			others_req[i].offsets[j] = req_off;
-			others_req[i].lens[j] = req_len;
-		    }
-		    else {
-			req_off = others_req[i].offsets[j];
-                        req_len = others_req[i].lens[j];
-		    }
-		    if (req_off < off + size) {
-			count[i]++;
-      ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)write_buf)+req_off-off) == (ADIO_Offset)(MPIR_Upint)(write_buf+req_off-off));
-			MPI_Address(write_buf+req_off-off, 
-                               &(others_req[i].mem_ptrs[j]));
-      ADIOI_Assert((off + size - req_off) == (int)(off + size - req_off));
-			recv_size[i] += (int)(ADIOI_MIN(off + size - req_off, 
-                                      (unsigned)req_len));
-
-			if (off+size-req_off < (unsigned)req_len)
-			{
-			    partial_recv[i] = (int) (off + size - req_off);
-
-			    /* --BEGIN ERROR HANDLING-- */
-			    if ((j+1 < others_req[i].count) && 
-                                 (others_req[i].offsets[j+1] < off+size))
-			    { 
-				*error_code = MPIO_Err_create_code(MPI_SUCCESS,
-								   MPIR_ERR_RECOVERABLE,
-								   myname,
-								   __LINE__,
-								   MPI_ERR_ARG,
-								   "Filetype specifies overlapping write regions (which is illegal according to the MPI-2 specification)", 0);
-				/* allow to continue since additional
-				 * communication might have to occur
-				 */
-			    }
-			    /* --END ERROR HANDLING-- */
-			    break;
-			}
-		    }
-		    else break;
-		}
-		curr_offlen_ptr[i] = j;
-	    }
-	}
-	
-#ifdef PROFILE
-	MPE_Log_event(14, 0, "end computation");
-	MPE_Log_event(7, 0, "start communication");
-#endif
-        if (bglmpio_comm == 1)
-	ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list, 
-                            len_list, send_size, recv_size, off, size, count, 
-                            start_pos, partial_recv, 
-                            sent_to_proc, nprocs, myrank, 
-			    buftype_is_contig, contig_access_count,
-			    min_st_offset, fd_size, fd_start, fd_end,
-			    others_req, send_buf_idx, curr_to_proc,
-                            done_to_proc, &hole, m, buftype_extent, buf_idx,
-			    error_code); 
-	else
-        if (bglmpio_comm == 0)
-	ADIOI_W_Exchange_data_alltoallv(fd, buf, write_buf, flat_buf, offset_list, 
-                            len_list, send_size, recv_size, off, size, count, 
-                            start_pos, partial_recv, 
-                            sent_to_proc, nprocs, myrank, 
-			    buftype_is_contig, contig_access_count,
-			    min_st_offset, fd_size, fd_start, fd_end,
-			    others_req, send_buf_idx, curr_to_proc,
-                            done_to_proc, &hole, m, buftype_extent, buf_idx,
-			    error_code); 
-        if (*error_code != MPI_SUCCESS) return;
-#ifdef PROFILE
-	MPE_Log_event(8, 0, "end communication");
-#endif
-
-	flag = 0;
-	for (i=0; i<nprocs; i++)
-	    if (count[i]) flag = 1;
-
-	if (flag) {
-      ADIOI_Assert(size == (int)size);
-	    ADIO_WriteContig(fd, write_buf, (int)size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, 
-                        off, &status, error_code);
-	    if (*error_code != MPI_SUCCESS) return;
-	}
-
-	off += size;
-	done += size;
-    }
-
-    for (i=0; i<nprocs; i++) count[i] = recv_size[i] = 0;
-#ifdef PROFILE
-	MPE_Log_event(7, 0, "start communication");
-#endif
-    for (m=ntimes; m<max_ntimes; m++) 
-	/* nothing to recv, but check for send. */
-        if (bglmpio_comm == 1)
-	ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list, 
-                            len_list, send_size, recv_size, off, size, count, 
-                            start_pos, partial_recv, 
-                            sent_to_proc, nprocs, myrank, 
-			    buftype_is_contig, contig_access_count,
-			    min_st_offset, fd_size, fd_start, fd_end,
-			    others_req, send_buf_idx, 
-                            curr_to_proc, done_to_proc, &hole, m, 
-                            buftype_extent, buf_idx, error_code); 
-	else
-        if (bglmpio_comm == 0)
-	ADIOI_W_Exchange_data_alltoallv(fd, buf, write_buf, flat_buf, offset_list, 
-                            len_list, send_size, recv_size, off, size, count, 
-                            start_pos, partial_recv, 
-                            sent_to_proc, nprocs, myrank, 
-			    buftype_is_contig, contig_access_count,
-			    min_st_offset, fd_size, fd_start, fd_end,
-			    others_req, send_buf_idx, 
-                            curr_to_proc, done_to_proc, &hole, m, 
-                            buftype_extent, buf_idx, error_code); 
-        if (*error_code != MPI_SUCCESS) return;
-#ifdef PROFILE
-	MPE_Log_event(8, 0, "end communication");
-#endif
-
-    if (ntimes) ADIOI_Free(write_buf);
-    ADIOI_Free(curr_offlen_ptr);
-    ADIOI_Free(count);
-    ADIOI_Free(partial_recv);
-    ADIOI_Free(send_size);
-    ADIOI_Free(recv_size);
-    ADIOI_Free(sent_to_proc);
-    ADIOI_Free(start_pos);
-    ADIOI_Free(send_buf_idx);
-    ADIOI_Free(curr_to_proc);
-    ADIOI_Free(done_to_proc);
-}
-
-
-/* Sets error_code to MPI_SUCCESS if successful, or creates an error code
- * in the case of error.
- */
-static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf,
-				  ADIOI_Flatlist_node *flat_buf, ADIO_Offset 
-				  *offset_list, ADIO_Offset *len_list, int *send_size, 
-				  int *recv_size, ADIO_Offset off, int size,
-				  int *count, int *start_pos,
-				  int *partial_recv,
-				  int *sent_to_proc, int nprocs, 
-				  int myrank, int
-				  buftype_is_contig, int contig_access_count,
-				  ADIO_Offset min_st_offset,
-				  ADIO_Offset fd_size,
-				  ADIO_Offset *fd_start, ADIO_Offset *fd_end, 
-				  ADIOI_Access *others_req, 
-				  int *send_buf_idx, int *curr_to_proc,
-				  int *done_to_proc, int *hole, int iter, 
-				  MPI_Aint buftype_extent, int *buf_idx,
-				  int *error_code)
-{
-    int i, j, k, *tmp_len, nprocs_recv, nprocs_send, err;
-    char **send_buf = NULL; 
-    MPI_Request *requests, *send_req;
-    MPI_Datatype *recv_types;
-    MPI_Status *statuses, status;
-    int *srt_len, sum;
-    ADIO_Offset *srt_off;
-    static char myname[] = "ADIOI_W_EXCHANGE_DATA";
-
-/* exchange recv_size info so that each process knows how much to
-   send to whom. */
-
-    MPI_Alltoall(recv_size, 1, MPI_INT, send_size, 1, MPI_INT, fd->comm);
-
-    /* create derived datatypes for recv */
-
-    nprocs_recv = 0;
-    for (i=0; i<nprocs; i++) if (recv_size[i]) nprocs_recv++;
-
-    recv_types = (MPI_Datatype *)
-	ADIOI_Malloc((nprocs_recv+1)*sizeof(MPI_Datatype)); 
-/* +1 to avoid a 0-size malloc */
-
-    tmp_len = (int *) ADIOI_Malloc(nprocs*sizeof(int));
-    j = 0;
-    for (i=0; i<nprocs; i++) {
-	if (recv_size[i]) {
-/* take care if the last off-len pair is a partial recv */
-	    if (partial_recv[i]) {
-		k = start_pos[i] + count[i] - 1;
-		tmp_len[i] = others_req[i].lens[k];
-		others_req[i].lens[k] = partial_recv[i];
-	    }
-	    MPI_Type_hindexed(count[i], 
-                 &(others_req[i].lens[start_pos[i]]),
-	             &(others_req[i].mem_ptrs[start_pos[i]]), 
-			 MPI_BYTE, recv_types+j);
-	    /* absolute displacements; use MPI_BOTTOM in recv */
-	    MPI_Type_commit(recv_types+j);
-	    j++;
-	}
-    }
-
-    /* To avoid a read-modify-write, check if there are holes in the 
-       data to be written. For this, merge the (sorted) offset lists
-       others_req using a heap-merge. */
-
-    sum = 0;
-    for (i=0; i<nprocs; i++) sum += count[i];
-    /* valgrind-detcted optimization: if there is no work on this process we do
-     * not need to search for holes */
-    if (sum) {
-	srt_off = (ADIO_Offset *) ADIOI_Malloc((sum)*sizeof(ADIO_Offset));
-	srt_len = (int *) ADIOI_Malloc((sum)*sizeof(int));
-
-        ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
-                         nprocs, nprocs_recv, sum);
-    }
-
-/* for partial recvs, restore original lengths */
-    for (i=0; i<nprocs; i++) 
-        if (partial_recv[i]) {
-            k = start_pos[i] + count[i] - 1;
-            others_req[i].lens[k] = tmp_len[i];
-        }
-    ADIOI_Free(tmp_len);
-
-    /* check if there are any holes. If yes, must do read-modify-write.
-     * holes can be in three places.  'middle' is what you'd expect: the
-     * processes are operating on noncontigous data.  But holes can also show
-     * up at the beginning or end of the file domain (see John Bent ROMIO REQ
-     * #835). Missing these holes would result in us writing more data than
-     * recieved by everyone else. */
-    *hole = 0;
-    if (sum) {
-        if (off != srt_off[0]) /* hole at the front */
-            *hole = 1;
-        else { /* coalesce the sorted offset-length pairs */
-            for (i=1; i<sum; i++) {
-                if (srt_off[i] <= srt_off[0] + srt_len[0]) {
-		    int new_len = srt_off[i] + srt_len[i] - srt_off[0];
-		    if (new_len > srt_len[0]) srt_len[0] = new_len;
-	        }
-                else
-                    break;
-	    }
-            if (i < sum || size != srt_len[0]) /* hole in middle or end */
-                *hole = 1;
-	    }
-
-    ADIOI_Free(srt_off);
-    ADIOI_Free(srt_len);
-    }
-
-    if (nprocs_recv) {
-	if (*hole) {
-	    ADIO_ReadContig(fd, write_buf, size, MPI_BYTE, 
-			    ADIO_EXPLICIT_OFFSET, off, &status, &err);
-	    /* --BEGIN ERROR HANDLING-- */
-	    if (err != MPI_SUCCESS) {
-		*error_code = MPIO_Err_create_code(err,
-						   MPIR_ERR_RECOVERABLE, myname,
-						   __LINE__, MPI_ERR_IO,
-						   "**ioRMWrdwr", 0);
-		return;
-	    } 
-	    /* --END ERROR HANDLING-- */
-	}
-    }
-
-    nprocs_send = 0;
-    for (i=0; i < nprocs; i++) if (send_size[i]) nprocs_send++;
-
-    if (fd->atomicity) {
-        /* bug fix from Wei-keng Liao and Kenin Coloma */
-        requests = (MPI_Request *)
-	    ADIOI_Malloc((nprocs_send+1)*sizeof(MPI_Request)); 
-        send_req = requests;
-    }
-    else {
-        requests = (MPI_Request *) 	
-            ADIOI_Malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request)); 
-        /* +1 to avoid a 0-size malloc */
-
-        /* post receives */
-        j = 0;
-        for (i=0; i<nprocs; i++) {
-            if (recv_size[i]) {
-                MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i, myrank+i+100*iter,
-                          fd->comm, requests+j);
-                j++;
-            }
-        }
-	send_req = requests + nprocs_recv;
-    }
-
-/* post sends. if buftype_is_contig, data can be directly sent from
-   user buf at location given by buf_idx. else use send_buf. */
-
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5032, 0, NULL);
-#endif
-    if (buftype_is_contig) {
-	j = 0;
-	for (i=0; i < nprocs; i++) 
-	    if (send_size[i]) {
-		MPI_Isend(((char *) buf) + buf_idx[i], send_size[i], 
-  		            MPI_BYTE, i,  myrank+i+100*iter, fd->comm, 
-                                  send_req+j);
-		j++;
-                buf_idx[i] += send_size[i];
-	    }
-    }
-    else if (nprocs_send) {
-	/* buftype is not contig */
-	send_buf = (char **) ADIOI_Malloc(nprocs*sizeof(char*));
-	for (i=0; i < nprocs; i++) 
-	    if (send_size[i]) 
-		send_buf[i] = (char *) ADIOI_Malloc(send_size[i]);
-
-	ADIOI_Fill_send_buffer(fd, buf, flat_buf, send_buf,
-                           offset_list, len_list, send_size, 
-			   send_req,
-                           sent_to_proc, nprocs, myrank, 
-                           contig_access_count,
-                           min_st_offset, fd_size, fd_start, fd_end, 
-                           send_buf_idx, curr_to_proc, done_to_proc, iter,
-                           buftype_extent);
-        /* the send is done in ADIOI_Fill_send_buffer */
-    }
-
-    if (fd->atomicity) {
-        /* bug fix from Wei-keng Liao and Kenin Coloma */
-        j = 0;
-        for (i=0; i<nprocs; i++) {
-            MPI_Status wkl_status;
-	    if (recv_size[i]) {
-	        MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i, myrank+i+100*iter,
-		          fd->comm, &wkl_status);
-	        j++;
-	    }
-        }
-    }
-
-    for (i=0; i<nprocs_recv; i++) MPI_Type_free(recv_types+i);
-    ADIOI_Free(recv_types);
-    
-    if (fd->atomicity) {
-        /* bug fix from Wei-keng Liao and Kenin Coloma */
-        statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send+1) * \
-                                         sizeof(MPI_Status)); 
-         /* +1 to avoid a 0-size malloc */
-    }
-    else {
-        statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send+nprocs_recv+1) * \
-                                     sizeof(MPI_Status)); 
-        /* +1 to avoid a 0-size malloc */
-    }
-
-#ifdef NEEDS_MPI_TEST
-    i = 0;
-    if (fd->atomicity) {
-        /* bug fix from Wei-keng Liao and Kenin Coloma */
-        while (!i) MPI_Testall(nprocs_send, send_req, &i, statuses);
-    }
-    else {
-        while (!i) MPI_Testall(nprocs_send+nprocs_recv, requests, &i, statuses);
-    }
-#else
-    if (fd->atomicity)
-        /* bug fix from Wei-keng Liao and Kenin Coloma */
-        MPI_Waitall(nprocs_send, send_req, statuses);
-    else
-        MPI_Waitall(nprocs_send+nprocs_recv, requests, statuses);
-#endif
-
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5033, 0, NULL);
-#endif
-    ADIOI_Free(statuses);
-    ADIOI_Free(requests);
-    if (!buftype_is_contig && nprocs_send) {
-	for (i=0; i < nprocs; i++) 
-	    if (send_size[i]) ADIOI_Free(send_buf[i]);
-	ADIOI_Free(send_buf);
-    }
-}
-
-
-#define ADIOI_BUF_INCR \
-{ \
-    while (buf_incr) { \
-        size_in_buf = ADIOI_MIN(buf_incr, flat_buf_sz); \
-        user_buf_idx += size_in_buf; \
-        flat_buf_sz -= size_in_buf; \
-        if (!flat_buf_sz) { \
-            if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
-            else { \
-                flat_buf_idx = 0; \
-                n_buftypes++; \
-            } \
-            user_buf_idx = flat_buf->indices[flat_buf_idx] + \
-                              (ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
-            flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
-        } \
-        buf_incr -= size_in_buf; \
-    } \
-}
-
-
-#define ADIOI_BUF_COPY \
-{ \
-    while (size) { \
-        size_in_buf = ADIOI_MIN(size, flat_buf_sz); \
-  ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)buf) + user_buf_idx) == (ADIO_Offset)(MPIR_Upint)((MPIR_Upint)buf + user_buf_idx)); \
-  ADIOI_Assert(size_in_buf == (size_t)size_in_buf); \
-        memcpy(&(send_buf[p][send_buf_idx[p]]), \
-               ((char *) buf) + user_buf_idx, size_in_buf); \
-        send_buf_idx[p] += size_in_buf; \
-        user_buf_idx += size_in_buf; \
-        flat_buf_sz -= size_in_buf; \
-        if (!flat_buf_sz) { \
-            if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
-            else { \
-                flat_buf_idx = 0; \
-                n_buftypes++; \
-            } \
-            user_buf_idx = flat_buf->indices[flat_buf_idx] + \
-                              (ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
-            flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
-        } \
-        size -= size_in_buf; \
-        buf_incr -= size_in_buf; \
-    } \
-    ADIOI_BUF_INCR \
-}
-
-static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
-                           *flat_buf, char **send_buf, ADIO_Offset 
-                           *offset_list, ADIO_Offset *len_list, int *send_size, 
-                           MPI_Request *requests, int *sent_to_proc, 
-                           int nprocs, int myrank, 
-                           int contig_access_count, 
-                           ADIO_Offset min_st_offset, ADIO_Offset fd_size,
-                           ADIO_Offset *fd_start, ADIO_Offset *fd_end, 
-                           int *send_buf_idx, int *curr_to_proc, 
-                           int *done_to_proc, int iter,
-                           MPI_Aint buftype_extent)
-{
-/* this function is only called if buftype is not contig */
-
-    int i, p, flat_buf_idx;
-    ADIO_Offset flat_buf_sz, size_in_buf, buf_incr, size;
-    int jj, n_buftypes;
-    ADIO_Offset off, len, rem_len, user_buf_idx;
-
-/*  curr_to_proc[p] = amount of data sent to proc. p that has already
-    been accounted for so far
-    done_to_proc[p] = amount of data already sent to proc. p in 
-    previous iterations
-    user_buf_idx = current location in user buffer 
-    send_buf_idx[p] = current location in send_buf of proc. p  */
-
-    for (i=0; i < nprocs; i++) {
-	send_buf_idx[i] = curr_to_proc[i] = 0;
-	done_to_proc[i] = sent_to_proc[i];
-    }
-    jj = 0;
-
-    user_buf_idx = flat_buf->indices[0];
-    flat_buf_idx = 0;
-    n_buftypes = 0;
-    flat_buf_sz = flat_buf->blocklens[0];
-
-    /* flat_buf_idx = current index into flattened buftype
-       flat_buf_sz = size of current contiguous component in 
-	                 flattened buf */
-
-    for (i=0; i<contig_access_count; i++) { 
-	off     = offset_list[i];
-	rem_len = len_list[i];
-
-	/*this request may span the file domains of more than one process*/
-  while (rem_len != 0) {
-	    len = rem_len;
-	    /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
-	     * longer than the single region that processor "p" is responsible
-	     * for.
-	     */
-	    p = ADIOI_BGL_Calc_aggregator(fd,
-				      off,
-				      min_st_offset,
-				      &len,
-				      fd_size,
-				      fd_start,
-				      fd_end);
-
-	    if (send_buf_idx[p] < send_size[p]) {
-		if (curr_to_proc[p]+len > done_to_proc[p]) {
-		    if (done_to_proc[p] > curr_to_proc[p]) {
-			size = ADIOI_MIN(curr_to_proc[p] + len - 
-                                done_to_proc[p], send_size[p]-send_buf_idx[p]);
-			buf_incr = done_to_proc[p] - curr_to_proc[p];
-			ADIOI_BUF_INCR
-      ADIOI_Assert((curr_to_proc[p] + len - done_to_proc[p]) == (unsigned)(curr_to_proc[p] + len - done_to_proc[p]));
-		        buf_incr = curr_to_proc[p] + len - done_to_proc[p];
-      ADIOI_Assert((done_to_proc[p] + size) == (unsigned)(done_to_proc[p] + size));
-			curr_to_proc[p] = done_to_proc[p] + size;
-		        ADIOI_BUF_COPY
-		    }
-		    else {
-			size = ADIOI_MIN(len,send_size[p]-send_buf_idx[p]);
-			buf_incr = len;
-      ADIOI_Assert((curr_to_proc[p] + size) == (unsigned)((ADIO_Offset)curr_to_proc[p] + size));
-			curr_to_proc[p] += size;
-			ADIOI_BUF_COPY
-		    }
-		    if (send_buf_idx[p] == send_size[p]) {
-			MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p, 
-				myrank+p+100*iter, fd->comm, requests+jj);
-			jj++;
-		    }
-		}
-		else {
-        ADIOI_Assert((curr_to_proc[p] + len) == (unsigned)((ADIO_Offset)curr_to_proc[p] + len));
-		    curr_to_proc[p] += len;
-		    buf_incr = len;
-		    ADIOI_BUF_INCR
-		}
-	    }
-	    else {
-		buf_incr = len;
-		ADIOI_BUF_INCR
-            }
-	    off     += len;
-	    rem_len -= len;
-	}
-    }
-    for (i=0; i < nprocs; i++) 
-	if (send_size[i]) sent_to_proc[i] = curr_to_proc[i];
-}
-
-
-
-static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, 
-		      ADIO_Offset *srt_off, int *srt_len, int *start_pos,
-		      int nprocs, int nprocs_recv, int total_elements)
-{
-    typedef struct {
-	ADIO_Offset *off_list;
-	int *len_list;
-	int nelem;
-    } heap_struct;
-
-    heap_struct *a, tmp;
-    int i, j, heapsize, l, r, k, smallest;
-
-    a = (heap_struct *) ADIOI_Malloc((nprocs_recv+1)*sizeof(heap_struct));
-
-    j = 0;
-    for (i=0; i<nprocs; i++)
-	if (count[i]) {
-	    a[j].off_list = &(others_req[i].offsets[start_pos[i]]);
-	    a[j].len_list = &(others_req[i].lens[start_pos[i]]);
-	    a[j].nelem = count[i];
-	    j++;
-	}
-
-    /* build a heap out of the first element from each list, with
-       the smallest element of the heap at the root */
-
-    heapsize = nprocs_recv;
-    for (i=heapsize/2 - 1; i>=0; i--) {
-	/* Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143
-           modified for a heap with smallest element at root. I have 
-           removed the recursion so that there are no function calls.
-           Function calls are too expensive. */
-	k = i;
-	while (1) {
-	    l = 2*(k+1) - 1;
-	    r = 2*(k+1);
-
-	    if ((l < heapsize) && 
-		(*(a[l].off_list) < *(a[k].off_list)))
-		smallest = l;
-	    else smallest = k;
-
-	    if ((r < heapsize) && 
-		(*(a[r].off_list) < *(a[smallest].off_list)))
-		smallest = r;
-
-	    if (smallest != k) {
-		tmp.off_list = a[k].off_list;
-		tmp.len_list = a[k].len_list;
-		tmp.nelem = a[k].nelem;
-
-		a[k].off_list = a[smallest].off_list;
-		a[k].len_list = a[smallest].len_list;
-		a[k].nelem = a[smallest].nelem;
-		
-		a[smallest].off_list = tmp.off_list;
-		a[smallest].len_list = tmp.len_list;
-		a[smallest].nelem = tmp.nelem;
-	    
-		k = smallest;
-	    }
-	    else break;
-	}
-    }
-
-    for (i=0; i<total_elements; i++) {
-        /* extract smallest element from heap, i.e. the root */
-	srt_off[i] = *(a[0].off_list);
-	srt_len[i] = *(a[0].len_list);
-	(a[0].nelem)--;
-
-	if (!a[0].nelem) {
-	    a[0].off_list = a[heapsize-1].off_list;
-	    a[0].len_list = a[heapsize-1].len_list;
-	    a[0].nelem = a[heapsize-1].nelem;
-	    heapsize--;
-	}
-	else {
-	    (a[0].off_list)++;
-	    (a[0].len_list)++;
-	}
-
-	/* Heapify(a, 0, heapsize); */
-	k = 0;
-	while (1) {
-	    l = 2*(k+1) - 1;
-	    r = 2*(k+1);
-
-	    if ((l < heapsize) && 
-		(*(a[l].off_list) < *(a[k].off_list)))
-		smallest = l;
-	    else smallest = k;
-
-	    if ((r < heapsize) && 
-		(*(a[r].off_list) < *(a[smallest].off_list)))
-		smallest = r;
-
-	    if (smallest != k) {
-		tmp.off_list = a[k].off_list;
-		tmp.len_list = a[k].len_list;
-		tmp.nelem = a[k].nelem;
-
-		a[k].off_list = a[smallest].off_list;
-		a[k].len_list = a[smallest].len_list;
-		a[k].nelem = a[smallest].nelem;
-		
-		a[smallest].off_list = tmp.off_list;
-		a[smallest].len_list = tmp.len_list;
-		a[smallest].nelem = tmp.nelem;
-	    
-		k = smallest;
-	    }
-	    else break;
-	}
-    }
-
-    ADIOI_Free(a);
-}
-
-
-static void ADIOI_W_Exchange_data_alltoallv(
-		ADIO_File fd, void *buf,
-		char *write_buf,					/* 1 */
-		ADIOI_Flatlist_node *flat_buf, 
-		ADIO_Offset *offset_list, 
-		ADIO_Offset *len_list, int *send_size, int *recv_size, 
-		ADIO_Offset off, int size,				/* 2 */
-		int *count, int *start_pos, int *partial_recv,
-		int *sent_to_proc, int nprocs, int myrank, 
-		int buftype_is_contig, int contig_access_count,
-		ADIO_Offset min_st_offset,
-		ADIO_Offset fd_size,
-		ADIO_Offset *fd_start, 
-		ADIO_Offset *fd_end,
-		ADIOI_Access *others_req,
-		int *send_buf_idx, int *curr_to_proc,			/* 3 */
-		int *done_to_proc, int *hole, 				/* 4 */
-		int iter, MPI_Aint buftype_extent, int *buf_idx,
-		int *error_code)
-{   
-    int i, j, k=0, nprocs_recv, nprocs_send, *tmp_len, err;
-    char **send_buf = NULL;
-    MPI_Request *send_req=NULL;
-    MPI_Status status;
-    int rtail, stail;
-    char *sbuf_ptr, *to_ptr;
-    int  len;
-    int  *sdispls, *rdispls;
-    char *all_recv_buf, *all_send_buf;
-    int *srt_len, sum;
-    ADIO_Offset *srt_off;
-    static char myname[] = "ADIOI_W_EXCHANGE_DATA";
-
-
-  /* exchange recv_size info so that each process knows how much to
-     send to whom. */
-    MPI_Alltoall(recv_size, 1, MPI_INT, send_size, 1, MPI_INT, fd->comm);
-    
-    nprocs_recv = 0;
-    for (i=0; i<nprocs; i++) if (recv_size[i]) { nprocs_recv++; }
-    nprocs_send = 0;
-    for (i=0; i<nprocs; i++) if (send_size[i]) { nprocs_send++; }
-    
-  /* receiver side data structures */
-    rdispls = (int *) ADIOI_Malloc( nprocs * sizeof(int) );
-    rtail = 0;
-    for (i=0; i<nprocs; i++) { rdispls[i] = rtail; rtail += recv_size[i]; }
-
-        /* data buffer */
-    all_recv_buf = (char *) ADIOI_Malloc( rtail );
-
-  /* sender side data structures */
-    sdispls = (int *) ADIOI_Malloc( nprocs * sizeof(int) );
-    stail = 0;
-    for (i=0; i<nprocs; i++) { sdispls[i] = stail; stail += send_size[i]; }
-
-        /* data buffer */
-    all_send_buf = (char *) ADIOI_Malloc( stail );
-    if (buftype_is_contig) {
-	for (i=0; i<nprocs; i++)
-	{
-	    if (send_size[i]) {
-		sbuf_ptr = all_send_buf + sdispls[i];
-		memcpy( sbuf_ptr, buf + buf_idx[i], send_size[i] );
-		buf_idx[i] += send_size[i];
-	    }
-	} 
-    } else {
-	send_buf = (char **) ADIOI_Malloc( nprocs * sizeof(char *) );
-	for (i=0; i<nprocs; i++)
-	    send_buf[i] = all_send_buf + sdispls[i];
-	ADIOI_Fill_send_buffer_nosend(fd, buf, flat_buf, send_buf,
-                           offset_list, len_list, send_size, 
-			   send_req,
-                           sent_to_proc, nprocs, myrank, 
-                           contig_access_count,
-                           min_st_offset, fd_size, fd_start, fd_end, 
-                           send_buf_idx, curr_to_proc, done_to_proc, iter,
-                           buftype_extent);
-    }
-
-  /* alltoallv */
-    MPI_Alltoallv( 
-            all_send_buf, send_size, sdispls, MPI_BYTE,
-            all_recv_buf, recv_size, rdispls, MPI_BYTE,
-            fd->comm ); 
-
-  /* data sieving pre-read */
-  /* To avoid a read-modify-write, check if there are holes in the 
-     data to be written. For this, merge the (sorted) offset lists
-     others_req using a heap-merge. */
-
-    sum = 0;
-    for (i=0; i<nprocs; i++) sum += count[i];
-    srt_off = (ADIO_Offset *) ADIOI_Malloc((sum+1)*sizeof(ADIO_Offset));
-    srt_len = (int *) ADIOI_Malloc((sum+1)*sizeof(int));
-
-    ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
-                     nprocs, nprocs_recv, sum);
-
-    /* check if there are any holes */
-    *hole = 0;
-    /* See if there are holes before the first request or after the last request*/
-    if((srt_off[0] > off) || 
-       ((srt_off[sum-1] + srt_len[sum-1]) < (off + size)))
-    {
-       *hole = 1;
-    }
-    else /* See if there are holes between the requests, if there are more than one */
-      for (i=0; i<sum-1; i++)
-        if (srt_off[i]+srt_len[i] < srt_off[i+1]) {
-            *hole = 1;
-            break;
-        }
-
-    ADIOI_Free(srt_off);
-    ADIOI_Free(srt_len);
-
-    if (nprocs_recv) {
-        if (*hole) {
-            ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
-                            ADIO_EXPLICIT_OFFSET, off, &status, &err);
-            /* --BEGIN ERROR HANDLING-- */
-            if (err != MPI_SUCCESS) {
-                *error_code = MPIO_Err_create_code(err,
-                                                   MPIR_ERR_RECOVERABLE, myname,
-                                                   __LINE__, MPI_ERR_IO,
-                                                   "**ioRMWrdwr", 0);
-                return;
-            }
-            /* --END ERROR HANDLING-- */
-        }
-    }
-    
-  /* scater all_recv_buf into 4M cb_buffer */
-    tmp_len = (int *) ADIOI_Malloc(nprocs*sizeof(int));
-    for (i=0; i<nprocs; i++)
-    {
-        if (recv_size[i]) {
-            if (partial_recv[i]) {
-                k = start_pos[i] + count[i] - 1;
-                tmp_len[i] = others_req[i].lens[k];
-                others_req[i].lens[k] = partial_recv[i];
-            }
-
-            sbuf_ptr = all_recv_buf + rdispls[i];
-            for (j=0; j<count[i]; j++) {
-                ADIOI_ENSURE_AINT_FITS_IN_PTR(others_req[i].mem_ptrs[ start_pos[i]+j ]);
-                to_ptr = (char *) ADIOI_AINT_CAST_TO_VOID_PTR ( others_req[i].mem_ptrs[ start_pos[i]+j ] );
-                len    =           others_req[i].lens[     start_pos[i]+j ]  ;
-                memcpy( to_ptr, sbuf_ptr, len );
-                sbuf_ptr += len;
-            }
-
-	    /* restore */
-            if (partial_recv[i]) {
-                k = start_pos[i] + count[i] - 1;
-                others_req[i].lens[k] = tmp_len[i];
-            }
-	    
-        }
-    }
-    
-    ADIOI_Free( tmp_len );
-    ADIOI_Free( all_send_buf );
-    ADIOI_Free( all_recv_buf );
-    ADIOI_Free(sdispls);
-    ADIOI_Free(rdispls);
-    return; 
-}   
-
-static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlist_node
-                           *flat_buf, char **send_buf, ADIO_Offset 
-                           *offset_list, ADIO_Offset *len_list, int *send_size, 
-                           MPI_Request *requests, int *sent_to_proc, 
-                           int nprocs, int myrank, 
-                           int contig_access_count, 
-                           ADIO_Offset min_st_offset, ADIO_Offset fd_size,
-                           ADIO_Offset *fd_start, ADIO_Offset *fd_end, 
-                           int *send_buf_idx, int *curr_to_proc, 
-                           int *done_to_proc, int iter,
-                           MPI_Aint buftype_extent)
-{
-/* this function is only called if buftype is not contig */
-
-    int i, p, flat_buf_idx;
-    ADIO_Offset flat_buf_sz, size_in_buf, buf_incr, size;
-    int jj, n_buftypes;
-    ADIO_Offset off, len, rem_len, user_buf_idx;
-
-/*  curr_to_proc[p] = amount of data sent to proc. p that has already
-    been accounted for so far
-    done_to_proc[p] = amount of data already sent to proc. p in 
-    previous iterations
-    user_buf_idx = current location in user buffer 
-    send_buf_idx[p] = current location in send_buf of proc. p  */
-
-    for (i=0; i < nprocs; i++) {
-	send_buf_idx[i] = curr_to_proc[i] = 0;
-	done_to_proc[i] = sent_to_proc[i];
-    }
-    jj = 0;
-
-    user_buf_idx = flat_buf->indices[0];
-    flat_buf_idx = 0;
-    n_buftypes = 0;
-    flat_buf_sz = flat_buf->blocklens[0];
-
-    /* flat_buf_idx = current index into flattened buftype
-       flat_buf_sz = size of current contiguous component in 
-	                 flattened buf */
-
-    for (i=0; i<contig_access_count; i++) { 
-	off     = offset_list[i];
-	rem_len = len_list[i];
-
-	/*this request may span the file domains of more than one process*/
-  while (rem_len != 0) {
-	    len = rem_len;
-	    /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
-	     * longer than the single region that processor "p" is responsible
-	     * for.
-	     */
-	    p = ADIOI_BGL_Calc_aggregator(fd,
-				      off,
-				      min_st_offset,
-				      &len,
-				      fd_size,
-				      fd_start,
-				      fd_end);
-
-	    if (send_buf_idx[p] < send_size[p]) {
-		if (curr_to_proc[p]+len > done_to_proc[p]) {
-		    if (done_to_proc[p] > curr_to_proc[p]) {
-			size = ADIOI_MIN(curr_to_proc[p] + len - 
-                                done_to_proc[p], send_size[p]-send_buf_idx[p]);
-			buf_incr = done_to_proc[p] - curr_to_proc[p];
-			ADIOI_BUF_INCR
-      ADIOI_Assert((curr_to_proc[p] + len - done_to_proc[p]) == (unsigned)(curr_to_proc[p] + len - done_to_proc[p]));
-		        buf_incr = curr_to_proc[p] + len - done_to_proc[p];
-      ADIOI_Assert((done_to_proc[p] + size) == (unsigned)(done_to_proc[p] + size));
-			curr_to_proc[p] = done_to_proc[p] + size;
-		        ADIOI_BUF_COPY
-		    }
-		    else {
-			size = ADIOI_MIN(len,send_size[p]-send_buf_idx[p]);
-			buf_incr = len;
-      ADIOI_Assert((curr_to_proc[p] + size) == (unsigned)((ADIO_Offset)curr_to_proc[p] + size));
-			curr_to_proc[p] += size;
-			ADIOI_BUF_COPY
-		    }
-		    /* moved to alltoallv */
-		    /*
-		    if (send_buf_idx[p] == send_size[p]) {
-			MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p, 
-				myrank+p+100*iter, fd->comm, requests+jj);
-			jj++;
-		    }
-		    */
-		}
-		else {
-        ADIOI_Assert((curr_to_proc[p] + len) == (unsigned)((ADIO_Offset)curr_to_proc[p] + len));
-		    curr_to_proc[p] += (int)len;
-		    buf_incr = len;
-		    ADIOI_BUF_INCR
-		}
-	    }
-	    else {
-		buf_incr = len;
-		ADIOI_BUF_INCR
-            }
-	    off     += len;
-	    rem_len -= len;
-	}
-    }
-    for (i=0; i < nprocs; i++) 
-	if (send_size[i]) sent_to_proc[i] = curr_to_proc[i];
-}
diff --git a/src/mpi/romio/adio/ad_bgl/ad_bgl_write.c b/src/mpi/romio/adio/ad_bgl/ad_bgl_write.c
deleted file mode 100644
index 3121035..0000000
--- a/src/mpi/romio/adio/ad_bgl/ad_bgl_write.c
+++ /dev/null
@@ -1,611 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bgl_write.c
- * \brief ???
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 1997 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "ad_bgl.h"
-#include "adio_extern.h"
-
-#include "ad_bgl_tuning.h"
-
-#ifdef AGGREGATION_PROFILE
-#include "mpe.h"
-#endif
-
-void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count, 
-                     MPI_Datatype datatype, int file_ptr_type,
-		     ADIO_Offset offset, ADIO_Status *status, int *error_code)
-{
-    MPI_Count err=-1, datatype_size;
-    ADIO_Offset len;
-    static char myname[] = "ADIOI_BGL_WRITECONTIG";
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5036, 0, NULL);
-#endif
-#if BGL_PROFILE
-		/* timing */
-		double io_time, io_time2;
-
-		if (bglmpio_timing) { 
-		    io_time = MPI_Wtime(); 
-		    bglmpio_prof_cw[ BGLMPIO_CIO_DATA_SIZE ] += len;
-		}
-#endif
-			  
-    MPI_Type_size_x(datatype, &datatype_size);
-    len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
-    ADIOI_Assert(len == (unsigned int) len); /* write takes an unsigned int parm */
-
-#if BGL_PROFILE
-
-    if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
-        	if (bglmpio_timing2) io_time2 = MPI_Wtime();
-	if (fd->fp_sys_posn != offset)
-	    lseek(fd->fd_sys, offset, SEEK_SET);
-        	if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
-	ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
-        	if (bglmpio_timing2) io_time2 = MPI_Wtime();
-	err = write(fd->fd_sys, buf, (unsigned int)len);
-        	if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
-	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
-	fd->fp_sys_posn = offset + err;
-	/* individual file pointer not updated */        
-    }
-    else { /* write from curr. location of ind. file pointer */
-	offset = fd->fp_ind;
-	        if (bglmpio_timing2) io_time2 = MPI_Wtime();
-	if (fd->fp_sys_posn != fd->fp_ind)
-	    lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
-        	if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
-	ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
-        	if (bglmpio_timing2) io_time2 = MPI_Wtime();
-	err = write(fd->fd_sys, buf, (unsigned int)len);
-        	if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
-	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
-	fd->fp_ind += err;
-	fd->fp_sys_posn = fd->fp_ind;
-    }
-
-#else	/* BGL_PROFILE */
-
-    if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
-	if (fd->fp_sys_posn != offset)
-	    lseek(fd->fd_sys, offset, SEEK_SET);
-	ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
-	err = write(fd->fd_sys, buf, (unsigned int)len);
-	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
-	fd->fp_sys_posn = offset + err;
-	/* individual file pointer not updated */        
-    }
-    else { /* write from curr. location of ind. file pointer */
-	offset = fd->fp_ind;
-	if (fd->fp_sys_posn != fd->fp_ind)
-	    lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
-	ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
-	err = write(fd->fd_sys, buf, (unsigned int)len);
-	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
-	fd->fp_ind += err;
-	fd->fp_sys_posn = fd->fp_ind;
-    }
-
-#endif	/* BGL_PROFILE */
-
-#if BGL_PROFILE
-		if (bglmpio_timing) bglmpio_prof_cw[ BGLMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
-#endif
-
-    /* --BEGIN ERROR HANDLING-- */
-    if (err == -1) {
-	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
-					   myname, __LINE__, MPI_ERR_IO,
-					   "**io",
-					   "**io %s", strerror(errno));
-	return;
-    }
-    /* --END ERROR HANDLING-- */
-
-#ifdef HAVE_STATUS_SET_BYTES
-    MPIR_Status_set_bytes(status, datatype, err);
-#endif
-
-    *error_code = MPI_SUCCESS;
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5037, 0, NULL);
-#endif
-}
-
-
-#define ADIOI_BUFFERED_WRITE \
-{ \
-    if (req_off >= writebuf_off + writebuf_len) { \
-	lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
-	err = write(fd->fd_sys, writebuf, writebuf_len); \
-        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
-        if (err == -1) err_flag = 1; \
-	writebuf_off = req_off; \
-        writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
-	if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
-	lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
-	err = read(fd->fd_sys, writebuf, writebuf_len); \
-        if (err == -1) { \
-            *error_code = MPIO_Err_create_code(MPI_SUCCESS, \
-					       MPIR_ERR_RECOVERABLE, myname, \
-					       __LINE__, MPI_ERR_IO, \
-					       "**ioRMWrdwr", 0); \
-	    return; \
-        } \
-    } \
-    write_sz = (unsigned) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
-    ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off));\
-    memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
-    while (write_sz != req_len) { \
-	lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
-	err = write(fd->fd_sys, writebuf, writebuf_len); \
-        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
-        if (err == -1) err_flag = 1; \
-        req_len -= write_sz; \
-        userbuf_off += write_sz; \
-        writebuf_off += writebuf_len; \
-        writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
-	if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
-	lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
-	err = read(fd->fd_sys, writebuf, writebuf_len); \
-        if (err == -1) { \
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS, \
-					       MPIR_ERR_RECOVERABLE, myname, \
-					       __LINE__, MPI_ERR_IO, \
-					       "**ioRMWrdwr", 0); \
-	    return; \
-        } \
-        write_sz = ADIOI_MIN(req_len, writebuf_len); \
-        memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
-    } \
-}
-
-
-/* this macro is used when filetype is contig and buftype is not contig.
-   it does not do a read-modify-write and does not lock*/
-#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
-{ \
-    if (req_off >= writebuf_off + writebuf_len) { \
-	lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
-	if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
-	err = write(fd->fd_sys, writebuf, writebuf_len); \
-        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
-        if (err == -1) err_flag = 1; \
-	writebuf_off = req_off; \
-        writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
-    } \
-    write_sz = (unsigned) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
-    ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off));\
-    memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
-    while (write_sz != req_len) { \
-	lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
-	if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
-	err = write(fd->fd_sys, writebuf, writebuf_len); \
-        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
-        if (err == -1) err_flag = 1; \
-        req_len -= write_sz; \
-        userbuf_off += write_sz; \
-        writebuf_off += writebuf_len; \
-        writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
-        write_sz = ADIOI_MIN(req_len, writebuf_len); \
-        memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
-    } \
-}
-
-
-
-void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
-                       MPI_Datatype datatype, int file_ptr_type,
-                       ADIO_Offset offset, ADIO_Status *status, int
-                       *error_code)
-{
-/* offset is in units of etype relative to the filetype. */
-
-
-
-    ADIOI_Flatlist_node *flat_buf, *flat_file;
-    ADIO_Offset i_offset, sum, size_in_filetype;
-    int i, j, k, err=-1, st_index=0;
-    int n_etypes_in_filetype;
-    ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes;
-    ADIO_Offset abs_off_in_filetype=0;
-    MPI_Count filetype_size, etype_size, buftype_size;
-    MPI_Aint filetype_extent, buftype_extent; 
-    int buf_count, buftype_is_contig, filetype_is_contig;
-    ADIO_Offset userbuf_off;
-    ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
-    char *writebuf, *value;
-    unsigned bufsize, writebuf_len, max_bufsize, write_sz;
-    int err_flag=0, info_flag;
-    ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len;
-    static char myname[] = "ADIOI_BGL_WRITESTRIDED";
-
-    if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
-    	/* if user has disabled data sieving on reads, use naive
-	 * approach instead.
-	 */
-      /*FPRINTF(stderr, "ADIOI_GEN_WriteStrided_naive(%d):\n", __LINE__);*/
-      ADIOI_GEN_WriteStrided_naive(fd, 
-				    buf,
-				    count,
-				    datatype,
-				    file_ptr_type,
-				    offset,
-				    status,
-				    error_code);
-    	return;
-    }
-    /*FPRINTF(stderr, "%s(%d):\n",myname, __LINE__);*/
-
-    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
-    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
-
-    MPI_Type_size_x(fd->filetype, &filetype_size);
-    if ( ! filetype_size ) {
-#ifdef HAVE_STATUS_SET_BYTES
-	MPIR_Status_set_bytes(status, datatype, 0);
-#endif
-	*error_code = MPI_SUCCESS; 
-	return;
-    }
-
-    MPI_Type_extent(fd->filetype, &filetype_extent);
-    MPI_Type_size_x(datatype, &buftype_size);
-    MPI_Type_extent(datatype, &buftype_extent);
-    etype_size = fd->etype_size;
-
-    ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
-    bufsize = buftype_size * count;
-
-/* get max_bufsize from the info object. */
-
-    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
-    ADIOI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, 
-                 &info_flag);
-    max_bufsize = atoi(value);
-    ADIOI_Free(value);
-
-    if (!buftype_is_contig && filetype_is_contig) {
-
-/* noncontiguous in memory, contiguous in file. */
-
-	ADIOI_Flatten_datatype(datatype);
-	flat_buf = ADIOI_Flatlist;
-	while (flat_buf->type != datatype) flat_buf = flat_buf->next;
-
-        off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : 
-                 fd->disp + etype_size * offset;
-
-        start_off = off;
-	end_offset = off + bufsize - 1;
-        writebuf_off = off;
-        writebuf = (char *) ADIOI_Malloc(max_bufsize);
-        writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
-
-/* if atomicity is true, lock the region to be accessed */
-        if (fd->atomicity) 
-            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
-
-        for (j=0; j<count; j++) 
-        {
-          int i;
-            for (i=0; i<flat_buf->count; i++) {
-                userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
-		req_off = off;
-		req_len = flat_buf->blocklens[i];
-		ADIOI_BUFFERED_WRITE_WITHOUT_READ
-                off += flat_buf->blocklens[i];
-            }
-        }
-
-        /* write the buffer out finally */
-	lseek(fd->fd_sys, writebuf_off, SEEK_SET); 
-	if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
-	err = write(fd->fd_sys, writebuf, writebuf_len); 
-        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
-        if (err == -1) err_flag = 1; 
-
-        if (fd->atomicity) 
-            ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
-
-	ADIOI_Free(writebuf); /* malloced in the buffered_write macro */
-
-        if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
-	if (err_flag) {
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE, myname,
-					       __LINE__, MPI_ERR_IO, "**io",
-					       "**io %s", strerror(errno));
-	}
-	else *error_code = MPI_SUCCESS;
-    }
-
-    else {  /* noncontiguous in file */
-
-/* filetype already flattened in ADIO_Open */
-	flat_file = ADIOI_Flatlist;
-	while (flat_file->type != fd->filetype) flat_file = flat_file->next;
-	disp = fd->disp;
-
-	if (file_ptr_type == ADIO_INDIVIDUAL) {
-	/* Wei-keng reworked type processing to be a bit more efficient */
-            offset       = fd->fp_ind - disp;
-            n_filetypes  = (offset - flat_file->indices[0]) / filetype_extent;
-            offset      -= (ADIO_Offset)n_filetypes * filetype_extent;
-            /* now offset is local to this extent */
-
-            /* find the block where offset is located, skip blocklens[i]==0 */
-            for (i=0; i<flat_file->count; i++) {
-                ADIO_Offset dist;
-                if (flat_file->blocklens[i] == 0) continue;
-                dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
-                /* fwr_size is from offset to the end of block i */
-                if (dist == 0) {
-                    i++;
-                    offset   = flat_file->indices[i];
-                    fwr_size = flat_file->blocklens[i];
-                    break;
-                }
-                if (dist > 0) {
-                    fwr_size = dist;
-                    break;
-                }
-            }
-            st_index = i;  /* starting index in flat_file->indices[] */
-            offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
-	}
-	else {
-    int i;
-	    n_etypes_in_filetype = filetype_size/etype_size;
-	    n_filetypes = offset / n_etypes_in_filetype;
-	    etype_in_filetype = offset % n_etypes_in_filetype;
-	    size_in_filetype = etype_in_filetype * etype_size;
- 
-	    sum = 0;
-	    for (i=0; i<flat_file->count; i++) {
-		sum += flat_file->blocklens[i];
-		if (sum > size_in_filetype) {
-		    st_index = i;
-		    fwr_size = sum - size_in_filetype;
-		    abs_off_in_filetype = flat_file->indices[i] +
-			size_in_filetype - (sum - flat_file->blocklens[i]);
-		    break;
-		}
-	    }
-
-	    /* abs. offset in bytes in the file */
-	    offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + 
-		    abs_off_in_filetype;
-	}
-
-        start_off = offset;
-        /* Wei-keng Liao:write request is within single flat_file contig block*/
-	/* this could happen, for example, with subarray types that are
-	 * actually fairly contiguous */
-        if (buftype_is_contig && bufsize <= fwr_size) {
-            ADIO_WriteContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
-                             offset, status, error_code);
-
-	    if (file_ptr_type == ADIO_INDIVIDUAL) {
-                /* update MPI-IO file pointer to point to the first byte 
-		 * that can be accessed in the fileview. */
-                fd->fp_ind = offset + bufsize;
-                if (bufsize == fwr_size) {
-                    do {
-                        st_index++;
-                        if (st_index == flat_file->count) {
-                            st_index = 0;
-                            n_filetypes++;
-                        }
-                    } while (flat_file->blocklens[st_index] == 0);
-                    fd->fp_ind = disp + flat_file->indices[st_index]
-                               + (ADIO_Offset)n_filetypes*filetype_extent;
-                }
-            }
-	    fd->fp_sys_posn = -1;   /* set it to null. */ 
-#ifdef HAVE_STATUS_SET_BYTES
-	    MPIR_Status_set_bytes(status, datatype, bufsize);
-#endif 
-            return;
-        }
-
-       /* Calculate end_offset, the last byte-offset that will be accessed.
-         e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/
-
-	st_fwr_size = fwr_size;
-	st_n_filetypes = n_filetypes;
-	i_offset = 0;
-	j = st_index;
-	off = offset;
-	fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
-	while (i_offset < bufsize) {
-	    i_offset += fwr_size;
-	    end_offset = off + fwr_size - 1;
-
-            j = (j+1) % flat_file->count;
-            n_filetypes += (j == 0) ? 1 : 0;
-            while (flat_file->blocklens[j]==0) {
-                j = (j+1) % flat_file->count;
-                n_filetypes += (j == 0) ? 1 : 0;
-            }
-
-	    off = disp + flat_file->indices[j] + 
-		    n_filetypes*(ADIO_Offset)filetype_extent;
-	    fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
-	}
-
-/* if atomicity is true, lock the region to be accessed */
-        if (fd->atomicity) 
-            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
-
-        /* initial read for the read-modify-write */
-        writebuf_off = offset;
-        writebuf = (char *) ADIOI_Malloc(max_bufsize);
-        writebuf_len = (unsigned)(ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
-	if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
-	lseek(fd->fd_sys, writebuf_off, SEEK_SET); 
-	err = read(fd->fd_sys, writebuf, writebuf_len); 
-        if (err == -1) {
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE,
-					       myname, __LINE__,
-					       MPI_ERR_IO,
-					       "ADIOI_BGL_WriteStrided: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.", 0);
-	    return;
-        } 
-
-	if (buftype_is_contig && !filetype_is_contig) {
-
-/* contiguous in memory, noncontiguous in file. should be the most
-   common case. */
-
-	    i_offset = 0;
-	    j = st_index;
-	    off = offset;
-	    n_filetypes = st_n_filetypes;
-	    fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
-	    while (i_offset < bufsize) {
-                if (fwr_size) { 
-                    /* TYPE_UB and TYPE_LB can result in 
-                       fwr_size = 0. save system call in such cases */ 
-		    /* lseek(fd->fd_sys, off, SEEK_SET);
-		    err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/
-
-		    req_off = off;
-		    req_len = fwr_size;
-		    userbuf_off = i_offset;
-		    ADIOI_BUFFERED_WRITE
-		}
-		i_offset += fwr_size;
-
-                if (off + fwr_size < disp + flat_file->indices[j] +
-                   flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
-                       off += fwr_size;
-                /* did not reach end of contiguous block in filetype.
-                   no more I/O needed. off is incremented by fwr_size. */
-                else {
-                    j = (j+1) % flat_file->count;
-                    n_filetypes += (j == 0) ? 1 : 0;
-                    while (flat_file->blocklens[j]==0) {
-                        j = (j+1) % flat_file->count;
-                        n_filetypes += (j == 0) ? 1 : 0;
-                    }
-		    off = disp + flat_file->indices[j] + 
-                                    n_filetypes*(ADIO_Offset)filetype_extent;
-		    fwr_size = ADIOI_MIN(flat_file->blocklens[j], 
-				    bufsize-i_offset);
-		}
-	    }
-	}
-	else {
-/* noncontiguous in memory as well as in file */
-
-	    ADIOI_Flatten_datatype(datatype);
-	    flat_buf = ADIOI_Flatlist;
-	    while (flat_buf->type != datatype) flat_buf = flat_buf->next;
-
-	    k = num = buf_count = 0;
-	    i_offset = flat_buf->indices[0];
-	    j = st_index;
-	    off = offset;
-	    n_filetypes = st_n_filetypes;
-	    fwr_size = st_fwr_size;
-	    bwr_size = flat_buf->blocklens[0];
-
-	    while (num < bufsize) {
-		size = ADIOI_MIN(fwr_size, bwr_size);
-		if (size) {
-		    /* lseek(fd->fd_sys, off, SEEK_SET);
-		    err = write(fd->fd_sys, ((char *) buf) + i_offset, size); */
-
-		    req_off = off;
-		    req_len = size;
-		    userbuf_off = i_offset;
-		    ADIOI_BUFFERED_WRITE
-		}
-
-		new_fwr_size = fwr_size;
-		new_bwr_size = bwr_size;
-
-		if (size == fwr_size) {
-/* reached end of contiguous block in file */
- 		    j = (j+1) % flat_file->count;
- 		    n_filetypes += (j == 0) ? 1 : 0;
- 		    while (flat_file->blocklens[j]==0) {
- 			j = (j+1) % flat_file->count;
- 			n_filetypes += (j == 0) ? 1 : 0;
-		    }
-
-		    off = disp + flat_file->indices[j] + 
-                                  n_filetypes*(ADIO_Offset)filetype_extent;
-
-		    new_fwr_size = flat_file->blocklens[j];
-		    if (size != bwr_size) {
-			i_offset += size;
-			new_bwr_size -= size;
-		    }
-		}
-
-		if (size == bwr_size) {
-/* reached end of contiguous block in memory */
-
-		    k = (k + 1)%flat_buf->count;
-		    buf_count++;
-		    i_offset = (ADIO_Offset)buftype_extent*(ADIO_Offset)(buf_count/flat_buf->count) +
-			flat_buf->indices[k]; 
-		    new_bwr_size = flat_buf->blocklens[k];
-		    if (size != fwr_size) {
-			off += size;
-			new_fwr_size -= size;
-		    }
-		}
-		num += size;
-		fwr_size = new_fwr_size;
-                bwr_size = new_bwr_size;
-	    }
-	}
-
-        /* write the buffer out finally */	
-	lseek(fd->fd_sys, writebuf_off, SEEK_SET); 
-	if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
-	err = write(fd->fd_sys, writebuf, writebuf_len); 
-
-        if (!(fd->atomicity))
-	    ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
-	else ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
-
-        if (err == -1) err_flag = 1; 
-
-	ADIOI_Free(writebuf); /* malloced in the buffered_write macro */
-
-	if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
-	if (err_flag) {
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE, myname,
-					       __LINE__, MPI_ERR_IO, "**io",
-					       "**io %s", strerror(errno));
-	}
-	else *error_code = MPI_SUCCESS;
-    }
-
-    fd->fp_sys_posn = -1;   /* set it to null. */
-
-#ifdef HAVE_STATUS_SET_BYTES
-    MPIR_Status_set_bytes(status, datatype, bufsize);
-/* This is a temporary way of filling in status. The right way is to 
-   keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
-#endif
-
-    if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
-}
diff --git a/src/mpi/romio/adio/ad_bglockless/Makefile.mk b/src/mpi/romio/adio/ad_bglockless/Makefile.mk
deleted file mode 100644
index 8c96bd1..0000000
--- a/src/mpi/romio/adio/ad_bglockless/Makefile.mk
+++ /dev/null
@@ -1,17 +0,0 @@
-## -*- Mode: Makefile; -*-
-## vim: set ft=automake :
-##
-## (C) 2011 by Argonne National Laboratory.
-##     See COPYRIGHT in top-level directory.
-##
-
-if BUILD_AD_BGLOCKLESS
-
-noinst_HEADERS += adio/ad_bglockless/ad_bglockless.h
-
-romio_other_sources +=                          \
-    adio/ad_bglockless/ad_bglockless.c          \
-    adio/ad_bglockless/ad_bglockless_features.c
-
-endif BUILD_AD_BGLOCKLESS
-
diff --git a/src/mpi/romio/adio/ad_bglockless/ad_bglockless.c b/src/mpi/romio/adio/ad_bglockless/ad_bglockless.c
deleted file mode 100644
index 512f0e9..0000000
--- a/src/mpi/romio/adio/ad_bglockless/ad_bglockless.c
+++ /dev/null
@@ -1,44 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
-/* 
- *
- *   Copyright (C) 2001 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "../ad_bg/ad_bg.h"
-#include "ad_bglockless.h"
-
-/* adioi.h has the ADIOI_Fns_struct define */
-#include "adioi.h"
-
-struct ADIOI_Fns_struct ADIO_BGLOCKLESS_operations = {
-    ADIOI_BG_Open, /* Open */
-    ADIOI_GEN_OpenColl, /* Collective open */
-    ADIOI_GEN_ReadContig, /* ReadContig */
-    ADIOI_GEN_WriteContig, /* WriteContig */
-    ADIOI_BG_ReadStridedColl, /* ReadStridedColl */
-    ADIOI_BG_WriteStridedColl, /* WriteStridedColl */
-    ADIOI_GEN_SeekIndividual, /* SeekIndividual */
-    ADIOI_GEN_Fcntl, /* Fcntl */
-    ADIOI_BG_SetInfo, /* SetInfo */
-    ADIOI_GEN_ReadStrided, /* ReadStrided */
-    ADIOI_NOLOCK_WriteStrided, /* WriteStrided */
-    ADIOI_BG_Close, /* Close */
-#ifdef ROMIO_HAVE_WORKING_AIO
-    ADIOI_GEN_IreadContig, /* IreadContig */
-    ADIOI_GEN_IwriteContig, /* IwriteContig */
-#else
-    ADIOI_FAKE_IreadContig, /* IreadContig */
-    ADIOI_FAKE_IwriteContig, /* IwriteContig */
-#endif
-    ADIOI_GEN_IODone, /* ReadDone */
-    ADIOI_GEN_IODone, /* WriteDone */
-    ADIOI_GEN_IOComplete, /* ReadComplete */
-    ADIOI_GEN_IOComplete, /* WriteComplete */
-    ADIOI_GEN_IreadStrided, /* IreadStrided */
-    ADIOI_GEN_IwriteStrided, /* IwriteStrided */
-    ADIOI_BG_Flush, /* Flush */
-    ADIOI_GEN_Resize, /* Resize */
-    ADIOI_GEN_Delete, /* Delete */
-    ADIOI_BGLOCKLESS_Feature  /* Features */
-};
diff --git a/src/mpi/romio/adio/ad_bglockless/ad_bglockless.h b/src/mpi/romio/adio/ad_bglockless/ad_bglockless.h
deleted file mode 100644
index cf753bc..0000000
--- a/src/mpi/romio/adio/ad_bglockless/ad_bglockless.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
-/* 
- *
- *   Copyright (C) 2008 Uchicago Argonne LLC
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#ifndef AD_BGLOCKLESS_INCLUDE
-#define AD_BGLOCKLESS_INCLUDE
-
-int ADIOI_BGLOCKLESS_Feature(ADIO_File fd, int flag);
-
-#endif
-
diff --git a/src/mpi/romio/adio/ad_bglockless/ad_bglockless_features.c b/src/mpi/romio/adio/ad_bglockless/ad_bglockless_features.c
deleted file mode 100644
index 784f726..0000000
--- a/src/mpi/romio/adio/ad_bglockless/ad_bglockless_features.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
-/*
- *
- *  (C) 2008 by Argonne National Laboratory.
- *      See COPYRIGHT in top-level directory.
- */
-/* begin_generated_IBM_copyright_prolog                             */
-/*                                                                  */
-/* This is an automatically generated copyright prolog.             */
-/* After initializing,  DO NOT MODIFY OR MOVE                       */
-/*  --------------------------------------------------------------- */
-/*                                                                  */
-/* Licensed Materials - Property of IBM                             */
-/* Blue Gene/Q                                                      */
-/* (C) Copyright IBM Corp.  2011, 2012                              */
-/* US Government Users Restricted Rights - Use, duplication or      */      
-/*   disclosure restricted by GSA ADP Schedule Contract with IBM    */
-/*   Corp.                                                          */
-/*                                                                  */
-/* This software is available to you under the Eclipse Public       */
-/* License (EPL).                                                   */
-/*                                                                  */
-/*  --------------------------------------------------------------- */
-/*                                                                  */
-/* end_generated_IBM_copyright_prolog                               */
-#include "adio.h"
-
-int ADIOI_BGLOCKLESS_Feature(ADIO_File fd, int flag)
-{
-	switch(flag) {
-		case ADIO_SCALABLE_OPEN:
-			return 1;
-		case ADIO_SHARED_FP:
-		case ADIO_LOCKS:
-		case ADIO_SEQUENTIAL:
-		case ADIO_DATA_SIEVING_WRITES:
-		default:
-			return 0;
-	}
-}

http://git.mpich.org/mpich.git/commitdiff/88ccf46760a07a6a1eb83a2c3401c6372fade946

commit 88ccf46760a07a6a1eb83a2c3401c6372fade946
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Mon Mar 10 14:29:15 2014 -0500

    bg to gpfs new files

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.c
new file mode 100644
index 0000000..4af0504
--- /dev/null
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.c
@@ -0,0 +1,837 @@
+/* ---------------------------------------------------------------- */
+/* (C)Copyright IBM Corp.  2007, 2008                               */
+/* ---------------------------------------------------------------- */
+/**
+ * \file ad_gpfs_aggrs.c
+ * \brief The externally used function from this file is is declared in ad_gpfs_aggrs.h
+ */
+
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *   Copyright (C) 1997-2001 University of Chicago.
+ *   See COPYRIGHT notice in top-level directory.
+ */
+
+
+#include "adio.h"
+#include "adio_cb_config_list.h"
+#include "ad_gpfs.h"
+
+#ifdef AGGREGATION_PROFILE
+#include "mpe.h"
+#endif
+
+
+#ifdef USE_DBG_LOGGING
+  #define AGG_DEBUG 1
+#endif
+
+#ifndef TRACE_ERR
+#  define TRACE_ERR(format...)
+#endif
+
+/* Comments copied from common:
+ * This file contains four functions:
+ *
+ * ADIOI_Calc_aggregator()
+ * ADIOI_Calc_file_domains()
+ * ADIOI_Calc_my_req()
+ * ADIOI_Calc_others_req()
+ *
+ * The last three of these were originally in ad_read_coll.c, but they are
+ * also shared with ad_write_coll.c.  I felt that they were better kept with
+ * the rest of the shared aggregation code.
+ */
+
+/* Discussion of values available from above:
+ *
+ * ADIO_Offset st_offsets[0..nprocs-1]
+ * ADIO_Offset end_offsets[0..nprocs-1]
+ *    These contain a list of start and end offsets for each process in
+ *    the communicator.  For example, an access at loc 10, size 10 would
+ *    have a start offset of 10 and end offset of 19.
+ * int nprocs
+ *    number of processors in the collective I/O communicator
+ * ADIO_Offset min_st_offset
+ * ADIO_Offset fd_start[0..nprocs_for_coll-1]
+ *    starting location of "file domain"; region that a given process will
+ *    perform aggregation for (i.e. actually do I/O)
+ * ADIO_Offset fd_end[0..nprocs_for_coll-1]
+ *    start + size - 1 roughly, but it can be less, or 0, in the case of
+ *    uneven distributions
+ */
+
+/* Description from common/ad_aggregate.c.  (Does it completely apply to bg?)
+ * ADIOI_Calc_aggregator()
+ *
+ * The intention here is to implement a function which provides basically
+ * the same functionality as in Rajeev's original version of
+ * ADIOI_Calc_my_req().  He used a ceiling division approach to assign the
+ * file domains, and we use the same approach here when calculating the
+ * location of an offset/len in a specific file domain.  Further we assume
+ * this same distribution when calculating the rank_index, which is later
+ *  used to map to a specific process rank in charge of the file domain.
+ *
+ * A better (i.e. more general) approach would be to use the list of file
+ * domains only.  This would be slower in the case where the
+ * original ceiling division was used, but it would allow for arbitrary
+ * distributions of regions to aggregators.  We'd need to know the
+ * nprocs_for_coll in that case though, which we don't have now.
+ *
+ * Note a significant difference between this function and Rajeev's old code:
+ * this code doesn't necessarily return a rank in the range
+ * 0..nprocs_for_coll; instead you get something in 0..nprocs.  This is a
+ * result of the rank mapping; any set of ranks in the communicator could be
+ * used now.
+ *
+ * Returns an integer representing a rank in the collective I/O communicator.
+ *
+ * The "len" parameter is also modified to indicate the amount of data
+ * actually available in this file domain.
+ */
+/*
+ * This is more general aggregator search function which does not base on the assumption
+ * that each aggregator hosts the file domain with the same size
+ */
+int ADIOI_GPFS_Calc_aggregator(ADIO_File fd,
+			      ADIO_Offset off,
+			      ADIO_Offset min_off,
+			      ADIO_Offset *len,
+			      ADIO_Offset fd_size,
+			      ADIO_Offset *fd_start,
+			      ADIO_Offset *fd_end)
+{
+    int rank_index, rank;
+    ADIO_Offset avail_bytes;
+    TRACE_ERR("Entering ADIOI_GPFS_Calc_aggregator\n");
+
+    ADIOI_GPFS_assert ( (off <= fd_end[fd->hints->cb_nodes-1] && off >= min_off && fd_start[0] >= min_off ) );
+
+    /* binary search --> rank_index is returned */
+    int ub = fd->hints->cb_nodes;
+    int lb = 0;
+    /* get an index into our array of aggregators */
+    /* Common code for striping - bg doesn't use it but it's
+       here to make diff'ing easier.
+    rank_index = (int) ((off - min_off + fd_size)/ fd_size - 1);
+
+    if (fd->hints->striping_unit > 0) {
+        * wkliao: implementation for file domain alignment
+           fd_start[] and fd_end[] have been aligned with file lock
+	   boundaries when returned from ADIOI_Calc_file_domains() so cannot
+	   just use simple arithmatic as above *
+        rank_index = 0;
+        while (off > fd_end[rank_index]) rank_index++;
+    }
+    bg does it's own striping below
+    */
+    rank_index = fd->hints->cb_nodes / 2;
+    while ( off < fd_start[rank_index] || off > fd_end[rank_index] ) {
+	if ( off > fd_end  [rank_index] ) {
+	    lb = rank_index;
+	    rank_index = (rank_index + ub) / 2;
+	}
+	else
+	if ( off < fd_start[rank_index] ) {
+	    ub = rank_index;
+	    rank_index = (rank_index + lb) / 2;
+	}
+    }
+    /* we index into fd_end with rank_index, and fd_end was allocated to be no
+     * bigger than fd->hins->cb_nodes.   If we ever violate that, we're
+     * overrunning arrays.  Obviously, we should never ever hit this abort */
+    if (rank_index >= fd->hints->cb_nodes || rank_index < 0) {
+        FPRINTF(stderr, "Error in ADIOI_Calc_aggregator(): rank_index(%d) >= fd->hints->cb_nodes (%d) fd_size=%lld off=%lld\n",
+			rank_index,fd->hints->cb_nodes,fd_size,off);
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    /* DBG_FPRINTF ("ADIOI_GPFS_Calc_aggregator: rank_index = %d\n",
+       rank_index ); */
+
+    /*
+     * remember here that even in Rajeev's original code it was the case that
+     * different aggregators could end up with different amounts of data to
+     * aggregate.  here we use fd_end[] to make sure that we know how much
+     * data this aggregator is working with.
+     *
+     * the +1 is to take into account the end vs. length issue.
+     */
+    avail_bytes = fd_end[rank_index] + 1 - off;
+    if (avail_bytes < *len && avail_bytes > 0) {
+        /* this file domain only has part of the requested contig. region */
+
+        *len = avail_bytes;
+    }
+
+    /* map our index to a rank */
+    /* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
+    rank = fd->hints->ranklist[rank_index];
+    TRACE_ERR("Leaving ADIOI_GPFS_Calc_aggregator\n");
+
+    return rank;
+}
+
+/*
+ * Compute a dynamic access range based file domain partition among I/O aggregators,
+ * which align to the GPFS block size
+ * Divide the I/O workload among "nprocs_for_coll" processes. This is
+ * done by (logically) dividing the file into file domains (FDs); each
+ * process may directly access only its own file domain.
+ * Additional effort is to make sure that each I/O aggregator get
+ * a file domain that aligns to the GPFS block size.  So, there will
+ * not be any false sharing of GPFS file blocks among multiple I/O nodes.
+ *
+ * The common version of this now accepts a min_fd_size and striping_unit.
+ * It doesn't seem necessary here (using GPFS block sizes) but keep it in mind
+ * (e.g. we could pass striping unit instead of using fs_ptr->blksize).
+ */
+void ADIOI_GPFS_Calc_file_domains(ADIO_File fd,
+	                              ADIO_Offset *st_offsets,
+                                      ADIO_Offset *end_offsets,
+                                      int          nprocs,
+                                      int          nprocs_for_coll,
+                                      ADIO_Offset *min_st_offset_ptr,
+                                      ADIO_Offset **fd_start_ptr,
+                                      ADIO_Offset **fd_end_ptr,
+                                      ADIO_Offset *fd_size_ptr,
+                                      void        *fs_ptr)
+{
+    ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
+    int i, aggr;
+    TRACE_ERR("Entering ADIOI_GPFS_Calc_file_domains\n");
+    blksize_t blksize;
+
+#ifdef AGGREGATION_PROFILE
+    MPE_Log_event (5004, 0, NULL);
+#endif
+
+#   if AGG_DEBUG
+    static char myname[] = "ADIOI_GPFS_Calc_file_domains";
+    DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n",
+	    myname,__LINE__,nprocs_for_coll);
+#   endif
+    if (fd->blksize <= 0)
+	/* default to 1M if blksize unset */
+	fd->blksize = 1048576;
+    blksize = fd->blksize;
+
+#   if AGG_DEBUG
+    DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);
+#   endif
+/* find min of start offsets and max of end offsets of all processes */
+    min_st_offset  = st_offsets [0];
+    max_end_offset = end_offsets[0];
+    for (i=1; i<nprocs; i++) {
+        min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
+        max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
+    }
+
+    /* DBG_FPRINTF(stderr, "_calc_file_domains, min_st_offset, max_
+       = %qd, %qd\n", min_st_offset, max_end_offset );*/
+
+    /* determine the "file domain (FD)" of each process, i.e., the portion of
+       the file that will be "owned" by each process */
+
+    ADIO_Offset gpfs_ub       = (max_end_offset +blksize-1) / blksize * blksize - 1;
+    ADIO_Offset gpfs_lb       = min_st_offset / blksize * blksize;
+    ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset;
+    ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize;
+    ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1;
+
+    int         naggs    = nprocs_for_coll;
+
+    /* Tweak the file domains so that no fd is smaller than a threshold.  We
+     * have to strike a balance between efficency and parallelism: somewhere
+     * between 10k processes sending 32-byte requests and one process sending a
+     * 320k request is a (system-dependent) sweet spot
+
+    This is from the common code - the new min_fd_size parm that we didn't implement.
+    (And common code uses a different declaration of fd_size so beware)
+
+    if (fd_size < min_fd_size)
+        fd_size = min_fd_size;
+    */
+    fd_size              = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
+    *fd_start_ptr        = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
+    *fd_end_ptr          = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
+    fd_start             = *fd_start_ptr;
+    fd_end               = *fd_end_ptr;
+
+    /* each process will have a file domain of some number of gpfs blocks, but
+     * the division of blocks is not likely to be even.  Some file domains will
+     * be "large" and others "small"
+     *
+     * Example: consider  17 blocks distributed over 3 aggregators.
+     * nb_cn_small = 17/3 = 5
+     * naggs_large = 17 - 3*(17/3) = 17 - 15  = 2
+     * naggs_small = 3 - 2 = 1
+     *
+     * and you end up with file domains of {5-blocks, 6-blocks, 6-blocks}
+     *
+     * what about (relatively) small files?  say, a file of 1000 blocks
+     * distributed over 2064 aggregators:
+     * nb_cn_small = 1000/2064 = 0
+     * naggs_large = 1000 - 2064*(1000/2064) = 1000
+     * naggs_small = 2064 - 1000 = 1064
+     * and you end up with domains of {0, 0, 0, ... 1, 1, 1 ...}
+     *
+     * it might be a good idea instead of having all the zeros up front, to
+     * "mix" those zeros into the fd_size array.  that way, no pset/bridge-set
+     * is left with zero work.  In fact, even if the small file domains aren't
+     * zero, it's probably still a good idea to mix the "small" file domains
+     * across the fd_size array to keep the io nodes in balance */
+
+
+    ADIO_Offset n_gpfs_blk    = fd_gpfs_range / blksize;
+    ADIO_Offset nb_cn_small   = n_gpfs_blk/naggs;
+    ADIO_Offset naggs_large   = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
+    ADIO_Offset naggs_small   = naggs - naggs_large;
+
+#ifdef BGQPLATFORM
+    if (gpfsmpio_balancecontig == 1) {
+	/* File domains blocks are assigned to aggregators in a breadth-first
+	 * fashion relative to the ions - additionally, file domains on the
+	 * aggregators sharing the same bridgeset and ion have contiguous
+	 * offsets. */
+
+	// initialize everything to small
+	for (i=0; i<naggs; i++)
+	    fd_size[i] = nb_cn_small     * blksize;
+
+	// go thru and distribute the large across the bridges
+
+	/* bridelistoffset: agg rank list offsets using the bridgelist - each
+	 * entry is created by adding up the indexes for the aggs from all
+	 * previous bridges */
+	int *bridgelistoffset =
+	    (int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int));
+	/* tmpbridgelistnum: copy of the bridgelistnum whose entries can be
+	 * decremented to keep track of bridge assignments during the actual
+	 * large block assignments to the agg rank list*/
+	int *tmpbridgelistnum =
+	    (int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int));
+
+	int j;
+	for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++) {
+	    int k, bridgerankoffset = 0;
+	    for (k=0;k<j;k++) {
+		bridgerankoffset += fd->hints->fs_hints.bg.bridgelistnum[k];
+	    }
+	    bridgelistoffset[j] = bridgerankoffset;
+	}
+
+	for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++)
+	    tmpbridgelistnum[j] = fd->hints->fs_hints.bg.bridgelistnum[j];
+	int bridgeiter = 0;
+
+	/* distribute the large blocks across the aggs going breadth-first
+	 * across the bridgelist - this distributes the fd sizes across the
+	 * ions, so later in the file domain assignment when it iterates thru
+	 * the ranklist the offsets will be contiguous within the bridge and
+	 * ion as well */
+	for (j=0;j<naggs_large;j++) {
+	    int foundbridge = 0;
+	    while (!foundbridge) {
+		if (tmpbridgelistnum[bridgeiter] > 0) {
+		    foundbridge = 1;
+		    /*
+		       printf("bridgeiter is %d tmpbridgelistnum[bridgeiter] is %d bridgelistoffset[bridgeiter] is %d\n",bridgeiter,tmpbridgelistnum[bridgeiter],bridgelistoffset[bridgeiter]);
+		       printf("naggs is %d bridgeiter is %d bridgelistoffset[bridgeiter] is %d tmpbridgelistnum[bridgeiter] is %d\n",naggs, bridgeiter,bridgelistoffset[bridgeiter],tmpbridgelistnum[bridgeiter]);
+		       printf("naggs is %d bridgeiter is %d setting fd_size[%d]\n",naggs, bridgeiter,bridgelistoffset[bridgeiter]+(fd->hints->bridgelistnum[bridgeiter]-tmpbridgelistnum[bridgeiter]));
+		     */
+		    fd_size[bridgelistoffset[bridgeiter]+(fd->hints->fs_hints.bg.bridgelistnum[bridgeiter]-tmpbridgelistnum[bridgeiter])] =
+			(nb_cn_small+1) * blksize;
+		    tmpbridgelistnum[bridgeiter]--;
+		}
+		if (bridgeiter == (fd->hints->fs_hints.bg.numbridges-1))
+		    bridgeiter = 0;
+		else
+		    bridgeiter++;
+	    }
+	}
+	ADIOI_Free(tmpbridgelistnum);
+	ADIOI_Free(bridgelistoffset);
+
+    } else {
+	/* BG/L- and BG/P-style distribution of file domains: simple allocation of
+	 * file domins to each aggregator */
+	for (i=0; i<naggs; i++) {
+	    if (i < naggs_small) {
+		fd_size[i] = nb_cn_small     * blksize;
+	    } else {
+		fd_size[i] = (nb_cn_small+1) * blksize;
+	    }
+	}
+    }
+#ifdef balancecontigtrace
+    int myrank;
+    MPI_Comm_rank(fd->comm,&myrank);
+    if (myrank == 0) {
+      fprintf(stderr,"naggs_small is %d nb_cn_small is %d\n",naggs_small,nb_cn_small);
+	for (i=0; i<naggs; i++) {
+	    fprintf(stderr,"fd_size[%d] set to %d agg rank is %d\n",i,fd_size[i],fd->hints->ranklist[i]);
+	}
+    }
+#endif
+
+#else // not BGQ platform
+	for (i=0; i<naggs; i++) {
+	    if (i < naggs_small) {
+		fd_size[i] = nb_cn_small     * blksize;
+	    } else {
+		fd_size[i] = (nb_cn_small+1) * blksize;
+	    }
+	}
+
+#endif
+
+
+#   if AGG_DEBUG
+     DBG_FPRINTF(stderr,"%s(%d): "
+                   "gpfs_ub       %llu, "
+                   "gpfs_lb       %llu, "
+                   "gpfs_ub_rdoff %llu, "
+                   "gpfs_lb_rdoff %llu, "
+                   "fd_gpfs_range %llu, "
+                   "n_gpfs_blk    %llu, "
+                   "nb_cn_small   %llu, "
+                   "naggs_large   %llu, "
+                   "naggs_small   %llu, "
+                   "\n",
+                   myname,__LINE__,
+                   gpfs_ub      ,
+                   gpfs_lb      ,
+                   gpfs_ub_rdoff,
+                   gpfs_lb_rdoff,
+                   fd_gpfs_range,
+                   n_gpfs_blk   ,
+                   nb_cn_small  ,
+                   naggs_large  ,
+                   naggs_small
+                   );
+#   endif
+
+    fd_size[0]       -= gpfs_lb_rdoff;
+    fd_size[naggs-1] -= gpfs_ub_rdoff;
+
+    /* compute the file domain for each aggr */
+    ADIO_Offset offset = min_st_offset;
+    for (aggr=0; aggr<naggs; aggr++) {
+        fd_start[aggr] = offset;
+        fd_end  [aggr] = offset + fd_size[aggr] - 1;
+        offset += fd_size[aggr];
+    }
+
+    *fd_size_ptr = fd_size[0];
+    *min_st_offset_ptr = min_st_offset;
+
+#ifdef AGGREGATION_PROFILE
+    MPE_Log_event (5005, 0, NULL);
+#endif
+    ADIOI_Free (fd_size);
+    TRACE_ERR("Leaving ADIOI_GPFS_Calc_file_domains\n");
+}
+
+/*
+ * ADIOI_GPFS_Calc_my_req() overrides ADIOI_Calc_my_req for the default implementation
+ * is specific for static file domain partitioning.
+ *
+ * ADIOI_Calc_my_req() - calculate what portions of the access requests
+ * of this process are located in the file domains of various processes
+ * (including this one)
+ */
+void ADIOI_GPFS_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
+			   int contig_access_count, ADIO_Offset
+			   min_st_offset, ADIO_Offset *fd_start,
+			   ADIO_Offset *fd_end, ADIO_Offset fd_size,
+			   int nprocs,
+			   int *count_my_req_procs_ptr,
+			   int **count_my_req_per_proc_ptr,
+			   ADIOI_Access **my_req_ptr,
+			   int **buf_idx_ptr)
+/* Possibly reconsider if buf_idx's are ok as int's, or should they be aints/offsets?
+   They are used as memory buffer indices so it seems like the 2G limit is in effect */
+{
+    int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
+    int i, l, proc;
+    ADIO_Offset fd_len, rem_len, curr_idx, off;
+    ADIOI_Access *my_req;
+    TRACE_ERR("Entering ADIOI_GPFS_Calc_my_req\n");
+
+#ifdef AGGREGATION_PROFILE
+    MPE_Log_event (5024, 0, NULL);
+#endif
+    *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs,sizeof(int));
+    count_my_req_per_proc = *count_my_req_per_proc_ptr;
+/* count_my_req_per_proc[i] gives the no. of contig. requests of this
+   process in process i's file domain. calloc initializes to zero.
+   I'm allocating memory of size nprocs, so that I can do an
+   MPI_Alltoall later on.*/
+
+    buf_idx = (int *) ADIOI_Malloc(nprocs*sizeof(int));
+/* buf_idx is relevant only if buftype_is_contig.
+   buf_idx[i] gives the index into user_buf where data received
+   from proc. i should be placed. This allows receives to be done
+   without extra buffer. This can't be done if buftype is not contig. */
+
+    /* initialize buf_idx to -1 */
+    for (i=0; i < nprocs; i++) buf_idx[i] = -1;
+
+    /* one pass just to calculate how much space to allocate for my_req;
+     * contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
+     */
+    for (i=0; i < contig_access_count; i++) {
+	/* short circuit offset/len processing if len == 0
+	 * 	(zero-byte  read/write */
+	if (len_list[i] == 0)
+		continue;
+	off = offset_list[i];
+	fd_len = len_list[i];
+	/* note: we set fd_len to be the total size of the access.  then
+	 * ADIOI_Calc_aggregator() will modify the value to return the
+	 * amount that was available from the file domain that holds the
+	 * first part of the access.
+	 */
+  /* BES */
+	proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
+				     fd_start, fd_end);
+	count_my_req_per_proc[proc]++;
+
+	/* figure out how much data is remaining in the access (i.e. wasn't
+	 * part of the file domain that had the starting byte); we'll take
+	 * care of this data (if there is any) in the while loop below.
+	 */
+	rem_len = len_list[i] - fd_len;
+
+	while (rem_len > 0) {
+	    off += fd_len; /* point to first remaining byte */
+	    fd_len = rem_len; /* save remaining size, pass to calc */
+	    proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len,
+					 fd_size, fd_start, fd_end);
+
+	    count_my_req_per_proc[proc]++;
+	    rem_len -= fd_len; /* reduce remaining length by amount from fd */
+	}
+    }
+
+/* now allocate space for my_req, offset, and len */
+
+    *my_req_ptr = (ADIOI_Access *)
+	ADIOI_Malloc(nprocs*sizeof(ADIOI_Access));
+    my_req = *my_req_ptr;
+
+    count_my_req_procs = 0;
+    for (i=0; i < nprocs; i++) {
+	if (count_my_req_per_proc[i]) {
+	    my_req[i].offsets = (ADIO_Offset *)
+		ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset));
+	    my_req[i].lens = (int *)
+		ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(int));
+	    count_my_req_procs++;
+	}
+	my_req[i].count = 0;  /* will be incremented where needed
+				      later */
+    }
+
+/* now fill in my_req */
+    curr_idx = 0;
+    for (i=0; i<contig_access_count; i++) {
+	/* short circuit offset/len processing if len == 0
+	 * 	(zero-byte  read/write */
+	if (len_list[i] == 0)
+		continue;
+	off = offset_list[i];
+	fd_len = len_list[i];
+	proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
+				     fd_start, fd_end);
+
+	/* for each separate contiguous access from this process */
+	if (buf_idx[proc] == -1)
+  {
+    ADIOI_Assert(curr_idx == (int) curr_idx);
+    buf_idx[proc] = (int) curr_idx;
+  }
+
+	l = my_req[proc].count;
+	curr_idx += fd_len;
+
+	rem_len = len_list[i] - fd_len;
+
+	/* store the proc, offset, and len information in an array
+         * of structures, my_req. Each structure contains the
+         * offsets and lengths located in that process's FD,
+	 * and the associated count.
+	 */
+	my_req[proc].offsets[l] = off;
+  ADIOI_Assert(fd_len == (int) fd_len);
+	my_req[proc].lens[l] = (int) fd_len;
+	my_req[proc].count++;
+
+	while (rem_len > 0) {
+	    off += fd_len;
+	    fd_len = rem_len;
+	    proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len,
+					 fd_size, fd_start, fd_end);
+
+	    if (buf_idx[proc] == -1)
+      {
+        ADIOI_Assert(curr_idx == (int) curr_idx);
+        buf_idx[proc] = (int) curr_idx;
+      }
+
+	    l = my_req[proc].count;
+	    curr_idx += fd_len;
+	    rem_len -= fd_len;
+
+	    my_req[proc].offsets[l] = off;
+      ADIOI_Assert(fd_len == (int) fd_len);
+	    my_req[proc].lens[l] = (int) fd_len;
+	    my_req[proc].count++;
+	}
+    }
+
+
+
+#ifdef AGG_DEBUG
+    for (i=0; i<nprocs; i++) {
+	if (count_my_req_per_proc[i] > 0) {
+	    DBG_FPRINTF(stderr, "data needed from %d (count = %d):\n", i,
+		    my_req[i].count);
+	    for (l=0; l < my_req[i].count; l++) {
+		DBG_FPRINTF(stderr, "   off[%d] = %lld, len[%d] = %d\n", l,
+			my_req[i].offsets[l], l, my_req[i].lens[l]);
+	    }
+	}
+	DBG_FPRINTF(stderr, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
+    }
+#endif
+
+    *count_my_req_procs_ptr = count_my_req_procs;
+    *buf_idx_ptr = buf_idx;
+#ifdef AGGREGATION_PROFILE
+    MPE_Log_event (5025, 0, NULL);
+#endif
+    TRACE_ERR("Leaving ADIOI_GPFS_Calc_my_req\n");
+}
+
+/*
+ * ADIOI_Calc_others_req (copied to bg and switched to all to all for performance)
+ *
+ * param[in]  count_my_req_procs        Number of processes whose file domain my
+ *                                        request touches.
+ * param[in]  count_my_req_per_proc     count_my_req_per_proc[i] gives the no. of
+ *                                        contig. requests of this process in
+ *                                        process i's file domain.
+ * param[in]  my_req                    A structure defining my request
+ * param[in]  nprocs                    Number of nodes in the block
+ * param[in]  myrank                    Rank of this node
+ * param[out] count_others_req_proc_ptr Number of processes whose requests lie in
+ *                                        my process's file domain (including my
+ *                                        process itself)
+ * param[out] others_req_ptr            Array of other process' requests that lie
+ *                                        in my process's file domain
+ */
+void ADIOI_GPFS_Calc_others_req(ADIO_File fd, int count_my_req_procs,
+				int *count_my_req_per_proc,
+				ADIOI_Access *my_req,
+				int nprocs, int myrank,
+				int *count_others_req_procs_ptr,
+				ADIOI_Access **others_req_ptr)
+{
+    TRACE_ERR("Entering ADIOI_GPFS_Calc_others_req\n");
+/* determine what requests of other processes lie in this process's
+   file domain */
+
+/* count_others_req_procs = number of processes whose requests lie in
+   this process's file domain (including this process itself)
+   count_others_req_per_proc[i] indicates how many separate contiguous
+   requests of proc. i lie in this process's file domain. */
+
+    int *count_others_req_per_proc, count_others_req_procs;
+    int i;
+    ADIOI_Access *others_req;
+
+    /* Parameters for MPI_Alltoallv */
+    int *scounts, *sdispls, *rcounts, *rdispls;
+
+    /* Parameters for MPI_Alltoallv.  These are the buffers, which
+     * are later computed to be the lowest address of all buffers
+     * to be sent/received for offsets and lengths.  Initialize to
+     * the highest possible address which is the current minimum.
+     */
+    void *sendBufForOffsets=(void*)0xFFFFFFFFFFFFFFFF,
+	 *sendBufForLens   =(void*)0xFFFFFFFFFFFFFFFF,
+	 *recvBufForOffsets=(void*)0xFFFFFFFFFFFFFFFF,
+	 *recvBufForLens   =(void*)0xFFFFFFFFFFFFFFFF;
+
+/* first find out how much to send/recv and from/to whom */
+#ifdef AGGREGATION_PROFILE
+    MPE_Log_event (5026, 0, NULL);
+#endif
+    /* Send 1 int to each process.  count_my_req_per_proc[i] is the number of
+     * requests that my process will do to the file domain owned by process[i].
+     * Receive 1 int from each process.  count_others_req_per_proc[i] is the number of
+     * requests that process[i] will do to the file domain owned by my process.
+     */
+    count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int));
+/*     cora2a1=timebase(); */
+/*for(i=0;i<nprocs;i++) ?*/
+    MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
+		 count_others_req_per_proc, 1, MPI_INT, fd->comm);
+
+/*     total_cora2a+=timebase()-cora2a1; */
+
+    /* Allocate storage for an array of other nodes' accesses of our
+     * node's file domain.  Also allocate storage for the alltoallv
+     * parameters.
+     */
+    *others_req_ptr = (ADIOI_Access *)
+	ADIOI_Malloc(nprocs*sizeof(ADIOI_Access));
+    others_req = *others_req_ptr;
+
+    scounts = ADIOI_Malloc(nprocs*sizeof(int));
+    sdispls = ADIOI_Malloc(nprocs*sizeof(int));
+    rcounts = ADIOI_Malloc(nprocs*sizeof(int));
+    rdispls = ADIOI_Malloc(nprocs*sizeof(int));
+
+    /* If process[i] has any requests in my file domain,
+     *   initialize an ADIOI_Access structure that will describe each request
+     *   from process[i].  The offsets, lengths, and buffer pointers still need
+     *   to be obtained to complete the setting of this structure.
+     */
+    count_others_req_procs = 0;
+    for (i=0; i<nprocs; i++) {
+	if (count_others_req_per_proc[i])
+  {
+	    others_req[i].count = count_others_req_per_proc[i];
+
+	    others_req[i].offsets = (ADIO_Offset *)
+		ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(ADIO_Offset));
+	    others_req[i].lens = (int *)
+		ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(int));
+
+	    if ( (MPIR_Upint)others_req[i].offsets < (MPIR_Upint)recvBufForOffsets )
+		recvBufForOffsets = others_req[i].offsets;
+	    if ( (MPIR_Upint)others_req[i].lens < (MPIR_Upint)recvBufForLens )
+		recvBufForLens = others_req[i].lens;
+
+	    others_req[i].mem_ptrs = (MPI_Aint *)
+		ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(MPI_Aint));
+
+	    count_others_req_procs++;
+	}
+	else
+	{
+	    others_req[i].count = 0;
+	    others_req[i].offsets = NULL;
+	    others_req[i].lens    = NULL;
+	}
+    }
+    /* If no recv buffer was allocated in the loop above, make it NULL */
+    if ( recvBufForOffsets == (void*)0xFFFFFFFFFFFFFFFF) recvBufForOffsets = NULL;
+    if ( recvBufForLens    == (void*)0xFFFFFFFFFFFFFFFF) recvBufForLens    = NULL;
+
+    /* Now send the calculated offsets and lengths to respective processes */
+
+    /************************/
+    /* Exchange the offsets */
+    /************************/
+
+    /* Determine the lowest sendBufForOffsets/Lens */
+    for (i=0; i<nprocs; i++)
+    {
+	if ( (my_req[i].count) &&
+	     ((MPIR_Upint)my_req[i].offsets <= (MPIR_Upint)sendBufForOffsets) )
+       {
+	  sendBufForOffsets = my_req[i].offsets;
+    }
+
+	if ( (my_req[i].count) &&
+	     ((MPIR_Upint)my_req[i].lens <= (MPIR_Upint)sendBufForLens) )
+       {
+	    sendBufForLens = my_req[i].lens;
+      }
+    }
+
+    /* If no send buffer was found in the loop above, make it NULL */
+    if ( sendBufForOffsets == (void*)0xFFFFFFFFFFFFFFFF) sendBufForOffsets = NULL;
+    if ( sendBufForLens    == (void*)0xFFFFFFFFFFFFFFFF) sendBufForLens    = NULL;
+
+    /* Calculate the displacements from the sendBufForOffsets/Lens */
+    for (i=0; i<nprocs; i++)
+    {
+	/* Send these offsets to process i.*/
+	scounts[i] = count_my_req_per_proc[i];
+	if ( scounts[i] == 0 )
+	    sdispls[i] = 0;
+	else
+  	  sdispls[i] =  (int)
+	                ( ( (MPIR_Upint)my_req[i].offsets -
+			   (MPIR_Upint)sendBufForOffsets ) /
+			  (MPIR_Upint)sizeof(ADIO_Offset) );
+
+	/* Receive these offsets from process i.*/
+	rcounts[i] = count_others_req_per_proc[i];
+	if ( rcounts[i] == 0 )
+	    rdispls[i] = 0;
+	else
+	    rdispls[i] = (int)
+	                 ( ( (MPIR_Upint)others_req[i].offsets -
+			     (MPIR_Upint)recvBufForOffsets ) /
+			   (MPIR_Upint)sizeof(ADIO_Offset) );
+    }
+
+    /* Exchange the offsets */
+    MPI_Alltoallv(sendBufForOffsets,
+		  scounts, sdispls, ADIO_OFFSET,
+		  recvBufForOffsets,
+		  rcounts, rdispls, ADIO_OFFSET,
+		  fd->comm);
+
+    /************************/
+    /* Exchange the lengths */
+    /************************/
+
+    for (i=0; i<nprocs; i++)
+    {
+	/* Send these lengths to process i.*/
+	scounts[i] = count_my_req_per_proc[i];
+	if ( scounts[i] == 0 )
+	    sdispls[i] = 0;
+	else
+	  sdispls[i] = (int)
+	               ( ( (MPIR_Upint)my_req[i].lens -
+			   (MPIR_Upint)sendBufForLens ) /
+			 (MPIR_Upint) sizeof(int) );
+
+	/* Receive these offsets from process i. */
+	rcounts[i] = count_others_req_per_proc[i];
+	if ( rcounts[i] == 0 )
+	    rdispls[i] = 0;
+	else
+	    rdispls[i] = (int)
+	                 ( ( (MPIR_Upint)others_req[i].lens -
+			     (MPIR_Upint)recvBufForLens ) /
+			   (MPIR_Upint) sizeof(int) );
+    }
+
+    /* Exchange the lengths */
+    MPI_Alltoallv(sendBufForLens,
+		  scounts, sdispls, MPI_INT,
+		  recvBufForLens,
+		  rcounts, rdispls, MPI_INT,
+		  fd->comm);
+
+    /* Clean up */
+    ADIOI_Free(count_others_req_per_proc);
+    ADIOI_Free (scounts);
+    ADIOI_Free (sdispls);
+    ADIOI_Free (rcounts);
+    ADIOI_Free (rdispls);
+
+    *count_others_req_procs_ptr = count_others_req_procs;
+#ifdef AGGREGATION_PROFILE
+    MPE_Log_event (5027, 0, NULL);
+#endif
+    TRACE_ERR("Leaving ADIOI_GPFS_Calc_others_req\n");
+}
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.h b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.h
new file mode 100644
index 0000000..d257cb2
--- /dev/null
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.h
@@ -0,0 +1,89 @@
+/* ---------------------------------------------------------------- */
+/* (C)Copyright IBM Corp.  2007, 2008                               */
+/* ---------------------------------------------------------------- */
+/**
+ * \file ad_gpfs_aggrs.h
+ * \brief ???
+ */
+
+/*
+ * File: ad_gpfs_aggrs.h
+ *
+ * Declares functions specific for GPFS parallel I/O solution. The implemented optimizations are:
+ * 	. Aligned file-domain partitioning, integrated in 7/28/2005
+ *
+ * In addition, following optimizations are planned:
+ * 	. Integrating multiple file-domain partitioning schemes
+ *	  (corresponding to Alok Chouhdary's persistent file domain work).
+ */
+
+#ifndef AD_GPFS_AGGRS_H_
+#define AD_GPFS_AGGRS_H_
+
+#include "adio.h"
+#include <sys/stat.h>
+
+#if !defined(GPFS_SUPER_MAGIC)
+  #define GPFS_SUPER_MAGIC (0x47504653)
+#endif
+
+    /* overriding ADIOI_Calc_file_domains() to apply 'aligned file domain partitioning'. */
+    void ADIOI_GPFS_Calc_file_domains(ADIO_File fd,
+	                                  ADIO_Offset *st_offsets,
+				          ADIO_Offset *end_offsets,
+				          int          nprocs,
+				          int          nprocs_for_coll,
+				          ADIO_Offset *min_st_offset_ptr,
+				          ADIO_Offset **fd_start_ptr,
+				          ADIO_Offset **fd_end_ptr,
+				          ADIO_Offset *fd_size_ptr,
+                  void        *fs_ptr);
+
+    /* overriding ADIOI_Calc_aggregator() for the default implementation is specific for
+       static file domain partitioning */
+    int ADIOI_GPFS_Calc_aggregator(ADIO_File fd,
+				  ADIO_Offset off,
+				  ADIO_Offset min_off,
+				  ADIO_Offset *len,
+				  ADIO_Offset fd_size,
+				  ADIO_Offset *fd_start,
+				  ADIO_Offset *fd_end);
+
+    /* overriding ADIOI_Calc_my_req for the default implementation is specific for
+       static file domain partitioning */
+    void ADIOI_GPFS_Calc_my_req ( ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
+				 int contig_access_count, ADIO_Offset
+				 min_st_offset, ADIO_Offset *fd_start,
+				 ADIO_Offset *fd_end, ADIO_Offset fd_size,
+				 int nprocs,
+				 int *count_my_req_procs_ptr,
+				 int **count_my_req_per_proc_ptr,
+				 ADIOI_Access **my_req_ptr,
+				 int **buf_idx_ptr);
+
+    /*
+     * ADIOI_Calc_others_req
+     *
+     * param[in]  count_my_req_procs        Number of processes whose file domain my
+     *                                        request touches.
+     * param[in]  count_my_req_per_proc     count_my_req_per_proc[i] gives the no. of
+     *                                        contig. requests of this process in
+     *                                        process i's file domain.
+     * param[in]  my_req                    A structure defining my request
+     * param[in]  nprocs                    Number of nodes in the block
+     * param[in]  myrank                    Rank of this node
+     * param[out] count_others_req_proc_ptr Number of processes whose requests lie in
+     *                                        my process's file domain (including my
+     *                                        process itself)
+     * param[out] others_req_ptr            Array of other process' requests that lie
+     *                                        in my process's file domain
+     */
+     void ADIOI_GPFS_Calc_others_req(ADIO_File fd, int count_my_req_procs,
+				    int *count_my_req_per_proc,
+				    ADIOI_Access *my_req,
+				    int nprocs, int myrank,
+				    int *count_others_req_procs_ptr,
+				    ADIOI_Access **others_req_ptr);
+
+
+#endif  /* AD_GPFS_AGGRS_H_ */
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_getsh.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_getsh.c
new file mode 100644
index 0000000..f68771d
--- /dev/null
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_getsh.c
@@ -0,0 +1,84 @@
+/* ---------------------------------------------------------------- */
+/* (C)Copyright IBM Corp.  2007, 2008                               */
+/* ---------------------------------------------------------------- */
+/**
+ * \file ad_bg_getsh.c
+ * \brief ???
+ */
+
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *   Copyright (C) 1997 University of Chicago.
+ *   See COPYRIGHT notice in top-level directory.
+ */
+
+#include "ad_gpfs.h"
+
+/* returns the current location of the shared_fp in terms of the
+   no. of etypes relative to the current view, and also increments the
+   shared_fp by the number of etypes to be accessed (incr) in the read
+   or write following this function. */
+
+void ADIOI_GPFS_Get_shared_fp(ADIO_File fd, ADIO_Offset incr, ADIO_Offset *shared_fp,
+			 int *error_code)
+{
+    ADIO_Offset new_fp;
+    int err;
+    MPI_Comm dupcommself;
+    static char myname[] = "ADIOI_BG_GET_SHARED_FP";
+
+    if (fd->shared_fp_fd == ADIO_FILE_NULL) {
+	MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
+	fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF,
+				     dupcommself,
+				     fd->shared_fp_fname,
+				     fd->file_system,
+				     fd->fns,
+				     ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE,
+				     0,
+				     MPI_BYTE,
+				     MPI_BYTE,
+				     MPI_INFO_NULL,
+				     ADIO_PERM_NULL,
+				     error_code);
+	if (*error_code != MPI_SUCCESS) return;
+	*shared_fp = 0;
+	ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
+	err = read(fd->shared_fp_fd->fd_sys, shared_fp, sizeof(ADIO_Offset));
+        /* if the file is empty, the above read may return error
+           (reading beyond end of file). In that case, shared_fp = 0,
+           set above, is the correct value. */
+    }
+    else {
+	ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
+
+	err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
+	if (err == 0) {
+	    err = read(fd->shared_fp_fd->fd_sys, shared_fp,
+		       sizeof(ADIO_Offset));
+	}
+	if (err == -1) {
+	    ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
+	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+					       MPIR_ERR_RECOVERABLE, myname,
+					       __LINE__, MPI_ERR_IO, "**io",
+					       "**io %s", strerror(errno));
+	    return;
+	}
+    }
+
+    new_fp = *shared_fp + incr;
+
+    err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
+    if (err == 0) {
+	err = write(fd->shared_fp_fd->fd_sys, &new_fp, sizeof(ADIO_Offset));
+    }
+    ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
+    if (err == -1) {
+	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+					   myname, __LINE__, MPI_ERR_IO,
+					   "**io",
+					   "**io %s", strerror(errno));
+    }
+    else *error_code = MPI_SUCCESS;
+}
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_setsh.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_setsh.c
new file mode 100644
index 0000000..f169776
--- /dev/null
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_setsh.c
@@ -0,0 +1,68 @@
+/* ---------------------------------------------------------------- */
+/* (C)Copyright IBM Corp.  2007, 2008                               */
+/* ---------------------------------------------------------------- */
+/**
+ * \file ad_bg_setsh.c
+ * \brief ???
+ */
+
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *   Copyright (C) 1997 University of Chicago.
+ *   See COPYRIGHT notice in top-level directory.
+ */
+
+#include "ad_gpfs.h"
+
+/* set the shared file pointer to "offset" etypes relative to the current
+   view */
+
+/*
+This looks very similar to ADIOI_GEN_Set_shared_fp, except this
+function avoids locking the file twice.  The generic version does
+
+Write lock
+ADIO_WriteContig
+Unlock
+
+For BG, ADIOI_BG_WriteContig does a lock before writing to disable
+caching. To avoid the lock being called twice, this version for BG does
+
+Write lock
+Lseek
+Write
+Unlock
+
+*/
+
+void ADIOI_GPFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code)
+{
+    int err;
+    MPI_Comm dupcommself;
+    static char myname[] = "ADIOI_BG_SET_SHARED_FP";
+
+    if (fd->shared_fp_fd == ADIO_FILE_NULL) {
+	MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
+	fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF, dupcommself,
+				     fd->shared_fp_fname,
+				     fd->file_system, fd->fns,
+				     ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE,
+				     0, MPI_BYTE, MPI_BYTE, MPI_INFO_NULL,
+				     ADIO_PERM_NULL, error_code);
+    }
+
+    if (*error_code != MPI_SUCCESS) return;
+
+    ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
+    lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
+    err = write(fd->shared_fp_fd->fd_sys, &offset, sizeof(ADIO_Offset));
+    ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
+
+    if (err == -1) {
+	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+					   myname, __LINE__, MPI_ERR_IO,
+					   "**io",
+					   "**io %s", strerror(errno));
+    }
+    else *error_code = MPI_SUCCESS;
+}

http://git.mpich.org/mpich.git/commitdiff/d4b3106d7ba6372fde66dc7d3476064edb9f803b

commit d4b3106d7ba6372fde66dc7d3476064edb9f803b
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Fri Mar 21 11:24:35 2014 -0500

    ad_bg to ad_gpfs major reorganization
    
    reconfiguration changes from bg to gpfs with platformspec; removal of
    lockless

diff --git a/src/mpi/romio/adio/ad_gpfs/Makefile.mk b/src/mpi/romio/adio/ad_gpfs/Makefile.mk
index 0ab6f43..d04a8b7 100644
--- a/src/mpi/romio/adio/ad_gpfs/Makefile.mk
+++ b/src/mpi/romio/adio/ad_gpfs/Makefile.mk
@@ -5,29 +5,30 @@
 ##     See COPYRIGHT in top-level directory.
 ##
 
-if BUILD_AD_BG
+if BUILD_AD_GPFS
 
 AM_CPPFLAGS += -DBGL_OPTIM_STEP1_2=1 -DBGL_OPTIM_STEP1_1=1
 
 noinst_HEADERS +=                                                    \
-    adio/ad_bg/ad_bg_aggrs.h                                         \
-    adio/ad_bg/ad_gpfs.h                                               \
-    adio/ad_bg/ad_bg_pset.h                                          \
-    adio/ad_bg/ad_bg_tuning.h
+    adio/ad_gpfs/ad_bg_aggrs.h                                         \
+    adio/ad_gpfs/ad_gpfs_aggrs.h                                         \
+    adio/ad_gpfs/ad_gpfs.h                                               \
+    adio/ad_gpfs/ad_bg_pset.h                                          \
+    adio/ad_gpfs/ad_gpfs_tuning.h
 
 romio_other_sources +=                                               \
-    adio/ad_bg/ad_bg_aggrs.c                                         \
-    adio/ad_bg/ad_gpfs_close.c                                         \
-    adio/ad_bg/ad_gpfs_flush.c                                         \
-    adio/ad_bg/ad_bg_hints.c                                         \
-    adio/ad_bg/ad_bg_pset.c                                          \
-    adio/ad_bg/ad_bg_tuning.c                                        \
-    adio/ad_bg/ad_gpfs.c                                               \
-    adio/ad_bg/ad_gpfs_fcntl.c                                         \
-    adio/ad_bg/ad_gpfs_getsh.c                                         \
-    adio/ad_bg/ad_gpfs_open.c                                          \
-    adio/ad_bg/ad_gpfs_rdcoll.c                                        \
-    adio/ad_bg/ad_gpfs_setsh.c                                         \
-    adio/ad_bg/ad_gpfs_wrcoll.c
+    adio/ad_gpfs/ad_bg_aggrs.c                                         \
+    adio/ad_gpfs/ad_gpfs_aggrs.c                                         \
+    adio/ad_gpfs/ad_gpfs_close.c                                         \
+    adio/ad_gpfs/ad_gpfs_flush.c                                         \
+    adio/ad_gpfs/ad_bg_hints.c                                         \
+    adio/ad_gpfs/ad_bg_pset.c                                          \
+    adio/ad_gpfs/ad_gpfs_tuning.c                                        \
+    adio/ad_gpfs/ad_gpfs.c                                               \
+    adio/ad_gpfs/ad_gpfs_getsh.c                                         \
+    adio/ad_gpfs/ad_gpfs_open.c                                          \
+    adio/ad_gpfs/ad_gpfs_rdcoll.c                                        \
+    adio/ad_gpfs/ad_gpfs_setsh.c                                         \
+    adio/ad_gpfs/ad_gpfs_wrcoll.c
 
-endif BUILD_AD_BG
+endif BUILD_AD_GPFS
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_bg_aggrs.c b/src/mpi/romio/adio/ad_gpfs/ad_bg_aggrs.c
index 76c4af6..58f005c 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_bg_aggrs.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_bg_aggrs.c
@@ -17,7 +17,7 @@
 
 #include "adio.h"
 #include "adio_cb_config_list.h"
-#include "ad_bg.h"
+#include "ad_gpfs.h"
 #include "ad_bg_pset.h"
 #include "ad_bg_aggrs.h"
 #ifdef AGGREGATION_PROFILE
@@ -112,8 +112,8 @@ ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset)
 
   /* Send the info of IO proxy CN to all processes and keep the info in fd->hints struct.
      Declared in adio_cb_config_list.h */
-    ADIOI_cb_bcast_rank_map(fd);		
-    if (bgmpio_balancecontig == 1) { /* additionally need to send bridgelist,
+    ADIOI_cb_bcast_rank_map(fd);
+    if (gpfsmpio_balancecontig == 1) { /* additionally need to send bridgelist,
 					bridgelistnum and numbridges to all
 					ranks */
 	if (r != 0) {
@@ -253,7 +253,7 @@ ADIOI_BG_compute_agg_ranklist_serial_do (const ADIOI_BG_ConfInfo_t *confInfo,
        {
          for(j = 0; j < numAggs; j++)
          {
-           ADIOI_BG_assert(nextAggr<aggTotal);
+           ADIOI_GPFS_assert(nextAggr<aggTotal);
            aggList[nextAggr] = bridgelist[procIndex+j*distance+1].rank;
            TRACE_ERR("agglist[%d] -> bridgelist[%d] = %d\n", nextAggr, procIndex+j*distance+1,aggList[nextAggr]);
            if(aggList[nextAggr]==lastBridge) /* can't have bridge in the list twice */
@@ -346,7 +346,7 @@ ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
       DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );
     }
 #   endif
-    if (bgmpio_balancecontig == 1) {
+    if (gpfsmpio_balancecontig == 1) {
 	/* what comes out of this code block is the agg ranklist sorted by
 	 * bridge set and ion id with associated bridge info stored in the
 	 * hints structure for later access during file domain assignment */
@@ -527,777 +527,3 @@ ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
     TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial\n");
     return;
 }
-
-/* Description from common/ad_aggregate.c.  (Does it completely apply to bg?)
- * ADIOI_Calc_aggregator()
- *
- * The intention here is to implement a function which provides basically 
- * the same functionality as in Rajeev's original version of 
- * ADIOI_Calc_my_req().  He used a ceiling division approach to assign the 
- * file domains, and we use the same approach here when calculating the
- * location of an offset/len in a specific file domain.  Further we assume
- * this same distribution when calculating the rank_index, which is later
- *  used to map to a specific process rank in charge of the file domain.
- *
- * A better (i.e. more general) approach would be to use the list of file
- * domains only.  This would be slower in the case where the
- * original ceiling division was used, but it would allow for arbitrary
- * distributions of regions to aggregators.  We'd need to know the 
- * nprocs_for_coll in that case though, which we don't have now.
- *
- * Note a significant difference between this function and Rajeev's old code:
- * this code doesn't necessarily return a rank in the range
- * 0..nprocs_for_coll; instead you get something in 0..nprocs.  This is a
- * result of the rank mapping; any set of ranks in the communicator could be
- * used now.
- *
- * Returns an integer representing a rank in the collective I/O communicator.
- *
- * The "len" parameter is also modified to indicate the amount of data
- * actually available in this file domain.
- */
-/* 
- * This is more general aggregator search function which does not base on the assumption
- * that each aggregator hosts the file domain with the same size 
- */
-int ADIOI_BG_Calc_aggregator(ADIO_File fd,
-			      ADIO_Offset off,
-			      ADIO_Offset min_off,
-			      ADIO_Offset *len,
-			      ADIO_Offset fd_size,
-			      ADIO_Offset *fd_start,
-			      ADIO_Offset *fd_end)
-{
-    int rank_index, rank;
-    ADIO_Offset avail_bytes;
-    TRACE_ERR("Entering ADIOI_BG_Calc_aggregator\n");
-
-    ADIOI_BG_assert ( (off <= fd_end[fd->hints->cb_nodes-1] && off >= min_off && fd_start[0] >= min_off ) );
-
-    /* binary search --> rank_index is returned */
-    int ub = fd->hints->cb_nodes;
-    int lb = 0;
-    /* get an index into our array of aggregators */
-    /* Common code for striping - bg doesn't use it but it's
-       here to make diff'ing easier.
-    rank_index = (int) ((off - min_off + fd_size)/ fd_size - 1);
-
-    if (fd->hints->striping_unit > 0) {
-        * wkliao: implementation for file domain alignment
-           fd_start[] and fd_end[] have been aligned with file lock
-	   boundaries when returned from ADIOI_Calc_file_domains() so cannot
-	   just use simple arithmatic as above *
-        rank_index = 0;
-        while (off > fd_end[rank_index]) rank_index++;
-    } 
-    bg does it's own striping below 
-    */
-    rank_index = fd->hints->cb_nodes / 2;
-    while ( off < fd_start[rank_index] || off > fd_end[rank_index] ) {
-	if ( off > fd_end  [rank_index] ) {
-	    lb = rank_index;
-	    rank_index = (rank_index + ub) / 2;
-	}
-	else 
-	if ( off < fd_start[rank_index] ) {
-	    ub = rank_index;
-	    rank_index = (rank_index + lb) / 2;
-	}
-    }
-    /* we index into fd_end with rank_index, and fd_end was allocated to be no
-     * bigger than fd->hins->cb_nodes.   If we ever violate that, we're
-     * overrunning arrays.  Obviously, we should never ever hit this abort */
-    if (rank_index >= fd->hints->cb_nodes || rank_index < 0) {
-        FPRINTF(stderr, "Error in ADIOI_Calc_aggregator(): rank_index(%d) >= fd->hints->cb_nodes (%d) fd_size=%lld off=%lld\n",
-			rank_index,fd->hints->cb_nodes,fd_size,off);
-        MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    /* DBG_FPRINTF ("ADIOI_BG_Calc_aggregator: rank_index = %d\n",
-       rank_index ); */
-
-    /* 
-     * remember here that even in Rajeev's original code it was the case that
-     * different aggregators could end up with different amounts of data to
-     * aggregate.  here we use fd_end[] to make sure that we know how much
-     * data this aggregator is working with.  
-     *
-     * the +1 is to take into account the end vs. length issue.
-     */
-    avail_bytes = fd_end[rank_index] + 1 - off;
-    if (avail_bytes < *len && avail_bytes > 0) {
-        /* this file domain only has part of the requested contig. region */
-
-        *len = avail_bytes;
-    }
-
-    /* map our index to a rank */
-    /* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
-    rank = fd->hints->ranklist[rank_index];
-    TRACE_ERR("Leaving ADIOI_BG_Calc_aggregator\n");
-
-    return rank;
-}
-
-/* 
- * Compute a dynamic access range based file domain partition among I/O aggregators,
- * which align to the GPFS block size
- * Divide the I/O workload among "nprocs_for_coll" processes. This is
- * done by (logically) dividing the file into file domains (FDs); each
- * process may directly access only its own file domain. 
- * Additional effort is to make sure that each I/O aggregator get
- * a file domain that aligns to the GPFS block size.  So, there will 
- * not be any false sharing of GPFS file blocks among multiple I/O nodes. 
- *  
- * The common version of this now accepts a min_fd_size and striping_unit. 
- * It doesn't seem necessary here (using GPFS block sizes) but keep it in mind
- * (e.g. we could pass striping unit instead of using fs_ptr->blksize). 
- */
-void ADIOI_BG_GPFS_Calc_file_domains(ADIO_File fd,
-	                              ADIO_Offset *st_offsets,
-                                      ADIO_Offset *end_offsets,
-                                      int          nprocs,
-                                      int          nprocs_for_coll,
-                                      ADIO_Offset *min_st_offset_ptr,
-                                      ADIO_Offset **fd_start_ptr,
-                                      ADIO_Offset **fd_end_ptr,
-                                      ADIO_Offset *fd_size_ptr,
-                                      void        *fs_ptr)
-{
-    ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
-    int i, aggr;
-    TRACE_ERR("Entering ADIOI_BG_GPFS_Calc_file_domains\n");
-    blksize_t blksize;
-
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5004, 0, NULL);
-#endif
-
-#   if AGG_DEBUG
-    static char myname[] = "ADIOI_BG_GPFS_Calc_file_domains";
-    DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n", 
-	    myname,__LINE__,nprocs_for_coll);
-#   endif
-    if (fd->blksize <= 0)
-	/* default to 1M if blksize unset */
-	fd->blksize = 1048576;
-    blksize = fd->blksize;
-
-#   if AGG_DEBUG
-    DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);
-#   endif
-/* find min of start offsets and max of end offsets of all processes */
-    min_st_offset  = st_offsets [0];
-    max_end_offset = end_offsets[0];
-    for (i=1; i<nprocs; i++) {
-        min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
-        max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
-    }
-
-    /* DBG_FPRINTF(stderr, "_calc_file_domains, min_st_offset, max_
-       = %qd, %qd\n", min_st_offset, max_end_offset );*/
-
-    /* determine the "file domain (FD)" of each process, i.e., the portion of
-       the file that will be "owned" by each process */
-
-    ADIO_Offset gpfs_ub       = (max_end_offset +blksize-1) / blksize * blksize - 1;
-    ADIO_Offset gpfs_lb       = min_st_offset / blksize * blksize;
-    ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset;
-    ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize;
-    ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1;
-
-    int         naggs    = nprocs_for_coll;
-
-    /* Tweak the file domains so that no fd is smaller than a threshold.  We
-     * have to strike a balance between efficency and parallelism: somewhere
-     * between 10k processes sending 32-byte requests and one process sending a
-     * 320k request is a (system-dependent) sweet spot 
-     
-    This is from the common code - the new min_fd_size parm that we didn't implement. 
-    (And common code uses a different declaration of fd_size so beware) 
-     
-    if (fd_size < min_fd_size)
-        fd_size = min_fd_size;
-    */
-    fd_size              = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
-    *fd_start_ptr        = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
-    *fd_end_ptr          = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
-    fd_start             = *fd_start_ptr;
-    fd_end               = *fd_end_ptr;
-
-    /* each process will have a file domain of some number of gpfs blocks, but
-     * the division of blocks is not likely to be even.  Some file domains will
-     * be "large" and others "small"
-     *
-     * Example: consider  17 blocks distributed over 3 aggregators.
-     * nb_cn_small = 17/3 = 5
-     * naggs_large = 17 - 3*(17/3) = 17 - 15  = 2
-     * naggs_small = 3 - 2 = 1
-     *
-     * and you end up with file domains of {5-blocks, 6-blocks, 6-blocks}
-     *
-     * what about (relatively) small files?  say, a file of 1000 blocks
-     * distributed over 2064 aggregators:
-     * nb_cn_small = 1000/2064 = 0
-     * naggs_large = 1000 - 2064*(1000/2064) = 1000
-     * naggs_small = 2064 - 1000 = 1064
-     * and you end up with domains of {0, 0, 0, ... 1, 1, 1 ...}
-     *
-     * it might be a good idea instead of having all the zeros up front, to
-     * "mix" those zeros into the fd_size array.  that way, no pset/bridge-set
-     * is left with zero work.  In fact, even if the small file domains aren't
-     * zero, it's probably still a good idea to mix the "small" file domains
-     * across the fd_size array to keep the io nodes in balance */
-
-
-    ADIO_Offset n_gpfs_blk    = fd_gpfs_range / blksize;
-    ADIO_Offset nb_cn_small   = n_gpfs_blk/naggs;
-    ADIO_Offset naggs_large   = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
-    ADIO_Offset naggs_small   = naggs - naggs_large;
-
-    if (bgmpio_balancecontig == 1) {
-	/* File domains blocks are assigned to aggregators in a breadth-first
-	 * fashion relative to the ions - additionally, file domains on the
-	 * aggregators sharing the same bridgeset and ion have contiguous
-	 * offsets. */
-
-	// initialize everything to small
-	for (i=0; i<naggs; i++)
-	    fd_size[i] = nb_cn_small     * blksize;
-
-	// go thru and distribute the large across the bridges
-
-	/* bridelistoffset: agg rank list offsets using the bridgelist - each
-	 * entry is created by adding up the indexes for the aggs from all
-	 * previous bridges */
-	int *bridgelistoffset =
-	    (int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int));
-	/* tmpbridgelistnum: copy of the bridgelistnum whose entries can be
-	 * decremented to keep track of bridge assignments during the actual
-	 * large block assignments to the agg rank list*/
-	int *tmpbridgelistnum =
-	    (int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int));
-
-	int j;
-	for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++) {
-	    int k, bridgerankoffset = 0;
-	    for (k=0;k<j;k++) {
-		bridgerankoffset += fd->hints->fs_hints.bg.bridgelistnum[k];
-	    }
-	    bridgelistoffset[j] = bridgerankoffset;
-	}
-
-	for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++)
-	    tmpbridgelistnum[j] = fd->hints->fs_hints.bg.bridgelistnum[j];
-	int bridgeiter = 0;
-
-	/* distribute the large blocks across the aggs going breadth-first
-	 * across the bridgelist - this distributes the fd sizes across the
-	 * ions, so later in the file domain assignment when it iterates thru
-	 * the ranklist the offsets will be contiguous within the bridge and
-	 * ion as well */
-	for (j=0;j<naggs_large;j++) {
-	    int foundbridge = 0;
-	    while (!foundbridge) {
-		if (tmpbridgelistnum[bridgeiter] > 0) {
-		    foundbridge = 1;
-		    /*
-		       printf("bridgeiter is %d tmpbridgelistnum[bridgeiter] is %d bridgelistoffset[bridgeiter] is %d\n",bridgeiter,tmpbridgelistnum[bridgeiter],bridgelistoffset[bridgeiter]);
-		       printf("naggs is %d bridgeiter is %d bridgelistoffset[bridgeiter] is %d tmpbridgelistnum[bridgeiter] is %d\n",naggs, bridgeiter,bridgelistoffset[bridgeiter],tmpbridgelistnum[bridgeiter]);
-		       printf("naggs is %d bridgeiter is %d setting fd_size[%d]\n",naggs, bridgeiter,bridgelistoffset[bridgeiter]+(fd->hints->bridgelistnum[bridgeiter]-tmpbridgelistnum[bridgeiter]));
-		     */
-		    fd_size[bridgelistoffset[bridgeiter]+(fd->hints->fs_hints.bg.bridgelistnum[bridgeiter]-tmpbridgelistnum[bridgeiter])] =
-			(nb_cn_small+1) * blksize;
-		    tmpbridgelistnum[bridgeiter]--;
-		}
-		if (bridgeiter == (fd->hints->fs_hints.bg.numbridges-1))
-		    bridgeiter = 0;
-		else
-		    bridgeiter++;
-	    }
-	}
-	ADIOI_Free(tmpbridgelistnum);
-	ADIOI_Free(bridgelistoffset);
-
-    } else {
-	/* BG/L- and BG/P-style distribution of file domains: simple allocation of
-	 * file domins to each aggregator */
-	for (i=0; i<naggs; i++) {
-	    if (i < naggs_small) {
-		fd_size[i] = nb_cn_small     * blksize;
-	    } else {
-		fd_size[i] = (nb_cn_small+1) * blksize;
-	    }
-	}
-    }
-#ifdef balancecontigtrace
-    int myrank;
-    MPI_Comm_rank(fd->comm,&myrank);
-    if (myrank == 0) {
-      fprintf(stderr,"naggs_small is %d nb_cn_small is %d\n",naggs_small,nb_cn_small);
-	for (i=0; i<naggs; i++) {
-	    fprintf(stderr,"fd_size[%d] set to %d agg rank is %d\n",i,fd_size[i],fd->hints->ranklist[i]);
-	}
-    }
-#endif
-
-#   if AGG_DEBUG
-     DBG_FPRINTF(stderr,"%s(%d): "
-                   "gpfs_ub       %llu, "
-                   "gpfs_lb       %llu, "
-                   "gpfs_ub_rdoff %llu, "
-                   "gpfs_lb_rdoff %llu, "
-                   "fd_gpfs_range %llu, "
-                   "n_gpfs_blk    %llu, "
-                   "nb_cn_small   %llu, "
-                   "naggs_large   %llu, "
-                   "naggs_small   %llu, "
-                   "\n",
-                   myname,__LINE__,
-                   gpfs_ub      ,
-                   gpfs_lb      ,
-                   gpfs_ub_rdoff,
-                   gpfs_lb_rdoff,
-                   fd_gpfs_range,
-                   n_gpfs_blk   ,
-                   nb_cn_small  ,
-                   naggs_large  ,
-                   naggs_small
-                   );
-#   endif
-
-    fd_size[0]       -= gpfs_lb_rdoff;
-    fd_size[naggs-1] -= gpfs_ub_rdoff;
-
-    /* compute the file domain for each aggr */
-    ADIO_Offset offset = min_st_offset;
-    for (aggr=0; aggr<naggs; aggr++) {
-        fd_start[aggr] = offset;
-        fd_end  [aggr] = offset + fd_size[aggr] - 1;
-        offset += fd_size[aggr];
-    }
-
-    *fd_size_ptr = fd_size[0];
-    *min_st_offset_ptr = min_st_offset;
-
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5005, 0, NULL);
-#endif
-    ADIOI_Free (fd_size);
-    TRACE_ERR("Leaving ADIOI_BG_GPFS_Calc_file_domains\n");
-}
-
-/* 
- * When a process is an IO aggregator, this will return its index in the aggrs list.
- * Otherwise, this will return -1 
- */
-int ADIOI_BG_Aggrs_index( ADIO_File fd, int myrank )
-{
-    int i;
-    for (i=0; i<fd->hints->cb_nodes; i++) 
-	if (fd->hints->ranklist[i] == myrank) return i;
-    return -1;
-}
-
-/* 
- * ADIOI_BG_Calc_my_req() overrides ADIOI_Calc_my_req for the default implementation 
- * is specific for static file domain partitioning.
- *
- * ADIOI_Calc_my_req() - calculate what portions of the access requests
- * of this process are located in the file domains of various processes
- * (including this one)
- */
-void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list, 
-			   int contig_access_count, ADIO_Offset 
-			   min_st_offset, ADIO_Offset *fd_start,
-			   ADIO_Offset *fd_end, ADIO_Offset fd_size,
-			   int nprocs,
-			   int *count_my_req_procs_ptr,
-			   int **count_my_req_per_proc_ptr,
-			   ADIOI_Access **my_req_ptr,
-			   int **buf_idx_ptr)
-/* Possibly reconsider if buf_idx's are ok as int's, or should they be aints/offsets? 
-   They are used as memory buffer indices so it seems like the 2G limit is in effect */
-{
-    int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
-    int i, l, proc;
-    ADIO_Offset fd_len, rem_len, curr_idx, off;
-    ADIOI_Access *my_req;
-    TRACE_ERR("Entering ADIOI_BG_Calc_my_req\n");
-
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5024, 0, NULL);
-#endif
-    *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs,sizeof(int)); 
-    count_my_req_per_proc = *count_my_req_per_proc_ptr;
-/* count_my_req_per_proc[i] gives the no. of contig. requests of this
-   process in process i's file domain. calloc initializes to zero.
-   I'm allocating memory of size nprocs, so that I can do an 
-   MPI_Alltoall later on.*/
-
-    buf_idx = (int *) ADIOI_Malloc(nprocs*sizeof(int));
-/* buf_idx is relevant only if buftype_is_contig.
-   buf_idx[i] gives the index into user_buf where data received
-   from proc. i should be placed. This allows receives to be done
-   without extra buffer. This can't be done if buftype is not contig. */
-   
-    /* initialize buf_idx to -1 */
-    for (i=0; i < nprocs; i++) buf_idx[i] = -1;
-
-    /* one pass just to calculate how much space to allocate for my_req;
-     * contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
-     */
-    for (i=0; i < contig_access_count; i++) {
-	/* short circuit offset/len processing if len == 0 
-	 * 	(zero-byte  read/write */
-	if (len_list[i] == 0) 
-		continue;
-	off = offset_list[i];
-	fd_len = len_list[i];
-	/* note: we set fd_len to be the total size of the access.  then
-	 * ADIOI_Calc_aggregator() will modify the value to return the 
-	 * amount that was available from the file domain that holds the
-	 * first part of the access.
-	 */
-  /* BES */
-	proc = ADIOI_BG_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, 
-				     fd_start, fd_end);
-	count_my_req_per_proc[proc]++;
-
-	/* figure out how much data is remaining in the access (i.e. wasn't 
-	 * part of the file domain that had the starting byte); we'll take 
-	 * care of this data (if there is any) in the while loop below.
-	 */
-	rem_len = len_list[i] - fd_len;
-
-	while (rem_len > 0) {
-	    off += fd_len; /* point to first remaining byte */
-	    fd_len = rem_len; /* save remaining size, pass to calc */
-	    proc = ADIOI_BG_Calc_aggregator(fd, off, min_st_offset, &fd_len, 
-					 fd_size, fd_start, fd_end);
-
-	    count_my_req_per_proc[proc]++;
-	    rem_len -= fd_len; /* reduce remaining length by amount from fd */
-	}
-    }
-
-/* now allocate space for my_req, offset, and len */
-
-    *my_req_ptr = (ADIOI_Access *)
-	ADIOI_Malloc(nprocs*sizeof(ADIOI_Access)); 
-    my_req = *my_req_ptr;
-
-    count_my_req_procs = 0;
-    for (i=0; i < nprocs; i++) {
-	if (count_my_req_per_proc[i]) {
-	    my_req[i].offsets = (ADIO_Offset *)
-		ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset));
-	    my_req[i].lens = (int *)
-		ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(int));
-	    count_my_req_procs++;
-	}	    
-	my_req[i].count = 0;  /* will be incremented where needed
-				      later */
-    }
-
-/* now fill in my_req */
-    curr_idx = 0;
-    for (i=0; i<contig_access_count; i++) { 
-	/* short circuit offset/len processing if len == 0 
-	 * 	(zero-byte  read/write */
-	if (len_list[i] == 0)
-		continue;
-	off = offset_list[i];
-	fd_len = len_list[i];
-	proc = ADIOI_BG_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, 
-				     fd_start, fd_end);
-
-	/* for each separate contiguous access from this process */
-	if (buf_idx[proc] == -1)
-  {
-    ADIOI_Assert(curr_idx == (int) curr_idx);
-    buf_idx[proc] = (int) curr_idx;
-  }
-
-	l = my_req[proc].count;
-	curr_idx += fd_len;
-
-	rem_len = len_list[i] - fd_len;
-
-	/* store the proc, offset, and len information in an array
-         * of structures, my_req. Each structure contains the 
-         * offsets and lengths located in that process's FD, 
-	 * and the associated count. 
-	 */
-	my_req[proc].offsets[l] = off;
-  ADIOI_Assert(fd_len == (int) fd_len);
-	my_req[proc].lens[l] = (int) fd_len;
-	my_req[proc].count++;
-
-	while (rem_len > 0) {
-	    off += fd_len;
-	    fd_len = rem_len;
-	    proc = ADIOI_BG_Calc_aggregator(fd, off, min_st_offset, &fd_len, 
-					 fd_size, fd_start, fd_end);
-
-	    if (buf_idx[proc] == -1) 
-      {
-        ADIOI_Assert(curr_idx == (int) curr_idx);
-        buf_idx[proc] = (int) curr_idx;
-      }
-
-	    l = my_req[proc].count;
-	    curr_idx += fd_len;
-	    rem_len -= fd_len;
-
-	    my_req[proc].offsets[l] = off;
-      ADIOI_Assert(fd_len == (int) fd_len);
-	    my_req[proc].lens[l] = (int) fd_len;
-	    my_req[proc].count++;
-	}
-    }
-
-
-
-#ifdef AGG_DEBUG
-    for (i=0; i<nprocs; i++) {
-	if (count_my_req_per_proc[i] > 0) {
-	    DBG_FPRINTF(stderr, "data needed from %d (count = %d):\n", i, 
-		    my_req[i].count);
-	    for (l=0; l < my_req[i].count; l++) {
-		DBG_FPRINTF(stderr, "   off[%d] = %lld, len[%d] = %d\n", l,
-			my_req[i].offsets[l], l, my_req[i].lens[l]);
-	    }
-	}
-	DBG_FPRINTF(stderr, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
-    }
-#endif
-
-    *count_my_req_procs_ptr = count_my_req_procs;
-    *buf_idx_ptr = buf_idx;
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5025, 0, NULL);
-#endif
-    TRACE_ERR("Leaving ADIOI_BG_Calc_my_req\n");
-}
-
-/*
- * ADIOI_Calc_others_req (copied to bg and switched to all to all for performance)
- *
- * param[in]  count_my_req_procs        Number of processes whose file domain my
- *                                        request touches.
- * param[in]  count_my_req_per_proc     count_my_req_per_proc[i] gives the no. of 
- *                                        contig. requests of this process in 
- *                                        process i's file domain.
- * param[in]  my_req                    A structure defining my request
- * param[in]  nprocs                    Number of nodes in the block
- * param[in]  myrank                    Rank of this node
- * param[out] count_others_req_proc_ptr Number of processes whose requests lie in
- *                                        my process's file domain (including my 
- *                                        process itself)
- * param[out] others_req_ptr            Array of other process' requests that lie
- *                                        in my process's file domain
- */
-void ADIOI_BG_Calc_others_req(ADIO_File fd, int count_my_req_procs, 
-				int *count_my_req_per_proc,
-				ADIOI_Access *my_req, 
-				int nprocs, int myrank,
-				int *count_others_req_procs_ptr,
-				ADIOI_Access **others_req_ptr)  
-{
-    TRACE_ERR("Entering ADIOI_BG_Calc_others_req\n");
-/* determine what requests of other processes lie in this process's
-   file domain */
-
-/* count_others_req_procs = number of processes whose requests lie in
-   this process's file domain (including this process itself) 
-   count_others_req_per_proc[i] indicates how many separate contiguous
-   requests of proc. i lie in this process's file domain. */
-
-    int *count_others_req_per_proc, count_others_req_procs;
-    int i;
-    ADIOI_Access *others_req;
-    
-    /* Parameters for MPI_Alltoallv */
-    int *scounts, *sdispls, *rcounts, *rdispls;
-
-    /* Parameters for MPI_Alltoallv.  These are the buffers, which
-     * are later computed to be the lowest address of all buffers
-     * to be sent/received for offsets and lengths.  Initialize to
-     * the highest possible address which is the current minimum.
-     */
-    void *sendBufForOffsets=(void*)0xFFFFFFFFFFFFFFFF, 
-	 *sendBufForLens   =(void*)0xFFFFFFFFFFFFFFFF, 
-	 *recvBufForOffsets=(void*)0xFFFFFFFFFFFFFFFF, 
-	 *recvBufForLens   =(void*)0xFFFFFFFFFFFFFFFF; 
-
-/* first find out how much to send/recv and from/to whom */
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5026, 0, NULL);
-#endif
-    /* Send 1 int to each process.  count_my_req_per_proc[i] is the number of 
-     * requests that my process will do to the file domain owned by process[i].
-     * Receive 1 int from each process.  count_others_req_per_proc[i] is the number of
-     * requests that process[i] will do to the file domain owned by my process.
-     */
-    count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int));
-/*     cora2a1=timebase(); */
-/*for(i=0;i<nprocs;i++) ?*/
-    MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
-		 count_others_req_per_proc, 1, MPI_INT, fd->comm);
-
-/*     total_cora2a+=timebase()-cora2a1; */
-
-    /* Allocate storage for an array of other nodes' accesses of our
-     * node's file domain.  Also allocate storage for the alltoallv
-     * parameters.
-     */
-    *others_req_ptr = (ADIOI_Access *)
-	ADIOI_Malloc(nprocs*sizeof(ADIOI_Access)); 
-    others_req = *others_req_ptr;
-
-    scounts = ADIOI_Malloc(nprocs*sizeof(int));
-    sdispls = ADIOI_Malloc(nprocs*sizeof(int));
-    rcounts = ADIOI_Malloc(nprocs*sizeof(int));
-    rdispls = ADIOI_Malloc(nprocs*sizeof(int));
-
-    /* If process[i] has any requests in my file domain,
-     *   initialize an ADIOI_Access structure that will describe each request
-     *   from process[i].  The offsets, lengths, and buffer pointers still need
-     *   to be obtained to complete the setting of this structure.
-     */
-    count_others_req_procs = 0;
-    for (i=0; i<nprocs; i++) {
-	if (count_others_req_per_proc[i]) 
-  {
-	    others_req[i].count = count_others_req_per_proc[i];
-
-	    others_req[i].offsets = (ADIO_Offset *)
-		ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(ADIO_Offset));
-	    others_req[i].lens = (int *)
-		ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(int)); 
-
-	    if ( (MPIR_Upint)others_req[i].offsets < (MPIR_Upint)recvBufForOffsets )
-		recvBufForOffsets = others_req[i].offsets;
-	    if ( (MPIR_Upint)others_req[i].lens < (MPIR_Upint)recvBufForLens )
-		recvBufForLens = others_req[i].lens;
-
-	    others_req[i].mem_ptrs = (MPI_Aint *)
-		ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(MPI_Aint)); 
-
-	    count_others_req_procs++;
-	}
-	else 
-	{
-	    others_req[i].count = 0;
-	    others_req[i].offsets = NULL;
-	    others_req[i].lens    = NULL;
-	}
-    }
-    /* If no recv buffer was allocated in the loop above, make it NULL */
-    if ( recvBufForOffsets == (void*)0xFFFFFFFFFFFFFFFF) recvBufForOffsets = NULL;
-    if ( recvBufForLens    == (void*)0xFFFFFFFFFFFFFFFF) recvBufForLens    = NULL;
-    
-    /* Now send the calculated offsets and lengths to respective processes */
-
-    /************************/
-    /* Exchange the offsets */
-    /************************/
-
-    /* Determine the lowest sendBufForOffsets/Lens */
-    for (i=0; i<nprocs; i++)
-    {
-	if ( (my_req[i].count) &&
-	     ((MPIR_Upint)my_req[i].offsets <= (MPIR_Upint)sendBufForOffsets) )
-       {
-	  sendBufForOffsets = my_req[i].offsets;
-    }
-	   
-	if ( (my_req[i].count) &&
-	     ((MPIR_Upint)my_req[i].lens <= (MPIR_Upint)sendBufForLens) )
-       {
-	    sendBufForLens = my_req[i].lens;
-      }
-    }
-
-    /* If no send buffer was found in the loop above, make it NULL */
-    if ( sendBufForOffsets == (void*)0xFFFFFFFFFFFFFFFF) sendBufForOffsets = NULL;
-    if ( sendBufForLens    == (void*)0xFFFFFFFFFFFFFFFF) sendBufForLens    = NULL;
-
-    /* Calculate the displacements from the sendBufForOffsets/Lens */
-    for (i=0; i<nprocs; i++)
-    {
-	/* Send these offsets to process i.*/
-	scounts[i] = count_my_req_per_proc[i];
-	if ( scounts[i] == 0 )
-	    sdispls[i] = 0;
-	else
-  	  sdispls[i] =  (int)
-	                ( ( (MPIR_Upint)my_req[i].offsets - 
-			   (MPIR_Upint)sendBufForOffsets ) / 
-			  (MPIR_Upint)sizeof(ADIO_Offset) );
-
-	/* Receive these offsets from process i.*/
-	rcounts[i] = count_others_req_per_proc[i];
-	if ( rcounts[i] == 0 )
-	    rdispls[i] = 0;
-	else
-	    rdispls[i] = (int)
-	                 ( ( (MPIR_Upint)others_req[i].offsets - 
-			     (MPIR_Upint)recvBufForOffsets ) / 
-			   (MPIR_Upint)sizeof(ADIO_Offset) );
-    }
-
-    /* Exchange the offsets */
-    MPI_Alltoallv(sendBufForOffsets,
-		  scounts, sdispls, ADIO_OFFSET,
-		  recvBufForOffsets,
-		  rcounts, rdispls, ADIO_OFFSET,
-		  fd->comm);
-
-    /************************/
-    /* Exchange the lengths */
-    /************************/
-
-    for (i=0; i<nprocs; i++)
-    {
-	/* Send these lengths to process i.*/
-	scounts[i] = count_my_req_per_proc[i];
-	if ( scounts[i] == 0 )
-	    sdispls[i] = 0;
-	else
-	  sdispls[i] = (int)
-	               ( ( (MPIR_Upint)my_req[i].lens - 
-			   (MPIR_Upint)sendBufForLens ) / 
-			 (MPIR_Upint) sizeof(int) );
-	
-	/* Receive these offsets from process i. */
-	rcounts[i] = count_others_req_per_proc[i];
-	if ( rcounts[i] == 0 )
-	    rdispls[i] = 0;
-	else
-	    rdispls[i] = (int)
-	                 ( ( (MPIR_Upint)others_req[i].lens - 
-			     (MPIR_Upint)recvBufForLens ) / 
-			   (MPIR_Upint) sizeof(int) );
-    }
-
-    /* Exchange the lengths */
-    MPI_Alltoallv(sendBufForLens,
-		  scounts, sdispls, MPI_INT,
-		  recvBufForLens,
-		  rcounts, rdispls, MPI_INT,
-		  fd->comm);
-
-    /* Clean up */
-    ADIOI_Free(count_others_req_per_proc);
-    ADIOI_Free (scounts);
-    ADIOI_Free (sdispls);
-    ADIOI_Free (rcounts);
-    ADIOI_Free (rdispls);
-
-    *count_others_req_procs_ptr = count_others_req_procs;
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5027, 0, NULL);
-#endif
-    TRACE_ERR("Leaving ADIOI_BG_Calc_others_req\n");
-}
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_bg_aggrs.h b/src/mpi/romio/adio/ad_gpfs/ad_bg_aggrs.h
index 0fd76f4..a5322a0 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_bg_aggrs.h
+++ b/src/mpi/romio/adio/ad_gpfs/ad_bg_aggrs.h
@@ -30,66 +30,4 @@
     /* generate a list of I/O aggregators that utilizes BG-PSET orginization. */
     int ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset);
 
-    /* overriding ADIOI_Calc_file_domains() to apply 'aligned file domain partitioning'. */
-    void ADIOI_BG_GPFS_Calc_file_domains(ADIO_File fd,
-	                                  ADIO_Offset *st_offsets,
-				          ADIO_Offset *end_offsets,
-				          int          nprocs,
-				          int          nprocs_for_coll,
-				          ADIO_Offset *min_st_offset_ptr,
-				          ADIO_Offset **fd_start_ptr,
-				          ADIO_Offset **fd_end_ptr,
-				          ADIO_Offset *fd_size_ptr,
-                  void        *fs_ptr);
-
-    /* a utilitiy function for debugging */
-    int ADIOI_BG_Aggrs_index(ADIO_File fd, int myrank );
-
-    /* overriding ADIOI_Calc_aggregator() for the default implementation is specific for 
-       static file domain partitioning */
-    int ADIOI_BG_Calc_aggregator(ADIO_File fd,
-				  ADIO_Offset off,
-				  ADIO_Offset min_off,
-				  ADIO_Offset *len,
-				  ADIO_Offset fd_size,
-				  ADIO_Offset *fd_start,
-				  ADIO_Offset *fd_end);
-
-    /* overriding ADIOI_Calc_my_req for the default implementation is specific for 
-       static file domain partitioning */
-    void ADIOI_BG_Calc_my_req ( ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
-				 int contig_access_count, ADIO_Offset
-				 min_st_offset, ADIO_Offset *fd_start,
-				 ADIO_Offset *fd_end, ADIO_Offset fd_size,
-				 int nprocs,
-				 int *count_my_req_procs_ptr,
-				 int **count_my_req_per_proc_ptr,
-				 ADIOI_Access **my_req_ptr,
-				 int **buf_idx_ptr);
-
-    /*
-     * ADIOI_Calc_others_req
-     *
-     * param[in]  count_my_req_procs        Number of processes whose file domain my
-     *                                        request touches.
-     * param[in]  count_my_req_per_proc     count_my_req_per_proc[i] gives the no. of 
-     *                                        contig. requests of this process in 
-     *                                        process i's file domain.
-     * param[in]  my_req                    A structure defining my request
-     * param[in]  nprocs                    Number of nodes in the block
-     * param[in]  myrank                    Rank of this node
-     * param[out] count_others_req_proc_ptr Number of processes whose requests lie in
-     *                                        my process's file domain (including my 
-     *                                        process itself)
-     * param[out] others_req_ptr            Array of other process' requests that lie
-     *                                        in my process's file domain
-     */
-     void ADIOI_BG_Calc_others_req(ADIO_File fd, int count_my_req_procs, 
-				    int *count_my_req_per_proc,
-				    ADIOI_Access *my_req, 
-				    int nprocs, int myrank,
-				    int *count_others_req_procs_ptr,
-				    ADIOI_Access **others_req_ptr);
-
-
 #endif  /* AD_BG_AGGRS_H_ */
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_bg_hints.c b/src/mpi/romio/adio/ad_gpfs/ad_bg_hints.c
index da86206..68163fa 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_bg_hints.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_bg_hints.c
@@ -16,7 +16,7 @@
 #include "adio_extern.h"
 #include "hint_fns.h"
 
-#include "ad_bg.h"
+#include "ad_gpfs.h"
 #include "ad_bg_pset.h"
 #include "ad_bg_aggrs.h"
 
@@ -25,9 +25,9 @@
 #define   ADIOI_BG_IND_WR_BUFFER_SIZE_DFLT	"4194304"
 #define   ADIOI_BG_NAGG_IN_PSET_HINT_NAME	"bg_nodes_pset"
 /** \page mpiio_vars MPIIO Configuration
- *  
- * BlueGene MPIIO configuration and performance tuning. Used by ad_bg and ad_bglockless ADIO's.
- *  
+ *
+ * BlueGene MPIIO configuration and performance tuning. Used by ad_gpfs ADIO.
+ *
  * \section hint_sec Hints
  * - bg_nodes_pset - Specify how many aggregators to use per pset.
  *   This hint will override the cb_nodes hint based on BlueGene psets.
@@ -70,14 +70,14 @@ void ADIOI_BG_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
      */
 
     value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
-    ADIOI_BG_assert ((value != NULL));
+    ADIOI_GPFS_assert ((value != NULL));
 
     /* initialize info and hints to default values if they haven't been
      * previously initialized
      */
     if (!fd->hints->initialized) {
 
-	ad_bg_get_env_vars();
+	ad_gpfs_get_env_vars();
 	did_anything = 1;
 
 	/* buffer size for collective I/O */
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_bg_pset.c b/src/mpi/romio/adio/ad_gpfs/ad_bg_pset.c
index 3e842da..6470969 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_bg_pset.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_bg_pset.c
@@ -14,7 +14,7 @@
 
 /* #define TRACE_ON */
 #include <stdlib.h>
-#include "ad_bg.h"
+#include "ad_gpfs.h"
 #include "ad_bg_pset.h"
 #include <spi/include/kernel/process.h>
 #include <firmware/include/personality.h>
@@ -31,7 +31,7 @@ ADIOI_BG_ProcInfo_t *
 ADIOI_BG_ProcInfo_new()
 {
     ADIOI_BG_ProcInfo_t *p = (ADIOI_BG_ProcInfo_t *) ADIOI_Malloc (sizeof(ADIOI_BG_ProcInfo_t));
-    ADIOI_BG_assert ((p != NULL));
+    ADIOI_GPFS_assert ((p != NULL));
     return p;
 }
 
@@ -39,7 +39,7 @@ ADIOI_BG_ProcInfo_t *
 ADIOI_BG_ProcInfo_new_n( int n )
 {
     ADIOI_BG_ProcInfo_t *p = (ADIOI_BG_ProcInfo_t *) ADIOI_Malloc (n * sizeof(ADIOI_BG_ProcInfo_t));
-    ADIOI_BG_assert ((p != NULL));
+    ADIOI_GPFS_assert ((p != NULL));
     return p;
 }
 
@@ -53,7 +53,7 @@ ADIOI_BG_ConfInfo_t *
 ADIOI_BG_ConfInfo_new ()
 {
     ADIOI_BG_ConfInfo_t *p = (ADIOI_BG_ConfInfo_t *) ADIOI_Malloc (sizeof(ADIOI_BG_ConfInfo_t));
-    ADIOI_BG_assert ((p != NULL));
+    ADIOI_GPFS_assert ((p != NULL));
     return p;
 }
 
@@ -260,9 +260,9 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
             
       conf->nAggrs = n_aggrs;
       /*    First pass gets nAggrs = -1 */
-      if(conf->nAggrs <=0) 
-         conf->nAggrs = bgmpio_bg_nagg_pset;
-      if(conf->ioMinSize <= conf->nAggrs) 
+      if(conf->nAggrs <=0)
+         conf->nAggrs = gpfsmpio_bg_nagg_pset;
+      if(conf->ioMinSize <= conf->nAggrs)
         conf->nAggrs = ADIOI_MAX(1,conf->ioMinSize-1); /* not including bridge itself */
 /*      if(conf->nAggrs > conf->numBridgeRanks) 
          conf->nAggrs = conf->numBridgeRanks; 
@@ -273,7 +273,7 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
       TRACE_ERR("Maximum ranks under a bridge rank: %d, minimum: %d, nAggrs: %d, numBridgeRanks: %d pset dflt: %d naggrs: %d ratio: %f\n", maxcompute, mincompute, conf->nAggrs, conf->numBridgeRanks, ADIOI_BG_NAGG_PSET_DFLT, conf->nAggrs, conf->aggRatio);
    }
 
-   ADIOI_BG_assert((bridgerank != -1));
+   ADIOI_GPFS_assert((bridgerank != -1));
    proc->bridgeRank = bridgerank;
    proc->iamBridge = iambridge;
    TRACE_ERR("Rank %d has bridge set index %d (bridge rank: %d) with %d other ranks, ioNodeIndex: %d\n", rank,  proc->ioNodeIndex, bridgerank, proc->myIOSize, proc->ioNodeIndex);
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_bg_tuning.c b/src/mpi/romio/adio/ad_gpfs/ad_bg_tuning.c
deleted file mode 100644
index 8aea5a9..0000000
--- a/src/mpi/romio/adio/ad_gpfs/ad_bg_tuning.c
+++ /dev/null
@@ -1,263 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bg_tuning.c
- * \brief Defines ad_bg performance tuning
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 2008 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-/*---------------------------------------------------------------------
- * ad_bg_tuning.c
- *
- * defines global variables and functions for performance tuning and 
- * functional debugging.
- *---------------------------------------------------------------------*/
-
-#include "ad_bg_tuning.h"
-#include "mpi.h"
-
-#if !defined(PVFS2_SUPER_MAGIC)
-  #define PVFS2_SUPER_MAGIC (0x20030528)
-#endif
-
-
-int 	bgmpio_timing;
-int 	bgmpio_timing2;
-int     bgmpio_timing_cw_level;
-int 	bgmpio_comm;
-int 	bgmpio_tunegather;
-int 	bgmpio_tuneblocking;
-long    bglocklessmpio_f_type;
-int     bgmpio_bg_nagg_pset;
-int     bgmpio_pthreadio;
-int     bgmpio_p2pcontig;
-int	bgmpio_balancecontig;
-int     bgmpio_devnullio;
-
-double	bgmpio_prof_cw    [BGMPIO_CIO_LAST];
-double	bgmpio_prof_cr    [BGMPIO_CIO_LAST];
-
-/* set internal variables for tuning environment variables */
-/** \page mpiio_vars MPIIO Configuration
-  \section env_sec Environment Variables
- * - BGMPIO_COMM - Define how data is exchanged on collective
- *   reads and writes.  Possible values:
- *   - 0 - Use MPI_Alltoallv.
- *   - 1 - Use MPI_Isend/MPI_Irecv.
- *   - Default is 0.
- *
- * - BGMPIO_TIMING - collect timing breakdown for MPI I/O collective calls.
- *   Possible values:
- *   - 0 - Do not collect/report timing.
- *   - 1 - Collect/report timing.
- *   - Default is 0.
- *
- * - BGMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated
- *   for aggregator collective i/o.  Possible values:
- *   - 0 - Use two MPI_Allgather's to collect starting and ending offsets.
- *   - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets.
- *   - Default is 1.
- *
- * - BGMPIO_TUNEBLOCKING - Tune how aggregate file domains are 
- *   calculated (block size).  Possible values:
- *   - 0 - Evenly calculate file domains across aggregators.  Also use 
- *   MPI_Isend/MPI_Irecv to exchange domain information.
- *   - 1 - Align file domains with the underlying file system's block size.  Also use 
- *   MPI_Alltoallv to exchange domain information.
- *   - Default is 1.
- *
- * - BGLOCKLESSMPIO_F_TYPE - Specify a filesystem type that should run
- *   the ad_bglockless driver.   NOTE: Using romio prefixes (such as
- *   "bg:" or "bglockless:") on a file name will override this environment
- *   variable.  Possible values:
- *   - 0xnnnnnnnn - Any valid file system type (or "magic number") from
- *                  statfs() field f_type.
- *   - The default is 0x20030528 (PVFS2_SUPER_MAGIC)
- *
- * - BGMPIO_NAGG_PSET - Specify a ratio of "I/O aggregators" to use for each
- *   compute group (compute nodes + i/o nodes).    Possible values:
- *   - any integer
- *   - Default is 8
- *
- * - BGMPIO_PTHREADIO - Enables a very simple form of asyncronous io where a
- *   pthread is spawned to do the posix writes while the main thread does the
- *   data aggregation - useful for large files where multiple rounds are
- *   required (more that the cb_buffer_size of data per aggregator).   User
- *   must ensure there is hw resource available for the thread to run.  I
- *   am sure there is a better way to do this involving comm threads - this is
- *   just a start.  NOTE: For some reason the stats collected when this is
- *   enabled misses some of the data so the data sizes are off a bit - this is
- *   a statistical issue only, the data is still accurately written out
- *
- * - BGMPIO_P2PCONTIG -  Does simple point-to-point communication between the
- *   aggregator and the procs that feed it.  Performance could be enhanced by a
- *   one-sided put algorithm.  Current implementation allows only 1 round of
- *   data.  Useful/allowed only when:
- * 1.) The datatype is contiguous.
- * 2.) The offsets are increasing in rank-order.
- * 3.) There are no gaps between the offsets.
- * 4.) No single rank has a data size which spans multiple file domains.
- *
- * - BGMPIO_BALANCECONTIG -  File domain blocks are assigned to aggregators in
- *   a breadth-first fashion relative to the ions - additionally, file domains
- *   on the aggregators sharing the same bridgeset and ion have contiguous
- *   offsets.  The breadth-first assignment improves performance in the case of
- *   a relatively small file of size less than the gpfs block size multiplied
- *   by the number of ions. Files: ad_bg_wrcoll.c ad_bg_aggrs.c.  Possible Values
- *   - 0 - assign file domain blocks in the traditional manner
- *   - 1 - if there are variable sized file domain blocks, spread them out
- *         (balance) across bridge nodes
- *
- * - BGMPIO_DEVNULLIO - do everything *except* write to / read from the file
- *   system. When experimenting with different two-phase I/O strategies, it's
- *   helpful to remove the highly variable file system from the experiment.
- *   - 0 (disabled) or 1 (enabled)
- *   - Default is 0
- *
- */
-
-void ad_bg_get_env_vars() {
-    char *x, *dummy;
-
-    bgmpio_comm   = 0;
-	x = getenv( "BGMPIO_COMM"         ); 
-	if (x) bgmpio_comm         = atoi(x);
-    bgmpio_timing = 0;
-	x = getenv( "BGMPIO_TIMING"       ); 
-	if (x) bgmpio_timing       = atoi(x);
-    bgmpio_tunegather = 1;
-	x = getenv( "BGMPIO_TUNEGATHER"   ); 
-	if (x) bgmpio_tunegather   = atoi(x);
-    bgmpio_tuneblocking = 1;
-    x = getenv( "BGMPIO_TUNEBLOCKING" ); 
-    if (x) bgmpio_tuneblocking = atoi(x);
-    bglocklessmpio_f_type = PVFS2_SUPER_MAGIC;
-    x = getenv( "BGLOCKLESSMPIO_F_TYPE" ); 
-    if (x) bglocklessmpio_f_type = strtol(x,&dummy,0);
-    DBG_FPRINTF(stderr,"BGLOCKLESSMPIO_F_TYPE=%ld/%#lX\n",
-            bglocklessmpio_f_type,bglocklessmpio_f_type);
-    /* note: this value will be 'sanity checked' in ADIOI_BG_persInfo_init(),
-     * when we know a bit more about what "largest possible value" and
-     * "smallest possible value" should be */
-    bgmpio_bg_nagg_pset = ADIOI_BG_NAGG_PSET_DFLT;
-    x = getenv("BGMPIO_NAGG_PSET");
-    if (x) bgmpio_bg_nagg_pset = atoi(x);
-
-    bgmpio_pthreadio = 0;
-    x = getenv( "BGMPIO_PTHREADIO" );
-    if (x) bgmpio_pthreadio = atoi(x);
-
-    bgmpio_p2pcontig = 0;
-    x = getenv( "BGMPIO_P2PCONTIG" );
-    if (x) bgmpio_p2pcontig = atoi(x);
-
-    bgmpio_balancecontig = 0;
-    x = getenv( "BGMPIO_BALANCECONTIG" );
-    if (x) bgmpio_balancecontig = atoi(x);
-
-    bgmpio_devnullio = 0;
-    x = getenv( "BGMPIO_DEVNULLIO" );
-    if (x) bgmpio_devnullio = atoi(x);
-}
-
-/* report timing breakdown for MPI I/O collective call */
-void ad_bg_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs )
-{
-    int i;
-
-    if (bgmpio_timing) {
-	/* Timing across the whole communicator is a little bit interesting,
-	 * but what is *more* interesting is if we single out the aggregators
-	 * themselves.  non-aggregators spend a lot of time in "exchange" not
-	 * exchanging data, but blocked because they are waiting for
-	 * aggregators to finish writing.  If we focus on just the aggregator
-	 * processes we will get a more clear picture about the data exchange
-	 * vs. i/o time breakdown */
-
-	/* if deferred open enabled, we could use the aggregator communicator */
-	MPI_Comm agg_comm;
-	int nr_aggs, agg_rank;
-	MPI_Comm_split(fd->comm, (fd->is_agg ? 1 : MPI_UNDEFINED), 0, &agg_comm);
-	if(agg_comm != MPI_COMM_NULL) {
-	    MPI_Comm_size(agg_comm, &nr_aggs);
-	    MPI_Comm_rank(agg_comm, &agg_rank);
-	}
-
-	double *bgmpio_prof_org = bgmpio_prof_cr;
-	if (rw) bgmpio_prof_org = bgmpio_prof_cw;
-
-	double bgmpio_prof_avg[ BGMPIO_CIO_LAST ];
-	double bgmpio_prof_max[ BGMPIO_CIO_LAST ];
-	
-	if( agg_comm != MPI_COMM_NULL) {
-	    MPI_Reduce( bgmpio_prof_org, bgmpio_prof_avg, BGMPIO_CIO_LAST, MPI_DOUBLE, MPI_SUM, 0, agg_comm);
-	    MPI_Reduce( bgmpio_prof_org, bgmpio_prof_max, BGMPIO_CIO_LAST, MPI_DOUBLE, MPI_MAX, 0, agg_comm);
-	}
-	if (agg_comm != MPI_COMM_NULL && agg_rank == 0) {
-
-	    for (i=0; i<BGMPIO_CIO_LAST; i++) bgmpio_prof_avg[i] /= nr_aggs;
-
-	    bgmpio_prof_avg[ BGMPIO_CIO_B_POSI_RW  ] =
-		bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nr_aggs /
-		bgmpio_prof_max[ BGMPIO_CIO_T_POSI_RW  ];
-	    bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_RW  ] =
-		bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nr_aggs /
-		bgmpio_prof_max[ BGMPIO_CIO_T_MPIO_RW  ];
-
-	    bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_CRW ] =
-		bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nr_aggs /
-		bgmpio_prof_max[ BGMPIO_CIO_T_MPIO_CRW ];
-
-	    fprintf(stderr,"TIMING-%1s,", (rw ? "W" : "R") );
-	    fprintf(stderr,"SIZE: %12.4lld , ", (long long int)(bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nr_aggs));
-	    fprintf(stderr,"SEEK-avg: %10.3f , ",
-		    bgmpio_prof_avg[ BGMPIO_CIO_T_SEEK ]     );
-	    fprintf(stderr,"SEEK-max: %10.3f , ",
-		    bgmpio_prof_max[ BGMPIO_CIO_T_SEEK ]     );
-	    fprintf(stderr,"LOCAL-avg: %10.3f , ",
-		    bgmpio_prof_avg[ BGMPIO_CIO_T_LCOMP ]    );
-	    fprintf(stderr,"GATHER-max: %10.3f , ",
-		    bgmpio_prof_max[ BGMPIO_CIO_T_GATHER ]   );
-	    fprintf(stderr,"PATTERN-avg: %10.3f , ",
-		    bgmpio_prof_avg[ BGMPIO_CIO_T_PATANA ]   );
-	    fprintf(stderr,"FILEDOMAIN-avg: %10.3f , ",
-		    bgmpio_prof_avg[ BGMPIO_CIO_T_FD_PART ]  );
-	    fprintf(stderr,"MYREQ-avg: %10.3f , ",
-		    bgmpio_prof_avg[ BGMPIO_CIO_T_MYREQ ]    );
-	    fprintf(stderr,"OTHERREQ-max: %10.3f , ",
-		    bgmpio_prof_max[ BGMPIO_CIO_T_OTHREQ ]   );
-	    fprintf(stderr,"EXCHANGE-max: %10.3f , ",
-		    bgmpio_prof_max[ BGMPIO_CIO_T_DEXCH ]    );
-	    fprintf(stderr, "EXCHANGE-RECV_EXCH-max: %10.3f , ",
-		    bgmpio_prof_max[ BGMPIO_CIO_T_DEXCH_RECV_EXCH]  );
-	    fprintf(stderr, "EXCHANGE-SETUP-max: %10.3f , ",
-		    bgmpio_prof_max[ BGMPIO_CIO_T_DEXCH_SETUP]  );
-	    fprintf(stderr, "EXCHANGE-NET-max: %10.3f , ",
-		    bgmpio_prof_max[ BGMPIO_CIO_T_DEXCH_NET]  );
-	    fprintf(stderr, "EXCHANGE-SORT-max: %10.3f , ",
-		    bgmpio_prof_max[ BGMPIO_CIO_T_DEXCH_SORT]  );
-	    fprintf(stderr, "EXCHANGE-SIEVE-max: %10.3f , ",
-		    bgmpio_prof_max[ BGMPIO_CIO_T_DEXCH_SIEVE]  );
-	    fprintf(stderr,"POSIX-TIME-avg: %10.3f , ",
-		    bgmpio_prof_avg[ BGMPIO_CIO_T_POSI_RW ]  );
-	    fprintf(stderr,"MPIIO-CONTIG-TIME-avg: %10.3f , ",
-		    bgmpio_prof_avg[ BGMPIO_CIO_T_MPIO_RW ]  );
-	    fprintf(stderr,"MPIIO-STRIDED-TIME-avg: %10.3f , ",
-		    bgmpio_prof_avg[ BGMPIO_CIO_T_MPIO_CRW ] );
-	    fprintf(stderr,"POSIX-BW-avg: %10.3f , ",
-		    bgmpio_prof_avg[ BGMPIO_CIO_B_POSI_RW ]  );
-	    fprintf(stderr,"MPI-BW-avg: %10.3f , ",
-		    bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_RW ]  );
-	    fprintf(stderr,"MPI-BW-collective-avg: %10.3f\n ",
-		    bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_CRW ] );
-	}
-	if (agg_comm != MPI_COMM_NULL) MPI_Comm_free(&agg_comm);
-    }
-
-}
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_bg_tuning.h b/src/mpi/romio/adio/ad_gpfs/ad_bg_tuning.h
deleted file mode 100644
index 39ab047..0000000
--- a/src/mpi/romio/adio/ad_gpfs/ad_bg_tuning.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bg_tuning.h
- * \brief ???
- */
-
-/*---------------------------------------------------------------------
- * ad_bg_tuning.h
- *
- * declares global variables and macros for performance tuning and 
- * functional debugging.
- *---------------------------------------------------------------------*/
-
-#ifndef AD_BG_TUNING_H_
-#define AD_BG_TUNING_H_
-
-#include "adio.h"
-
-#define ADIOI_BG_assert( a ) if (!(a)) { \
-                                fprintf( stderr, "AD_BG_assert, file=%s, line=%d\n", __FILE__, __LINE__ ); \
-                                MPI_Abort( MPI_COMM_WORLD, 1 ); \
-                           }
-
-
-/*-----------------------------------------
- *  Global variables for the control of
- *  1.  timing
- *  2.  select specific optimizations
- *-----------------------------------------*/
-
-/* timing fields */
-enum {
-    BGMPIO_CIO_DATA_SIZE=0,	
-    BGMPIO_CIO_T_SEEK,		
-    BGMPIO_CIO_T_LCOMP,	/* time for ADIOI_Calc_my_off_len(), local */
-    BGMPIO_CIO_T_GATHER,	/* time for previous MPI_Allgather, now Allreduce */
-    BGMPIO_CIO_T_PATANA,	/* time for a quick test if access is contiguous or not, local */
-    BGMPIO_CIO_T_FD_PART,	/* time for file domain partitioning, local */
-    BGMPIO_CIO_T_MYREQ,	/* time for ADIOI_BG_Calc_my_req(), local */
-    BGMPIO_CIO_T_OTHREQ,	/* time for ADIOI_Calc_others_req(), short Alltoall */
-    BGMPIO_CIO_T_DEXCH,	/* time for I/O data exchange */
-    /* the next DEXCH_* timers capture finer-grained portions of T_DEXCH */
-    BGMPIO_CIO_T_DEXCH_RECV_EXCH,/* time for each process to exchange recieve
-				    size info with everyone else */
-    BGMPIO_CIO_T_DEXCH_SETUP,	/* time for setup portion of I/O data exchange */
-    BGMPIO_CIO_T_DEXCH_NET,	/* time for network portion of I/O data exchange */
-    BGMPIO_CIO_T_DEXCH_SORT, 	/* time to sort requesst in I/O data exchange */
-    BGMPIO_CIO_T_DEXCH_SIEVE, 	/* time for read portion of RMW in two phase */
-    BGMPIO_CIO_T_POSI_RW,
-    BGMPIO_CIO_B_POSI_RW,
-    BGMPIO_CIO_T_MPIO_RW,	/* time for ADIOI_BG_WriteContig() */
-    BGMPIO_CIO_B_MPIO_RW,
-    BGMPIO_CIO_T_MPIO_CRW,	/* time for ADIOI_BG_WriteStridedColl() */
-    BGMPIO_CIO_B_MPIO_CRW,
-    BGMPIO_CIO_LAST
-};
-
-extern double 	bgmpio_prof_cw    [BGMPIO_CIO_LAST];
-extern double 	bgmpio_prof_cr    [BGMPIO_CIO_LAST];
-
-
-/* corresponds to environment variables to select optimizations and timing level */
-extern int 	bgmpio_timing;
-extern int      bgmpio_timing_cw_level;
-extern int 	bgmpio_comm;
-extern int 	bgmpio_tunegather;
-extern int 	bgmpio_tuneblocking;
-extern long bglocklessmpio_f_type;
-extern int      bgmpio_pthreadio;
-extern int      bgmpio_p2pcontig;
-extern int  bgmpio_balancecontig;
-extern int      bgmpio_devnullio;
-
-/* Default is, well, kind of complicated. Blue Gene /L and /P had "psets": one
- * i/o node and all compute nodes wired to it.  On Blue Gene /Q that
- * relationship is a lot more fluid.  There are still I/O nodes, and compute
- * nodes are assigned to an i/o node, but there are two routes to the i/o node,
- * via compute nodes designated as "bridge nodes".  In this code, what we used
- * to call a "pset" is actually "compute nodes associated with and including a
- * bridge node".  So, "nAgg" is roughly "number of aggregators per bridge", but
- * look closely at ADIOI_BG_persInfo_init() for the details */
-
-#define ADIOI_BG_NAGG_PSET_DFLT 64
-
-extern int     bgmpio_bg_nagg_pset;
-
-
-/* set internal variables for tuning environment variables */
-void ad_bg_get_env_vars(void);
-
-/* report timing breakdown for MPI I/O collective call */
-void ad_bg_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs );
-
-/* note: 	
- *   T := timing; 
- * CIO := collective I/O 
- */
-#define BGMPIO_T_CIO_RESET( RW ) \
-	{ \
-	  int i; \
-	  for ( i = 0; i < BGMPIO_CIO_LAST; i ++ ) \
-	    bgmpio_prof_c##RW [ i ] = 0; \
-	}
-
-#define BGMPIO_T_CIO_REPORT( RW, FD, MYRANK, NPROCS ) \
-	ad_bg_timing_crw_report ( RW, FD, MYRANK, NPROCS ); \
-
-#define BGMPIO_T_CIO_SET_GET(RW, ISSET, ISGET, VAR1, VAR2 ) \
-         {\
-	 double temp = MPI_Wtime(); \
-	 if ( ISSET ) bgmpio_prof_c##RW [ VAR1 ] = temp; \
-	 if ( ISGET ) bgmpio_prof_c##RW [ VAR2 ] = temp - bgmpio_prof_c##RW [ VAR2 ] ;\
-	 }
-
-#endif  /* AD_BG_TUNING_H_ */
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs.c
index 4a3904e..e241cc9 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs.c
@@ -2,7 +2,7 @@
 /* (C)Copyright IBM Corp.  2007, 2008                               */
 /* ---------------------------------------------------------------- */
 /**
- * \file ad_bg.c
+ * \file ad_gpfs.c
  * \brief ???
  */
 
@@ -11,25 +11,28 @@
  *   Copyright (C) 2001 University of Chicago. 
  *   See COPYRIGHT notice in top-level directory.
  */
-#define BG_OPTIM_STEP1_1 1
-#include "ad_bg.h"
+#include "ad_gpfs.h"
 
 /* adioi.h has the ADIOI_Fns_struct define */
 #include "adioi.h"
 
-struct ADIOI_Fns_struct ADIO_BG_operations = {
-    ADIOI_BG_Open, /* Open */
+struct ADIOI_Fns_struct ADIO_GPFS_operations = {
+    ADIOI_GPFS_Open, /* Open */
     ADIOI_GEN_OpenColl, /* Collective open */
     ADIOI_GEN_ReadContig, /* ReadContig */
     ADIOI_GEN_WriteContig, /* WriteContig */
-    ADIOI_BG_ReadStridedColl, /* ReadStridedColl */
-    ADIOI_BG_WriteStridedColl, /* WriteStridedColl */
+    ADIOI_GPFS_ReadStridedColl, /* ReadStridedColl */
+    ADIOI_GPFS_WriteStridedColl, /* WriteStridedColl */
     ADIOI_GEN_SeekIndividual, /* SeekIndividual */
-    ADIOI_BG_Fcntl, /* Fcntl */
+    ADIOI_GEN_Fcntl, /* Fcntl */
+#ifdef BGQPLATFORM
     ADIOI_BG_SetInfo, /* SetInfo */
+#else
+    ADIOI_GEN_SetInfo, /* SetInfo */
+#endif
     ADIOI_GEN_ReadStrided, /* ReadStrided */
     ADIOI_GEN_WriteStrided, /* WriteStrided */
-    ADIOI_BG_Close, /* Close */
+    ADIOI_GPFS_Close, /* Close */
 #ifdef ROMIO_HAVE_WORKING_AIO
 #warning Consider BG support for NFS before enabling this.
     ADIOI_GEN_IreadContig, /* IreadContig */
@@ -44,7 +47,7 @@ struct ADIOI_Fns_struct ADIO_BG_operations = {
     ADIOI_GEN_IOComplete, /* WriteComplete */
     ADIOI_GEN_IreadStrided, /* IreadStrided */
     ADIOI_GEN_IwriteStrided, /* IwriteStrided */
-    ADIOI_BG_Flush, /* Flush */
+    ADIOI_GPFS_Flush, /* Flush */
     ADIOI_GEN_Resize, /* Resize */
     ADIOI_GEN_Delete, /* Delete */
     ADIOI_GEN_Feature, /* Features */
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs.h b/src/mpi/romio/adio/ad_gpfs/ad_gpfs.h
index 763c8de..d7db201 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs.h
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs.h
@@ -2,7 +2,7 @@
 /* (C)Copyright IBM Corp.  2007, 2008                               */
 /* ---------------------------------------------------------------- */
 /**
- * \file ad_bg.h
+ * \file ad_gpfs.h
  * \brief ???
  */
 
@@ -12,8 +12,8 @@
  *   See COPYRIGHT notice in top-level directory.
  */
 
-#ifndef AD_BG_INCLUDE
-#define AD_BG_INCLUDE
+#ifndef AD_GPFS_INCLUDE
+#define AD_GPFS_INCLUDE
 
 #include <unistd.h>
 #include <stdlib.h>
@@ -28,70 +28,49 @@
 #include <aio.h>
 #endif
 
-#if 0 
-int ADIOI_BG_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
-		  int wr, void *handle);
-#endif
 
-void ADIOI_BG_Open(ADIO_File fd, int *error_code);
+void ADIOI_GPFS_Open(ADIO_File fd, int *error_code);
 
-void ADIOI_BG_Close(ADIO_File fd, int *error_code);
+void ADIOI_GPFS_Close(ADIO_File fd, int *error_code);
 
-void ADIOI_BG_ReadContig(ADIO_File fd, void *buf, int count, 
+void ADIOI_GPFS_ReadContig(ADIO_File fd, void *buf, int count,
                       MPI_Datatype datatype, int file_ptr_type,
                      ADIO_Offset offset, ADIO_Status *status, int
 		     *error_code);
-void ADIOI_BG_WriteContig(ADIO_File fd, const void *buf, int count, 
+void ADIOI_GPFS_WriteContig(ADIO_File fd, const void *buf, int count,
                       MPI_Datatype datatype, int file_ptr_type,
                       ADIO_Offset offset, ADIO_Status *status, int
-		      *error_code);   
-#if 0
-void ADIOI_BG_IwriteContig(ADIO_File fd, void *buf, int count, 
-                      MPI_Datatype datatype, int file_ptr_type,
-                      ADIO_Offset offset, ADIO_Request *request, int
-		      *error_code);   
-void ADIOI_BG_IreadContig(ADIO_File fd, void *buf, int count, 
-                      MPI_Datatype datatype, int file_ptr_type,
-                      ADIO_Offset offset, ADIO_Request *request, int
-		      *error_code);   
-int ADIOI_BG_ReadDone(ADIO_Request *request, ADIO_Status *status, int
-		       *error_code);
-int ADIOI_BG_WriteDone(ADIO_Request *request, ADIO_Status *status, int
-		       *error_code);
-void ADIOI_BG_ReadComplete(ADIO_Request *request, ADIO_Status *status, int
-		       *error_code); 
-void ADIOI_BG_WriteComplete(ADIO_Request *request, ADIO_Status *status,
-			int *error_code); 
-#endif
-void ADIOI_BG_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int
-		*error_code); 
+		      *error_code);
+
+#ifdef BGQPLATFORM
 void ADIOI_BG_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
+#endif
 
-void ADIOI_BG_WriteStrided(ADIO_File fd, const void *buf, int count,
+void ADIOI_GPFS_WriteStrided(ADIO_File fd, const void *buf, int count,
 		       MPI_Datatype datatype, int file_ptr_type,
 		       ADIO_Offset offset, ADIO_Status *status, int
 		       *error_code);
-void ADIOI_BG_ReadStrided(ADIO_File fd, void *buf, int count,
+void ADIOI_GPFS_ReadStrided(ADIO_File fd, void *buf, int count,
 		       MPI_Datatype datatype, int file_ptr_type,
 		       ADIO_Offset offset, ADIO_Status *status, int
 		       *error_code);
 
-void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
+void ADIOI_GPFS_ReadStridedColl(ADIO_File fd, void *buf, int count,
                                MPI_Datatype datatype, int file_ptr_type,
                                ADIO_Offset offset, ADIO_Status *status, int
                                *error_code);
 
-void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
+void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
                        MPI_Datatype datatype, int file_ptr_type,
                        ADIO_Offset offset, ADIO_Status *status, int
                        *error_code);
 
-void ADIOI_BG_Get_shared_fp(ADIO_File fd, ADIO_Offset size, ADIO_Offset *shared_fp, int *error_code);
-void ADIOI_BG_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
+void ADIOI_GPFS_Get_shared_fp(ADIO_File fd, ADIO_Offset size, ADIO_Offset *shared_fp, int *error_code);
+void ADIOI_GPFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
 
-void ADIOI_BG_Flush(ADIO_File fd, int *error_code);
+void ADIOI_GPFS_Flush(ADIO_File fd, int *error_code);
 
-#include "ad_bg_tuning.h"
+#include "ad_gpfs_tuning.h"
 
 
 #endif
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_close.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_close.c
index cb30f72..d82e0e5 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_close.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_close.c
@@ -2,7 +2,7 @@
 /* (C)Copyright IBM Corp.  2007, 2008                               */
 /* ---------------------------------------------------------------- */
 /**
- * \file ad_bg_close.c
+ * \file ad_gpfs_close.c
  * \brief ???
  */
 
@@ -12,13 +12,12 @@
  *   See COPYRIGHT notice in top-level directory.
  */
 
-#include "ad_bg.h"
-#include "ad_bg_aggrs.h"
+#include "ad_gpfs_tuning.h"
 
-void ADIOI_BG_Close(ADIO_File fd, int *error_code)
+void ADIOI_GPFS_Close(ADIO_File fd, int *error_code)
 {
   int err, derr=0;
-  static char myname[] = "ADIOI_BG_CLOSE";
+  static char myname[] = "ADIOI_GPFS_CLOSE";
 
 #ifdef PROFILE
   MPE_Log_event(9, 0, "start close");
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_fcntl.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_fcntl.c
deleted file mode 100644
index 6bf4267..0000000
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_fcntl.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bg_fcntl.c
- * \brief ???
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 1997 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "ad_bg.h"
-#include "adio_extern.h"
-/* #ifdef MPISGI
-#include "mpisgi2.h"
-#endif */
-
-void ADIOI_BG_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct,
-		     int *error_code)
-{
-    static char myname[] = "ADIOI_BG_FCNTL";
-
-    switch(flag) {
-    case ADIO_FCNTL_GET_FSIZE:
-	fcntl_struct->fsize = lseek(fd->fd_sys, 0, SEEK_END);
-	if (fd->fp_sys_posn != -1) 
-	     lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET);
-	if (fcntl_struct->fsize == -1) {
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE, myname,
-					       __LINE__, MPI_ERR_IO, "**io",
-					       "**io %s", strerror(errno));
-	}
-	else *error_code = MPI_SUCCESS;
-	break;
-
-    case ADIO_FCNTL_SET_DISKSPACE:
-	ADIOI_GEN_Prealloc(fd, fcntl_struct->diskspace, error_code);
-	break;
-
-    case ADIO_FCNTL_SET_ATOMICITY:
-	fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1;
-	*error_code = MPI_SUCCESS;
-	break;
-
-	/* --BEGIN ERROR HANDLING-- */
-    default:
-	*error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					   MPIR_ERR_RECOVERABLE,
-					   myname, __LINE__,
-					   MPI_ERR_ARG,
-					   "**flag", "**flag %d", flag);
-	/* --END ERROR HANDLING-- */
-    }
-}
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_flush.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_flush.c
index ebc15d1..8d9603e 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_flush.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_flush.c
@@ -2,7 +2,7 @@
 /* (C)Copyright IBM Corp.  2007, 2008                               */
 /* ---------------------------------------------------------------- */
 /**
- * \file ad_bg_flush.c
+ * \file ad_gpfs_flush.c
  * \brief Scalable flush based on underlying filesystem and psets
  */
 
@@ -13,14 +13,12 @@
  *   See COPYRIGHT notice in top-level directory.
  */
 
-#include "ad_bg.h"
-#include "ad_bg_aggrs.h"
+#include "ad_gpfs.h"
 
-void ADIOI_BG_Flush(ADIO_File fd, int *error_code)
+void ADIOI_GPFS_Flush(ADIO_File fd, int *error_code)
 {
     int err=0;
-    static char myname[] = "ADIOI_BG_FLUSH";
-
+    static char myname[] = "ADIOI_GPFS_FLUSH";
 
     int rank;
 
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_getsh.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_getsh.c
deleted file mode 100644
index f18db25..0000000
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_getsh.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bg_getsh.c
- * \brief ???
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 1997 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "ad_bg.h"
-
-/* returns the current location of the shared_fp in terms of the
-   no. of etypes relative to the current view, and also increments the
-   shared_fp by the number of etypes to be accessed (incr) in the read
-   or write following this function. */
-
-void ADIOI_BG_Get_shared_fp(ADIO_File fd, ADIO_Offset incr, ADIO_Offset *shared_fp,
-			 int *error_code)
-{
-    ADIO_Offset new_fp;
-    int err;
-    MPI_Comm dupcommself;
-    static char myname[] = "ADIOI_BG_GET_SHARED_FP";
-
-    if (fd->shared_fp_fd == ADIO_FILE_NULL) {
-	MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
-	fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF, 
-				     dupcommself,
-				     fd->shared_fp_fname, 
-				     fd->file_system,
-				     fd->fns,
-				     ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE, 
-				     0, 
-				     MPI_BYTE, 
-				     MPI_BYTE, 
-				     MPI_INFO_NULL, 
-				     ADIO_PERM_NULL, 
-				     error_code);
-	if (*error_code != MPI_SUCCESS) return;
-	*shared_fp = 0;
-	ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
-	err = read(fd->shared_fp_fd->fd_sys, shared_fp, sizeof(ADIO_Offset));
-        /* if the file is empty, the above read may return error
-           (reading beyond end of file). In that case, shared_fp = 0, 
-           set above, is the correct value. */
-    }
-    else {
-	ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
-
-	err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
-	if (err == 0) {
-	    err = read(fd->shared_fp_fd->fd_sys, shared_fp,
-		       sizeof(ADIO_Offset));
-	}
-	if (err == -1) {
-	    ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE, myname,
-					       __LINE__, MPI_ERR_IO, "**io",
-					       "**io %s", strerror(errno));
-	    return;
-	}
-    }
-
-    new_fp = *shared_fp + incr;
-
-    err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
-    if (err == 0) {
-	err = write(fd->shared_fp_fd->fd_sys, &new_fp, sizeof(ADIO_Offset));
-    }
-    ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
-    if (err == -1) {
-	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
-					   myname, __LINE__, MPI_ERR_IO,
-					   "**io",
-					   "**io %s", strerror(errno));
-    }
-    else *error_code = MPI_SUCCESS;
-}
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_open.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_open.c
index aef9f06..6225dd9 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_open.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_open.c
@@ -2,7 +2,7 @@
 /* (C)Copyright IBM Corp.  2007, 2008                               */
 /* ---------------------------------------------------------------- */
 /**
- * \file ad_bg_open.c
+ * \file ad_gpfs_open.c
  * \brief ???
  */
 
@@ -12,19 +12,18 @@
  *   See COPYRIGHT notice in top-level directory.
  */
 
-#include "ad_bg.h"
-#include "ad_bg_aggrs.h"
+#include "ad_gpfs_tuning.h"
 
 #include <sys/statfs.h>
 #include <sys/vfs.h>
 
-void ADIOI_BG_Open(ADIO_File fd, int *error_code)
+void ADIOI_GPFS_Open(ADIO_File fd, int *error_code)
 {
   int perm, old_mask, amode, rank, rc;
-  static char myname[] = "ADIOI_BG_OPEN";
+  static char myname[] = "ADIOI_GPFS_OPEN";
 
   /* set internal variables for tuning environment variables */
-  ad_bg_get_env_vars();    
+  ad_gpfs_get_env_vars();
 
   if (fd->perm == ADIO_PERM_NULL)  {
     old_mask = umask(022);
@@ -54,7 +53,7 @@ void ADIOI_BG_Open(ADIO_File fd, int *error_code)
   DBG_FPRINTF(stderr,"open('%s',%#X,%#X) rc=%d, errno=%d\n",fd->filename,amode,perm,fd->fd_sys,errno);
   fd->fd_direct = -1;
 
-  if (bgmpio_devnullio == 1) {
+  if (gpfsmpio_devnullio == 1) {
       fd->null_fd = open("/dev/null", O_RDWR);
   } else {
       fd->null_fd = -1;
@@ -78,14 +77,14 @@ void ADIOI_BG_Open(ADIO_File fd, int *error_code)
 
 	MPI_Comm_rank(fd->comm, &rank);
 	if ((rank == fd->hints->ranklist[0]) || (fd->comm == MPI_COMM_SELF)) {
-	    struct stat64 bg_stat;
+	    struct stat64 gpfs_stat;
 	    /* Get the (real) underlying file system block size */
-	    rc = stat64(fd->filename, &bg_stat);
+	    rc = stat64(fd->filename, &gpfs_stat);
 	    if (rc >= 0)
 	    {
-		fd->blksize = bg_stat.st_blksize;
+		fd->blksize = gpfs_stat.st_blksize;
 		DBGV_FPRINTF(stderr,"Successful stat '%s'.  Blocksize=%ld\n",
-			fd->filename,bg_stat.st_blksize);
+			fd->filename,gpfs_stat.st_blksize);
 	    }
 	    else
 	    {
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c
index 94348e3..eae5563 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c
@@ -2,7 +2,7 @@
 /* (C)Copyright IBM Corp.  2007, 2008                               */
 /* ---------------------------------------------------------------- */
 /**
- * \file ad_bg_rdcoll.c
+ * \file ad_gpfs_rdcoll.c
  * \brief ???
  */
 
@@ -15,10 +15,8 @@
 
 #include "adio.h"
 #include "adio_extern.h"
-#include "ad_bg.h"
-#include "ad_bg_pset.h"
-#include "ad_bg_aggrs.h"
-
+#include "ad_gpfs.h"
+#include "ad_gpfs_aggrs.h"
 
 #ifdef PROFILE
 #include "mpe.h"
@@ -90,7 +88,7 @@ extern void ADIOI_Calc_my_off_len(ADIO_File fd, int bufcount, MPI_Datatype
 
 
 
-void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
+void ADIOI_GPFS_ReadStridedColl(ADIO_File fd, void *buf, int count,
 			       MPI_Datatype datatype, int file_ptr_type,
 			       ADIO_Offset offset, ADIO_Status *status, int
 			       *error_code)
@@ -115,12 +113,12 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
     ADIO_Offset start_offset, end_offset, orig_fp, fd_size, min_st_offset, off;
     ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL,
 	*fd_end = NULL, *end_offsets = NULL;
-    ADIO_Offset *bg_offsets0 = NULL, *bg_offsets = NULL;
+    ADIO_Offset *gpfs_offsets0 = NULL, *gpfs_offsets = NULL;
     int  ii;
     ADIO_Offset *len_list = NULL;
     int *buf_idx = NULL;
 
-    BGMPIO_T_CIO_RESET( r)
+    GPFSMPIO_T_CIO_RESET( r);
 
 #ifdef HAVE_STATUS_SET_BYTES
     MPI_Count bufsize, size;
@@ -145,8 +143,8 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
     nprocs_for_coll = fd->hints->cb_nodes;
     orig_fp = fd->fp_ind;
 
-    BGMPIO_T_CIO_SET_GET( r, 1, 0, BGMPIO_CIO_T_MPIO_CRW, BGMPIO_CIO_LAST)
-    BGMPIO_T_CIO_SET_GET( r, 1, 0, BGMPIO_CIO_T_LCOMP, BGMPIO_CIO_LAST )
+    GPFSMPIO_T_CIO_SET_GET( r, 1, 0, GPFSMPIO_CIO_T_MPIO_CRW, GPFSMPIO_CIO_LAST)
+    GPFSMPIO_T_CIO_SET_GET( r, 1, 0, GPFSMPIO_CIO_T_LCOMP, GPFSMPIO_CIO_LAST )
 
     /* only check for interleaving if cb_read isn't disabled */
     if (fd->hints->cb_read != ADIOI_HINT_DISABLE) {
@@ -158,9 +156,9 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
 
 	ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
 			      &offset_list, &len_list, &start_offset,
-			      &end_offset, &contig_access_count); 
-    
-    BGMPIO_T_CIO_SET_GET( r, 1, 1, BGMPIO_CIO_T_GATHER, BGMPIO_CIO_T_LCOMP )
+			      &end_offset, &contig_access_count);
+
+    GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_GATHER, GPFSMPIO_CIO_T_LCOMP )
 
 #ifdef RDCOLL_DEBUG
     for (i=0; i<contig_access_count; i++) {
@@ -176,24 +174,24 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
 	st_offsets   = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
 	end_offsets  = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
 
-    if (bgmpio_tunegather) {
-	    bg_offsets0 = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
-	    bg_offsets  = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
+    if (gpfsmpio_tunegather) {
+	    gpfs_offsets0 = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
+	    gpfs_offsets  = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
 	    for (ii=0; ii<nprocs; ii++)  {
-		bg_offsets0[ii*2]   = 0;
-		bg_offsets0[ii*2+1] = 0;
+		gpfs_offsets0[ii*2]   = 0;
+		gpfs_offsets0[ii*2+1] = 0;
 	    }
-	    bg_offsets0[myrank*2]   = start_offset;
-	    bg_offsets0[myrank*2+1] =   end_offset;
+	    gpfs_offsets0[myrank*2]   = start_offset;
+	    gpfs_offsets0[myrank*2+1] =   end_offset;
 
-	MPI_Allreduce( bg_offsets0, bg_offsets, nprocs*2, ADIO_OFFSET, MPI_MAX, fd->comm );
+	MPI_Allreduce( gpfs_offsets0, gpfs_offsets, nprocs*2, ADIO_OFFSET, MPI_MAX, fd->comm );
 
 	    for (ii=0; ii<nprocs; ii++)  {
-		st_offsets [ii] = bg_offsets[ii*2]  ;
-		end_offsets[ii] = bg_offsets[ii*2+1];
+		st_offsets [ii] = gpfs_offsets[ii*2]  ;
+		end_offsets[ii] = gpfs_offsets[ii*2+1];
 	    }
-	    ADIOI_Free( bg_offsets0 );
-	    ADIOI_Free( bg_offsets  );
+	    ADIOI_Free( gpfs_offsets0 );
+	    ADIOI_Free( gpfs_offsets  );
     } else {
         MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
                       ADIO_OFFSET, fd->comm);
@@ -201,7 +199,7 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
                       ADIO_OFFSET, fd->comm);
     }
 
-    BGMPIO_T_CIO_SET_GET( r, 1, 1, BGMPIO_CIO_T_PATANA, BGMPIO_CIO_T_GATHER )
+    GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_PATANA, GPFSMPIO_CIO_T_GATHER )
 
 	/* are the accesses of different processes interleaved? */
 	for (i=1; i<nprocs; i++)
@@ -243,7 +241,7 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
 	return;
     }
 
-    BGMPIO_T_CIO_SET_GET( r, 1, 1, BGMPIO_CIO_T_FD_PART, BGMPIO_CIO_T_PATANA )
+    GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_FD_PART, GPFSMPIO_CIO_T_PATANA )
 
     /* We're going to perform aggregation of I/O.  Here we call
      * ADIOI_Calc_file_domains() to determine what processes will handle I/O
@@ -261,8 +259,8 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
      * needs to be mapped to an actual rank in the communicator later.
      *
      */
-    if (bgmpio_tuneblocking)
-    ADIOI_BG_GPFS_Calc_file_domains(fd, st_offsets, end_offsets, nprocs,
+    if (gpfsmpio_tuneblocking)
+    ADIOI_GPFS_Calc_file_domains(fd, st_offsets, end_offsets, nprocs,
 			    nprocs_for_coll, &min_st_offset,
 			    &fd_start, &fd_end, &fd_size, fd->fs_ptr);
     else
@@ -272,8 +270,8 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
 			    fd->hints->min_fdomain_size, &fd_size, 
 			    fd->hints->striping_unit);
 
-    BGMPIO_T_CIO_SET_GET( r, 1, 1, BGMPIO_CIO_T_MYREQ, BGMPIO_CIO_T_FD_PART );
-    if (bgmpio_p2pcontig==1) {
+    GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_MYREQ, GPFSMPIO_CIO_T_FD_PART );
+    if (gpfsmpio_p2pcontig==1) {
 	/* For some simple yet common(?) workloads, full-on two-phase I/O is
 	 * overkill.  We can establish sub-groups of processes and their
 	 * aggregator, and then these sub-groups will carry out a simplified
@@ -289,11 +287,11 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
 	if (inOrderAndNoGaps && buftype_is_contig) {
 	    /* if these conditions exist then execute the P2PContig code else
 	     * execute the original code */
-	    P2PContigReadAggregation(fd, buf, 
+	    P2PContigReadAggregation(fd, buf,
 		    error_code, st_offsets, end_offsets, fd_start, fd_end);
 
 	    /* NOTE: we are skipping the rest of two-phase in this path */
-            BGMPIO_T_CIO_REPORT( 0, fd, myrank, nprocs)
+            GPFSMPIO_T_CIO_REPORT( 0, fd, myrank, nprocs)
 	    return;
 	}
     }
@@ -310,8 +308,8 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
      * buf_idx[] - array of locations into which data can be directly moved;
      *     this is only valid for contiguous buffer case
      */
-    if (bgmpio_tuneblocking)
-    ADIOI_BG_Calc_my_req(fd, offset_list, len_list, contig_access_count,
+    if (gpfsmpio_tuneblocking)
+    ADIOI_GPFS_Calc_my_req(fd, offset_list, len_list, contig_access_count,
 		      min_st_offset, fd_start, fd_end, fd_size,
 		      nprocs, &count_my_req_procs, 
 		      &count_my_req_per_proc, &my_req,
@@ -323,7 +321,7 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
 		      &count_my_req_per_proc, &my_req,
 		      &buf_idx);
 
-    BGMPIO_T_CIO_SET_GET( r, 1, 1, BGMPIO_CIO_T_OTHREQ, BGMPIO_CIO_T_MYREQ )
+    GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_OTHREQ, GPFSMPIO_CIO_T_MYREQ )
 
     /* perform a collective communication in order to distribute the
      * data calculated above.  fills in the following:
@@ -332,11 +330,11 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
      * count_others_req_per_proc[] - number of separate contiguous
      *     requests from proc i lie in this process's file domain.
      */
-    if (bgmpio_tuneblocking)
-    ADIOI_BG_Calc_others_req(fd, count_my_req_procs, 
-			  count_my_req_per_proc, my_req, 
-			  nprocs, myrank, &count_others_req_procs, 
-			  &others_req); 
+    if (gpfsmpio_tuneblocking)
+    ADIOI_GPFS_Calc_others_req(fd, count_my_req_procs,
+			  count_my_req_per_proc, my_req,
+			  nprocs, myrank, &count_others_req_procs,
+			  &others_req);
 
     else
     ADIOI_Calc_others_req(fd, count_my_req_procs, 
@@ -344,7 +342,7 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
 			  nprocs, myrank, &count_others_req_procs, 
 			  &others_req); 
 
-    BGMPIO_T_CIO_SET_GET( r, 1, 1, BGMPIO_CIO_T_DEXCH, BGMPIO_CIO_T_OTHREQ )
+    GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_DEXCH, GPFSMPIO_CIO_T_OTHREQ )
 
     /* my_req[] and count_my_req_per_proc aren't needed at this point, so 
      * let's free the memory 
@@ -367,10 +365,10 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
 			len_list, contig_access_count, min_st_offset,
 			fd_size, fd_start, fd_end, buf_idx, error_code);
 
-    BGMPIO_T_CIO_SET_GET( r, 0, 1, BGMPIO_CIO_LAST, BGMPIO_CIO_T_DEXCH )
-    BGMPIO_T_CIO_SET_GET( r, 0, 1, BGMPIO_CIO_LAST, BGMPIO_CIO_T_MPIO_CRW )
+    GPFSMPIO_T_CIO_SET_GET( r, 0, 1, GPFSMPIO_CIO_LAST, GPFSMPIO_CIO_T_DEXCH )
+    GPFSMPIO_T_CIO_SET_GET( r, 0, 1, GPFSMPIO_CIO_LAST, GPFSMPIO_CIO_T_MPIO_CRW )
 
-    BGMPIO_T_CIO_REPORT( 0, fd, myrank, nprocs)
+    GPFSMPIO_T_CIO_REPORT( 0, fd, myrank, nprocs)
 
     if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
 
@@ -658,17 +656,17 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
 #ifdef PROFILE
         MPE_Log_event(7, 0, "start communication");
 #endif
-	if (bgmpio_comm == 1)
+	if (gpfsmpio_comm == 1)
 	ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list,
 			    send_size, recv_size, count, 
        			    start_pos, partial_send, recd_from_proc, nprocs,
 			    myrank, 
 			    buftype_is_contig, contig_access_count,
 			    min_st_offset, fd_size, fd_start, fd_end,
-			    others_req, 
-                            m, buftype_extent, buf_idx); 
-        else    
-	if (bgmpio_comm == 0) {
+			    others_req,
+                            m, buftype_extent, buf_idx);
+        else
+	if (gpfsmpio_comm == 0) {
         ADIOI_R_Exchange_data_alltoallv(fd, buf, flat_buf, offset_list, len_list,
                             send_size, recv_size, count,
                             start_pos, partial_send, recd_from_proc, nprocs,
@@ -707,7 +705,7 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
     for (m=ntimes; m<max_ntimes; m++) 
 /* nothing to send, but check for recv. */
 
-	if (bgmpio_comm == 1)
+	if (gpfsmpio_comm == 1)
 	ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list,
 			    send_size, recv_size, count, 
 			    start_pos, partial_send, recd_from_proc, nprocs,
@@ -717,7 +715,7 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
 			    others_req, m,
                             buftype_extent, buf_idx); 
         else    /* strncmp( env_switch, "alltoall", 8 ) == 0 */
-	if (bgmpio_comm == 0)
+	if (gpfsmpio_comm == 0)
         ADIOI_R_Exchange_data_alltoallv(fd, buf, flat_buf, offset_list, len_list,
                             send_size, recv_size, count, 
                             start_pos, partial_send, recd_from_proc, nprocs,
@@ -984,7 +982,7 @@ static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
 	     * longer than the single region that processor "p" is responsible
 	     * for.
 	     */
-	    p = ADIOI_BG_Calc_aggregator(fd,
+	    p = ADIOI_GPFS_Calc_aggregator(fd,
 				      off,
 				      min_st_offset,
 				      &len,
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_setsh.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_setsh.c
deleted file mode 100644
index 4d61420..0000000
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_setsh.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bg_setsh.c
- * \brief ???
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 1997 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "ad_bg.h"
-
-/* set the shared file pointer to "offset" etypes relative to the current 
-   view */
-
-/*
-This looks very similar to ADIOI_GEN_Set_shared_fp, except this 
-function avoids locking the file twice.  The generic version does
-
-Write lock
-ADIO_WriteContig
-Unlock
-
-For BG, ADIOI_BG_WriteContig does a lock before writing to disable
-caching. To avoid the lock being called twice, this version for BG does
-
-Write lock
-Lseek
-Write
-Unlock 
-
-*/
-
-void ADIOI_BG_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code)
-{
-    int err;
-    MPI_Comm dupcommself;
-    static char myname[] = "ADIOI_BG_SET_SHARED_FP";
-
-    if (fd->shared_fp_fd == ADIO_FILE_NULL) {
-	MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
-	fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF, dupcommself,
-				     fd->shared_fp_fname, 
-				     fd->file_system, fd->fns,
-				     ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE, 
-				     0, MPI_BYTE, MPI_BYTE, MPI_INFO_NULL, 
-				     ADIO_PERM_NULL, error_code);
-    }
-
-    if (*error_code != MPI_SUCCESS) return;
-
-    ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
-    lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
-    err = write(fd->shared_fp_fd->fd_sys, &offset, sizeof(ADIO_Offset));
-    ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
-
-    if (err == -1) {
-	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
-					   myname, __LINE__, MPI_ERR_IO,
-					   "**io",
-					   "**io %s", strerror(errno));
-    }
-    else *error_code = MPI_SUCCESS;
-}
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
index 0e0b53a..f583dc4 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
@@ -2,7 +2,7 @@
 /* (C)Copyright IBM Corp.  2007, 2008                               */
 /* ---------------------------------------------------------------- */
 /**
- * \file ad_bg_wrcoll.c
+ * \file ad_gpfs_wrcoll.c
  * \brief ???
  */
 
@@ -14,10 +14,8 @@
 
 #include "adio.h"
 #include "adio_extern.h"
-#include "ad_bg.h"
-#include "ad_bg_pset.h"
-#include "ad_bg_aggrs.h"
-
+#include "ad_gpfs.h"
+#include "ad_gpfs_aggrs.h"
 
 #ifdef AGGREGATION_PROFILE
 #include "mpe.h"
@@ -96,7 +94,7 @@ static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
                       int nprocs, int nprocs_recv, int total_elements);
 
 
-void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
+void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
                        MPI_Datatype datatype, int file_ptr_type,
                        ADIO_Offset offset, ADIO_Status *status, int
                        *error_code)
@@ -121,28 +119,16 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
     ADIO_Offset orig_fp, start_offset, end_offset, fd_size, min_st_offset, off;
     ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL,
 	*fd_end = NULL, *end_offsets = NULL;
-    ADIO_Offset *bg_offsets0 = NULL, *bg_offsets = NULL;
+    ADIO_Offset *gpfs_offsets0 = NULL, *gpfs_offsets = NULL;
     int  ii;
 
     int *buf_idx = NULL;
     ADIO_Offset *len_list = NULL;
-    BGMPIO_T_CIO_RESET( w )
-#if 0
-    /* From common code - not implemented for bg.*/
-    int old_error, tmp_error;
-#endif
+    GPFSMPIO_T_CIO_RESET( w )
 #ifdef PROFILE
 	MPE_Log_event(13, 0, "start computation");
 #endif
 
-#if 0
-/*   From common code - not implemented for bg. */
-     if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) { 
-	ADIOI_IOStridedColl (fd, buf, count, ADIOI_WRITE, datatype, 
-			file_ptr_type, offset, status, error_code);
-	return;
-    }
-#endif
     MPI_Comm_size(fd->comm, &nprocs);
     MPI_Comm_rank(fd->comm, &myrank);
 
@@ -152,8 +138,8 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
     nprocs_for_coll = fd->hints->cb_nodes;
     orig_fp = fd->fp_ind;
 
-    BGMPIO_T_CIO_SET_GET( w, 1, 0, BGMPIO_CIO_T_MPIO_CRW, BGMPIO_CIO_LAST)
-    BGMPIO_T_CIO_SET_GET( w, 1, 0, BGMPIO_CIO_T_LCOMP, BGMPIO_CIO_LAST )
+    GPFSMPIO_T_CIO_SET_GET( w, 1, 0, GPFSMPIO_CIO_T_MPIO_CRW, GPFSMPIO_CIO_LAST)
+    GPFSMPIO_T_CIO_SET_GET( w, 1, 0, GPFSMPIO_CIO_T_LCOMP, GPFSMPIO_CIO_LAST )
 
 
     /* only check for interleaving if cb_write isn't disabled */
@@ -168,7 +154,7 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 			      &offset_list, &len_list, &start_offset,
 			      &end_offset, &contig_access_count); 
 
-    BGMPIO_T_CIO_SET_GET( w, 1, 1, BGMPIO_CIO_T_GATHER, BGMPIO_CIO_T_LCOMP )
+    GPFSMPIO_T_CIO_SET_GET( w, 1, 1, GPFSMPIO_CIO_T_GATHER, GPFSMPIO_CIO_T_LCOMP )
 
 	/* each process communicates its start and end offsets to other 
 	   processes. The result is an array each of start and end offsets stored
@@ -177,24 +163,24 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 	st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
 	end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
 
-    if (bgmpio_tunegather) {
-            bg_offsets0 = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
-            bg_offsets  = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
+    if (gpfsmpio_tunegather) {
+            gpfs_offsets0 = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
+            gpfs_offsets  = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
             for (ii=0; ii<nprocs; ii++)  {
-                bg_offsets0[ii*2]   = 0;
-                bg_offsets0[ii*2+1] = 0;
+                gpfs_offsets0[ii*2]   = 0;
+                gpfs_offsets0[ii*2+1] = 0;
             }
-            bg_offsets0[myrank*2]   = start_offset;
-            bg_offsets0[myrank*2+1] =   end_offset;
+            gpfs_offsets0[myrank*2]   = start_offset;
+            gpfs_offsets0[myrank*2+1] =   end_offset;
 
-        MPI_Allreduce( bg_offsets0, bg_offsets, nprocs*2, ADIO_OFFSET, MPI_MAX, fd->comm );
+        MPI_Allreduce( gpfs_offsets0, gpfs_offsets, nprocs*2, ADIO_OFFSET, MPI_MAX, fd->comm );
 
             for (ii=0; ii<nprocs; ii++)  {
-                st_offsets [ii] = bg_offsets[ii*2]  ;
-                end_offsets[ii] = bg_offsets[ii*2+1];
+                st_offsets [ii] = gpfs_offsets[ii*2]  ;
+                end_offsets[ii] = gpfs_offsets[ii*2+1];
             }
-            ADIOI_Free( bg_offsets0 );
-            ADIOI_Free( bg_offsets  );
+            ADIOI_Free( gpfs_offsets0 );
+            ADIOI_Free( gpfs_offsets  );
     } else {
 	MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
 		      ADIO_OFFSET, fd->comm);
@@ -202,7 +188,7 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 		      ADIO_OFFSET, fd->comm);
     }
 
-    BGMPIO_T_CIO_SET_GET(w, 1, 1, BGMPIO_CIO_T_PATANA, BGMPIO_CIO_T_GATHER )
+    GPFSMPIO_T_CIO_SET_GET(w, 1, 1, GPFSMPIO_CIO_T_PATANA, GPFSMPIO_CIO_T_GATHER )
 
 	/* are the accesses of different processes interleaved? */
 	for (i=1; i<nprocs; i++)
@@ -246,14 +232,14 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 	return;
     }
 
-    BGMPIO_T_CIO_SET_GET( w, 1, 1, BGMPIO_CIO_T_FD_PART, BGMPIO_CIO_T_PATANA )
-	
+    GPFSMPIO_T_CIO_SET_GET( w, 1, 1, GPFSMPIO_CIO_T_FD_PART, GPFSMPIO_CIO_T_PATANA )
+
 /* Divide the I/O workload among "nprocs_for_coll" processes. This is
    done by (logically) dividing the file into file domains (FDs); each
    process may directly access only its own file domain. */
 
-    if (bgmpio_tuneblocking)
-    ADIOI_BG_GPFS_Calc_file_domains(fd, st_offsets, end_offsets, nprocs,
+    if (gpfsmpio_tuneblocking)
+    ADIOI_GPFS_Calc_file_domains(fd, st_offsets, end_offsets, nprocs,
 			    nprocs_for_coll, &min_st_offset,
 			    &fd_start, &fd_end, &fd_size, fd->fs_ptr);   
     else
@@ -263,9 +249,9 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 			    fd->hints->min_fdomain_size, &fd_size,
 			    fd->hints->striping_unit);   
 
-    BGMPIO_T_CIO_SET_GET( w, 1, 1, BGMPIO_CIO_T_MYREQ, BGMPIO_CIO_T_FD_PART );
+    GPFSMPIO_T_CIO_SET_GET( w, 1, 1, GPFSMPIO_CIO_T_MYREQ, GPFSMPIO_CIO_T_FD_PART );
 
-    if (bgmpio_p2pcontig==1) {
+    if (gpfsmpio_p2pcontig==1) {
 	/* For some simple yet common(?) workloads, full-on two-phase I/O is overkill.  We can establish sub-groups of processes and their aggregator, and then these sub-groups will carry out a simplified two-phase over that sub-group.
 	 *
 	 * First verify that the filetype is contig and the offsets are
@@ -281,7 +267,7 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 	    P2PContigWriteAggregation(fd, buf, 
 		    error_code, st_offsets, end_offsets, fd_start, fd_end);
 	    /* NOTE: we are skipping the rest of two-phase in this path */
-            BGMPIO_T_CIO_REPORT( 1, fd, myrank, nprocs)
+            GPFSMPIO_T_CIO_REPORT( 1, fd, myrank, nprocs)
 	    return;
 	}
     }
@@ -289,8 +275,8 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 /* calculate what portions of the access requests of this process are
    located in what file domains */
 
-    if (bgmpio_tuneblocking)
-    ADIOI_BG_Calc_my_req(fd, offset_list, len_list, contig_access_count,
+    if (gpfsmpio_tuneblocking)
+    ADIOI_GPFS_Calc_my_req(fd, offset_list, len_list, contig_access_count,
 		      min_st_offset, fd_start, fd_end, fd_size,
 		      nprocs, &count_my_req_procs, 
 		      &count_my_req_per_proc, &my_req,
@@ -300,10 +286,10 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 		      min_st_offset, fd_start, fd_end, fd_size,
 		      nprocs, &count_my_req_procs, 
 		      &count_my_req_per_proc, &my_req,
-		      &buf_idx); 
+		      &buf_idx);
+
+    GPFSMPIO_T_CIO_SET_GET( w, 1, 1, GPFSMPIO_CIO_T_OTHREQ, GPFSMPIO_CIO_T_MYREQ )
 
-    BGMPIO_T_CIO_SET_GET( w, 1, 1, BGMPIO_CIO_T_OTHREQ, BGMPIO_CIO_T_MYREQ )
-	
 /* based on everyone's my_req, calculate what requests of other
    processes lie in this process's file domain.
    count_others_req_procs = number of processes whose requests lie in
@@ -311,8 +297,8 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
    count_others_req_per_proc[i] indicates how many separate contiguous
    requests of proc. i lie in this process's file domain. */
 
-    if (bgmpio_tuneblocking)
-	ADIOI_BG_Calc_others_req(fd, count_my_req_procs,
+    if (gpfsmpio_tuneblocking)
+	ADIOI_GPFS_Calc_others_req(fd, count_my_req_procs,
 			      count_my_req_per_proc, my_req,
 			      nprocs, myrank,
 			      &count_others_req_procs, &others_req);
@@ -320,9 +306,9 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
     ADIOI_Calc_others_req(fd, count_my_req_procs, 
 			  count_my_req_per_proc, my_req, 
 			  nprocs, myrank,
-			  &count_others_req_procs, &others_req); 
-    
-    BGMPIO_T_CIO_SET_GET( w, 1, 1, BGMPIO_CIO_T_DEXCH, BGMPIO_CIO_T_OTHREQ )
+			  &count_others_req_procs, &others_req);
+
+    GPFSMPIO_T_CIO_SET_GET( w, 1, 1, GPFSMPIO_CIO_T_DEXCH, GPFSMPIO_CIO_T_OTHREQ )
 
     ADIOI_Free(count_my_req_per_proc);
     for (i=0; i < nprocs; i++) {
@@ -339,52 +325,11 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 			len_list, contig_access_count, min_st_offset,
 			fd_size, fd_start, fd_end, buf_idx, error_code);
 
-    BGMPIO_T_CIO_SET_GET( w, 0, 1, BGMPIO_CIO_LAST, BGMPIO_CIO_T_DEXCH )
-    BGMPIO_T_CIO_SET_GET( w, 0, 1, BGMPIO_CIO_LAST, BGMPIO_CIO_T_MPIO_CRW )
-
-    BGMPIO_T_CIO_REPORT( 1, fd, myrank, nprocs)
-#if 0
-    /* From common code - not implemented for bg.
-     * 
-     * If this collective write is followed by an independent write,
-     * it's possible to have those subsequent writes on other processes
-     * race ahead and sneak in before the read-modify-write completes.
-     * We carry out a collective communication at the end here so no one
-     * can start independent i/o before collective I/O completes. 
-     *
-     * need to do some gymnastics with the error codes so that if something
-     * went wrong, all processes report error, but if a process has a more
-     * specific error code, we can still have that process report the
-     * additional information */
-
-    old_error = *error_code;
-    if (*error_code != MPI_SUCCESS) *error_code = MPI_ERR_IO;
-
-     /* optimization: if only one process performing i/o, we can perform
-     * a less-expensive Bcast  */
-#ifdef ADIOI_MPE_LOGGING
-    MPE_Log_event( ADIOI_MPE_postwrite_a, 0, NULL );
-#endif
-    if (fd->hints->cb_nodes == 1) 
-	    MPI_Bcast(error_code, 1, MPI_INT, 
-			    fd->hints->ranklist[0], fd->comm);
-    else {
-	    tmp_error = *error_code;
-	    MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT, 
-			    MPI_MAX, fd->comm);
-    }
-#ifdef ADIOI_MPE_LOGGING
-    MPE_Log_event( ADIOI_MPE_postwrite_b, 0, NULL );
-#endif
-#ifdef AGGREGATION_PROFILE
-	MPE_Log_event (5012, 0, NULL);
-#endif
-
-    if ( (old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO) )
-	    *error_code = old_error;
+    GPFSMPIO_T_CIO_SET_GET( w, 0, 1, GPFSMPIO_CIO_LAST, GPFSMPIO_CIO_T_DEXCH )
+    GPFSMPIO_T_CIO_SET_GET( w, 0, 1, GPFSMPIO_CIO_LAST, GPFSMPIO_CIO_T_MPIO_CRW )
 
+    GPFSMPIO_T_CIO_REPORT( 1, fd, myrank, nprocs)
 
-#endif
 /* free all memory allocated for collective I/O */
     if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
 
@@ -478,7 +423,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
     coll_bufsize = atoi(value);
     ADIOI_Free(value);
 
-    if (bgmpio_pthreadio == 1){
+    if (gpfsmpio_pthreadio == 1){
 	/* ROMIO will spawn an additional thread. both threads use separate
 	 * halves of the collective buffer*/
 	coll_bufsize = coll_bufsize/2;
@@ -511,7 +456,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
 		  fd->comm); 
 
     write_buf = fd->io_buf;
-    if (bgmpio_pthreadio == 1) {
+    if (gpfsmpio_pthreadio == 1) {
 	write_buf2 = fd->io_buf + coll_bufsize;
     }
 
@@ -570,7 +515,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
     done = 0;
     off = st_loc;
 
-    if(bgmpio_pthreadio == 1)
+    if(gpfsmpio_pthreadio == 1)
 	io_thread = pthread_self();
 
 #ifdef PROFILE
@@ -663,22 +608,22 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
 	MPE_Log_event(14, 0, "end computation");
 	MPE_Log_event(7, 0, "start communication");
 #endif
-        if (bgmpio_comm == 1)
-	ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list, 
-                            len_list, send_size, recv_size, off, size, count, 
-                            start_pos, partial_recv, 
-                            sent_to_proc, nprocs, myrank, 
+        if (gpfsmpio_comm == 1)
+	ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
+                            len_list, send_size, recv_size, off, size, count,
+                            start_pos, partial_recv,
+                            sent_to_proc, nprocs, myrank,
 			    buftype_is_contig, contig_access_count,
 			    min_st_offset, fd_size, fd_start, fd_end,
 			    others_req, send_buf_idx, curr_to_proc,
                             done_to_proc, &hole, m, buftype_extent, buf_idx,
 			    error_code); 
 	else
-        if (bgmpio_comm == 0)
-	ADIOI_W_Exchange_data_alltoallv(fd, buf, write_buf, flat_buf, offset_list, 
-                            len_list, send_size, recv_size, off, size, count, 
-                            start_pos, partial_recv, 
-                            sent_to_proc, nprocs, myrank, 
+        if (gpfsmpio_comm == 0)
+	ADIOI_W_Exchange_data_alltoallv(fd, buf, write_buf, flat_buf, offset_list,
+                            len_list, send_size, recv_size, off, size, count,
+                            start_pos, partial_recv,
+                            sent_to_proc, nprocs, myrank,
 			    buftype_is_contig, contig_access_count,
 			    min_st_offset, fd_size, fd_start, fd_end,
 			    others_req, send_buf_idx, curr_to_proc,
@@ -698,7 +643,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
 	    sprintf(round, "two-phase-round=%d", m);
 	    setenv("LIBIOLOG_EXTRA_INFO", round, 1);
       ADIOI_Assert(size == (int)size);
-	    if (bgmpio_pthreadio == 1) {
+	    if (gpfsmpio_pthreadio == 1) {
 		/* there is no such thing as "invalid pthread identifier", so
 		 * we'll use pthread_self() instead.  Before we do I/O we want
 		 * to complete I/O from any previous iteration -- but only a
@@ -734,7 +679,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
 	off += size;
 	done += size;
     }
-    if (bgmpio_pthreadio == 1) {
+    if (gpfsmpio_pthreadio == 1) {
 	if ( !pthread_equal(io_thread, pthread_self()) ) {
 	    pthread_join(io_thread, &thread_ret);
 	    *error_code = *(int *)thread_ret;
@@ -747,22 +692,22 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
 #endif
     for (m=ntimes; m<max_ntimes; m++) 
 	/* nothing to recv, but check for send. */
-        if (bgmpio_comm == 1)
-	ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list, 
-                            len_list, send_size, recv_size, off, size, count, 
-                            start_pos, partial_recv, 
-                            sent_to_proc, nprocs, myrank, 
+        if (gpfsmpio_comm == 1)
+	ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
+                            len_list, send_size, recv_size, off, size, count,
+                            start_pos, partial_recv,
+                            sent_to_proc, nprocs, myrank,
 			    buftype_is_contig, contig_access_count,
 			    min_st_offset, fd_size, fd_start, fd_end,
 			    others_req, send_buf_idx, 
                             curr_to_proc, done_to_proc, &hole, m, 
                             buftype_extent, buf_idx, error_code); 
 	else
-        if (bgmpio_comm == 0)
-	ADIOI_W_Exchange_data_alltoallv(fd, buf, write_buf, flat_buf, offset_list, 
-                            len_list, send_size, recv_size, off, size, count, 
-                            start_pos, partial_recv, 
-                            sent_to_proc, nprocs, myrank, 
+        if (gpfsmpio_comm == 0)
+	ADIOI_W_Exchange_data_alltoallv(fd, buf, write_buf, flat_buf, offset_list,
+                            len_list, send_size, recv_size, off, size, count,
+                            start_pos, partial_recv,
+                            sent_to_proc, nprocs, myrank,
 			    buftype_is_contig, contig_access_count,
 			    min_st_offset, fd_size, fd_start, fd_end,
 			    others_req, send_buf_idx, 
@@ -1136,7 +1081,7 @@ static void ADIOI_Fill_send_buffer(ADIO_File fd, const void *buf, ADIOI_Flatlist
 	     * longer than the single region that processor "p" is responsible
 	     * for.
 	     */
-	    p = ADIOI_BG_Calc_aggregator(fd,
+	    p = ADIOI_GPFS_Calc_aggregator(fd,
 				      off,
 				      min_st_offset,
 				      &len,
@@ -1346,13 +1291,12 @@ static void ADIOI_W_Exchange_data_alltoallv(
     static char myname[] = "ADIOI_W_EXCHANGE_DATA";
     double io_time;
 
-
     io_time = MPI_Wtime();
   /* exchange recv_size info so that each process knows how much to
      send to whom. */
     MPI_Alltoall(recv_size, 1, MPI_INT, send_size, 1, MPI_INT, fd->comm);
 
-    bgmpio_prof_cw[BGMPIO_CIO_T_DEXCH_RECV_EXCH] += MPI_Wtime() - io_time;
+    gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_RECV_EXCH] += MPI_Wtime() - io_time;
     io_time = MPI_Wtime();
     
     nprocs_recv = 0;
@@ -1399,7 +1343,7 @@ static void ADIOI_W_Exchange_data_alltoallv(
 	ADIOI_Free(send_buf);
     }
 
-    bgmpio_prof_cw[BGMPIO_CIO_T_DEXCH_SETUP] += MPI_Wtime() - io_time;
+    gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_SETUP] += MPI_Wtime() - io_time;
 
     io_time = MPI_Wtime();
   /* alltoallv */
@@ -1411,7 +1355,7 @@ static void ADIOI_W_Exchange_data_alltoallv(
     ADIOI_Free( all_send_buf );
     ADIOI_Free(sdispls);
 
-    bgmpio_prof_cw[BGMPIO_CIO_T_DEXCH_NET] += MPI_Wtime() - io_time;
+    gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_NET] += MPI_Wtime() - io_time;
     io_time = MPI_Wtime();
   /* data sieving pre-read */
   /* To avoid a read-modify-write, check if there are holes in the 
@@ -1444,7 +1388,7 @@ static void ADIOI_W_Exchange_data_alltoallv(
     ADIOI_Free(srt_off);
     ADIOI_Free(srt_len);
 
-    bgmpio_prof_cw[BGMPIO_CIO_T_DEXCH_SORT] += MPI_Wtime() - io_time;
+    gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_SORT] += MPI_Wtime() - io_time;
     io_time = MPI_Wtime();
     if (nprocs_recv) {
         if (*hole) {
@@ -1461,7 +1405,7 @@ static void ADIOI_W_Exchange_data_alltoallv(
             /* --END ERROR HANDLING-- */
         }
     }
-    bgmpio_prof_cw[BGMPIO_CIO_T_DEXCH_SIEVE] += MPI_Wtime() - io_time;
+    gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_SIEVE] += MPI_Wtime() - io_time;
 
   /* scater all_recv_buf into 4M cb_buffer */
     tmp_len = (int *) ADIOI_Malloc(nprocs*sizeof(int));
@@ -1550,7 +1494,7 @@ static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, const void *buf, ADIOI_F
 	     * longer than the single region that processor "p" is responsible
 	     * for.
 	     */
-	    p = ADIOI_BG_Calc_aggregator(fd,
+	    p = ADIOI_GPFS_Calc_aggregator(fd,
 				      off,
 				      min_st_offset,
 				      &len,
diff --git a/src/mpi/romio/adio/ad_testfs/ad_testfs_hints.c b/src/mpi/romio/adio/ad_testfs/ad_testfs_hints.c
index d57c080..a6c1be9 100644
--- a/src/mpi/romio/adio/ad_testfs/ad_testfs_hints.c
+++ b/src/mpi/romio/adio/ad_testfs/ad_testfs_hints.c
@@ -23,10 +23,5 @@ void ADIOI_TESTFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
     FPRINTF(stdout, "[%d/%d]    calling ADIOI_GEN_SetInfo\n", 
 	    myrank, nprocs);
 
-#ifdef ROMIO_BGL   /* BlueGene support for pvfs through ufs */
-    /* BlueGene hack: force testfs to mimic BlueGene hints */
-    ADIOI_BGL_SetInfo(fd, users_info, error_code);
-#else
     ADIOI_GEN_SetInfo(fd, users_info, error_code);
-#endif
 }
diff --git a/src/mpi/romio/adio/common/ad_fstype.c b/src/mpi/romio/adio/common/ad_fstype.c
index 365ce48..26ad614 100644
--- a/src/mpi/romio/adio/common/ad_fstype.c
+++ b/src/mpi/romio/adio/common/ad_fstype.c
@@ -204,32 +204,8 @@ static void ADIO_FileSysType_parentdir(const char *filename, char **dirnamep)
 }
 #endif /* ROMIO_NTFS */
 
-#if defined(ROMIO_BGL) || defined(ROMIO_BG)
-		    /* BlueGene support for lockless i/o (necessary for PVFS.
-		      possibly beneficial for others, unless data sieving
-		      writes desired) */
-
-/* BlueGene environment variables can override lockless selection.*/
-#ifdef ROMIO_BG
-extern void ad_bg_get_env_vars();
-#else
-extern void ad_bgl_get_env_vars();
-#endif
-extern long bglocklessmpio_f_type;
-
-static void check_for_lockless_exceptions(long stat_type, int *fstype)
-{
-    /* exception for lockless file systems.  (PVFS2 is the default in ad_bgl_tuning.)
-     * The BGLOCKLESS_F_TYPE environment variable will override it by specifying 
-     * the appropriate file system magic number here. 
-     */ 
-    if (stat_type == bglocklessmpio_f_type) 
-      /* use lock-free driver on bluegene to support specified fs (defaults to pvfs2) */
-      *fstype = ADIO_BGLOCKLESS; 
-}
-#endif
 /*
- ADIO_FileSysType_fncall - determines the file system type for a given file 
+ ADIO_FileSysType_fncall - determines the file system type for a given file
  using a system-dependent function call
 
 Input Parameters:
@@ -361,28 +337,12 @@ static void ADIO_FileSysType_fncall(const char *filename, int *fstype, int *erro
     }
 # endif
 
-#ifdef ROMIO_BG
-/* The BlueGene generic ADIO is also a special case. */
-    ad_bg_get_env_vars();
 
-    *fstype = ADIO_BG;
-    check_for_lockless_exceptions(fsbuf.f_type, fstype);
-    *error_code = MPI_SUCCESS;
-    return;
-#endif
-
-#  ifdef ROMIO_BGL 
-    /* BlueGene is a special case: all file systems are AD_BGL, except for
-     * certain exceptions */
+#ifdef ROMIO_GPFS
 
-    /* Bluegene needs to read enviroment variables before selecting the file system*/
-    ad_bgl_get_env_vars();
-
-    *fstype = ADIO_BGL;
-    check_for_lockless_exceptions(fsbuf.f_type, fstype);
-    *error_code = MPI_SUCCESS;
+    *fstype = ADIO_GPFS;
     return;
-#  endif
+#endif
 
     /* FPRINTF(stderr, "%d\n", fsbuf.f_type);*/
 # ifdef NFS_SUPER_MAGIC
@@ -598,15 +558,8 @@ static void ADIO_FileSysType_prefix(const char *filename, int *fstype, int *erro
     {
 	*fstype = ADIO_LUSTRE;
     }
-    else if (!strncmp(filename, "bgl:", 4) || !strncmp(filename, "BGL:", 4)) {
-	*fstype = ADIO_BGL;
-    }
-    else if (!strncmp(filename, "bg:", 3) || !strncmp(filename, "BG:", 3)) {
-	*fstype = ADIO_BG;
-    }
-    else if (!strncmp(filename, "bglockless:", 11) || 
-	    !strncmp(filename, "BGLOCKLESS:", 11)) {
-	*fstype = ADIO_BGLOCKLESS;
+    else if (!strncmp(filename, "gpfs:", 5) || !strncmp(filename, "GPFS:", 5)) {
+	*fstype = ADIO_GPFS;
     }
     else {
 #ifdef ROMIO_NTFS
@@ -843,34 +796,15 @@ void ADIO_ResolveFileType(MPI_Comm comm, const char *filename, int *fstype,
 	*ops = &ADIO_TESTFS_operations;
 #endif
     }
-    if (file_system == ADIO_BGL) {
-#ifndef ROMIO_BGL
-	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, 
-					   myname, __LINE__, MPI_ERR_IO, 
-					   "**iofstypeunsupported", 0);
-	return;
-#else
-	*ops = &ADIO_BGL_operations;
-#endif
-    }
-    if (file_system == ADIO_BG) {
-#ifndef ROMIO_BG
+
+    if (file_system == ADIO_GPFS) {
+#ifndef ROMIO_GPFS
 	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
 					myname, __LINE__, MPI_ERR_IO,
 					"**iofstypeunsupported", 0);
 	return;
 #else
-	*ops = &ADIO_BG_operations;
-#endif
-    }
-    if (file_system == ADIO_BGLOCKLESS) {
-#ifndef ROMIO_BGLOCKLESS
-	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, 
-					   myname, __LINE__, MPI_ERR_IO, 
-					   "**iofstypeunsupported", 0);
-	return;
-#else
-	*ops = &ADIO_BGLOCKLESS_operations;
+	*ops = &ADIO_GPFS_operations;
 #endif
     }
 
diff --git a/src/mpi/romio/adio/common/ad_get_sh_fp.c b/src/mpi/romio/adio/common/ad_get_sh_fp.c
index c73b499..786a4b3 100644
--- a/src/mpi/romio/adio/common/ad_get_sh_fp.c
+++ b/src/mpi/romio/adio/common/ad_get_sh_fp.c
@@ -6,11 +6,8 @@
 
 #include "adio.h"
 
-#ifdef ROMIO_BGL
-void ADIOI_BGL_Get_shared_fp(ADIO_File fd, int size, ADIO_Offset *shared_fp, int *error_code);
-#endif
-#ifdef ROMIO_BG
-void ADIOI_BG_Get_shared_fp(ADIO_File fd, int size, ADIO_Offset *shared_fp, int *error_code);
+#ifdef ROMIO_GPFS
+void ADIOI_GPFS_Get_shared_fp(ADIO_File fd, int size, ADIO_Offset *shared_fp, int *error_code);
 #endif
 
 /* returns the current location of the shared_fp in terms of the
@@ -42,17 +39,9 @@ void ADIO_Get_shared_fp(ADIO_File fd, ADIO_Offset incr, ADIO_Offset *shared_fp,
     }
 #endif
 
-#ifdef ROMIO_BGL
-    /* BGLOCKLESS won't support shared fp */
-    if (fd->file_system == ADIO_BGL) {
-	ADIOI_BGL_Get_shared_fp(fd, incr, shared_fp, error_code);
-	return;
-    }
-#endif
-#ifdef ROMIO_BG
-    /* BGLOCKLESS won't support shared fp */
-    if (fd->file_system == ADIO_BG) {
-	ADIOI_BG_Get_shared_fp(fd, incr, shared_fp, error_code);
+#ifdef ROMIO_GPFS
+    if (fd->file_system == ADIO_GPFS) {
+	ADIOI_GPFS_Get_shared_fp(fd, incr, shared_fp, error_code);
 	return;
     }
 #endif
diff --git a/src/mpi/romio/adio/common/ad_read.c b/src/mpi/romio/adio/common/ad_read.c
index e61841f..93bb7d1 100644
--- a/src/mpi/romio/adio/common/ad_read.c
+++ b/src/mpi/romio/adio/common/ad_read.c
@@ -13,8 +13,8 @@
 #ifdef AGGREGATION_PROFILE
 #include "mpe.h"
 #endif
-#ifdef ROMIO_BG
-# include "adio/ad_bg/ad_bg_tuning.h"
+#ifdef ROMIO_GPFS
+# include "adio/ad_gpfs/ad_gpfs_tuning.h"
 #endif
 
 void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count, 
@@ -37,19 +37,19 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
     MPI_Type_size_x(datatype, &datatype_size);
     len = datatype_size * (ADIO_Offset)count;
 
-#ifdef ROMIO_BG
-    if (bgmpio_timing) {
+#ifdef ROMIO_GPFS
+    if (gpfsmpio_timing) {
 	io_time = MPI_Wtime();
-	bgmpio_prof_cr[ BGMPIO_CIO_DATA_SIZE ] += len;
+	gpfsmpio_prof_cr[ GPFSMPIO_CIO_DATA_SIZE ] += len;
     }
 #endif
 
     if (file_ptr_type == ADIO_INDIVIDUAL) {
 	offset = fd->fp_ind;
     }
- 
-#if ROMIO_BG
-    if (bgmpio_timing) io_time2 = MPI_Wtime();
+
+#ifdef ROMIO_GPFS
+    if (gpfsmpio_timing) io_time2 = MPI_Wtime();
 #endif
     p=buf;
     while (bytes_xfered < len) {
@@ -57,9 +57,11 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
 	MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
 #endif
 	rd_count = len - bytes_xfered;
-	if (bgmpio_devnullio)
+#ifdef ROMIO_GPFS
+	if (gpfsmpio_devnullio)
 	    err = pread(fd->null_fd, p, rd_count, offset+bytes_xfered);
 	else
+#endif
 	    err = pread(fd->fd_sys, p, rd_count, offset+bytes_xfered);
 	/* --BEGIN ERROR HANDLING-- */
 	if (err == -1) {
@@ -83,8 +85,8 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
 	bytes_xfered += err;
 	p += err;
     }
-#if ROMIO_BG
-    if (bgmpio_timing) bgmpio_prof_cr[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
+#ifdef ROMIO_GPFS
+    if (gpfsmpio_timing) gpfsmpio_prof_cr[ GPFSMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
 #endif
     fd->fp_sys_posn = offset + bytes_xfered;
 
@@ -102,5 +104,7 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
 #ifdef AGGREGATION_PROFILE
     MPE_Log_event (5035, 0, NULL);
 #endif
-    if (bgmpio_timing) bgmpio_prof_cr[ BGMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
+#ifdef ROMIO_GPFS
+    if (gpfsmpio_timing) gpfsmpio_prof_cr[ GPFSMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
+#endif
 }
diff --git a/src/mpi/romio/adio/common/ad_set_sh_fp.c b/src/mpi/romio/adio/common/ad_set_sh_fp.c
index ba6affd..77bcc6c 100644
--- a/src/mpi/romio/adio/common/ad_set_sh_fp.c
+++ b/src/mpi/romio/adio/common/ad_set_sh_fp.c
@@ -5,8 +5,8 @@
  */
 
 #include "adio.h"
-#ifdef ROMIO_BG
-void ADIOI_BG_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
+#ifdef ROMIO_GPFS
+void ADIOI_GPFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
 #endif
 
 /* set the shared file pointer to "offset" etypes relative to the current 
@@ -26,17 +26,9 @@ void ADIO_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code)
     }
 #endif
 
-#ifdef ROMIO_BGL
-    /* BGLOCKLESS won't support shared fp */
-    if (fd->file_system == ADIO_BGL) {
-	ADIOI_BGL_Set_shared_fp(fd, offset, error_code);
-	return;
-    }
-#endif
-#ifdef ROMIO_BG
-    /* BGLOCKLESS won't support shared fp */
-    if (fd->file_system == ADIO_BG) {
-	ADIOI_BG_Set_shared_fp(fd, offset, error_code);
+#ifdef ROMIO_GPFS
+    if (fd->file_system == ADIO_GPFS) {
+	ADIOI_GPFS_Set_shared_fp(fd, offset, error_code);
 	return;
     }
 #endif
diff --git a/src/mpi/romio/adio/common/ad_write.c b/src/mpi/romio/adio/common/ad_write.c
index 2a9a57f..45c2eec 100644
--- a/src/mpi/romio/adio/common/ad_write.c
+++ b/src/mpi/romio/adio/common/ad_write.c
@@ -14,8 +14,8 @@
 #include "mpe.h"
 #endif
 
-#ifdef ROMIO_BG
-#include "adio/ad_bg/ad_bg_tuning.h"
+#ifdef ROMIO_GPFS
+#include "adio/ad_gpfs/ad_gpfs_tuning.h"
 #endif
 
 
@@ -40,10 +40,10 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
     MPI_Type_size_x(datatype, &datatype_size);
     len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
 
-#ifdef ROMIO_BG
-    if (bgmpio_timing) {
+#ifdef ROMIO_GPFS
+    if (gpfsmpio_timing) {
 	io_time = MPI_Wtime();
-	bgmpio_prof_cw[ BGMPIO_CIO_DATA_SIZE ] += len;
+	gpfsmpio_prof_cw[ GPFSMPIO_CIO_DATA_SIZE ] += len;
     }
 #endif
 
@@ -51,8 +51,8 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
 	offset = fd->fp_ind;
     }
 
-#ifdef ROMIO_BG
-    if (bgmpio_timing) io_time2 = MPI_Wtime();
+#ifdef ROMIO_GPFS
+    if (gpfsmpio_timing) io_time2 = MPI_Wtime();
 #endif
     p = (char *)buf;
     while (bytes_xfered < len) {
@@ -60,9 +60,11 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
 	MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
 #endif
 	wr_count = len - bytes_xfered;
-	if (bgmpio_devnullio)
+#ifdef ROMIO_GPFS
+	if (gpfsmpio_devnullio)
 	    err = pwrite(fd->null_fd, p, wr_count, offset+bytes_xfered);
 	else
+#endif
 	    err = pwrite(fd->fd_sys, p, wr_count, offset+bytes_xfered);
 	/* --BEGIN ERROR HANDLING-- */
 	if (err == -1) {
@@ -82,8 +84,8 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
 	p += err;
     }
 
-#ifdef ROMIO_BG
-    if (bgmpio_timing) bgmpio_prof_cw[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
+#ifdef ROMIO_GPFS
+    if (gpfsmpio_timing) gpfsmpio_prof_cw[ GPFSMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
 #endif
     fd->fp_sys_posn = offset + bytes_xfered;
 
@@ -91,8 +93,8 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
 	fd->fp_ind += bytes_xfered; 
     }
 
-#ifdef ROMIO_BG
-    if (bgmpio_timing) bgmpio_prof_cw[ BGMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
+#ifdef ROMIO_GPFS
+    if (gpfsmpio_timing) gpfsmpio_prof_cw[ GPFSMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
 #endif
 
 #ifdef HAVE_STATUS_SET_BYTES
diff --git a/src/mpi/romio/adio/common/cb_config_list.c b/src/mpi/romio/adio/common/cb_config_list.c
index 9db91af..cfd29f7 100644
--- a/src/mpi/romio/adio/common/cb_config_list.c
+++ b/src/mpi/romio/adio/common/cb_config_list.c
@@ -688,7 +688,7 @@ static int get_max_procs(int cb_nodes)
  *
  * Returns a token of types defined at top of this file.
  */
-#if defined(ROMIO_BGL) || defined(ROMIO_BG)
+#if defined(ROMIO_GPFS)
 /* On BlueGene, the ',' character shows up in get_processor_name, so we have to
  * use a different delimiter */
 #define COLON ':'
diff --git a/src/mpi/romio/adio/common/p2p_aggregation.c b/src/mpi/romio/adio/common/p2p_aggregation.c
index a4ec07c..53df38e 100644
--- a/src/mpi/romio/adio/common/p2p_aggregation.c
+++ b/src/mpi/romio/adio/common/p2p_aggregation.c
@@ -1,8 +1,7 @@
 #include "adio.h"
 #include "adio_extern.h"
-//#include "ad_bg.h"
 #include <mpix.h>
-#include "../ad_bg/ad_bg_tuning.h"
+#include "../ad_gpfs/ad_gpfs_tuning.h"
 
 void P2PContigWriteAggregation(ADIO_File fd,
 	const void *buf,
@@ -33,7 +32,7 @@ void P2PContigWriteAggregation(ADIO_File fd,
 
     int naggs = fd->hints->cb_nodes;
     int coll_bufsize = fd->hints->cb_buffer_size;
-    if (bgmpio_pthreadio == 1) {
+    if (gpfsmpio_pthreadio == 1) {
 	/* split buffer in half for a kind of double buffering with the threads*/
 	coll_bufsize = fd->hints->cb_buffer_size/2;
     }
@@ -155,7 +154,7 @@ void P2PContigWriteAggregation(ADIO_File fd,
 
     int currentWriteBuf = 0;
     int useIOBuffer = 0;
-    if (bgmpio_pthreadio && (numberOfRounds>1)) {
+    if (gpfsmpio_pthreadio && (numberOfRounds>1)) {
 	useIOBuffer = 1;
 	io_thread = pthread_self();
     }
@@ -171,7 +170,7 @@ void P2PContigWriteAggregation(ADIO_File fd,
     int *mpiRequestMapPerProc = (int *)ADIOI_Malloc(numSourceProcs * sizeof(int));
 
     endTimeBase = MPI_Wtime();
-    bgmpio_prof_cw[BGMPIO_CIO_T_MYREQ] += (endTimeBase-startTimeBase);
+    gpfsmpio_prof_cw[GPFSMPIO_CIO_T_MYREQ] += (endTimeBase-startTimeBase);
     startTimeBase = MPI_Wtime();
 
     /* each iteration of this loop writes a coll_bufsize portion of the file
@@ -249,7 +248,7 @@ void P2PContigWriteAggregation(ADIO_File fd,
 
 	}
 
-	bgmpio_prof_cw[BGMPIO_CIO_T_DEXCH_SETUP] += (endTimeBase-startTimeBase);
+	gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_SETUP] += (endTimeBase-startTimeBase);
 	startTimeBase = MPI_Wtime();
 
 	// the aggs receive the data from the source procs
@@ -297,7 +296,7 @@ void P2PContigWriteAggregation(ADIO_File fd,
 	}
 
 	endTimeBase = MPI_Wtime();
-	bgmpio_prof_cw[BGMPIO_CIO_T_DEXCH_NET] += (endTimeBase-startTimeBase);
+	gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_NET] += (endTimeBase-startTimeBase);
 	// the aggs now write the data
 	if (numDataRecvToWaitFor > 0) {
 
@@ -351,7 +350,7 @@ void P2PContigWriteAggregation(ADIO_File fd,
     } // for-loop roundIter
 
     endTimeBase=MPI_Wtime();
-    bgmpio_prof_cw[BGMPIO_CIO_T_DEXCH] += (endTimeBase-startTimeBase);
+    gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH] += (endTimeBase-startTimeBase);
 
     if (useIOBuffer) { // thread writer cleanup
 
@@ -414,7 +413,7 @@ void P2PContigReadAggregation(ADIO_File fd,
 
     int naggs = fd->hints->cb_nodes;
     int coll_bufsize = fd->hints->cb_buffer_size;
-    if (bgmpio_pthreadio == 1)
+    if (gpfsmpio_pthreadio == 1)
 	/* share buffer between working threads */
 	coll_bufsize = coll_bufsize/2;
 
@@ -551,13 +550,13 @@ void P2PContigReadAggregation(ADIO_File fd,
 
     int currentReadBuf = 0;
     int useIOBuffer = 0;
-    if (bgmpio_pthreadio && (numberOfRounds>1)) {
+    if (gpfsmpio_pthreadio && (numberOfRounds>1)) {
 	useIOBuffer = 1;
 	io_thread = pthread_self();
     }
 
     endTimeBase = MPI_Wtime();
-    bgmpio_prof_cw[BGMPIO_CIO_T_MYREQ] += (endTimeBase-startTimeBase);
+    gpfsmpio_prof_cw[GPFSMPIO_CIO_T_MYREQ] += (endTimeBase-startTimeBase);
 
 
     // each iteration of this loop reads a coll_bufsize portion of the file domain
diff --git a/src/mpi/romio/adio/include/adio.h b/src/mpi/romio/adio/include/adio.h
index e9a56a8..fb1b22b 100644
--- a/src/mpi/romio/adio/include/adio.h
+++ b/src/mpi/romio/adio/include/adio.h
@@ -294,10 +294,11 @@ typedef struct {
 #define ADIO_PANFS               161   /* Panasas FS */
 #define ADIO_GRIDFTP             162   /* Globus GridFTP */
 #define ADIO_LUSTRE              163   /* Lustre */
-#define ADIO_BGL                 164   /* IBM BGL */
-#define ADIO_BGLOCKLESS          165   /* IBM BGL (lock-free) */
+// #define ADIO_BGL                 164   /* IBM BGL */
+// #define ADIO_BGLOCKLESS          165   /* IBM BGL (lock-free) */
 #define ADIO_ZOIDFS              167   /* ZoidFS: the I/O forwarding fs */
-#define ADIO_BG                  168
+//#define ADIO_BG                  168
+#define ADIO_GPFS                  168
 
 #define ADIO_SEEK_SET            SEEK_SET
 #define ADIO_SEEK_CUR            SEEK_CUR
diff --git a/src/mpi/romio/adio/include/adioi_fs_proto.h b/src/mpi/romio/adio/include/adioi_fs_proto.h
index 65f0183..e3af917 100644
--- a/src/mpi/romio/adio/include/adioi_fs_proto.h
+++ b/src/mpi/romio/adio/include/adioi_fs_proto.h
@@ -74,19 +74,9 @@ extern struct ADIOI_Fns_struct ADIO_TESTFS_operations;
 /* prototypes are in adio/ad_testfs/ad_testfs.h */
 #endif
 
-#ifdef ROMIO_BGL
-extern struct ADIOI_Fns_struct ADIO_BGL_operations;
-/* prototypes are in adio/ad_bgl/ad_bgl.h */
-#endif
-
-#ifdef ROMIO_BG
-extern struct ADIOI_Fns_struct ADIO_BG_operations;
-/* prototypes are in adio/ad_bg/ad_bg.h */
-#endif
-
-#ifdef ROMIO_BGLOCKLESS
-extern struct ADIOI_Fns_struct ADIO_BGLOCKLESS_operations;
-/* no extra prototypes for this fs at this time */
+#ifdef ROMIO_GPFS
+extern struct ADIOI_Fns_struct ADIO_GPFS_operations;
+/* prototypes are in adio/ad_gpfs/ad_gpfs.h */
 #endif
 
 #ifdef ROMIO_GRIDFTP
diff --git a/src/mpi/romio/configure.ac b/src/mpi/romio/configure.ac
index e3b07c7..d614fbe 100644
--- a/src/mpi/romio/configure.ac
+++ b/src/mpi/romio/configure.ac
@@ -201,13 +201,6 @@ fi
 # start with the set of file systems that the user asked for
 # FILE_SYSTEM=$with_file_system
 FILE_SYSTEM=`echo $with_file_system | sed -e 's/:.*$//'`
-changequote(<<,>>)
-file_system_args=`echo $with_file_system | sed -e 's/^[^:]*//' -e 's/^://'`
-changequote([,])
-if test "$file_system_args" = "BGQ" ; then
-    AC_DEFINE(FSPLATFORM,BGQ,BGQ platform)
-fi
-echo "with_file_system is :"$with_file_system": file_system_args is :"$file_system_args": FILE_SYSTEM is :"$FILE_SYSTEM": FSPLATFORM is :"$FSPLATFORM":"
 
 # Check if Make is working
 PAC_PROG_MAKE
@@ -800,9 +793,24 @@ fi
 if test -n "$file_system_ufs"; then
     AC_DEFINE(ROMIO_UFS,1,[Define for ROMIO with UFS])
 fi
+
+changequote(<<,>>)
+file_system_args=`echo $with_file_system | sed -e 's/^[^:]*//' -e 's/^://'`
+changequote([,])
+
 if test -n "$file_system_gpfs"; then
     AC_DEFINE(ROMIO_GPFS,1,[Define for ROMIO with GPFS])
 fi
+
+if test "$file_system_args" = "BGQ" -a -n "$file_system_gpfs"; then
+    AC_DEFINE(BGQPLATFORM,1,BGQ platform)
+fi
+if test "$file_system_args" = "PE" -a -n "$file_system_gpfs"; then
+    AC_DEFINE(PEPLATFORM,1,PE platform)
+fi
+
+# echo "with_file_system is :"$with_file_system": file_system_args is :"$file_system_args": FILE_SYSTEM is :"$FILE_SYSTEM":"
+
 if test -n "$file_system_hfs"; then
     AC_DEFINE(ROMIO_HFS,1,[Define for ROMIO with HFS])
 fi

http://git.mpich.org/mpich.git/commitdiff/bc1ae63767f19ba4e9de97612eb59348c6ca2c61

commit bc1ae63767f19ba4e9de97612eb59348c6ca2c61
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Tue Mar 4 22:17:35 2014 -0600

    makefile and autoconf changes

diff --git a/src/mpi/romio/adio/Makefile.mk b/src/mpi/romio/adio/Makefile.mk
index f7494b2..caca412 100644
--- a/src/mpi/romio/adio/Makefile.mk
+++ b/src/mpi/romio/adio/Makefile.mk
@@ -21,9 +21,7 @@ noinst_HEADERS +=                      \
     adio/include/mpiu_greq.h           \
     adio/include/nopackage.h
 
-include $(top_srcdir)/adio/ad_bg/Makefile.mk
-include $(top_srcdir)/adio/ad_bgl/Makefile.mk
-include $(top_srcdir)/adio/ad_bglockless/Makefile.mk
+include $(top_srcdir)/adio/ad_gpfs/Makefile.mk
 include $(top_srcdir)/adio/ad_gridftp/Makefile.mk
 include $(top_srcdir)/adio/ad_hfs/Makefile.mk
 include $(top_srcdir)/adio/ad_lustre/Makefile.mk
diff --git a/src/mpi/romio/adio/ad_gpfs/Makefile.mk b/src/mpi/romio/adio/ad_gpfs/Makefile.mk
index ade29c5..0ab6f43 100644
--- a/src/mpi/romio/adio/ad_gpfs/Makefile.mk
+++ b/src/mpi/romio/adio/ad_gpfs/Makefile.mk
@@ -11,23 +11,23 @@ AM_CPPFLAGS += -DBGL_OPTIM_STEP1_2=1 -DBGL_OPTIM_STEP1_1=1
 
 noinst_HEADERS +=                                                    \
     adio/ad_bg/ad_bg_aggrs.h                                         \
-    adio/ad_bg/ad_bg.h                                               \
+    adio/ad_bg/ad_gpfs.h                                               \
     adio/ad_bg/ad_bg_pset.h                                          \
     adio/ad_bg/ad_bg_tuning.h
 
 romio_other_sources +=                                               \
     adio/ad_bg/ad_bg_aggrs.c                                         \
-    adio/ad_bg/ad_bg_close.c                                         \
-    adio/ad_bg/ad_bg_flush.c                                         \
+    adio/ad_bg/ad_gpfs_close.c                                         \
+    adio/ad_bg/ad_gpfs_flush.c                                         \
     adio/ad_bg/ad_bg_hints.c                                         \
     adio/ad_bg/ad_bg_pset.c                                          \
     adio/ad_bg/ad_bg_tuning.c                                        \
-    adio/ad_bg/ad_bg.c                                               \
-    adio/ad_bg/ad_bg_fcntl.c                                         \
-    adio/ad_bg/ad_bg_getsh.c                                         \
-    adio/ad_bg/ad_bg_open.c                                          \
-    adio/ad_bg/ad_bg_rdcoll.c                                        \
-    adio/ad_bg/ad_bg_setsh.c                                         \
-    adio/ad_bg/ad_bg_wrcoll.c
+    adio/ad_bg/ad_gpfs.c                                               \
+    adio/ad_bg/ad_gpfs_fcntl.c                                         \
+    adio/ad_bg/ad_gpfs_getsh.c                                         \
+    adio/ad_bg/ad_gpfs_open.c                                          \
+    adio/ad_bg/ad_gpfs_rdcoll.c                                        \
+    adio/ad_bg/ad_gpfs_setsh.c                                         \
+    adio/ad_bg/ad_gpfs_wrcoll.c
 
 endif BUILD_AD_BG
diff --git a/src/mpi/romio/configure.ac b/src/mpi/romio/configure.ac
index 6257d59..e3b07c7 100644
--- a/src/mpi/romio/configure.ac
+++ b/src/mpi/romio/configure.ac
@@ -44,7 +44,7 @@ fi
 
 AC_CONFIG_HEADER(adio/include/romioconf.h)
 AH_TOP([/*
- *  (C) 2008 by Argonne National Laboratory.
+ *  (C) 2011 by Argonne National Laboratory.
  *      See COPYRIGHT in top-level directory.
  */
 #ifndef ROMIOCONF_H_INCLUDED
@@ -77,6 +77,7 @@ AC_SUBST([MPI_H_INCLUDE])
 
 TEST_LIBNAME=""
 FILE_SYSTEM=""
+
 # Do not set variables to empty that may be communicated from the
 # outside environment (e.g., MPI_LIB, MPI_BIN_DIR, LIBNAME)
 DEBUG=no
@@ -140,7 +141,7 @@ dnl An m4 macro for use with m4_foreach_w and friends.  You should modify this
 dnl list if you want to add a known file system.  The list is just whitespace
 dnl separated, so you can use newlines and tabs as well.
 m4_define([known_filesystems_m4_w],
-          [nfs ufs pfs pvfs pvfs2 testfs xfs panfs gridftp lustre bg bgl bglockless zoidfs hfs piofs sfs])dnl
+          [nfs ufs pfs pvfs pvfs2 testfs xfs panfs gridftp lustre gpfs zoidfs hfs piofs sfs])dnl
 dnl
 dnl An m4 macro for use with m4_foreach and friends.  Expands to a quoted list of
 dnl quoted elements.  A bit easier to use without unintended expansion than the
@@ -198,8 +199,15 @@ if test -n "$with_mpi"; then
 fi
 
 # start with the set of file systems that the user asked for
-FILE_SYSTEM=$with_file_system
-
+# FILE_SYSTEM=$with_file_system
+FILE_SYSTEM=`echo $with_file_system | sed -e 's/:.*$//'`
+changequote(<<,>>)
+file_system_args=`echo $with_file_system | sed -e 's/^[^:]*//' -e 's/^://'`
+changequote([,])
+if test "$file_system_args" = "BGQ" ; then
+    AC_DEFINE(FSPLATFORM,BGQ,BGQ platform)
+fi
+echo "with_file_system is :"$with_file_system": file_system_args is :"$file_system_args": FILE_SYSTEM is :"$FILE_SYSTEM": FSPLATFORM is :"$FSPLATFORM":"
 
 # Check if Make is working
 PAC_PROG_MAKE
@@ -792,24 +800,8 @@ fi
 if test -n "$file_system_ufs"; then
     AC_DEFINE(ROMIO_UFS,1,[Define for ROMIO with UFS])
 fi
-if test -n "$file_system_bgl"; then	
-    AC_DEFINE(ROMIO_BGL,1,[Define for ROMIO with BGL])
-fi
-if test -n "$file_system_bg"; then
-    AC_DEFINE(ROMIO_BG,1,[Define for ROMIO with BG])
-fi
-if test -n "$file_system_bglockless"; then
-    if test -n "$file_system_bgl"; then
-        AC_DEFINE(ROMIO_BGLOCKLESS,1,[Define for lock-free ROMIO with BGL])
-    fi
-
-    if test -n "$file_system_bg"; then
-        AC_DEFINE(ROMIO_BGLOCKLESS,1,[Define for lock-free ROMIO with BG])
-    fi
-
-    if test -n "$ROMIO_BGLOCKLESS"; then
-        AC_MSG_ERROR("bglockless requested without [bgl|bg]")
-    fi
+if test -n "$file_system_gpfs"; then
+    AC_DEFINE(ROMIO_GPFS,1,[Define for ROMIO with GPFS])
 fi
 if test -n "$file_system_hfs"; then
     AC_DEFINE(ROMIO_HFS,1,[Define for ROMIO with HFS])
@@ -958,7 +950,7 @@ if test -n "$file_system_gridftp"; then
 	AC_DEFINE(ROMIO_GRIDFTP, 1, [Define for ROMIO with gridftp])
 fi
 
-AS_IF([test -n "$file_system_bgl" -o -n "$file_system_bg"],
+AS_IF([test -n "$file_system_gpfs"],
     [SYSDEP_INC=-I${prefix}/include], [SYSDEP_INC=])
 
 # Check for presence and characteristics of async. I/O calls if
@@ -1245,6 +1237,7 @@ if test "$have_aio_h" = "yes" -o "$have_sys_aio_h" = "yes" -o "x$disable_aio" =
     )
 	
 fi
+
 # End of aio-related tests
 
 #

http://git.mpich.org/mpich.git/commitdiff/614819fd401b2c8452a5d33937a6f7761ece6a93

commit 614819fd401b2c8452a5d33937a6f7761ece6a93
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Tue Mar 4 22:10:46 2014 -0600

    move ad_bg to ad_gpfs files and directories

diff --git a/src/mpi/romio/adio/ad_bg/.gitignore b/src/mpi/romio/adio/ad_gpfs/.gitignore
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/.gitignore
rename to src/mpi/romio/adio/ad_gpfs/.gitignore
diff --git a/src/mpi/romio/adio/ad_bg/Makefile.mk b/src/mpi/romio/adio/ad_gpfs/Makefile.mk
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/Makefile.mk
rename to src/mpi/romio/adio/ad_gpfs/Makefile.mk
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.c b/src/mpi/romio/adio/ad_gpfs/ad_bg_aggrs.c
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/ad_bg_aggrs.c
rename to src/mpi/romio/adio/ad_gpfs/ad_bg_aggrs.c
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.h b/src/mpi/romio/adio/ad_gpfs/ad_bg_aggrs.h
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/ad_bg_aggrs.h
rename to src/mpi/romio/adio/ad_gpfs/ad_bg_aggrs.h
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_hints.c b/src/mpi/romio/adio/ad_gpfs/ad_bg_hints.c
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/ad_bg_hints.c
rename to src/mpi/romio/adio/ad_gpfs/ad_bg_hints.c
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_pset.c b/src/mpi/romio/adio/ad_gpfs/ad_bg_pset.c
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/ad_bg_pset.c
rename to src/mpi/romio/adio/ad_gpfs/ad_bg_pset.c
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_pset.h b/src/mpi/romio/adio/ad_gpfs/ad_bg_pset.h
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/ad_bg_pset.h
rename to src/mpi/romio/adio/ad_gpfs/ad_bg_pset.h
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c b/src/mpi/romio/adio/ad_gpfs/ad_bg_tuning.c
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
rename to src/mpi/romio/adio/ad_gpfs/ad_bg_tuning.c
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h b/src/mpi/romio/adio/ad_gpfs/ad_bg_tuning.h
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
rename to src/mpi/romio/adio/ad_gpfs/ad_bg_tuning.h
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs.c
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/ad_bg.c
rename to src/mpi/romio/adio/ad_gpfs/ad_gpfs.c
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg.h b/src/mpi/romio/adio/ad_gpfs/ad_gpfs.h
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/ad_bg.h
rename to src/mpi/romio/adio/ad_gpfs/ad_gpfs.h
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_close.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_close.c
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/ad_bg_close.c
rename to src/mpi/romio/adio/ad_gpfs/ad_gpfs_close.c
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_fcntl.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_fcntl.c
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/ad_bg_fcntl.c
rename to src/mpi/romio/adio/ad_gpfs/ad_gpfs_fcntl.c
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_flush.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_flush.c
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/ad_bg_flush.c
rename to src/mpi/romio/adio/ad_gpfs/ad_gpfs_flush.c
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_getsh.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_getsh.c
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/ad_bg_getsh.c
rename to src/mpi/romio/adio/ad_gpfs/ad_gpfs_getsh.c
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_open.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_open.c
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/ad_bg_open.c
rename to src/mpi/romio/adio/ad_gpfs/ad_gpfs_open.c
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
rename to src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_setsh.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_setsh.c
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/ad_bg_setsh.c
rename to src/mpi/romio/adio/ad_gpfs/ad_gpfs_setsh.c
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
similarity index 100%
rename from src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
rename to src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c

http://git.mpich.org/mpich.git/commitdiff/283629cd960f01957b99b8a6254af47a2fedcb1d

commit 283629cd960f01957b99b8a6254af47a2fedcb1d
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Mon Jan 27 19:46:51 2014 +0000

    romio configure: if/fi unbalanced
    
    astonishigly, the blue gene L(!) condition lacked a closing 'fi' but we
    never noticed since async I/O never worked on blue gene.  Use the AS_IF
    macro to make this less likely to recur in the future.

diff --git a/src/mpi/romio/configure.ac b/src/mpi/romio/configure.ac
index 4bf99dd..6257d59 100644
--- a/src/mpi/romio/configure.ac
+++ b/src/mpi/romio/configure.ac
@@ -958,10 +958,8 @@ if test -n "$file_system_gridftp"; then
 	AC_DEFINE(ROMIO_GRIDFTP, 1, [Define for ROMIO with gridftp])
 fi
 
-if test -n "$file_system_bgl" -o -n "$file_system_bg"; then
-    SYSDEP_INC=-I${prefix}/include
-else
-    SYSDEP_INC=
+AS_IF([test -n "$file_system_bgl" -o -n "$file_system_bg"],
+    [SYSDEP_INC=-I${prefix}/include], [SYSDEP_INC=])
 
 # Check for presence and characteristics of async. I/O calls if
 # not disabled.
@@ -1247,7 +1245,6 @@ if test "$have_aio_h" = "yes" -o "$have_sys_aio_h" = "yes" -o "x$disable_aio" =
     )
 	
 fi
-fi
 # End of aio-related tests
 
 #

http://git.mpich.org/mpich.git/commitdiff/e8b5dfdbb9765dde0c17f37735d6f4381cd59d13

commit e8b5dfdbb9765dde0c17f37735d6f4381cd59d13
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Wed Feb 26 16:25:48 2014 -0600

    Significant simplification of ad_bg_open
    
    In order to accomodate deferred open, we can't do *any* collective
    operations in ad_bg_open.  Any collectives have to happen one level up
    at ADIOI_GEN_Opencoll.
    
    We already promoted fs blksize in a prior patch, and simplified
    "scalable sync" in another patch, so when we remove the collective call
    (bcast of blocksize and fs type), we can also remove the "is it ok to
    scalalbe sync"? (because it will always be ok) and the "are we an
    fsync-aggregator" logic becuase now only the first io aggregator will be
    such an aggregator.

diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.h b/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.h
index 35faf16..0fd76f4 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.h
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.h
@@ -27,15 +27,6 @@
   #define GPFS_SUPER_MAGIC (0x47504653)
 #endif
 
-    /* File system (BG) specific information - 
-         hung off of ADIOI_FileD file descriptor (fd->fs_ptr) at open */
-    typedef struct ADIOI_BG_fs_s {
-      int         fsync_aggr; /* "fsync aggregation" flags (below) */
-#define ADIOI_BG_FSYNC_AGGREGATION_DISABLED  0x00
-#define ADIOI_BG_FSYNC_AGGREGATION_ENABLED   0x01
-#define ADIOI_BG_FSYNC_AGGREGATOR            0x10 /* This rank is an aggregator */
-    }  ADIOI_BG_fs;
-
     /* generate a list of I/O aggregators that utilizes BG-PSET orginization. */
     int ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset);
 
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_open.c b/src/mpi/romio/adio/ad_bg/ad_bg_open.c
index 1ca1595..aef9f06 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_open.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_open.c
@@ -18,189 +18,9 @@
 #include <sys/statfs.h>
 #include <sys/vfs.h>
 
-/* COPIED FROM ad_fstype.c since it is static in that file
-
- ADIO_FileSysType_parentdir - determines a string pathname for the
- parent directory of a given filename.
-
-Input Parameters:
-. filename - pointer to file name character array
-
-Output Parameters:
-. dirnamep - pointer to location in which to store a pointer to a string
-
- Note that the caller should free the memory located at the pointer returned
- after the string is no longer needed.
-*/
-
-#ifndef PATH_MAX
-#define PATH_MAX 65535
-#endif
-
-/* In a strict ANSI environment, S_ISLNK may not be defined.  Fix that
-   here.  We assume that S_ISLNK is *always* defined as a macro.  If
-   that is not universally true, then add a test to the romio
-   configure that trys to link a program that references S_ISLNK */
-#if !defined(S_ISLNK) 
-#    if defined(S_IFLNK)
-     /* Check for the link bit */
-#    define S_ISLNK(mode) ((mode) & S_IFLNK)
-#    else
-     /* no way to check if it is a link, so say false */
-#    define S_ISLNK(mode) 0   
-#    endif
-#endif /* !(S_ISLNK) */
-
-/* ADIO_FileSysType_parentdir
- *
- * Returns pointer to string in dirnamep; that string is allocated with
- * strdup and must be free()'d.
- */
-static void ADIO_FileSysType_parentdir(char *filename, char **dirnamep)
-{
-    int err;
-    char *dir = NULL, *slash;
-    struct stat statbuf;
-    
-    err = lstat(filename, &statbuf);
-
-    if (err || (!S_ISLNK(statbuf.st_mode))) {
-	/* no such file, or file is not a link; these are the "normal"
-	 * cases where we can just return the parent directory.
-	 */
-	dir = ADIOI_Strdup(filename);
-    }
-    else {
-	/* filename is a symlink.  we've presumably already tried
-	 * to stat it and found it to be missing (dangling link),
-	 * but this code doesn't care if the target is really there
-	 * or not.
-	 */
-	int namelen;
-	char *linkbuf;
-
-	linkbuf = ADIOI_Malloc(PATH_MAX+1);
-	namelen = readlink(filename, linkbuf, PATH_MAX+1);
-	if (namelen == -1) {
-	    /* something strange has happened between the time that
-	     * we determined that this was a link and the time that
-	     * we attempted to read it; punt and use the old name.
-	     */
-	    dir = ADIOI_Strdup(filename);
-	}
-	else {
-	    /* successfully read the link */
-	    linkbuf[namelen] = '\0'; /* readlink doesn't null terminate */
-	    dir = ADIOI_Strdup(linkbuf);
-	    ADIOI_Free(linkbuf);
-	}
-    }
-
-    slash = strrchr(dir, '/');
-    if (!slash) ADIOI_Strncpy(dir, ".", 2);
-    else {
-	if (slash == dir) *(dir + 1) = '\0';
-	else *slash = '\0';
-    }
-
-    *dirnamep = dir;
-    return;
-}
-
-static void scaleable_stat(ADIO_File fd)
-{
-    struct stat64 bg_stat;
-    struct statfs bg_statfs;
-    int rank, rc;
-    char * dir;
-    long buf[2];
-    MPI_Comm_rank(fd->comm, &rank);
-
-    if ((rank == fd->hints->ranklist[0]) || (fd->comm == MPI_COMM_SELF)) {
-	/* Get the (real) underlying file system block size */
-	rc = stat64(fd->filename, &bg_stat);
-	if (rc >= 0)
-	{
-	    buf[0] = bg_stat.st_blksize;
-	    DBGV_FPRINTF(stderr,"Successful stat '%s'.  Blocksize=%ld\n",
-		    fd->filename,bg_stat.st_blksize);
-	}
-	else
-	{
-	    DBGV_FPRINTF(stderr,"Stat '%s' failed with rc=%d, errno=%d\n",
-		    fd->filename,rc,errno);
-	}
-	/* Get the (real) underlying file system type so we can 
-	 * plan our fsync scaling strategy */
-	rc = statfs(fd->filename,&bg_statfs);
-	if (rc >= 0)
-	{
-	    DBGV_FPRINTF(stderr,"Successful statfs '%s'.  Magic number=%#lX\n",
-		    fd->filename,bg_statfs.f_type);
-	    buf[1] = bg_statfs.f_type;
-	}
-	else
-	{
-	    DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",
-		    fd->filename,rc,errno);
-	    ADIO_FileSysType_parentdir(fd->filename, &dir);
-	    rc = statfs(dir,&bg_statfs);
-	    if (rc >= 0)
-	    {
-		DBGV_FPRINTF(stderr,"Successful statfs '%s'.  Magic number=%#lX\n",dir,bg_statfs.f_type);
-		buf[1] = bg_statfs.f_type;
-	    }
-	    else
-	    {
-		/* Hmm.  Guess we'll assume the worst-case, that it's not GPFS
-		 * or BGLOCKLESSMPIO_F_TYPE (default PVFS2) below */
-		buf[1] = -1; /* bogus magic number */
-		DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",dir,rc,errno);
-	    }
-	    free(dir);
-	}
-    }
-    /* now we can broadcast the stat/statfs data to everyone else */
-    if (fd->comm != MPI_COMM_SELF) { /* if indep open, there's no one to talk to*/
-	if (fd->agg_comm != MPI_COMM_NULL) /* deferred open: only a subset of
-					      processes participate */
-	    MPI_Bcast(buf, 2, MPI_LONG, 0, fd->agg_comm);
-	else
-	    MPI_Bcast(buf, 2, MPI_LONG, fd->hints->ranklist[0], fd->comm);
-    }
-    bg_stat.st_blksize = buf[0];
-    bg_statfs.f_type = buf[1];
-
-    /* data from stat64 */
-    /* store the blksize in the file system specific storage */
-    fd->blksize = bg_stat.st_blksize;
-
-    /* data from statfs */
-   if ((bg_statfs.f_type == GPFS_SUPER_MAGIC) ||
-       (bg_statfs.f_type == bglocklessmpio_f_type))
-   {
-      ((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr = 
-            ADIOI_BG_FSYNC_AGGREGATION_ENABLED;
-
-      /* Only one rank is an "fsync aggregator" because only one 
-      * fsync is needed */
-      if (rank == fd->hints->ranklist[0])
-      {
-         ((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr |= 
-            ADIOI_BG_FSYNC_AGGREGATOR;
-         DBG_FPRINTF(stderr,"fsync aggregator %d\n",rank);
-      }
-      else 
-         ; /* aggregation enabled but this rank is not an aggregator*/
-   }
-   else
-      ; /* Other filesystems default to no fsync aggregation */
-}
-
-
 void ADIOI_BG_Open(ADIO_File fd, int *error_code)
 {
-  int perm, old_mask, amode;
+  int perm, old_mask, amode, rank, rc;
   static char myname[] = "ADIOI_BG_OPEN";
 
   /* set internal variables for tuning environment variables */
@@ -246,27 +66,39 @@ void ADIOI_BG_Open(ADIO_File fd, int *error_code)
     if(fd->fd_sys != -1)
     {
 
-        /* Initialize the ad_bg file system specific information */
-        ADIOI_BG_assert(fd->fs_ptr == NULL);
-        fd->fs_ptr = (ADIOI_BG_fs*) ADIOI_Malloc(sizeof(ADIOI_BG_fs));
-
         fd->blksize = 1048576; /* default to 1M */
 
-        /* default is no fsync aggregation */
-        ((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr = 
-	    ADIOI_BG_FSYNC_AGGREGATION_DISABLED; 
-
-
 #ifdef ADIOI_MPE_LOGGING
         MPE_Log_event(ADIOI_MPE_stat_a, 0, NULL);
 #endif
-        scaleable_stat(fd);
+	/* in this fs-specific routine, we might not be called over entire
+	 * communicator (deferred open).  Collect statistics on one process.
+	 * ADIOI_GEN_Opencoll (common-code caller) will take care of the
+	 * broadcast */
+
+	MPI_Comm_rank(fd->comm, &rank);
+	if ((rank == fd->hints->ranklist[0]) || (fd->comm == MPI_COMM_SELF)) {
+	    struct stat64 bg_stat;
+	    /* Get the (real) underlying file system block size */
+	    rc = stat64(fd->filename, &bg_stat);
+	    if (rc >= 0)
+	    {
+		fd->blksize = bg_stat.st_blksize;
+		DBGV_FPRINTF(stderr,"Successful stat '%s'.  Blocksize=%ld\n",
+			fd->filename,bg_stat.st_blksize);
+	    }
+	    else
+	    {
+		DBGV_FPRINTF(stderr,"Stat '%s' failed with rc=%d, errno=%d\n",
+			fd->filename,rc,errno);
+	    }
+	}
+	/* all other ranks have incorrect fd->blocksize, but ADIOI_GEN_Opencoll
+	 * will take care of that in both standard and deferred-open case */
+
 #ifdef ADIOI_MPE_LOGGING
         MPE_Log_event(ADIOI_MPE_stat_b, 0, NULL);
 #endif
-	/* file domain code will get terribly confused in a hard-to-debug way
-	 * if gpfs blocksize not sensible */
-        ADIOI_BG_assert( fd->blksize > 0);
     }
 
   if (fd->fd_sys == -1)  {
diff --git a/src/mpi/romio/adio/common/ad_opencoll.c b/src/mpi/romio/adio/common/ad_opencoll.c
index ce746b9..81dc0be 100644
--- a/src/mpi/romio/adio/common/ad_opencoll.c
+++ b/src/mpi/romio/adio/common/ad_opencoll.c
@@ -80,6 +80,7 @@ void ADIOI_GEN_OpenColl(ADIO_File fd, int rank,
 	     * (not all do)*/
 	    MPI_Bcast(&(fd->blksize), 1, MPI_LONG, fd->hints->ranklist[0], fd->comm);
 	    *error_code = MPI_SUCCESS;
+	    ADIOI_Assert(fd->blksize > 0);
 	    return;
         }
     }
@@ -114,6 +115,9 @@ void ADIOI_GEN_OpenColl(ADIO_File fd, int rank,
     /* broadcast a bit of information (blocksize for now) to all proceses in
      * communicator, not just those who participated in open */
     MPI_Bcast(&(fd->blksize), 1, MPI_LONG, fd->hints->ranklist[0], fd->comm);
+    /* file domain code will get terribly confused in a hard-to-debug way if
+     * gpfs blocksize not sensible */
+    ADIOI_Assert( fd->blksize > 0);
     /* for deferred open: this process has opened the file (because if we are
      * not an aggregaor and we are doing deferred open, we returned earlier)*/
     fd->is_open = 1;

http://git.mpich.org/mpich.git/commitdiff/030fd0f12b1648851c8773ff31db5cd128e63445

commit 030fd0f12b1648851c8773ff31db5cd128e63445
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Wed Feb 26 11:14:39 2014 -0600

    Rework "scalable flush" logic
    
    If deferred open is enabled, the logic that says if we should do a
    scalable flush and which processes should do the flush won't propagate
    to the non-aggregator processes.  Replace old way of doing things with a
    simpler stat-from-first-aggregator approach.

diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_flush.c b/src/mpi/romio/adio/ad_bg/ad_bg_flush.c
index 2848f7c..ebc15d1 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_flush.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_flush.c
@@ -18,73 +18,56 @@
 
 void ADIOI_BG_Flush(ADIO_File fd, int *error_code)
 {
-  int err=0;
-  static char myname[] = "ADIOI_BG_FLUSH";
+    int err=0;
+    static char myname[] = "ADIOI_BG_FLUSH";
 
 
-  if(((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr & ADIOI_BG_FSYNC_AGGREGATION_ENABLED)
-  {
     int rank;
- 
-    /* Barrier so we can collectively do fewer fsync's */
-    MPI_Barrier(fd->comm);
-  
+
     MPI_Comm_rank(fd->comm, &rank);
-  
-    /* All ranks marked as "fsync aggregators" should fsync. 
-       (We currently only do one fsync on rank 0 but this is general 
-       enough to support >1 aggregator using allreduce to get the
-       results instead of simply bcast'ing the results from rank 0.)*/
-    if(((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr & ADIOI_BG_FSYNC_AGGREGATOR)
-    {
-      err = fsync(fd->fd_sys);
-      DBG_FPRINTF(stderr,"aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
-      /* We want errno, not the return code if it failed */
-      if (err == -1) err = errno;
-      else err = 0;
+
+    /* the old logic about who is an fsync aggregator and who is not fell down
+     * when deferred open was enabled.  Instead, make this look more like
+     * ad_pvfs2_flush.  If one day the I/O aggregators have something they need
+     * to flush, we can consult the 'fd->hints->ranklist[]' array.  For now, a
+     * flush from one process should suffice */
+
+    /* ensure all other proceses are done writing. On many platforms MPI_Reduce
+     * is fastest because it has the lightest constraints. On Blue Gene, BARRIER
+     * is optimized  */
+    MPI_Barrier(fd->comm);
+
+    if (rank == fd->hints->ranklist[0]) {
+	err = fsync(fd->fd_sys);
+	DBG_FPRINTF(stderr,"aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
+	/* We want errno, not the return code if it failed */
+	if (err == -1) err = errno;
+	else err = 0;
     }
-    /* Just pick an errno (using unsigned MPI_MAX) from any failures */
+    /* Just pick an errno (using unsigned MPI_MAX) from any failures.  We use
+     * MPI_Allreduce in case one day we wish to fsync from more than one
+     * process */
     MPI_Allreduce( MPI_IN_PLACE, (unsigned*)&err, 1, MPI_UNSIGNED, MPI_MAX, fd->comm);
     DBGV_FPRINTF(stderr,"aggregation result:fsync %s, errno %#X,\n",fd->filename, err);
 
     if (err) /* if it's non-zero, it must be an errno */
     {
-      errno = err;
-      err = -1;
+	errno = err;
+	err = -1;
     }
-  }
-  else /* Non-aggregated fsync */
-  {
-#ifdef USE_DBG_LOGGING
-    int rank;
-#endif
-    err = fsync(fd->fd_sys);
-#ifdef USE_DBG_LOGGING
-    MPI_Comm_rank(fd->comm, &rank);
 
-    if(rank == 0)
+    /* --BEGIN ERROR HANDLING-- */
+    if (err == -1)
     {
-        DBG_FPRINTF(stderr,"no aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
+	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+		myname, __LINE__, MPI_ERR_IO,
+		"**io",
+		"**io %s", strerror(errno));
+	DBGT_FPRINTF(stderr,"fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
+	return;
     }
-    else
-    {
-        DBGV_FPRINTF(stderr,"no aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
-    }
-#endif
-  }
-
-  /* --BEGIN ERROR HANDLING-- */
-  if (err == -1)
-  {
-    *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
-                                       myname, __LINE__, MPI_ERR_IO,
-                                       "**io",
-                                       "**io %s", strerror(errno));
-    DBGT_FPRINTF(stderr,"fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
-    return;
-  }
-  /* --END ERROR HANDLING-- */
+    /* --END ERROR HANDLING-- */
 
-  *error_code = MPI_SUCCESS;
+    *error_code = MPI_SUCCESS;
 }
 

http://git.mpich.org/mpich.git/commitdiff/1ce0fe811842913d79dfb3f316b7e5f8caca771f

commit 1ce0fe811842913d79dfb3f316b7e5f8caca771f
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Tue Feb 25 12:20:18 2014 -0600

    additional broadcast in open for blocksize
    
    some file systems (e.g. bluegene) might stat the file and wish to inform
    all processes about some bit of underlying file system information (e.g.
    blocksize).  In the deferred open case, not all processes participate in
    the lowest, fs-specific open, so let's broadcast here in common code.

diff --git a/src/mpi/romio/adio/common/ad_opencoll.c b/src/mpi/romio/adio/common/ad_opencoll.c
index 2bea36e..ce746b9 100644
--- a/src/mpi/romio/adio/common/ad_opencoll.c
+++ b/src/mpi/romio/adio/common/ad_opencoll.c
@@ -61,6 +61,9 @@ void ADIOI_GEN_OpenColl(ADIO_File fd, int rank,
 		   access_mode ^= ADIO_EXCL;
        }
     }
+    fd->blksize = 1024*1024*4; /* this large default value should be good for
+				 most file systems.  any ROMIO driver is free
+				 to stat the file and find an optimial value */
 
     /* if we are doing deferred open, non-aggregators should return now */
     if (fd->hints->deferred_open ) {
@@ -69,7 +72,14 @@ void ADIOI_GEN_OpenColl(ADIO_File fd, int rank,
              * restore access_mode that non-aggregators get the right
              * value from get_amode */
             fd->access_mode = orig_amode_excl;
-            *error_code = MPI_SUCCESS;
+	    /* In file-system specific open, a driver might collect some
+	     * information via stat().  Deferred open means not every process
+	     * participates in fs-specific open, but they all participate in
+	     * this open call.  Broadcast a bit of information in case
+	     * lower-level file system driver (e.g. 'bluegene') collected it
+	     * (not all do)*/
+	    MPI_Bcast(&(fd->blksize), 1, MPI_LONG, fd->hints->ranklist[0], fd->comm);
+	    *error_code = MPI_SUCCESS;
 	    return;
         }
     }
@@ -101,6 +111,9 @@ void ADIOI_GEN_OpenColl(ADIO_File fd, int rank,
     /* if we turned off EXCL earlier, then we should turn it back on */
     if (fd->access_mode != orig_amode_excl) fd->access_mode = orig_amode_excl;
 
+    /* broadcast a bit of information (blocksize for now) to all proceses in
+     * communicator, not just those who participated in open */
+    MPI_Bcast(&(fd->blksize), 1, MPI_LONG, fd->hints->ranklist[0], fd->comm);
     /* for deferred open: this process has opened the file (because if we are
      * not an aggregaor and we are doing deferred open, we returned earlier)*/
     fd->is_open = 1;

http://git.mpich.org/mpich.git/commitdiff/fdc4cb6f8227adb5ebee35edcb5fe3dfc281b438

commit fdc4cb6f8227adb5ebee35edcb5fe3dfc281b438
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Tue Feb 25 11:37:43 2014 -0600

    Promote blocksize to ADIOI_FileD struct
    
    "file system blocksize" seems like one of those generic-enough values we
    should keep track of in the ADIOI_FileD structure.  This promotion will
    make some deferred-open fixes easier, too.

diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.c b/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.c
index 455e41d..76c4af6 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.c
@@ -666,6 +666,7 @@ void ADIOI_BG_GPFS_Calc_file_domains(ADIO_File fd,
     ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
     int i, aggr;
     TRACE_ERR("Entering ADIOI_BG_GPFS_Calc_file_domains\n");
+    blksize_t blksize;
 
 #ifdef AGGREGATION_PROFILE
     MPE_Log_event (5004, 0, NULL);
@@ -676,9 +677,11 @@ void ADIOI_BG_GPFS_Calc_file_domains(ADIO_File fd,
     DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n", 
 	    myname,__LINE__,nprocs_for_coll);
 #   endif
-    __blksize_t blksize = 1048576; /* default to 1M */
-    if(fs_ptr && ((ADIOI_BG_fs*)fs_ptr)->blksize) /* ignore null ptr or 0 blksize */
-      blksize = ((ADIOI_BG_fs*)fs_ptr)->blksize;
+    if (fd->blksize <= 0)
+	/* default to 1M if blksize unset */
+	fd->blksize = 1048576;
+    blksize = fd->blksize;
+
 #   if AGG_DEBUG
     DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);
 #   endif
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.h b/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.h
index 41e45a3..35faf16 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.h
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.h
@@ -30,7 +30,6 @@
     /* File system (BG) specific information - 
          hung off of ADIOI_FileD file descriptor (fd->fs_ptr) at open */
     typedef struct ADIOI_BG_fs_s {
-      __blksize_t blksize;
       int         fsync_aggr; /* "fsync aggregation" flags (below) */
 #define ADIOI_BG_FSYNC_AGGREGATION_DISABLED  0x00
 #define ADIOI_BG_FSYNC_AGGREGATION_ENABLED   0x01
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_open.c b/src/mpi/romio/adio/ad_bg/ad_bg_open.c
index 0713c42..1ca1595 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_open.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_open.c
@@ -173,7 +173,7 @@ static void scaleable_stat(ADIO_File fd)
 
     /* data from stat64 */
     /* store the blksize in the file system specific storage */
-    ((ADIOI_BG_fs*)fd->fs_ptr)->blksize = bg_stat.st_blksize;
+    fd->blksize = bg_stat.st_blksize;
 
     /* data from statfs */
    if ((bg_statfs.f_type == GPFS_SUPER_MAGIC) ||
@@ -250,7 +250,7 @@ void ADIOI_BG_Open(ADIO_File fd, int *error_code)
         ADIOI_BG_assert(fd->fs_ptr == NULL);
         fd->fs_ptr = (ADIOI_BG_fs*) ADIOI_Malloc(sizeof(ADIOI_BG_fs));
 
-        ((ADIOI_BG_fs*)fd->fs_ptr)->blksize = 1048576; /* default to 1M */
+        fd->blksize = 1048576; /* default to 1M */
 
         /* default is no fsync aggregation */
         ((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr = 
@@ -266,7 +266,7 @@ void ADIOI_BG_Open(ADIO_File fd, int *error_code)
 #endif
 	/* file domain code will get terribly confused in a hard-to-debug way
 	 * if gpfs blocksize not sensible */
-        ADIOI_BG_assert( ((ADIOI_BG_fs*)fd->fs_ptr)->blksize > 0);
+        ADIOI_BG_assert( fd->blksize > 0);
     }
 
   if (fd->fd_sys == -1)  {
diff --git a/src/mpi/romio/adio/include/adio.h b/src/mpi/romio/adio/include/adio.h
index 2c1ecf2..e9a56a8 100644
--- a/src/mpi/romio/adio/include/adio.h
+++ b/src/mpi/romio/adio/include/adio.h
@@ -196,6 +196,8 @@ typedef struct ADIOI_FileD {
     unsigned d_mem;          /* data buffer memory alignment */
     unsigned d_miniosz;      /* min xfer size, xfer size multiple,
                                 and file seek offset alignment */
+    blksize_t blksize;       /* some optimizations benefit from knowing
+				underlying block size */
     ADIO_Offset fp_ind;      /* individual file pointer in MPI-IO (in bytes)*/
     ADIO_Offset fp_sys_posn; /* current location of the system file-pointer
                                 in bytes */

http://git.mpich.org/mpich.git/commitdiff/87102f400cab7635ae95ddec6cb67fedcf34d131

commit 87102f400cab7635ae95ddec6cb67fedcf34d131
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Thu Jan 30 14:05:28 2014 -0600

    option to read/write to /dev/null
    
    Useful for situations like evaluating various collective I/O approaches.
    Reading/writing /dev/null eliminates file system variablity.

diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_close.c b/src/mpi/romio/adio/ad_bg/ad_bg_close.c
index 0ad5218..cb30f72 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_close.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_close.c
@@ -24,6 +24,9 @@ void ADIOI_BG_Close(ADIO_File fd, int *error_code)
   MPE_Log_event(9, 0, "start close");
 #endif
 
+  if (fd->null_fd >= 0)
+      close(fd->null_fd);
+
   err = close(fd->fd_sys);
   if (fd->fd_direct >= 0)
   {
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_open.c b/src/mpi/romio/adio/ad_bg/ad_bg_open.c
index 7afa14d..0713c42 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_open.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_open.c
@@ -234,6 +234,12 @@ void ADIOI_BG_Open(ADIO_File fd, int *error_code)
   DBG_FPRINTF(stderr,"open('%s',%#X,%#X) rc=%d, errno=%d\n",fd->filename,amode,perm,fd->fd_sys,errno);
   fd->fd_direct = -1;
 
+  if (bgmpio_devnullio == 1) {
+      fd->null_fd = open("/dev/null", O_RDWR);
+  } else {
+      fd->null_fd = -1;
+  }
+
   if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
     fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
 
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
index b33bd90..8aea5a9 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
@@ -38,6 +38,7 @@ int     bgmpio_bg_nagg_pset;
 int     bgmpio_pthreadio;
 int     bgmpio_p2pcontig;
 int	bgmpio_balancecontig;
+int     bgmpio_devnullio;
 
 double	bgmpio_prof_cw    [BGMPIO_CIO_LAST];
 double	bgmpio_prof_cr    [BGMPIO_CIO_LAST];
@@ -112,6 +113,13 @@ double	bgmpio_prof_cr    [BGMPIO_CIO_LAST];
  *   - 0 - assign file domain blocks in the traditional manner
  *   - 1 - if there are variable sized file domain blocks, spread them out
  *         (balance) across bridge nodes
+ *
+ * - BGMPIO_DEVNULLIO - do everything *except* write to / read from the file
+ *   system. When experimenting with different two-phase I/O strategies, it's
+ *   helpful to remove the highly variable file system from the experiment.
+ *   - 0 (disabled) or 1 (enabled)
+ *   - Default is 0
+ *
  */
 
 void ad_bg_get_env_vars() {
@@ -152,6 +160,10 @@ void ad_bg_get_env_vars() {
     bgmpio_balancecontig = 0;
     x = getenv( "BGMPIO_BALANCECONTIG" );
     if (x) bgmpio_balancecontig = atoi(x);
+
+    bgmpio_devnullio = 0;
+    x = getenv( "BGMPIO_DEVNULLIO" );
+    if (x) bgmpio_devnullio = atoi(x);
 }
 
 /* report timing breakdown for MPI I/O collective call */
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
index e6ca648..39ab047 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
@@ -71,6 +71,7 @@ extern long bglocklessmpio_f_type;
 extern int      bgmpio_pthreadio;
 extern int      bgmpio_p2pcontig;
 extern int  bgmpio_balancecontig;
+extern int      bgmpio_devnullio;
 
 /* Default is, well, kind of complicated. Blue Gene /L and /P had "psets": one
  * i/o node and all compute nodes wired to it.  On Blue Gene /Q that
diff --git a/src/mpi/romio/adio/common/ad_read.c b/src/mpi/romio/adio/common/ad_read.c
index 28fa128..e61841f 100644
--- a/src/mpi/romio/adio/common/ad_read.c
+++ b/src/mpi/romio/adio/common/ad_read.c
@@ -57,7 +57,10 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
 	MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
 #endif
 	rd_count = len - bytes_xfered;
-	err = pread(fd->fd_sys, p, rd_count, offset+bytes_xfered);
+	if (bgmpio_devnullio)
+	    err = pread(fd->null_fd, p, rd_count, offset+bytes_xfered);
+	else
+	    err = pread(fd->fd_sys, p, rd_count, offset+bytes_xfered);
 	/* --BEGIN ERROR HANDLING-- */
 	if (err == -1) {
 	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
diff --git a/src/mpi/romio/adio/common/ad_write.c b/src/mpi/romio/adio/common/ad_write.c
index dcfa74c..2a9a57f 100644
--- a/src/mpi/romio/adio/common/ad_write.c
+++ b/src/mpi/romio/adio/common/ad_write.c
@@ -60,7 +60,10 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
 	MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
 #endif
 	wr_count = len - bytes_xfered;
-	err = pwrite(fd->fd_sys, p, wr_count, offset+bytes_xfered);
+	if (bgmpio_devnullio)
+	    err = pwrite(fd->null_fd, p, wr_count, offset+bytes_xfered);
+	else
+	    err = pwrite(fd->fd_sys, p, wr_count, offset+bytes_xfered);
 	/* --BEGIN ERROR HANDLING-- */
 	if (err == -1) {
 	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
diff --git a/src/mpi/romio/adio/include/adio.h b/src/mpi/romio/adio/include/adio.h
index b370f29..2c1ecf2 100644
--- a/src/mpi/romio/adio/include/adio.h
+++ b/src/mpi/romio/adio/include/adio.h
@@ -187,6 +187,7 @@ typedef struct ADIOI_Hints_struct ADIOI_Hints;
 typedef struct ADIOI_FileD {
     int cookie;              /* for error checking */
     FDTYPE fd_sys;              /* system file descriptor */
+    FDTYPE null_fd;          /* the null-device file descriptor: debug only (obviously)*/
     int fd_direct;           /* On XFS, this is used for direct I/O; 
                                 fd_sys is used for buffered I/O */
     int direct_read;         /* flag; 1 means use direct read */

http://git.mpich.org/mpich.git/commitdiff/35d0c5b45aacff992cc72c9bbf3735a54ddfabe5

commit 35d0c5b45aacff992cc72c9bbf3735a54ddfabe5
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Tue Feb 4 14:36:27 2014 -0600

    balancecontig: topology-aware aggregator seleciton
    
    Two features in this change:
    - selection of file domains can result in some i/o nodes with more work
      than others (or some with no work at all), so distribute file domains
      with some awareness of i/o nodes
    
    - since we have some awareness of I/O nodes, select processes that are
      closes to those i/o nodes.

diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.c b/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.c
index 4872c22..455e41d 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.c
@@ -13,6 +13,7 @@
  */
 
 /*#define TRACE_ON */
+// #define balancecontigtrace 1
 
 #include "adio.h"
 #include "adio_cb_config_list.h"
@@ -112,6 +113,34 @@ ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset)
   /* Send the info of IO proxy CN to all processes and keep the info in fd->hints struct.
      Declared in adio_cb_config_list.h */
     ADIOI_cb_bcast_rank_map(fd);		
+    if (bgmpio_balancecontig == 1) { /* additionally need to send bridgelist,
+					bridgelistnum and numbridges to all
+					ranks */
+	if (r != 0) {
+	    fd->hints->fs_hints.bg.bridgelist =
+		ADIOI_Malloc(fd->hints->cb_nodes*sizeof(int));
+	    if (fd->hints->fs_hints.bg.bridgelist == NULL) {
+		/* NEED TO HANDLE ENOMEM */
+	    }
+	}
+	MPI_Bcast(fd->hints->fs_hints.bg.bridgelist, fd->hints->cb_nodes, MPI_INT, 0,
+		fd->comm);
+
+	if (r != 0) {
+	    fd->hints->fs_hints.bg.bridgelistnum =
+		ADIOI_Malloc(fd->hints->cb_nodes*sizeof(int));
+	    if (fd->hints->fs_hints.bg.bridgelistnum == NULL) {
+		/* NEED TO HANDLE ENOMEM */
+	    }
+	}
+	MPI_Bcast(fd->hints->fs_hints.bg.bridgelistnum, fd->hints->cb_nodes,
+		MPI_INT, 0, fd->comm);
+
+	MPI_Bcast(&fd->hints->fs_hints.bg.numbridges, 1, MPI_INT, 0,
+		fd->comm);
+
+    }
+
 
     ADIOI_BG_persInfo_free( confInfo, procInfo );
     TRACE_ERR("Leaving ADIOI_BG_gen_agg_ranklist\n");
@@ -317,16 +346,184 @@ ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
       DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );
     }
 #   endif
+    if (bgmpio_balancecontig == 1) {
+	/* what comes out of this code block is the agg ranklist sorted by
+	 * bridge set and ion id with associated bridge info stored in the
+	 * hints structure for later access during file domain assignment */
+
+	// sort the agg ranklist by bridges
+	int *interleavedbridgeranklist = (int *) ADIOI_Malloc (naggs * sizeof(int)); // resorted agg rank list
+	int *bridgelist = (int *) ADIOI_Malloc (naggs * sizeof(int)); // list of all bride ranks
+	/* each entry here is the number of aggregators associated with the
+	 * bridge rank of the same index in bridgelist */
+	int *bridgelistnum = (int *) ADIOI_Malloc (naggs * sizeof(int));
+
+	int numbridges = 0;
+
+	int i;
+	for (i=0;i<naggs;i++)
+	    bridgelistnum[i] = 0;
+	int *summaryranklistionids = (int *) ADIOI_Malloc (naggs * sizeof(int));
+	for (i=0;i<naggs;i++)
+	    summaryranklistionids[i] = -1;
+
+	/* build the bridgelist and bridgelistnum data by going thru each agg
+	 * entry and find the associated bridge list index - at the end we will
+	 * know how many aggs belong to each bridge */
+	for (i=0;i<naggs;i++) {
+	    int aggbridgerank = all_procInfo[tmp_ranklist[i]].bridgeRank;
+	    int ionid = all_procInfo[tmp_ranklist[i]].ionID;
+	    int foundrank = 0;
+	    int summaryranklistbridgeindex = 0;
+	    int j;
+	    for (j=0;(j<numbridges && !foundrank);j++) {
+		if (bridgelist[j] == aggbridgerank) {
+		    foundrank = 1;
+		    summaryranklistbridgeindex = j;
+		}
+		else
+		    summaryranklistbridgeindex++;
+	    }
+	    if (!foundrank) {
+		bridgelist[summaryranklistbridgeindex] = aggbridgerank;
+		if (summaryranklistionids[summaryranklistbridgeindex] == -1)
+		    summaryranklistionids[summaryranklistbridgeindex] = aggbridgerank;
+		else if (summaryranklistionids[summaryranklistbridgeindex] > aggbridgerank)
+		    summaryranklistionids[summaryranklistbridgeindex] = aggbridgerank;
+		numbridges++;
+	    }
 
-  /* copy the ranklist of IO aggregators to fd->hints */
-    if(fd->hints->ranklist != NULL) ADIOI_Free (fd->hints->ranklist);
+	    bridgelistnum[summaryranklistbridgeindex]++;
+	}
+
+	// resort bridgelist and bridgelistnum by io node minimum bridge rank
+	int x;
+	for (x=0;x<numbridges;x++) {
+	    for (i=0;i<(numbridges-1);i++) {
+		if (summaryranklistionids[i] > summaryranklistionids[i+1]) {
+		    int tmpionid = summaryranklistionids[i];
+		    summaryranklistionids[i] = summaryranklistionids[i+1];
+		    summaryranklistionids[i+1] = tmpionid;
+		    int tmpbridgerank = bridgelist[i];
+		    bridgelist[i] = bridgelist[i+1];
+		    bridgelist[i+1] = tmpbridgerank;
+		    int tmpbridgeranknum = bridgelistnum[i];
+		    bridgelistnum[i] = bridgelistnum[i+1];
+		    bridgelistnum[i+1] = tmpbridgeranknum;
+		}
+	    }
+	}
+
+	// for each ion make sure bridgelist is in rank order
+	int startSortIndex = -1;
+	int endSortIndex = -1;
+	int currentBridgeIndex = -1;
+
+	while (endSortIndex < numbridges) {
+	    int currentIonId = summaryranklistionids[currentBridgeIndex];
+	    startSortIndex = currentBridgeIndex;
+	    while ((summaryranklistionids[currentBridgeIndex] == currentIonId) &&
+		    (currentBridgeIndex < numbridges))
+		currentBridgeIndex++;
+	    endSortIndex = currentBridgeIndex;
+	    int x;
+	    for (x=startSortIndex;x<endSortIndex;x++) {
+		for (i=startSortIndex;i<(endSortIndex-1);i++) {
+		    if (bridgelist[i] > bridgelist[i+1]) {
+			int tmpbridgerank = bridgelist[i];
+			bridgelist[i] = bridgelist[i+1];
+			bridgelist[i+1] = tmpbridgerank;
+			int tmpbridgeranknum = bridgelistnum[i];
+			bridgelistnum[i] = bridgelistnum[i+1];
+			bridgelistnum[i+1] = tmpbridgeranknum;
+		    }
+		}
+	    }
+	}
 
-    fd->hints->cb_nodes = naggs;
-    fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
-    memcpy( fd->hints->ranklist, tmp_ranklist, naggs*sizeof(int) );
+	/* populate interleavedbridgeranklist - essentially the agg rank list
+	 * is now sorted by the bridge node and ion minimum bridge rank */
+	int currentrankoffset = 0;
+	for (i=0;i<numbridges;i++) {
+	    int bridgerankiter = 0;
+	    int *thisBridgeAggList = (int *) ADIOI_Malloc (naggs * sizeof(int));
+	    int numAggsForThisBridge = 0;
+
+	    int k;
+	    for (k=0;k<naggs;k++) {
+		int aggbridgerank = all_procInfo[tmp_ranklist[k]].bridgeRank;
+		if (aggbridgerank == bridgelist[i]) {
+		    thisBridgeAggList[numAggsForThisBridge] = tmp_ranklist[k];
+		    numAggsForThisBridge++;
+		}
+	    }
 
-  /* */
-    ADIOI_Free( tmp_ranklist );
+	    // sort thisBridgeAggList
+	    int x;
+	    for (x=0;x<numAggsForThisBridge;x++) {
+		int n;
+		for (n=0;n<(numAggsForThisBridge-1);n++) {
+		    if (thisBridgeAggList[n] > thisBridgeAggList[n+1]) {
+			int tmpthisBridgeAggList = thisBridgeAggList[n];
+			thisBridgeAggList[n] = thisBridgeAggList[n+1];
+			thisBridgeAggList[n+1] = tmpthisBridgeAggList;
+		    }
+		}
+	    }
+	    int n;
+	    for (n=0;n<numAggsForThisBridge;n++) {
+		interleavedbridgeranklist[currentrankoffset] = thisBridgeAggList[n];
+		currentrankoffset++;
+	    }
+	    ADIOI_Free(thisBridgeAggList);
+	}
+
+#ifdef balancecontigtrace
+	fprintf(stderr,"Interleaved aggregator list:\n");
+	for (i=0;i<naggs;i++) {
+	    fprintf(stderr,"Agg: %d Agg rank: %d with bridge rank %d\n",i,interleavedbridgeranklist[i],all_procInfo[interleavedbridgeranklist[i]].bridgeRank);
+	}
+	fprintf(stderr,"Bridges list:\n");
+	for (i=0;i<numbridges;i++) {
+	    fprintf(stderr,"bridge %d ion id %d rank %d num %d\n",i,summaryranklistionids[i],bridgelist[i],bridgelistnum[i]);
+	}
+
+#endif
+	/* copy the ranklist of IO aggregators to fd->hints */
+	if(fd->hints->ranklist != NULL)
+	    ADIOI_Free (fd->hints->ranklist);
+	if(fd->hints->fs_hints.bg.bridgelist != NULL)
+	    ADIOI_Free (fd->hints->fs_hints.bg.bridgelist);
+	if(fd->hints->fs_hints.bg.bridgelistnum != NULL)
+	    ADIOI_Free (fd->hints->fs_hints.bg.bridgelistnum);
+
+	fd->hints->cb_nodes = naggs;
+	fd->hints->fs_hints.bg.numbridges = numbridges;
+	fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
+	memcpy( fd->hints->ranklist, interleavedbridgeranklist, naggs*sizeof(int) );
+
+	fd->hints->fs_hints.bg.bridgelist = (int *) ADIOI_Malloc (naggs * sizeof(int));
+	memcpy( fd->hints->fs_hints.bg.bridgelist, bridgelist, naggs*sizeof(int) );
+
+	fd->hints->fs_hints.bg.bridgelistnum = (int *) ADIOI_Malloc (naggs * sizeof(int));
+	memcpy( fd->hints->fs_hints.bg.bridgelistnum, bridgelistnum, naggs*sizeof(int) );
+
+	ADIOI_Free(summaryranklistionids);
+	ADIOI_Free( tmp_ranklist );
+	ADIOI_Free( bridgelistnum );
+	ADIOI_Free( bridgelist );
+	ADIOI_Free( interleavedbridgeranklist );
+    }  else {
+	/* classic topology-agnostic copy of the ranklist of IO aggregators to
+	 * fd->hints */
+	if(fd->hints->ranklist != NULL) ADIOI_Free (fd->hints->ranklist);
+
+	fd->hints->cb_nodes = naggs;
+	fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
+	memcpy( fd->hints->ranklist, tmp_ranklist, naggs*sizeof(int) );
+
+	ADIOI_Free( tmp_ranklist );
+    }
     TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial\n");
     return;
 }
@@ -455,7 +652,8 @@ int ADIOI_BG_Calc_aggregator(ADIO_File fd,
  * It doesn't seem necessary here (using GPFS block sizes) but keep it in mind
  * (e.g. we could pass striping unit instead of using fs_ptr->blksize). 
  */
-void ADIOI_BG_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
+void ADIOI_BG_GPFS_Calc_file_domains(ADIO_File fd,
+	                              ADIO_Offset *st_offsets,
                                       ADIO_Offset *end_offsets,
                                       int          nprocs,
                                       int          nprocs_for_coll,
@@ -553,13 +751,91 @@ void ADIOI_BG_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
     ADIO_Offset naggs_large   = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
     ADIO_Offset naggs_small   = naggs - naggs_large;
 
-    for (i=0; i<naggs; i++) {
-	if (i < naggs_small) {
+    if (bgmpio_balancecontig == 1) {
+	/* File domains blocks are assigned to aggregators in a breadth-first
+	 * fashion relative to the ions - additionally, file domains on the
+	 * aggregators sharing the same bridgeset and ion have contiguous
+	 * offsets. */
+
+	// initialize everything to small
+	for (i=0; i<naggs; i++)
 	    fd_size[i] = nb_cn_small     * blksize;
-	} else {
-	    fd_size[i] = (nb_cn_small+1) * blksize;
+
+	// go thru and distribute the large across the bridges
+
+	/* bridelistoffset: agg rank list offsets using the bridgelist - each
+	 * entry is created by adding up the indexes for the aggs from all
+	 * previous bridges */
+	int *bridgelistoffset =
+	    (int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int));
+	/* tmpbridgelistnum: copy of the bridgelistnum whose entries can be
+	 * decremented to keep track of bridge assignments during the actual
+	 * large block assignments to the agg rank list*/
+	int *tmpbridgelistnum =
+	    (int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int));
+
+	int j;
+	for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++) {
+	    int k, bridgerankoffset = 0;
+	    for (k=0;k<j;k++) {
+		bridgerankoffset += fd->hints->fs_hints.bg.bridgelistnum[k];
+	    }
+	    bridgelistoffset[j] = bridgerankoffset;
+	}
+
+	for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++)
+	    tmpbridgelistnum[j] = fd->hints->fs_hints.bg.bridgelistnum[j];
+	int bridgeiter = 0;
+
+	/* distribute the large blocks across the aggs going breadth-first
+	 * across the bridgelist - this distributes the fd sizes across the
+	 * ions, so later in the file domain assignment when it iterates thru
+	 * the ranklist the offsets will be contiguous within the bridge and
+	 * ion as well */
+	for (j=0;j<naggs_large;j++) {
+	    int foundbridge = 0;
+	    while (!foundbridge) {
+		if (tmpbridgelistnum[bridgeiter] > 0) {
+		    foundbridge = 1;
+		    /*
+		       printf("bridgeiter is %d tmpbridgelistnum[bridgeiter] is %d bridgelistoffset[bridgeiter] is %d\n",bridgeiter,tmpbridgelistnum[bridgeiter],bridgelistoffset[bridgeiter]);
+		       printf("naggs is %d bridgeiter is %d bridgelistoffset[bridgeiter] is %d tmpbridgelistnum[bridgeiter] is %d\n",naggs, bridgeiter,bridgelistoffset[bridgeiter],tmpbridgelistnum[bridgeiter]);
+		       printf("naggs is %d bridgeiter is %d setting fd_size[%d]\n",naggs, bridgeiter,bridgelistoffset[bridgeiter]+(fd->hints->bridgelistnum[bridgeiter]-tmpbridgelistnum[bridgeiter]));
+		     */
+		    fd_size[bridgelistoffset[bridgeiter]+(fd->hints->fs_hints.bg.bridgelistnum[bridgeiter]-tmpbridgelistnum[bridgeiter])] =
+			(nb_cn_small+1) * blksize;
+		    tmpbridgelistnum[bridgeiter]--;
+		}
+		if (bridgeiter == (fd->hints->fs_hints.bg.numbridges-1))
+		    bridgeiter = 0;
+		else
+		    bridgeiter++;
+	    }
+	}
+	ADIOI_Free(tmpbridgelistnum);
+	ADIOI_Free(bridgelistoffset);
+
+    } else {
+	/* BG/L- and BG/P-style distribution of file domains: simple allocation of
+	 * file domins to each aggregator */
+	for (i=0; i<naggs; i++) {
+	    if (i < naggs_small) {
+		fd_size[i] = nb_cn_small     * blksize;
+	    } else {
+		fd_size[i] = (nb_cn_small+1) * blksize;
+	    }
+	}
+    }
+#ifdef balancecontigtrace
+    int myrank;
+    MPI_Comm_rank(fd->comm,&myrank);
+    if (myrank == 0) {
+      fprintf(stderr,"naggs_small is %d nb_cn_small is %d\n",naggs_small,nb_cn_small);
+	for (i=0; i<naggs; i++) {
+	    fprintf(stderr,"fd_size[%d] set to %d agg rank is %d\n",i,fd_size[i],fd->hints->ranklist[i]);
 	}
     }
+#endif
 
 #   if AGG_DEBUG
      DBG_FPRINTF(stderr,"%s(%d): "
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.h b/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.h
index af156fc..41e45a3 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.h
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.h
@@ -41,7 +41,8 @@
     int ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset);
 
     /* overriding ADIOI_Calc_file_domains() to apply 'aligned file domain partitioning'. */
-    void ADIOI_BG_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
+    void ADIOI_BG_GPFS_Calc_file_domains(ADIO_File fd,
+	                                  ADIO_Offset *st_offsets,
 				          ADIO_Offset *end_offsets,
 				          int          nprocs,
 				          int          nprocs_for_coll,
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_pset.h b/src/mpi/romio/adio/ad_bg/ad_bg_pset.h
index b1ed6bc..832c01e 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_pset.h
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_pset.h
@@ -21,6 +21,7 @@
 typedef struct {
    int ioNodeIndex; /* similar to psetNum on BGL/BGP */
    int rank; /* my rank */
+   int ionID;  /* ion id this cn is using */
 /*   int myCoords[5]; */
    int bridgeRank; /* my bridge node (or proxy) rank */
    unsigned char coreID;
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c b/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
index b9ddf32..94348e3 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
@@ -262,7 +262,7 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
      *
      */
     if (bgmpio_tuneblocking)
-    ADIOI_BG_GPFS_Calc_file_domains(st_offsets, end_offsets, nprocs,
+    ADIOI_BG_GPFS_Calc_file_domains(fd, st_offsets, end_offsets, nprocs,
 			    nprocs_for_coll, &min_st_offset,
 			    &fd_start, &fd_end, &fd_size, fd->fs_ptr);
     else
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
index a6f1ba4..b33bd90 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
@@ -37,6 +37,7 @@ long    bglocklessmpio_f_type;
 int     bgmpio_bg_nagg_pset;
 int     bgmpio_pthreadio;
 int     bgmpio_p2pcontig;
+int	bgmpio_balancecontig;
 
 double	bgmpio_prof_cw    [BGMPIO_CIO_LAST];
 double	bgmpio_prof_cr    [BGMPIO_CIO_LAST];
@@ -101,7 +102,17 @@ double	bgmpio_prof_cr    [BGMPIO_CIO_LAST];
  * 2.) The offsets are increasing in rank-order.
  * 3.) There are no gaps between the offsets.
  * 4.) No single rank has a data size which spans multiple file domains.
-*/
+ *
+ * - BGMPIO_BALANCECONTIG -  File domain blocks are assigned to aggregators in
+ *   a breadth-first fashion relative to the ions - additionally, file domains
+ *   on the aggregators sharing the same bridgeset and ion have contiguous
+ *   offsets.  The breadth-first assignment improves performance in the case of
+ *   a relatively small file of size less than the gpfs block size multiplied
+ *   by the number of ions. Files: ad_bg_wrcoll.c ad_bg_aggrs.c.  Possible Values
+ *   - 0 - assign file domain blocks in the traditional manner
+ *   - 1 - if there are variable sized file domain blocks, spread them out
+ *         (balance) across bridge nodes
+ */
 
 void ad_bg_get_env_vars() {
     char *x, *dummy;
@@ -138,6 +149,9 @@ void ad_bg_get_env_vars() {
     x = getenv( "BGMPIO_P2PCONTIG" );
     if (x) bgmpio_p2pcontig = atoi(x);
 
+    bgmpio_balancecontig = 0;
+    x = getenv( "BGMPIO_BALANCECONTIG" );
+    if (x) bgmpio_balancecontig = atoi(x);
 }
 
 /* report timing breakdown for MPI I/O collective call */
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
index 80400ba..e6ca648 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
@@ -70,6 +70,7 @@ extern int 	bgmpio_tuneblocking;
 extern long bglocklessmpio_f_type;
 extern int      bgmpio_pthreadio;
 extern int      bgmpio_p2pcontig;
+extern int  bgmpio_balancecontig;
 
 /* Default is, well, kind of complicated. Blue Gene /L and /P had "psets": one
  * i/o node and all compute nodes wired to it.  On Blue Gene /Q that
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c b/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
index 2f80bb0..0e0b53a 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
@@ -253,7 +253,7 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
    process may directly access only its own file domain. */
 
     if (bgmpio_tuneblocking)
-    ADIOI_BG_GPFS_Calc_file_domains(st_offsets, end_offsets, nprocs,
+    ADIOI_BG_GPFS_Calc_file_domains(fd, st_offsets, end_offsets, nprocs,
 			    nprocs_for_coll, &min_st_offset,
 			    &fd_start, &fd_end, &fd_size, fd->fs_ptr);   
     else
diff --git a/src/mpi/romio/adio/include/adioi.h b/src/mpi/romio/adio/include/adioi.h
index 702cc20..205a9cb 100644
--- a/src/mpi/romio/adio/include/adioi.h
+++ b/src/mpi/romio/adio/include/adioi.h
@@ -74,6 +74,13 @@ struct ADIOI_Hints_struct {
 			unsigned read_chunk_sz; /* chunk size for direct reads */
 			unsigned write_chunk_sz; /* chunk size for direct writes */
 		} xfs;
+	struct {
+	    int *bridgelist; /* list of all bride ranks */
+	    int *bridgelistnum; /* each entry here is the number of aggregators
+				   associated with the bridge rank of the same
+				   index in bridgelist */
+	    int numbridges; /* total number of bridges */
+	} bg;
     } fs_hints;
 
 };

http://git.mpich.org/mpich.git/commitdiff/917af7dca45c7ec63eef8532b6401371f27e64a3

commit 917af7dca45c7ec63eef8532b6401371f27e64a3
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Tue Mar 4 10:31:50 2014 -0600

    additional logging information
    
    robl's got a one-off logger.  can pass extra information to it with an
    environment variable.  probably not useful in general.

diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c b/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
index 65722f8..b9ddf32 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
@@ -638,6 +638,9 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
         MPE_Log_event(14, 0, "end computation");
 #endif
 	if (flag) {
+	    char round[50];
+	    sprintf(round, "two-phase-round=%d", m);
+	    setenv("LIBIOLOG_EXTRA_INFO", round, 1);
       ADIOI_Assert(size == (int)size);
 	    ADIO_ReadContig(fd, read_buf+for_curr_iter, (int)size, MPI_BYTE,
 			    ADIO_EXPLICIT_OFFSET, off, &status, error_code);
@@ -735,6 +738,8 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
     ADIOI_Free(recv_size);
     ADIOI_Free(recd_from_proc);
     ADIOI_Free(start_pos);
+
+    unsetenv("LIBIOLOG_EXTRA_INFO");
 }
 
 static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c b/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
index 935a1fd..2f80bb0 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
@@ -694,6 +694,9 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
 	    if (count[i]) flag = 1;
 
 	if (flag) {
+	    char round[50];
+	    sprintf(round, "two-phase-round=%d", m);
+	    setenv("LIBIOLOG_EXTRA_INFO", round, 1);
       ADIOI_Assert(size == (int)size);
 	    if (bgmpio_pthreadio == 1) {
 		/* there is no such thing as "invalid pthread identifier", so
@@ -780,6 +783,8 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
     ADIOI_Free(send_buf_idx);
     ADIOI_Free(curr_to_proc);
     ADIOI_Free(done_to_proc);
+
+    unsetenv("LIBIOLOG_EXTRA_INFO");
 }
 
 
@@ -895,6 +900,8 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf
 
     if (nprocs_recv) {
 	if (*hole) {
+	    char * stuff = "data-sieve-in-two-phase";
+	    setenv("LIBIOLOG_EXTRA_INFO", stuff, 1);
 	    ADIO_ReadContig(fd, write_buf, size, MPI_BYTE, 
 			    ADIO_EXPLICIT_OFFSET, off, &status, &err);
 	    /* --BEGIN ERROR HANDLING-- */
@@ -906,6 +913,7 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf
 		return;
 	    } 
 	    /* --END ERROR HANDLING-- */
+	    unsetenv("LIBIOLOG_EXTRA_INFO");
 	}
     }
 

http://git.mpich.org/mpich.git/commitdiff/7ec40e90620e04b7abc699c8ae75facc41eaa4f8

commit 7ec40e90620e04b7abc699c8ae75facc41eaa4f8
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Tue Mar 4 10:31:07 2014 -0600

    subset peer-to-peer two phase
    
    For certain workloads, MPI processes will only speak to one aggregator.
    In those cases, we will restrict communication to just point-to-point
    among those processes and their aggregator.  Sometimes called
    "p2pcontig" optimization.

diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c b/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
index d3f37af..65722f8 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
@@ -272,7 +272,31 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
 			    fd->hints->min_fdomain_size, &fd_size, 
 			    fd->hints->striping_unit);
 
-    BGMPIO_T_CIO_SET_GET( r, 1, 1, BGMPIO_CIO_T_MYREQ, BGMPIO_CIO_T_FD_PART )
+    BGMPIO_T_CIO_SET_GET( r, 1, 1, BGMPIO_CIO_T_MYREQ, BGMPIO_CIO_T_FD_PART );
+    if (bgmpio_p2pcontig==1) {
+	/* For some simple yet common(?) workloads, full-on two-phase I/O is
+	 * overkill.  We can establish sub-groups of processes and their
+	 * aggregator, and then these sub-groups will carry out a simplified
+	 * two-phase over that sub-group.
+	 *
+	 * First verify that the filetype is contig and the offsets are
+	 * increasing in rank order*/
+	int i, inOrderAndNoGaps = 1;
+	for (i=0;i<(nprocs-1);i++) {
+	    if (end_offsets[i] != (st_offsets[i+1]-1))
+		inOrderAndNoGaps = 0;
+	}
+	if (inOrderAndNoGaps && buftype_is_contig) {
+	    /* if these conditions exist then execute the P2PContig code else
+	     * execute the original code */
+	    P2PContigReadAggregation(fd, buf, 
+		    error_code, st_offsets, end_offsets, fd_start, fd_end);
+
+	    /* NOTE: we are skipping the rest of two-phase in this path */
+            BGMPIO_T_CIO_REPORT( 0, fd, myrank, nprocs)
+	    return;
+	}
+    }
 
     /* calculate where the portions of the access requests of this process 
      * are located in terms of the file domains.  this could be on the same
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
index b21ac13..a6f1ba4 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
@@ -36,6 +36,7 @@ int 	bgmpio_tuneblocking;
 long    bglocklessmpio_f_type;
 int     bgmpio_bg_nagg_pset;
 int     bgmpio_pthreadio;
+int     bgmpio_p2pcontig;
 
 double	bgmpio_prof_cw    [BGMPIO_CIO_LAST];
 double	bgmpio_prof_cr    [BGMPIO_CIO_LAST];
@@ -91,6 +92,15 @@ double	bgmpio_prof_cr    [BGMPIO_CIO_LAST];
  *   just a start.  NOTE: For some reason the stats collected when this is
  *   enabled misses some of the data so the data sizes are off a bit - this is
  *   a statistical issue only, the data is still accurately written out
+ *
+ * - BGMPIO_P2PCONTIG -  Does simple point-to-point communication between the
+ *   aggregator and the procs that feed it.  Performance could be enhanced by a
+ *   one-sided put algorithm.  Current implementation allows only 1 round of
+ *   data.  Useful/allowed only when:
+ * 1.) The datatype is contiguous.
+ * 2.) The offsets are increasing in rank-order.
+ * 3.) There are no gaps between the offsets.
+ * 4.) No single rank has a data size which spans multiple file domains.
 */
 
 void ad_bg_get_env_vars() {
@@ -124,6 +134,9 @@ void ad_bg_get_env_vars() {
     x = getenv( "BGMPIO_PTHREADIO" );
     if (x) bgmpio_pthreadio = atoi(x);
 
+    bgmpio_p2pcontig = 0;
+    x = getenv( "BGMPIO_P2PCONTIG" );
+    if (x) bgmpio_p2pcontig = atoi(x);
 
 }
 
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
index 4055fcc..80400ba 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
@@ -69,6 +69,7 @@ extern int 	bgmpio_tunegather;
 extern int 	bgmpio_tuneblocking;
 extern long bglocklessmpio_f_type;
 extern int      bgmpio_pthreadio;
+extern int      bgmpio_p2pcontig;
 
 /* Default is, well, kind of complicated. Blue Gene /L and /P had "psets": one
  * i/o node and all compute nodes wired to it.  On Blue Gene /Q that
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c b/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
index a29ce80..935a1fd 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
@@ -263,8 +263,29 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 			    fd->hints->min_fdomain_size, &fd_size,
 			    fd->hints->striping_unit);   
 
-    BGMPIO_T_CIO_SET_GET( w, 1, 1, BGMPIO_CIO_T_MYREQ, BGMPIO_CIO_T_FD_PART )
-	
+    BGMPIO_T_CIO_SET_GET( w, 1, 1, BGMPIO_CIO_T_MYREQ, BGMPIO_CIO_T_FD_PART );
+
+    if (bgmpio_p2pcontig==1) {
+	/* For some simple yet common(?) workloads, full-on two-phase I/O is overkill.  We can establish sub-groups of processes and their aggregator, and then these sub-groups will carry out a simplified two-phase over that sub-group.
+	 *
+	 * First verify that the filetype is contig and the offsets are
+	 * increasing in rank order*/
+	int i, inOrderAndNoGaps = 1;
+	for (i=0;i<(nprocs-1);i++) {
+	    if (end_offsets[i] != (st_offsets[i+1]-1))
+		inOrderAndNoGaps = 0;
+	}
+	if (inOrderAndNoGaps && buftype_is_contig) {
+	    /* if these conditions exist then execute the P2PContig code else
+	     * execute the original code */
+	    P2PContigWriteAggregation(fd, buf, 
+		    error_code, st_offsets, end_offsets, fd_start, fd_end);
+	    /* NOTE: we are skipping the rest of two-phase in this path */
+            BGMPIO_T_CIO_REPORT( 1, fd, myrank, nprocs)
+	    return;
+	}
+    }
+
 /* calculate what portions of the access requests of this process are
    located in what file domains */
 
diff --git a/src/mpi/romio/adio/common/Makefile.mk b/src/mpi/romio/adio/common/Makefile.mk
index f95f90a..043b198 100644
--- a/src/mpi/romio/adio/common/Makefile.mk
+++ b/src/mpi/romio/adio/common/Makefile.mk
@@ -67,5 +67,6 @@ romio_other_sources +=                  \
     adio/common/strfns.c                \
     adio/common/system_hints.c          \
     adio/common/hint_fns.c              \
-    adio/common/ad_threaded_io.c
+    adio/common/ad_threaded_io.c        \
+    adio/common/p2p_aggregation.c
 
diff --git a/src/mpi/romio/adio/common/p2p_aggregation.c b/src/mpi/romio/adio/common/p2p_aggregation.c
new file mode 100644
index 0000000..a4ec07c
--- /dev/null
+++ b/src/mpi/romio/adio/common/p2p_aggregation.c
@@ -0,0 +1,777 @@
+#include "adio.h"
+#include "adio_extern.h"
+//#include "ad_bg.h"
+#include <mpix.h>
+#include "../ad_bg/ad_bg_tuning.h"
+
+void P2PContigWriteAggregation(ADIO_File fd,
+	const void *buf,
+	int *error_code,
+	ADIO_Offset *st_offsets,
+	ADIO_Offset *end_offsets,
+	ADIO_Offset *fd_start,
+	ADIO_Offset* fd_end)
+{
+
+    *error_code = MPI_SUCCESS; // initialize to success
+
+    double startTimeBase,endTimeBase;
+
+    MPI_Status status;
+    pthread_t io_thread;
+    void *thread_ret;
+    ADIOI_IO_ThreadFuncData io_thread_args;
+
+    int nprocs,myrank;
+    MPI_Comm_size(fd->comm, &nprocs);
+    MPI_Comm_rank(fd->comm, &myrank);
+
+    int myAggRank = -1; // if I am an aggregor this is my index into fd->hints->ranklist
+    int iAmUsedAgg = 0;
+
+    startTimeBase = MPI_Wtime();
+
+    int naggs = fd->hints->cb_nodes;
+    int coll_bufsize = fd->hints->cb_buffer_size;
+    if (bgmpio_pthreadio == 1) {
+	/* split buffer in half for a kind of double buffering with the threads*/
+	coll_bufsize = fd->hints->cb_buffer_size/2;
+    }
+
+    int j;
+    for (j=0;j<naggs;j++) {
+	if (fd->hints->ranklist[j] == myrank) {
+	    myAggRank = j;
+	    if (fd_end[j] > fd_start[j]) {
+		iAmUsedAgg = 1;
+	    }
+	}
+    }
+
+    /* determine how much data and to whom I need to send (also record the
+     * offset in the buffer for procs that span file domains) */
+    int *targetAggsForMyData = (int *)ADIOI_Malloc(naggs * sizeof(int));
+    int *dataSizeToSendPerTargetAgg = (int *)ADIOI_Malloc(naggs * sizeof(int));
+    int *bufferOffsetToSendPerTargetAgg = (int *)ADIOI_Malloc(naggs * sizeof(int));
+    int numTargetAggs = 0;
+    int i;
+    for (i=0;i<naggs;i++) {
+	if ( ((st_offsets[myrank] >= fd_start[i]) &&  (st_offsets[myrank] <= fd_end[i])) || ((end_offsets[myrank] >= fd_start[i]) &&  (end_offsets[myrank] <= fd_end[i]))) {
+	    targetAggsForMyData[numTargetAggs] = fd->hints->ranklist[i];
+	    if ( ((st_offsets[myrank] >= fd_start[i]) &&  (st_offsets[myrank] <= fd_end[i])) && ((end_offsets[myrank] >= fd_start[i]) &&  (end_offsets[myrank] <= fd_end[i]))) {
+		dataSizeToSendPerTargetAgg[numTargetAggs] = (end_offsets[myrank] - st_offsets[myrank])+1;
+		bufferOffsetToSendPerTargetAgg[numTargetAggs] = 0;
+	    }
+	    else if ((st_offsets[myrank] >= fd_start[i]) &&  (st_offsets[myrank] <= fd_end[i])) { // starts in this fd and goes past it
+		dataSizeToSendPerTargetAgg[numTargetAggs] = (fd_end[i] - st_offsets[myrank]) +1;
+		bufferOffsetToSendPerTargetAgg[numTargetAggs] = 0;
+	    }
+	    else { // starts in fd before this and ends in it
+		dataSizeToSendPerTargetAgg[numTargetAggs] = (end_offsets[myrank] - fd_start[i]) +1;
+		bufferOffsetToSendPerTargetAgg[numTargetAggs] = fd_start[i]- st_offsets[myrank];
+	    }
+	    numTargetAggs++;
+	}
+    }
+
+    /* these 3 arrays track info on the procs that feed an aggregtor */
+    int *sourceProcsForMyData=NULL;
+    int *remainingDataAmountToGetPerProc=NULL;
+    ADIO_Offset *remainingDataOffsetToGetPerProc=NULL;
+
+    int numSourceProcs = 0;
+    int totalDataSizeToGet = 0;
+
+    if (iAmUsedAgg) { /* for the used aggregators figure out how much data I
+			 need from what procs */
+
+	// count numSourceProcs so we know how large to make the arrays
+	int i;
+	for (i=0;i<nprocs;i++)
+	    if ( ((st_offsets[i] >= fd_start[myAggRank]) &&  (st_offsets[i] <= fd_end[myAggRank])) || ((end_offsets[i] >= fd_start[myAggRank]) &&  (end_offsets[i] <= fd_end[myAggRank])))
+		numSourceProcs++;
+
+	sourceProcsForMyData = (int *)ADIOI_Malloc(numSourceProcs * sizeof(int));
+	remainingDataAmountToGetPerProc = (int *)ADIOI_Malloc(numSourceProcs * sizeof(int));
+	remainingDataOffsetToGetPerProc = (ADIO_Offset *)ADIOI_Malloc(numSourceProcs * sizeof(ADIO_Offset));
+
+	/* TODO: here was a spot where the balancecontig code figured out bridge ranks */
+
+	/* everybody has the st_offsets and end_offsets for all ranks so if I am a
+	 * used aggregator go thru them and figure out which ranks have data that
+	 * falls into my file domain assigned to me */
+	numSourceProcs = 0;
+	for (i=0;i<nprocs;i++) {
+	    if ( ((st_offsets[i] >= fd_start[myAggRank]) &&  (st_offsets[i] <= fd_end[myAggRank])) || ((end_offsets[i] >= fd_start[myAggRank]) &&  (end_offsets[i] <= fd_end[myAggRank]))) {
+		sourceProcsForMyData[numSourceProcs] = i;
+		if ( ((st_offsets[i] >= fd_start[myAggRank]) &&  (st_offsets[i] <= fd_end[myAggRank])) && ((end_offsets[i] >= fd_start[myAggRank]) &&  (end_offsets[i] <= fd_end[myAggRank]))) {
+		    remainingDataAmountToGetPerProc[numSourceProcs] = (end_offsets[i] - st_offsets[i])+1;
+		    remainingDataOffsetToGetPerProc[numSourceProcs] = st_offsets[i];
+		}
+		else if ((st_offsets[i] >= fd_start[myAggRank]) &&  (st_offsets[i] <= fd_end[myAggRank])) {// starts in this fd and goes past it
+		    remainingDataAmountToGetPerProc[numSourceProcs] = (fd_end[myAggRank] - st_offsets[i]) +1;
+		    remainingDataOffsetToGetPerProc[numSourceProcs] = st_offsets[i];
+		}
+		else { // starts in fd before this and ends in it
+		    remainingDataAmountToGetPerProc[numSourceProcs] = (end_offsets[i] - fd_start[myAggRank]) +1;
+		    remainingDataOffsetToGetPerProc[numSourceProcs] = fd_start[myAggRank];
+		}
+		totalDataSizeToGet += remainingDataAmountToGetPerProc[numSourceProcs];
+#ifdef p2pcontigtrace
+		printf("getting %ld bytes from source proc %d in fd %d with borders %ld to %ld\n",remainingDataAmountToGetPerProc[numSourceProcs],fd->hints->ranklist[myAggRank],i,fd_start[myAggRank],fd_end[myAggRank]);
+#endif
+		numSourceProcs++;
+	    }
+	}
+    }
+
+    int *amountOfDataReqestedByTargetAgg = (int *)ADIOI_Malloc(naggs * sizeof(int));
+    for (i=0;i<numTargetAggs;i++) {
+	amountOfDataReqestedByTargetAgg[i] = 0;
+#ifdef p2pcontigtrace
+	printf("Need to send %ld bytes at buffer offset %d to agg %d\n", dataSizeToSendPerTargetAgg[i],bufferOffsetToSendPerTargetAgg[i],targetAggsForMyData[i]);
+#endif
+    }
+
+    int totalAmountDataSent = 0;
+    int totalAmountDataReceived = 0;
+    MPI_Request *mpiSizeToSendRequest = (MPI_Request *) ADIOI_Malloc(numTargetAggs * sizeof(MPI_Request));
+    MPI_Request *mpiRecvDataRequest = (MPI_Request *) ADIOI_Malloc(numSourceProcs * sizeof(MPI_Request));
+    MPI_Request *mpiSendDataSizeRequest = (MPI_Request *) ADIOI_Malloc(numSourceProcs * sizeof(MPI_Request));
+
+    MPI_Request *mpiSendDataToTargetAggRequest = (MPI_Request *) ADIOI_Malloc(numTargetAggs * sizeof(MPI_Request));
+    MPI_Status mpiWaitAnyStatusFromTargetAggs,mpiWaitAnyStatusFromSourceProcs;
+
+    // use the write buffer allocated in the file_open
+    char *write_buf0 = fd->io_buf;
+    char *write_buf1 = fd->io_buf + coll_bufsize;
+
+    /* start off pointing to the first buffer. If we use the 2nd buffer (threaded
+     * case) we'll swap later */
+    char *write_buf = write_buf0;
+
+    // compute number of rounds
+    ADIO_Offset numberOfRounds = (ADIO_Offset)((((ADIO_Offset)(end_offsets[nprocs-1]-st_offsets[0]))/((ADIO_Offset)((ADIO_Offset)coll_bufsize*(ADIO_Offset)naggs)))) + 1;
+
+    int currentWriteBuf = 0;
+    int useIOBuffer = 0;
+    if (bgmpio_pthreadio && (numberOfRounds>1)) {
+	useIOBuffer = 1;
+	io_thread = pthread_self();
+    }
+
+    ADIO_Offset currentRoundFDStart = 0;
+    ADIO_Offset currentRoundFDEnd = 0;
+
+    if (iAmUsedAgg) {
+	currentRoundFDStart = fd_start[myAggRank];
+    }
+
+    int *dataSizeGottenThisRoundPerProc = (int *)ADIOI_Malloc(numSourceProcs * sizeof(int));
+    int *mpiRequestMapPerProc = (int *)ADIOI_Malloc(numSourceProcs * sizeof(int));
+
+    endTimeBase = MPI_Wtime();
+    bgmpio_prof_cw[BGMPIO_CIO_T_MYREQ] += (endTimeBase-startTimeBase);
+    startTimeBase = MPI_Wtime();
+
+    /* each iteration of this loop writes a coll_bufsize portion of the file
+     * domain */
+    int roundIter;
+    for (roundIter=0;roundIter<numberOfRounds;roundIter++) {
+
+	// determine what offsets define the portion of the file domain the agg is writing this round
+	if (iAmUsedAgg) {
+	    if ((fd_end[myAggRank] - currentRoundFDStart) < coll_bufsize) {
+		currentRoundFDEnd = fd_end[myAggRank];
+	    }
+	    else
+		currentRoundFDEnd = currentRoundFDStart + coll_bufsize - 1;
+#ifdef p2pcontigtrace
+	    printf("currentRoundFDStart is %ld currentRoundFDEnd is %ld within file domeain %ld to %ld\n",currentRoundFDStart,currentRoundFDEnd,fd_start[myAggRank],fd_end[myAggRank]);
+#endif
+	}
+
+	int numRecvToWaitFor = 0;
+	int irecv;
+
+	/* the source procs receive the amount of data the aggs want them to send */
+	int i;
+	startTimeBase = MPI_Wtime();
+	for (i=0;i<numTargetAggs;i++) {
+	    MPI_Irecv(&amountOfDataReqestedByTargetAgg[i],1,
+		    MPI_INT,targetAggsForMyData[i],0,
+		    fd->comm,&mpiSizeToSendRequest[i]);
+	    numRecvToWaitFor++;
+#ifdef p2pcontigtrace
+	    printf("MPI_Irecv from rank %d\n",targetAggsForMyData[i]);
+#endif
+	}
+
+	// the aggs send the amount of data they need to their source procs
+	for (i=0;i<numSourceProcs;i++) {
+	    if ((remainingDataOffsetToGetPerProc[i] >= currentRoundFDStart) && (remainingDataOffsetToGetPerProc[i] <= currentRoundFDEnd)) {
+		if ((remainingDataOffsetToGetPerProc[i] + remainingDataAmountToGetPerProc[i]) <= currentRoundFDEnd)
+		    dataSizeGottenThisRoundPerProc[i] = remainingDataAmountToGetPerProc[i];
+		else
+		    dataSizeGottenThisRoundPerProc[i] = (currentRoundFDEnd - remainingDataOffsetToGetPerProc[i]) +1;
+	    }
+	    else if (((remainingDataOffsetToGetPerProc[i]+remainingDataAmountToGetPerProc[i]) >= currentRoundFDStart) && ((remainingDataOffsetToGetPerProc[i]+remainingDataAmountToGetPerProc[i]) <= currentRoundFDEnd)) {
+		if ((remainingDataOffsetToGetPerProc[i]) >= currentRoundFDStart)
+		    dataSizeGottenThisRoundPerProc[i] = remainingDataAmountToGetPerProc[i];
+		else
+		    dataSizeGottenThisRoundPerProc[i] = (remainingDataOffsetToGetPerProc[i]-currentRoundFDStart) +1;
+	    }
+	    else
+		dataSizeGottenThisRoundPerProc[i] = 0;
+
+#ifdef p2pcontigtrace
+	    printf("dataSizeGottenThisRoundPerProc[%d] set to %d - remainingDataOffsetToGetPerProc is %d remainingDataAmountToGetPerProc is %d currentRoundFDStart is %d currentRoundFDEnd is %d\n",i,dataSizeGottenThisRoundPerProc[i],remainingDataOffsetToGetPerProc[i],remainingDataAmountToGetPerProc[i],currentRoundFDStart,currentRoundFDEnd);
+#endif
+	    MPI_Isend(&dataSizeGottenThisRoundPerProc[i],1,MPI_INT,
+		    sourceProcsForMyData[i],0,fd->comm,&mpiSendDataSizeRequest[i]);
+
+	}
+
+	// the source procs send the requested data to the aggs - only send if requested more than 0 bytes
+	for (i = 0; i < numRecvToWaitFor; i++) {
+	    MPI_Waitany(numRecvToWaitFor,mpiSizeToSendRequest,&irecv,&mpiWaitAnyStatusFromTargetAggs);
+
+#ifdef p2pcontigtrace
+	    printf("was sent request for %d bytes from rank %d irecv index %d\n",amountOfDataReqestedByTargetAgg[irecv],targetAggsForMyData[irecv],irecv);
+#endif
+
+	    if (amountOfDataReqestedByTargetAgg[irecv] > 0) {
+		MPI_Isend(&((char*)buf)[bufferOffsetToSendPerTargetAgg[irecv]],amountOfDataReqestedByTargetAgg[irecv],MPI_BYTE,
+			targetAggsForMyData[irecv],0,fd->comm,&mpiSendDataToTargetAggRequest[irecv]);
+		totalAmountDataSent += amountOfDataReqestedByTargetAgg[irecv];
+		bufferOffsetToSendPerTargetAgg[irecv] += amountOfDataReqestedByTargetAgg[irecv];
+	    }
+
+	}
+
+	bgmpio_prof_cw[BGMPIO_CIO_T_DEXCH_SETUP] += (endTimeBase-startTimeBase);
+	startTimeBase = MPI_Wtime();
+
+	// the aggs receive the data from the source procs
+	int numDataRecvToWaitFor = 0;
+	for (i=0;i<numSourceProcs;i++) {
+
+	    int j, currentWBOffset = 0;
+	    for (j=0;j<i;j++)
+		currentWBOffset += dataSizeGottenThisRoundPerProc[j];
+
+	    // only receive from source procs that will send > 0 count data
+	    if (dataSizeGottenThisRoundPerProc[i] > 0) {
+#ifdef p2pcontigtrace
+		printf("receiving data from rank %d dataSizeGottenThisRoundPerProc is %d currentWBOffset is %d\n",sourceProcsForMyData[i],dataSizeGottenThisRoundPerProc[i],currentWBOffset);
+#endif
+		MPI_Irecv(&((char*)write_buf)[currentWBOffset],dataSizeGottenThisRoundPerProc[i],
+			MPI_BYTE,sourceProcsForMyData[i],0,
+			fd->comm,&mpiRecvDataRequest[numDataRecvToWaitFor]);
+		mpiRequestMapPerProc[numDataRecvToWaitFor] = i;
+		numDataRecvToWaitFor++;
+	    }
+
+#ifdef p2pcontigtrace
+	    printf("MPI_Irecv from rank %d\n",targetAggsForMyData[i]);
+#endif
+	}
+
+	int totalDataReceivedThisRound = 0;
+	for (i = 0; i < numDataRecvToWaitFor; i++) {
+	    MPI_Waitany(numDataRecvToWaitFor,mpiRecvDataRequest,
+		    &irecv,&mpiWaitAnyStatusFromSourceProcs);
+	    totalDataReceivedThisRound +=
+		dataSizeGottenThisRoundPerProc[mpiRequestMapPerProc[irecv]];
+	    totalAmountDataReceived +=
+		dataSizeGottenThisRoundPerProc[mpiRequestMapPerProc[irecv]];
+
+#ifdef p2pcontigtrace
+	    printf("numDataRecvToWaitFor is %d was sent %d bytes data for %d remaining bytes from rank %d irecv index %d\n",numDataRecvToWaitFor,dataSizeGottenThisRoundPerProc[mpiRequestMapPerProc[irecv]],remainingDataAmountToGetPerProc[mpiRequestMapPerProc[irecv]],sourceProcsForMyData[mpiRequestMapPerProc[irecv]],irecv);
+#endif
+	    remainingDataAmountToGetPerProc[mpiRequestMapPerProc[irecv]] -=
+		dataSizeGottenThisRoundPerProc[mpiRequestMapPerProc[irecv]];
+	    remainingDataOffsetToGetPerProc[mpiRequestMapPerProc[irecv]] +=
+		dataSizeGottenThisRoundPerProc[mpiRequestMapPerProc[irecv]];
+
+	}
+
+	endTimeBase = MPI_Wtime();
+	bgmpio_prof_cw[BGMPIO_CIO_T_DEXCH_NET] += (endTimeBase-startTimeBase);
+	// the aggs now write the data
+	if (numDataRecvToWaitFor > 0) {
+
+#ifdef p2pcontigtrace
+	    printf("totalDataReceivedThisRound is %d\n",totalDataReceivedThisRound);
+#endif
+	    if (!useIOBuffer) {
+
+		ADIO_WriteContig(fd, write_buf, (int)totalDataReceivedThisRound,
+			MPI_BYTE, ADIO_EXPLICIT_OFFSET,
+			currentRoundFDStart, &status, error_code);
+	    } else { // use the thread writer
+
+		if(!pthread_equal(io_thread, pthread_self())) {
+		    pthread_join(io_thread, &thread_ret);
+		    *error_code = *(int *)thread_ret;
+		    if (*error_code != MPI_SUCCESS) return;
+		    io_thread = pthread_self();
+
+		}
+		io_thread_args.fd = fd;
+		/* do a little pointer shuffling: background I/O works from one
+		 * buffer while two-phase machinery fills up another */
+
+		if (currentWriteBuf == 0) {
+		    io_thread_args.buf = write_buf0;
+		    currentWriteBuf = 1;
+		    write_buf = write_buf1;
+		}
+		else {
+		    io_thread_args.buf = write_buf1;
+		    currentWriteBuf = 0;
+		    write_buf = write_buf0;
+		}
+		io_thread_args.io_kind = ADIOI_WRITE;
+		io_thread_args.size = totalDataReceivedThisRound;
+		io_thread_args.offset = currentRoundFDStart;
+		io_thread_args.status = status;
+		io_thread_args.error_code = *error_code;
+		if ( (pthread_create(&io_thread, NULL,
+				ADIOI_IO_Thread_Func, &(io_thread_args))) != 0)
+		    io_thread = pthread_self();
+
+	    }
+
+	} // numDataRecvToWaitFor > 0
+
+	if (iAmUsedAgg)
+	    currentRoundFDStart += coll_bufsize;
+
+    } // for-loop roundIter
+
+    endTimeBase=MPI_Wtime();
+    bgmpio_prof_cw[BGMPIO_CIO_T_DEXCH] += (endTimeBase-startTimeBase);
+
+    if (useIOBuffer) { // thread writer cleanup
+
+	if ( !pthread_equal(io_thread, pthread_self()) ) {
+	    pthread_join(io_thread, &thread_ret);
+	    *error_code = *(int *)thread_ret;
+	}
+
+    }
+
+
+
+    if (iAmUsedAgg) {
+	ADIOI_Free(sourceProcsForMyData);
+	ADIOI_Free(remainingDataAmountToGetPerProc);
+	ADIOI_Free(remainingDataOffsetToGetPerProc);
+    }
+
+    ADIOI_Free(targetAggsForMyData);
+    ADIOI_Free(dataSizeToSendPerTargetAgg);
+    ADIOI_Free(bufferOffsetToSendPerTargetAgg);
+    ADIOI_Free(amountOfDataReqestedByTargetAgg);
+    ADIOI_Free(mpiSizeToSendRequest);
+    ADIOI_Free(mpiRecvDataRequest);
+    ADIOI_Free(mpiSendDataSizeRequest);
+    ADIOI_Free(mpiSendDataToTargetAggRequest);
+    ADIOI_Free(dataSizeGottenThisRoundPerProc);
+    ADIOI_Free(mpiRequestMapPerProc);
+
+    /* TODO: still need a barrier here? */
+    MPI_Barrier(fd->comm);
+    return;
+}
+
+void P2PContigReadAggregation(ADIO_File fd,
+	const void *buf,
+	int *error_code,
+	ADIO_Offset *st_offsets,
+	ADIO_Offset *end_offsets,
+	ADIO_Offset *fd_start,
+	ADIO_Offset* fd_end)
+{
+
+    *error_code = MPI_SUCCESS; // initialize to success
+
+    double startTimeBase,endTimeBase;
+
+    MPI_Status status;
+    pthread_t io_thread;
+    void *thread_ret;
+    ADIOI_IO_ThreadFuncData io_thread_args;
+
+    startTimeBase = MPI_Wtime();
+    int nprocs,myrank;
+    MPI_Comm_size(fd->comm, &nprocs);
+    MPI_Comm_rank(fd->comm, &myrank);
+
+    int myAggRank = -1; // if I am an aggregor this is my index into fd->hints->ranklist
+    int iAmUsedAgg = 0;
+
+    int naggs = fd->hints->cb_nodes;
+    int coll_bufsize = fd->hints->cb_buffer_size;
+    if (bgmpio_pthreadio == 1)
+	/* share buffer between working threads */
+	coll_bufsize = coll_bufsize/2;
+
+    int j;
+    for (j=0;j<naggs;j++) {
+	if (fd->hints->ranklist[j] == myrank) {
+	    myAggRank = j;
+	    if (fd_end[j] > fd_start[j]) {
+		iAmUsedAgg = 1;
+	    }
+	}
+    }
+
+    // for my offset range determine how much data and from whom I need to get (also record the offset in the buffer for procs that span file domains)
+    int *sourceAggsForMyData = (int *)ADIOI_Malloc(naggs * sizeof(int));
+    int *dataSizeToGetPerSourceAgg = (int *)ADIOI_Malloc(naggs * sizeof(int));
+    int *bufferOffsetToGetPerSourceAgg = (int *)ADIOI_Malloc(naggs * sizeof(int));
+    int numSourceAggs = 0;
+    int i;
+    for (i=0;i<naggs;i++) {
+	if ( ((st_offsets[myrank] >= fd_start[i]) &&  (st_offsets[myrank] <= fd_end[i])) || ((end_offsets[myrank] >= fd_start[i]) &&  (end_offsets[myrank] <= fd_end[i]))) {
+	    sourceAggsForMyData[numSourceAggs] = fd->hints->ranklist[i];
+	    if ( ((st_offsets[myrank] >= fd_start[i]) &&  (st_offsets[myrank] <= fd_end[i])) && ((end_offsets[myrank] >= fd_start[i]) &&  (end_offsets[myrank] <= fd_end[i]))) {
+		dataSizeToGetPerSourceAgg[numSourceAggs] = (end_offsets[myrank] - st_offsets[myrank])+1;
+		bufferOffsetToGetPerSourceAgg[numSourceAggs] = 0;
+	    }
+	    else if ((st_offsets[myrank] >= fd_start[i]) &&  (st_offsets[myrank] <= fd_end[i])) { // starts in this fd and goes past it
+		dataSizeToGetPerSourceAgg[numSourceAggs] = (fd_end[i] - st_offsets[myrank]) +1;
+		bufferOffsetToGetPerSourceAgg[numSourceAggs] = 0;
+	    }
+	    else { // starts in fd before this and ends in it
+		dataSizeToGetPerSourceAgg[numSourceAggs] = (end_offsets[myrank] - fd_start[i]) +1;
+		bufferOffsetToGetPerSourceAgg[numSourceAggs] = fd_start[i]- st_offsets[myrank];
+	    }
+	    numSourceAggs++;
+	}
+    }
+
+    /* these 3 arrays track info on the procs that are fed from an aggregtor -
+     * to sacrifice some performance at setup to save on memory instead of
+     * using max size of nprocs for the arrays could determine exact size first
+     * and then allocate that size */
+    int *targetProcsForMyData=NULL;
+    int *remainingDataAmountToSendPerProc=NULL;
+    ADIO_Offset *remainingDataOffsetToSendPerProc=NULL;
+
+    int numTargetProcs = 0;
+    int totalDataSizeToSend = 0;
+
+    if (iAmUsedAgg) {
+	/* for the used aggregators figure out how much data I need from what procs */
+
+	/* count numTargetProcs so we know how large to make the arrays */
+	for (i=0;i<nprocs;i++)
+	    if ( ((st_offsets[i] >= fd_start[myAggRank]) &&
+			(st_offsets[i] <= fd_end[myAggRank])) ||
+		    ((end_offsets[i] >= fd_start[myAggRank]) &&
+		     (end_offsets[i] <= fd_end[myAggRank]))  )
+		numTargetProcs++;
+
+	targetProcsForMyData =
+	    (int *)ADIOI_Malloc(numTargetProcs * sizeof(int));
+	remainingDataAmountToSendPerProc =
+	    (int *)ADIOI_Malloc(numTargetProcs * sizeof(int));
+	remainingDataOffsetToSendPerProc =
+	    (ADIO_Offset *)ADIOI_Malloc(numTargetProcs * sizeof(ADIO_Offset));
+
+	/* TODO: some balancecontig logic might need to go here */
+
+	/* everybody has the st_offsets and end_offsets for all ranks so if I am a
+	 * used aggregator go thru them and figure out which ranks have data that
+	 * falls into my file domain assigned to me */
+	numTargetProcs = 0;
+	for (i=0;i<nprocs;i++) {
+	    if ( ((st_offsets[i] >= fd_start[myAggRank]) &&  (st_offsets[i] <= fd_end[myAggRank])) || ((end_offsets[i] >= fd_start[myAggRank]) &&  (end_offsets[i] <= fd_end[myAggRank]))) {
+		targetProcsForMyData[numTargetProcs] = i;
+		if ( ((st_offsets[i] >= fd_start[myAggRank]) &&  (st_offsets[i] <= fd_end[myAggRank])) && ((end_offsets[i] >= fd_start[myAggRank]) &&  (end_offsets[i] <= fd_end[myAggRank]))) {
+		    remainingDataAmountToSendPerProc[numTargetProcs] = (end_offsets[i] - st_offsets[i])+1;
+		    remainingDataOffsetToSendPerProc[numTargetProcs] = st_offsets[i];
+		}
+		else if ((st_offsets[i] >= fd_start[myAggRank]) &&  (st_offsets[i] <= fd_end[myAggRank])) {// starts in this fd and goes past it
+		    remainingDataAmountToSendPerProc[numTargetProcs] = (fd_end[myAggRank] - st_offsets[i]) +1;
+		    remainingDataOffsetToSendPerProc[numTargetProcs] = st_offsets[i];
+		}
+		else { // starts in fd before this and ends in it
+		    remainingDataAmountToSendPerProc[numTargetProcs] = (end_offsets[i] - fd_start[myAggRank]) +1;
+		    remainingDataOffsetToSendPerProc[numTargetProcs] = fd_start[myAggRank];
+		}
+		totalDataSizeToSend += remainingDataAmountToSendPerProc[numTargetProcs];
+		numTargetProcs++;
+	    }
+	}
+    }
+
+    int *amountOfDataReqestedFromSourceAgg = (int *)ADIOI_Malloc(naggs * sizeof(int));
+    for (i=0;i<numSourceAggs;i++) {
+	amountOfDataReqestedFromSourceAgg[i] = 0;
+    }
+
+
+    int totalAmountDataSent = 0;
+    int totalAmountDataReceived = 0;
+    MPI_Request *mpiSizeToSendRequest = (MPI_Request *) ADIOI_Malloc(numSourceAggs * sizeof(MPI_Request));
+    MPI_Request *mpiRecvDataFromSourceAggsRequest = (MPI_Request *) ADIOI_Malloc(numSourceAggs * sizeof(MPI_Request));
+    MPI_Request *mpiSendDataSizeRequest = (MPI_Request *) ADIOI_Malloc(numTargetProcs * sizeof(MPI_Request));
+
+    MPI_Request *mpiSendDataToTargetProcRequest = (MPI_Request *) ADIOI_Malloc(numTargetProcs * sizeof(MPI_Request));
+    MPI_Status mpiWaitAnyStatusFromTargetAggs,mpiWaitAnyStatusFromSourceProcs;
+
+    /* use the two-phase buffer allocated in the file_open - no app should ever
+     * be both reading and writing at the same time */
+    char *read_buf0 = fd->io_buf;
+    char *read_buf1 = fd->io_buf + coll_bufsize;
+    /* if threaded i/o selected, we'll do a kind of double buffering */
+    char *read_buf = read_buf0;
+
+    // compute number of rounds
+    ADIO_Offset numberOfRounds = (ADIO_Offset)((((ADIO_Offset)(end_offsets[nprocs-1]-st_offsets[0]))/((ADIO_Offset)((ADIO_Offset)coll_bufsize*(ADIO_Offset)naggs)))) + 1;
+
+#ifdef p2pcontigtrace
+    printf("need to send %d bytes - coll_bufsize is %d\n",totalDataSizeToSend,coll_bufsize);
+#endif
+
+    ADIO_Offset currentRoundFDStart = 0, nextRoundFDStart = 0;
+    ADIO_Offset currentRoundFDEnd = 0, nextRoundFDEnd = 0;
+
+    if (iAmUsedAgg) {
+	currentRoundFDStart = fd_start[myAggRank];
+	nextRoundFDStart = fd_start[myAggRank];
+    }
+
+    int *dataSizeSentThisRoundPerProc = (int *)ADIOI_Malloc(numTargetProcs * sizeof(int));
+    *error_code = MPI_SUCCESS;
+
+    int currentReadBuf = 0;
+    int useIOBuffer = 0;
+    if (bgmpio_pthreadio && (numberOfRounds>1)) {
+	useIOBuffer = 1;
+	io_thread = pthread_self();
+    }
+
+    endTimeBase = MPI_Wtime();
+    bgmpio_prof_cw[BGMPIO_CIO_T_MYREQ] += (endTimeBase-startTimeBase);
+
+
+    // each iteration of this loop reads a coll_bufsize portion of the file domain
+    int roundIter;
+    for (roundIter=0;roundIter<numberOfRounds;roundIter++) {
+
+	int irecv;
+	// determine what offsets define the portion of the file domain the agg is reading this round
+	if (iAmUsedAgg) {
+
+	    currentRoundFDStart = nextRoundFDStart;
+
+	    if (!useIOBuffer || (roundIter == 0)) {
+		int amountDataToReadThisRound;
+		if ((fd_end[myAggRank] - currentRoundFDStart) < coll_bufsize) {
+		    currentRoundFDEnd = fd_end[myAggRank];
+		    amountDataToReadThisRound = ((currentRoundFDEnd-currentRoundFDStart)+1);
+		}
+		else {
+		    currentRoundFDEnd = currentRoundFDStart + coll_bufsize - 1;
+		    amountDataToReadThisRound = coll_bufsize;
+		}
+
+		// read currentRoundFDEnd bytes
+		ADIO_ReadContig(fd, read_buf,amountDataToReadThisRound,
+			MPI_BYTE, ADIO_EXPLICIT_OFFSET, currentRoundFDStart,
+			&status, error_code);
+
+		endTimeBase = MPI_Wtime();
+	    }
+
+	    if (useIOBuffer) { // use the thread reader for the next round
+		// switch back and forth between the read buffers so that the data aggregation code is diseminating 1 buffer while the thread is reading into the other
+
+		if (roundIter > 0)
+		    currentRoundFDEnd = nextRoundFDEnd;
+
+		if (roundIter < (numberOfRounds-1)) {
+		    nextRoundFDStart += coll_bufsize;
+		    int amountDataToReadNextRound;
+		    if ((fd_end[myAggRank] - nextRoundFDStart) < coll_bufsize) {
+			nextRoundFDEnd = fd_end[myAggRank];
+			amountDataToReadNextRound = ((nextRoundFDEnd-nextRoundFDStart)+1);
+		    }
+		    else {
+			nextRoundFDEnd = nextRoundFDStart + coll_bufsize - 1;
+			amountDataToReadNextRound = coll_bufsize;
+		    }
+
+		    if(!pthread_equal(io_thread, pthread_self())) {
+			pthread_join(io_thread, &thread_ret);
+			*error_code = *(int *)thread_ret;
+			if (*error_code != MPI_SUCCESS) return;
+			io_thread = pthread_self();
+
+		    }
+		    io_thread_args.fd = fd;
+		    /* do a little pointer shuffling: background I/O works from one
+		     * buffer while two-phase machinery fills up another */
+
+		    if (currentReadBuf == 0) {
+			io_thread_args.buf = read_buf0;
+			currentReadBuf = 1;
+			read_buf = read_buf1;
+		    }
+		    else {
+			io_thread_args.buf = read_buf1;
+			currentReadBuf = 0;
+			read_buf = read_buf0;
+		    }
+		    io_thread_args.io_kind = ADIOI_READ;
+		    io_thread_args.size = amountDataToReadNextRound;
+		    io_thread_args.offset = nextRoundFDStart;
+		    io_thread_args.status = status;
+		    io_thread_args.error_code = *error_code;
+		    if ( (pthread_create(&io_thread, NULL,
+				    ADIOI_IO_Thread_Func, &(io_thread_args))) != 0)
+			io_thread = pthread_self();
+
+		}
+		else { // last round
+
+		    if(!pthread_equal(io_thread, pthread_self())) {
+			pthread_join(io_thread, &thread_ret);
+			*error_code = *(int *)thread_ret;
+			if (*error_code != MPI_SUCCESS) return;
+			io_thread = pthread_self();
+
+		    }
+		    if (currentReadBuf == 0) {
+			read_buf = read_buf0;
+		    }
+		    else {
+			read_buf = read_buf1;
+		    }
+
+		}
+	    } // useIOBuffer
+	} // IAmUsedAgg
+
+	/* the source procs receive the amount of data the aggs will be sending them */
+	int i;
+	for (i=0;i<numSourceAggs;i++) {
+	    MPI_Irecv(&amountOfDataReqestedFromSourceAgg[i],1,
+		    MPI_INT,sourceAggsForMyData[i],0,
+		    fd->comm,&mpiSizeToSendRequest[i]);
+	}
+
+	// the aggs send the amount of data they will be sending to their source procs
+	for (i=0;i<numTargetProcs;i++) {
+	    if ((remainingDataOffsetToSendPerProc[i] >= currentRoundFDStart) &&
+		    (remainingDataOffsetToSendPerProc[i] <= currentRoundFDEnd)) {
+		if ((remainingDataOffsetToSendPerProc[i] +
+			    remainingDataAmountToSendPerProc[i]) <= currentRoundFDEnd)
+		    dataSizeSentThisRoundPerProc[i] = remainingDataAmountToSendPerProc[i];
+		else
+		    dataSizeSentThisRoundPerProc[i] =
+			(currentRoundFDEnd - remainingDataOffsetToSendPerProc[i]) +1;
+	    }
+	    else if (((remainingDataOffsetToSendPerProc[i]+
+			    remainingDataAmountToSendPerProc[i]) >=
+			currentRoundFDStart) &&
+		    ((remainingDataOffsetToSendPerProc[i]+
+		      remainingDataAmountToSendPerProc[i]) <= currentRoundFDEnd)) {
+		if ((remainingDataOffsetToSendPerProc[i]) >= currentRoundFDStart)
+		    dataSizeSentThisRoundPerProc[i] = remainingDataAmountToSendPerProc[i];
+		else
+		    dataSizeSentThisRoundPerProc[i] =
+			(remainingDataOffsetToSendPerProc[i]-currentRoundFDStart) +1;
+	    }
+	    else
+		dataSizeSentThisRoundPerProc[i] = 0;
+
+	    MPI_Isend(&dataSizeSentThisRoundPerProc[i],1,MPI_INT,
+		    targetProcsForMyData[i],0,fd->comm,&mpiSendDataSizeRequest[i]);
+
+	}
+
+	/* the source procs get the requested data amount from the aggs and then
+	 * receive that amount of data - only recv if requested more than 0 bytes */
+	int numDataRecvToWaitFor = 0;
+	for (i = 0; i < numSourceAggs; i++) {
+	    MPI_Waitany(numSourceAggs,mpiSizeToSendRequest,
+		    &irecv,&mpiWaitAnyStatusFromTargetAggs);
+	    if (amountOfDataReqestedFromSourceAgg[irecv] > 0) {
+
+		MPI_Irecv(&((char*)buf)[bufferOffsetToGetPerSourceAgg[irecv]],
+			amountOfDataReqestedFromSourceAgg[irecv],MPI_BYTE,
+			sourceAggsForMyData[irecv],0,fd->comm,
+			&mpiRecvDataFromSourceAggsRequest[numDataRecvToWaitFor]);
+		totalAmountDataSent += amountOfDataReqestedFromSourceAgg[irecv];
+		bufferOffsetToGetPerSourceAgg[irecv] += amountOfDataReqestedFromSourceAgg[irecv];
+
+		numDataRecvToWaitFor++;
+	    }
+	}
+
+	// the aggs send the data to the source procs
+	for (i=0;i<numTargetProcs;i++) {
+
+	    int j, currentWBOffset = 0;
+	    for (j=0;j<i;j++)
+		currentWBOffset += dataSizeSentThisRoundPerProc[j];
+
+	    // only send to target procs that will recv > 0 count data
+	    if (dataSizeSentThisRoundPerProc[i] > 0) {
+		MPI_Isend(&((char*)read_buf)[currentWBOffset],
+			dataSizeSentThisRoundPerProc[i],
+			MPI_BYTE,targetProcsForMyData[i],0,
+			fd->comm,&mpiSendDataToTargetProcRequest[i]);
+		remainingDataAmountToSendPerProc[i] -= dataSizeSentThisRoundPerProc[i];
+		remainingDataOffsetToSendPerProc[i] += dataSizeSentThisRoundPerProc[i];
+	    }
+	}
+
+	// wait for the target procs to get their data
+	for (i = 0; i < numDataRecvToWaitFor; i++) {
+	    MPI_Waitany(numDataRecvToWaitFor,mpiRecvDataFromSourceAggsRequest,
+		    &irecv,&mpiWaitAnyStatusFromSourceProcs);
+	}
+
+	nextRoundFDStart = currentRoundFDStart + coll_bufsize;
+
+	MPI_Barrier(fd->comm); // need to sync up the source aggs which did the isend with the target procs which did the irecvs to give the target procs time to get the data before overwriting with next round readcontig
+
+    } // for-loop roundIter
+
+    if (useIOBuffer) { // thread reader cleanup
+
+	if ( !pthread_equal(io_thread, pthread_self()) ) {
+	    pthread_join(io_thread, &thread_ret);
+	    *error_code = *(int *)thread_ret;
+	}
+    }
+
+    if (iAmUsedAgg) {
+	ADIOI_Free(targetProcsForMyData);
+	ADIOI_Free(remainingDataAmountToSendPerProc);
+	ADIOI_Free(remainingDataOffsetToSendPerProc);
+    }
+
+    ADIOI_Free(sourceAggsForMyData);
+    ADIOI_Free(dataSizeToGetPerSourceAgg);
+    ADIOI_Free(bufferOffsetToGetPerSourceAgg);
+    ADIOI_Free(amountOfDataReqestedFromSourceAgg);
+    ADIOI_Free(mpiSizeToSendRequest);
+    ADIOI_Free(mpiRecvDataFromSourceAggsRequest);
+    ADIOI_Free(mpiSendDataSizeRequest);
+    ADIOI_Free(mpiSendDataToTargetProcRequest);
+    ADIOI_Free(dataSizeSentThisRoundPerProc);
+
+    /* TODO: is Barrier here needed? */
+    MPI_Barrier(fd->comm);
+
+    return;
+
+}

http://git.mpich.org/mpich.git/commitdiff/a19edd236c3bfb574c6d49080202a17e3e6a4cbe

commit a19edd236c3bfb574c6d49080202a17e3e6a4cbe
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Sun Jan 26 14:20:47 2014 -0600

    deferred open fixup: broadcast from correct root
    
    in deferred open case, we will have created an "aggregator communicator"
    consisting of i/o aggregators.  the 'ranklist' enumerates ranks in
    fd->comm, but is not meaningful in the aggregator communicator.
    likewise, we do not simply broadcast from '0' in the no-deferred-open
    case because rank 0 might not be an aggregator.

diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_open.c b/src/mpi/romio/adio/ad_bg/ad_bg_open.c
index 3a36a5a..7afa14d 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_open.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_open.c
@@ -116,7 +116,7 @@ static void scaleable_stat(ADIO_File fd)
     long buf[2];
     MPI_Comm_rank(fd->comm, &rank);
 
-    if (rank == fd->hints->ranklist[0]) {
+    if ((rank == fd->hints->ranklist[0]) || (fd->comm == MPI_COMM_SELF)) {
 	/* Get the (real) underlying file system block size */
 	rc = stat64(fd->filename, &bg_stat);
 	if (rc >= 0)
@@ -164,7 +164,7 @@ static void scaleable_stat(ADIO_File fd)
     if (fd->comm != MPI_COMM_SELF) { /* if indep open, there's no one to talk to*/
 	if (fd->agg_comm != MPI_COMM_NULL) /* deferred open: only a subset of
 					      processes participate */
-	    MPI_Bcast(buf, 2, MPI_LONG, fd->hints->ranklist[0], fd->agg_comm);
+	    MPI_Bcast(buf, 2, MPI_LONG, 0, fd->agg_comm);
 	else
 	    MPI_Bcast(buf, 2, MPI_LONG, fd->hints->ranklist[0], fd->comm);
     }

http://git.mpich.org/mpich.git/commitdiff/dde97df0e58d3b8b9ba4f32b4dace0285e90ddbc

commit dde97df0e58d3b8b9ba4f32b4dace0285e90ddbc
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Thu Jan 23 15:52:58 2014 -0600

    romio-timing: even more finer grained timing

diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
index d6a6725..b21ac13 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
@@ -195,6 +195,8 @@ void ad_bg_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs )
 		    bgmpio_prof_max[ BGMPIO_CIO_T_OTHREQ ]   );
 	    fprintf(stderr,"EXCHANGE-max: %10.3f , ",
 		    bgmpio_prof_max[ BGMPIO_CIO_T_DEXCH ]    );
+	    fprintf(stderr, "EXCHANGE-RECV_EXCH-max: %10.3f , ",
+		    bgmpio_prof_max[ BGMPIO_CIO_T_DEXCH_RECV_EXCH]  );
 	    fprintf(stderr, "EXCHANGE-SETUP-max: %10.3f , ",
 		    bgmpio_prof_max[ BGMPIO_CIO_T_DEXCH_SETUP]  );
 	    fprintf(stderr, "EXCHANGE-NET-max: %10.3f , ",
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
index ea50580..4055fcc 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
@@ -41,6 +41,9 @@ enum {
     BGMPIO_CIO_T_MYREQ,	/* time for ADIOI_BG_Calc_my_req(), local */
     BGMPIO_CIO_T_OTHREQ,	/* time for ADIOI_Calc_others_req(), short Alltoall */
     BGMPIO_CIO_T_DEXCH,	/* time for I/O data exchange */
+    /* the next DEXCH_* timers capture finer-grained portions of T_DEXCH */
+    BGMPIO_CIO_T_DEXCH_RECV_EXCH,/* time for each process to exchange recieve
+				    size info with everyone else */
     BGMPIO_CIO_T_DEXCH_SETUP,	/* time for setup portion of I/O data exchange */
     BGMPIO_CIO_T_DEXCH_NET,	/* time for network portion of I/O data exchange */
     BGMPIO_CIO_T_DEXCH_SORT, 	/* time to sort requesst in I/O data exchange */
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c b/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
index 3338041..a29ce80 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
@@ -1322,6 +1322,9 @@ static void ADIOI_W_Exchange_data_alltoallv(
   /* exchange recv_size info so that each process knows how much to
      send to whom. */
     MPI_Alltoall(recv_size, 1, MPI_INT, send_size, 1, MPI_INT, fd->comm);
+
+    bgmpio_prof_cw[BGMPIO_CIO_T_DEXCH_RECV_EXCH] += MPI_Wtime() - io_time;
+    io_time = MPI_Wtime();
     
     nprocs_recv = 0;
     for (i=0; i<nprocs; i++) if (recv_size[i]) { nprocs_recv++; }

http://git.mpich.org/mpich.git/commitdiff/0a437100052f921abb2a73c3d806c8828e68bfe3

commit 0a437100052f921abb2a73c3d806c8828e68bfe3
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Tue Jan 21 09:26:24 2014 -0600

    Two-phase I/O with threaded write
    
    Experimental async-with-pthread I/O approach to hiding some of the I/O
    latency/variability from the two-phase collectives.
    
    heavily modified from Paul Coffman's (pkcoffman at us.ibm.com) original work

diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c b/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
index 9fb9128..d3f37af 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
@@ -88,6 +88,8 @@ extern void ADIOI_Calc_my_off_len(ADIO_File fd, int bufcount, MPI_Datatype
 			    ADIO_Offset *end_offset_ptr, int
 			   *contig_access_count_ptr);
 
+
+
 void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
 			       MPI_Datatype datatype, int file_ptr_type,
 			       ADIO_Offset offset, ADIO_Status *status, int
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
index e1b0e35..d6a6725 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
@@ -35,6 +35,7 @@ int 	bgmpio_tunegather;
 int 	bgmpio_tuneblocking;
 long    bglocklessmpio_f_type;
 int     bgmpio_bg_nagg_pset;
+int     bgmpio_pthreadio;
 
 double	bgmpio_prof_cw    [BGMPIO_CIO_LAST];
 double	bgmpio_prof_cr    [BGMPIO_CIO_LAST];
@@ -81,7 +82,17 @@ double	bgmpio_prof_cr    [BGMPIO_CIO_LAST];
  *   - any integer
  *   - Default is 8
  *
+ * - BGMPIO_PTHREADIO - Enables a very simple form of asyncronous io where a
+ *   pthread is spawned to do the posix writes while the main thread does the
+ *   data aggregation - useful for large files where multiple rounds are
+ *   required (more that the cb_buffer_size of data per aggregator).   User
+ *   must ensure there is hw resource available for the thread to run.  I
+ *   am sure there is a better way to do this involving comm threads - this is
+ *   just a start.  NOTE: For some reason the stats collected when this is
+ *   enabled misses some of the data so the data sizes are off a bit - this is
+ *   a statistical issue only, the data is still accurately written out
 */
+
 void ad_bg_get_env_vars() {
     char *x, *dummy;
 
@@ -108,6 +119,12 @@ void ad_bg_get_env_vars() {
     bgmpio_bg_nagg_pset = ADIOI_BG_NAGG_PSET_DFLT;
     x = getenv("BGMPIO_NAGG_PSET");
     if (x) bgmpio_bg_nagg_pset = atoi(x);
+
+    bgmpio_pthreadio = 0;
+    x = getenv( "BGMPIO_PTHREADIO" );
+    if (x) bgmpio_pthreadio = atoi(x);
+
+
 }
 
 /* report timing breakdown for MPI I/O collective call */
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
index 818727d..ea50580 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
@@ -65,6 +65,7 @@ extern int 	bgmpio_comm;
 extern int 	bgmpio_tunegather;
 extern int 	bgmpio_tuneblocking;
 extern long bglocklessmpio_f_type;
+extern int      bgmpio_pthreadio;
 
 /* Default is, well, kind of complicated. Blue Gene /L and /P had "psets": one
  * i/o node and all compute nodes wired to it.  On Blue Gene /Q that
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c b/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
index 5bae065..3338041 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
@@ -26,6 +26,8 @@
 #include "mpe.h"
 #endif
 
+#include <pthread.h>
+
 /* prototypes of functions used for collective writes only. */
 static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
                          datatype, int nprocs, int myrank, ADIOI_Access
@@ -428,7 +430,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
     ADIO_Offset size=0;
     int hole, i, j, m, ntimes, max_ntimes, buftype_is_contig;
     ADIO_Offset st_loc=-1, end_loc=-1, off, done, req_off;
-    char *write_buf=NULL;
+    char *write_buf=NULL, *write_buf2=NULL;
     int *curr_offlen_ptr, *count, *send_size, req_len, *recv_size;
     int *partial_recv, *sent_to_proc, *start_pos, flag;
     int *send_buf_idx, *curr_to_proc, *done_to_proc;
@@ -438,6 +440,9 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
     int info_flag, coll_bufsize;
     char *value;
     static char myname[] = "ADIOI_EXCH_AND_WRITE";
+    pthread_t io_thread;
+    void *thread_ret;
+    ADIOI_IO_ThreadFuncData io_thread_args;
 
     *error_code = MPI_SUCCESS;  /* changed below if error */
     /* only I/O errors are currently reported */
@@ -452,6 +457,11 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
     coll_bufsize = atoi(value);
     ADIOI_Free(value);
 
+    if (bgmpio_pthreadio == 1){
+	/* ROMIO will spawn an additional thread. both threads use separate
+	 * halves of the collective buffer*/
+	coll_bufsize = coll_bufsize/2;
+    }
 
     for (i=0; i < nprocs; i++) {
 	if (others_req[i].count) {
@@ -480,6 +490,9 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
 		  fd->comm); 
 
     write_buf = fd->io_buf;
+    if (bgmpio_pthreadio == 1) {
+	write_buf2 = fd->io_buf + coll_bufsize;
+    }
 
     curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); 
     /* its use is explained below. calloc initializes to 0. */
@@ -536,6 +549,9 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
     done = 0;
     off = st_loc;
 
+    if(bgmpio_pthreadio == 1)
+	io_thread = pthread_self();
+
 #ifdef PROFILE
 	MPE_Log_event(14, 0, "end computation");
 #endif
@@ -658,14 +674,48 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
 
 	if (flag) {
       ADIOI_Assert(size == (int)size);
-	    ADIO_WriteContig(fd, write_buf, (int)size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, 
-                        off, &status, error_code);
-	    if (*error_code != MPI_SUCCESS) return;
+	    if (bgmpio_pthreadio == 1) {
+		/* there is no such thing as "invalid pthread identifier", so
+		 * we'll use pthread_self() instead.  Before we do I/O we want
+		 * to complete I/O from any previous iteration -- but only a
+		 * previous iteration that had I/O work to do (i.e. set 'flag')
+		 */
+		if(!pthread_equal(io_thread, pthread_self())) {
+		    pthread_join(io_thread, &thread_ret);
+		    *error_code = *(int *)thread_ret;
+		    if (*error_code != MPI_SUCCESS) return;
+		    io_thread = pthread_self();
+
+		}
+		io_thread_args.fd = fd;
+		/* do a little pointer shuffling: background I/O works from one
+		 * buffer while two-phase machinery fills up another */
+		io_thread_args.buf = write_buf;
+		ADIOI_SWAP(write_buf, write_buf2, char*);
+		io_thread_args.io_kind = ADIOI_WRITE;
+		io_thread_args.size = size;
+		io_thread_args.offset = off;
+		io_thread_args.status = status;
+		io_thread_args.error_code = *error_code;
+		if ( (pthread_create(&io_thread, NULL,
+			ADIOI_IO_Thread_Func, &(io_thread_args))) != 0)
+		    io_thread = pthread_self();
+	    } else {
+		ADIO_WriteContig(fd, write_buf, (int)size, MPI_BYTE,
+			ADIO_EXPLICIT_OFFSET, off, &status, error_code);
+		if (*error_code != MPI_SUCCESS) return;
+	    }
 	}
 
 	off += size;
 	done += size;
     }
+    if (bgmpio_pthreadio == 1) {
+	if ( !pthread_equal(io_thread, pthread_self()) ) {
+	    pthread_join(io_thread, &thread_ret);
+	    *error_code = *(int *)thread_ret;
+	}
+    }
 
     for (i=0; i<nprocs; i++) count[i] = recv_size[i] = 0;
 #ifdef PROFILE
diff --git a/src/mpi/romio/adio/common/Makefile.mk b/src/mpi/romio/adio/common/Makefile.mk
index 0acb27d..f95f90a 100644
--- a/src/mpi/romio/adio/common/Makefile.mk
+++ b/src/mpi/romio/adio/common/Makefile.mk
@@ -66,5 +66,6 @@ romio_other_sources +=                  \
     adio/common/status_setb.c           \
     adio/common/strfns.c                \
     adio/common/system_hints.c          \
-    adio/common/hint_fns.c
+    adio/common/hint_fns.c              \
+    adio/common/ad_threaded_io.c
 
diff --git a/src/mpi/romio/adio/common/ad_threaded_io.c b/src/mpi/romio/adio/common/ad_threaded_io.c
new file mode 100644
index 0000000..d06d828
--- /dev/null
+++ b/src/mpi/romio/adio/common/ad_threaded_io.c
@@ -0,0 +1,31 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *   Copyright (C) 1997-2001 University of Chicago.
+ *   See COPYRIGHT notice in top-level directory.
+ */
+
+#include "adio.h"
+#include "adio_extern.h"
+
+/* Function for running in another thread for doing the file reading while the
+ * main thread is doing data aggregation - useful only when multiple rounds are
+ * needed due to file size relative to the read buffer size and number of
+ * aggregators */
+
+void *ADIOI_IO_Thread_Func(void *vptr_args) {
+    ADIOI_IO_ThreadFuncData *args = (ADIOI_IO_ThreadFuncData*)vptr_args;
+
+    ADIOI_Assert(args->size == (int)(args->size));
+
+    if (args->io_kind == ADIOI_READ) {
+	ADIO_ReadContig(args->fd, args->buf, args->size, MPI_BYTE,
+		ADIO_EXPLICIT_OFFSET, args->offset,
+		&(args->status), &(args->error_code));
+    } else {
+	ADIO_WriteContig(args->fd, args->buf, args->size, MPI_BYTE,
+		ADIO_EXPLICIT_OFFSET, args->offset,
+		&(args->status), &(args->error_code));
+    }
+    pthread_exit(&(args->error_code));
+    return NULL;
+}
diff --git a/src/mpi/romio/adio/include/adioi.h b/src/mpi/romio/adio/include/adioi.h
index ea07491..702cc20 100644
--- a/src/mpi/romio/adio/include/adioi.h
+++ b/src/mpi/romio/adio/include/adioi.h
@@ -197,6 +197,9 @@ struct ADIOI_Fns_struct {
 
 #define ADIOI_MIN(a, b) ((a) < (b) ? (a) : (b))
 #define ADIOI_MAX(a, b) ((a) > (b) ? (a) : (b))
+/* thanks stackoverflow:
+ * http://stackoverflow.com/questions/3982348/implement-generic-swap-macro-in-c */
+#define ADIOI_SWAP(x, y, T) do { T temp##x##y = x; x = y; y = temp##x##y; } while (0);
 
 #define ADIOI_PREALLOC_BUFSZ      16777216    /* buffer size used to 
                                                 preallocate disk space */
@@ -859,5 +862,23 @@ if (MPIR_Ext_dbg_romio_typical_enabled) fprintf
 #define DBG_FPRINTF if (0) fprintf
 #define DBGV_FPRINTF if (0) fprintf
 #endif
+
+/* declarations for threaded I/O */
+/* i/o thread data structure (bgmpio_pthreadwc) */
+typedef struct wcThreadFuncData {
+    ADIO_File fd;
+    int io_kind;
+    char *buf;
+    int size;
+    ADIO_Offset offset;
+    ADIO_Status status;
+    int error_code;
+} ADIOI_IO_ThreadFuncData;
+
+void *ADIOI_IO_Thread_Func(void *vptr_args);
+
+
+
+
 #endif
 

http://git.mpich.org/mpich.git/commitdiff/d1e292ca3b0f21b63adc631ef728eb48c0860cab

commit d1e292ca3b0f21b63adc631ef728eb48c0860cab
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Tue Jan 21 09:25:47 2014 -0600

    coll_perf buffer size too small
    
    crank up the size of coll_perf to something not laughably small

diff --git a/src/mpi/romio/test/coll_perf.c b/src/mpi/romio/test/coll_perf.c
index 88b1501..0f9042d 100644
--- a/src/mpi/romio/test/coll_perf.c
+++ b/src/mpi/romio/test/coll_perf.c
@@ -64,9 +64,9 @@ int main(int argc, char **argv)
     ndims = 3;
     order = MPI_ORDER_C;
 
-    array_of_gsizes[0] = 128;
-    array_of_gsizes[1] = 128;
-    array_of_gsizes[2] = 128;
+    array_of_gsizes[0] = 128*17;
+    array_of_gsizes[1] = 128*9;
+    array_of_gsizes[2] = 128*11;
 
     array_of_distribs[0] = MPI_DISTRIBUTE_BLOCK;
     array_of_distribs[1] = MPI_DISTRIBUTE_BLOCK;

http://git.mpich.org/mpich.git/commitdiff/da9d3398de3c2cd339cb43f4755f4067944701db

commit da9d3398de3c2cd339cb43f4755f4067944701db
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Fri Jan 17 16:15:19 2014 -0600

    remove extraneous locks in bluegene driver
    
    The only reason these locks exist is becuse way back in BGL days someone
    at IBM thought it might be a good idea to have one driver that could
    access both NFS and GPFS.  There was also some concern about a large
    write call getting split up by the i/o forwarder.  fortunately, MPI-IO
    semantics mean applications that would be harmed by such a split already
    face "undefined" behavior.

diff --git a/src/mpi/romio/adio/ad_bg/Makefile.mk b/src/mpi/romio/adio/ad_bg/Makefile.mk
index 69fef21..ade29c5 100644
--- a/src/mpi/romio/adio/ad_bg/Makefile.mk
+++ b/src/mpi/romio/adio/ad_bg/Makefile.mk
@@ -21,9 +21,7 @@ romio_other_sources +=                                               \
     adio/ad_bg/ad_bg_flush.c                                         \
     adio/ad_bg/ad_bg_hints.c                                         \
     adio/ad_bg/ad_bg_pset.c                                          \
-    adio/ad_bg/ad_bg_read.c                                          \
     adio/ad_bg/ad_bg_tuning.c                                        \
-    adio/ad_bg/ad_bg_write.c                                         \
     adio/ad_bg/ad_bg.c                                               \
     adio/ad_bg/ad_bg_fcntl.c                                         \
     adio/ad_bg/ad_bg_getsh.c                                         \
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg.c b/src/mpi/romio/adio/ad_bg/ad_bg.c
index 3c93eaa..4a3904e 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg.c
@@ -20,15 +20,15 @@
 struct ADIOI_Fns_struct ADIO_BG_operations = {
     ADIOI_BG_Open, /* Open */
     ADIOI_GEN_OpenColl, /* Collective open */
-    ADIOI_BG_ReadContig, /* ReadContig */
-    ADIOI_BG_WriteContig, /* WriteContig */
+    ADIOI_GEN_ReadContig, /* ReadContig */
+    ADIOI_GEN_WriteContig, /* WriteContig */
     ADIOI_BG_ReadStridedColl, /* ReadStridedColl */
     ADIOI_BG_WriteStridedColl, /* WriteStridedColl */
     ADIOI_GEN_SeekIndividual, /* SeekIndividual */
     ADIOI_BG_Fcntl, /* Fcntl */
     ADIOI_BG_SetInfo, /* SetInfo */
-    ADIOI_BG_ReadStrided, /* ReadStrided */
-    ADIOI_BG_WriteStrided, /* WriteStrided */
+    ADIOI_GEN_ReadStrided, /* ReadStrided */
+    ADIOI_GEN_WriteStrided, /* WriteStrided */
     ADIOI_BG_Close, /* Close */
 #ifdef ROMIO_HAVE_WORKING_AIO
 #warning Consider BG support for NFS before enabling this.
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_read.c b/src/mpi/romio/adio/ad_bg/ad_bg_read.c
deleted file mode 100644
index 503c004..0000000
--- a/src/mpi/romio/adio/ad_bg/ad_bg_read.c
+++ /dev/null
@@ -1,523 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bg_read.c
- * \brief ???
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 1997 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "ad_bg.h"
-#include "adio_extern.h"
-
-#include "ad_bg_tuning.h"
-#ifdef AGGREGATION_PROFILE
-#include "mpe.h"
-#endif
-
-void ADIOI_BG_ReadContig(ADIO_File fd, void *buf, int count, 
-                     MPI_Datatype datatype, int file_ptr_type,
-		     ADIO_Offset offset, ADIO_Status *status, int *error_code)
-{
-    MPI_Count err=-1, datatype_size;
-    ADIO_Offset len;
-    static char myname[] = "ADIOI_BG_READCONTIG";
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5034, 0, NULL);
-#endif
-    /* timing */
-    double io_time, io_time2;
-
-    MPI_Type_size_x(datatype, &datatype_size);
-    len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
-    ADIOI_Assert(len == (unsigned int) len); /* read takes an unsigned int parm */
-
-    if (bgmpio_timing) {
-	io_time = MPI_Wtime();
-	bgmpio_prof_cr[ BGMPIO_CIO_DATA_SIZE ] += len;
-    }
-
-    if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
-	if (bgmpio_timing) io_time2 = MPI_Wtime();
-	if (fd->fp_sys_posn != offset)
-	    lseek(fd->fd_sys, offset, SEEK_SET);
-	if (bgmpio_timing) bgmpio_prof_cr[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
-	if (fd->atomicity)
-	    ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
-	else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
-	if (bgmpio_timing) io_time2 = MPI_Wtime();
-	err = read(fd->fd_sys, buf, (unsigned int)len);
-	if (bgmpio_timing) bgmpio_prof_cr[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
-	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
-	fd->fp_sys_posn = offset + err;
-	/* individual file pointer not updated */        
-    }
-    else {  /* read from curr. location of ind. file pointer */
-	offset = fd->fp_ind;
-	if (bgmpio_timing) io_time2 = MPI_Wtime();
-	if (fd->fp_sys_posn != fd->fp_ind)
-	    lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
-	if (bgmpio_timing) bgmpio_prof_cr[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
-	if (fd->atomicity)
-	    ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
-	else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
-	if (bgmpio_timing) io_time2 = MPI_Wtime();
-	err = read(fd->fd_sys, buf, (unsigned int)len);
-	if (bgmpio_timing) bgmpio_prof_cr[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
-	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
-	fd->fp_ind += err;
-	fd->fp_sys_posn = fd->fp_ind;
-    }
-
-    if (bgmpio_timing) bgmpio_prof_cr[ BGMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
-
-    /* --BEGIN ERROR HANDLING-- */
-    if (err == -1) {
-	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
-					   myname, __LINE__, MPI_ERR_IO,
-					   "**io", "**io %s", strerror(errno));
-	return;
-    }
-    /* --END ERROR HANDLING-- */
-
-#ifdef HAVE_STATUS_SET_BYTES
-    MPIR_Status_set_bytes(status, datatype, err);
-#endif
-
-    *error_code = MPI_SUCCESS;
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5035, 0, NULL);
-#endif
-}
-
-
-#define ADIOI_BUFFERED_READ \
-{ \
-    if (req_off >= readbuf_off + readbuf_len) { \
-	readbuf_off = req_off; \
-	readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));\
-	lseek(fd->fd_sys, readbuf_off, SEEK_SET);\
-        if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\
-        err = read(fd->fd_sys, readbuf, readbuf_len);\
-        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\
-        if (err == -1) err_flag = 1; \
-    } \
-    while (req_len > readbuf_off + readbuf_len - req_off) { \
-  ADIOI_Assert((readbuf_off + readbuf_len - req_off) == (int) (readbuf_off + readbuf_len - req_off));\
-	partial_read = (int) (readbuf_off + readbuf_len - req_off); \
-	tmp_buf = (char *) ADIOI_Malloc(partial_read); \
-	memcpy(tmp_buf, readbuf+readbuf_len-partial_read, partial_read); \
-	ADIOI_Free(readbuf); \
-	readbuf = (char *) ADIOI_Malloc(partial_read + max_bufsize); \
-	memcpy(readbuf, tmp_buf, partial_read); \
-	ADIOI_Free(tmp_buf); \
-	readbuf_off += readbuf_len-partial_read; \
-	readbuf_len = (unsigned) (partial_read + ADIOI_MIN(max_bufsize, \
-				       end_offset-readbuf_off+1)); \
-	lseek(fd->fd_sys, readbuf_off+partial_read, SEEK_SET);\
-        if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
-        err = read(fd->fd_sys, readbuf+partial_read, readbuf_len-partial_read);\
-        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
-        if (err == -1) err_flag = 1; \
-    } \
-    ADIOI_Assert(req_len == (size_t)req_len); \
-    memcpy((char *)buf + userbuf_off, readbuf+req_off-readbuf_off, req_len); \
-}
-
-
-void ADIOI_BG_ReadStrided(ADIO_File fd, void *buf, int count,
-                       MPI_Datatype datatype, int file_ptr_type,
-                       ADIO_Offset offset, ADIO_Status *status, int
-                       *error_code)
-{
-/* offset is in units of etype relative to the filetype. */
-
-
-    ADIOI_Flatlist_node *flat_buf, *flat_file;
-    ADIO_Offset i_offset, new_brd_size, brd_size, size;
-    int i, j, k, err=-1, st_index=0;
-    ADIO_Offset frd_size=0, new_frd_size, st_frd_size;
-    MPI_Count num, bufsize;
-    int n_etypes_in_filetype;
-    ADIO_Offset n_filetypes, etype_in_filetype, st_n_filetypes, size_in_filetype;
-    ADIO_Offset abs_off_in_filetype=0;
-    MPI_Count filetype_size, etype_size, buftype_size, partial_read;
-    MPI_Aint filetype_extent, buftype_extent; 
-    int buf_count, buftype_is_contig, filetype_is_contig;
-    ADIO_Offset userbuf_off, req_len, sum;
-    ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off;
-    char *readbuf, *tmp_buf, *value;
-    int err_flag=0, info_flag;
-    unsigned max_bufsize, readbuf_len;
-    static char myname[] = "ADIOI_BG_READSTRIDED";
-
-    if (fd->hints->ds_read == ADIOI_HINT_DISABLE) {
-  /* if user has disabled data sieving on reads, use naive
-	 * approach instead.
-	 */
-      /*FPRINTF(stderr, "ADIOI_GEN_ReadStrided_naive(%d):\n", __LINE__);*/
-      ADIOI_GEN_ReadStrided_naive(fd, 
-				    buf,
-				    count,
-				    datatype,
-				    file_ptr_type,
-				    offset,
-				    status,
-				    error_code);
-    	return;
-    }
-    /*FPRINTF(stderr, "%s(%d):\n",myname, __LINE__);*/
-
-    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
-    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
-
-    MPI_Type_size_x(fd->filetype, &filetype_size);
-    if ( ! filetype_size ) {
-#ifdef HAVE_STATUS_SET_BYTES
-	MPIR_Status_set_bytes(status, datatype, 0);
-#endif
-	*error_code = MPI_SUCCESS; 
-	return;
-    }
-
-    MPI_Type_extent(fd->filetype, &filetype_extent);
-    MPI_Type_size_x(datatype, &buftype_size);
-    MPI_Type_extent(datatype, &buftype_extent);
-    etype_size = fd->etype_size;
-
-    ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
-    bufsize = buftype_size * count;
-
-/* get max_bufsize from the info object. */
-
-    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
-    ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, 
-                 &info_flag);
-    max_bufsize = atoi(value);
-    ADIOI_Free(value);
-
-    if (!buftype_is_contig && filetype_is_contig) {
-
-/* noncontiguous in memory, contiguous in file. */
-
-	ADIOI_Flatten_datatype(datatype);
-	flat_buf = ADIOI_Flatlist;
-	while (flat_buf->type != datatype) flat_buf = flat_buf->next;
-
-        off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : 
-                 fd->disp + (ADIO_Offset)etype_size * offset;
-
-	start_off = off;
-	end_offset = off + bufsize - 1;
-        readbuf_off = off;
-        readbuf = (char *) ADIOI_Malloc(max_bufsize);
-        readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
-
-/* if atomicity is true, lock (exclusive) the region to be accessed */
-        if (fd->atomicity)
-            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
-
-	lseek(fd->fd_sys, readbuf_off, SEEK_SET);
-        if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);
-        err = read(fd->fd_sys, readbuf, readbuf_len);
-        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len);
-        if (err == -1) err_flag = 1;
-
-        for (j=0; j<count; j++) 
-        {
-              for (i=0; i<flat_buf->count; i++) {
-                  userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
-      req_off = off;
-      req_len = flat_buf->blocklens[i];
-      ADIOI_BUFFERED_READ
-                  off += flat_buf->blocklens[i];
-              }
-        }
-
-        if (fd->atomicity)
-            ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
-
-        if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
-
-	ADIOI_Free(readbuf); /* malloced in the buffered_read macro */
-
-	if (err_flag) {
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE, myname,
-					       __LINE__, MPI_ERR_IO, "**io",
-					       "**io %s", strerror(errno));
-	}
-	else *error_code = MPI_SUCCESS;
-    }
-
-    else {  /* noncontiguous in file */
-
-/* filetype already flattened in ADIO_Open */
-	flat_file = ADIOI_Flatlist;
-	while (flat_file->type != fd->filetype) flat_file = flat_file->next;
-	disp = fd->disp;
-
-	if (file_ptr_type == ADIO_INDIVIDUAL) {
-	    /* Wei-keng reworked type processing to be a bit more efficient */
-            offset       = fd->fp_ind - disp;
-            n_filetypes  = (offset - flat_file->indices[0]) / filetype_extent;
-	    offset -= (ADIO_Offset)n_filetypes * filetype_extent;
-	    /* now offset is local to this extent */
-
-            /* find the block where offset is located, skip blocklens[i]==0 */
-            for (i=0; i<flat_file->count; i++) {
-                ADIO_Offset dist;
-                if (flat_file->blocklens[i] == 0) continue;
-                dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
-                /* frd_size is from offset to the end of block i */
-		if (dist == 0) {
-		    i++;
-		    offset   = flat_file->indices[i];
-		    frd_size = flat_file->blocklens[i];
-		    break;
-		}
-		if (dist > 0) {
-                    frd_size = dist;
-		    break;
-		}
-	    }
-            st_index = i;  /* starting index in flat_file->indices[] */
-            offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
-	}
-	else {
-	    n_etypes_in_filetype = filetype_size/etype_size;
-	    n_filetypes = offset / n_etypes_in_filetype;
-	    etype_in_filetype = offset % n_etypes_in_filetype;
-	    size_in_filetype = etype_in_filetype * etype_size;
- 
-	    sum = 0;
-	    for (i=0; i<flat_file->count; i++) {
-		sum += flat_file->blocklens[i];
-		if (sum > size_in_filetype) {
-		    st_index = i;
-		    frd_size = sum - size_in_filetype;
-		    abs_off_in_filetype = flat_file->indices[i] +
-			size_in_filetype - (sum - flat_file->blocklens[i]);
-		    break;
-		}
-	    }
-
-	    /* abs. offset in bytes in the file */
-	    offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + 
-		    abs_off_in_filetype;
-	}
-
-        start_off = offset;
-
-	/* Wei-keng Liao: read request is within a single flat_file contig
-	 * block e.g. with subarray types that actually describe the whole
-	 * array */
-	if (buftype_is_contig && bufsize <= frd_size) {
-            ADIO_ReadContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
-                             offset, status, error_code);
-
-	    if (file_ptr_type == ADIO_INDIVIDUAL) {
-                /* update MPI-IO file pointer to point to the first byte that 
-		 * can be accessed in the fileview. */
-		fd->fp_ind = offset + bufsize;
-		if (bufsize == frd_size) {
-		    do {
-			st_index++;
-			if (st_index == flat_file->count) {
-			    st_index = 0;
-			    n_filetypes++;
-			}
-                    } while (flat_file->blocklens[st_index] == 0);
-		    fd->fp_ind = disp + flat_file->indices[st_index]
-                               + n_filetypes*filetype_extent;
-		}
-	    }
-	    fd->fp_sys_posn = -1;   /* set it to null. */ 
-#ifdef HAVE_STATUS_SET_BYTES
-	    MPIR_Status_set_bytes(status, datatype, bufsize);
-#endif 
-            return;
-	}
-
-       /* Calculate end_offset, the last byte-offset that will be accessed.
-         e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/
-
-	st_frd_size = frd_size;
-	st_n_filetypes = n_filetypes;
-	i_offset = 0;
-	j = st_index;
-	off = offset;
-	frd_size = ADIOI_MIN(st_frd_size, bufsize);
-	while (i_offset < bufsize) {
-	    i_offset += frd_size;
-	    end_offset = off + frd_size - 1;
-
-	    j = (j+1) % flat_file->count;
-            n_filetypes += (j == 0) ? 1 : 0;
-            while (flat_file->blocklens[j]==0) {
-		j = (j+1) % flat_file->count;
-		n_filetypes += (j == 0) ? 1 : 0;
-	    }
-	    off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent;
-	    frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
-	}
-
-/* if atomicity is true, lock (exclusive) the region to be accessed */
-        if (fd->atomicity)
-            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
-
-        /* initial read into readbuf */
-	readbuf_off = offset;
-	readbuf = (char *) ADIOI_Malloc(max_bufsize);
-	readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
-
-	lseek(fd->fd_sys, offset, SEEK_SET);
-        if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, offset, SEEK_SET, readbuf_len);
-        err = read(fd->fd_sys, readbuf, readbuf_len);
-        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, offset, SEEK_SET, readbuf_len);
-
-        if (err == -1) err_flag = 1;
-
-	if (buftype_is_contig && !filetype_is_contig) {
-
-/* contiguous in memory, noncontiguous in file. should be the most
-   common case. */
-
-	    i_offset = 0;
-	    j = st_index;
-	    off = offset;
-	    n_filetypes = st_n_filetypes;
-	    frd_size = ADIOI_MIN(st_frd_size, bufsize);
-	    while (i_offset < bufsize) {
-                if (frd_size) { 
-                    /* TYPE_UB and TYPE_LB can result in 
-                       frd_size = 0. save system call in such cases */ 
-		    /* lseek(fd->fd_sys, off, SEEK_SET);
-		    err = read(fd->fd_sys, ((char *) buf) + i, frd_size);*/
-
-		    req_off = off;
-		    req_len = frd_size;
-		    userbuf_off = i_offset;
-		    ADIOI_BUFFERED_READ
-		}
-		i_offset += frd_size;
-
-                if (off + frd_size < disp + flat_file->indices[j] +
-                   flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
-                       off += frd_size;
-                /* did not reach end of contiguous block in filetype.
-                   no more I/O needed. off is incremented by frd_size. */
-                else {
-                    j = (j+1) % flat_file->count;
-                    n_filetypes += (j == 0) ? 1 : 0;
-                    while (flat_file->blocklens[j]==0) {
-                        j = (j+1) % flat_file->count;
-                        n_filetypes += (j == 0) ? 1 : 0;
-		    }
-		    off = disp + flat_file->indices[j] + 
-                                        n_filetypes*(ADIO_Offset)filetype_extent;
-		    frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
-		}
-	    }
-	}
-	else {
-/* noncontiguous in memory as well as in file */
-
-	    ADIOI_Flatten_datatype(datatype);
-	    flat_buf = ADIOI_Flatlist;
-	    while (flat_buf->type != datatype) flat_buf = flat_buf->next;
-
-	    k = num = buf_count = 0;
-	    i_offset = flat_buf->indices[0];
-	    j = st_index;
-	    off = offset;
-	    n_filetypes = st_n_filetypes;
-	    frd_size = st_frd_size;
-	    brd_size = flat_buf->blocklens[0];
-
-	    while (num < bufsize) {
-		size = ADIOI_MIN(frd_size, brd_size);
-		if (size) {
-		    /* lseek(fd->fd_sys, off, SEEK_SET);
-		    err = read(fd->fd_sys, ((char *) buf) + i, size); */
-
-		    req_off = off;
-		    req_len = size;
-		    userbuf_off = i_offset;
-		    ADIOI_BUFFERED_READ
-		}
-
-		new_frd_size = frd_size;
-		new_brd_size = brd_size;
-
-		if (size == frd_size) {
-/* reached end of contiguous block in file */
-                    j = (j+1) % flat_file->count;
-                    n_filetypes += (j == 0) ? 1 : 0;
-                    while (flat_file->blocklens[j]==0) {
-                        j = (j+1) % flat_file->count;
-                        n_filetypes += (j == 0) ? 1 : 0;
-		    }
-
-		    off = disp + flat_file->indices[j] + 
-                                              n_filetypes*(ADIO_Offset)filetype_extent;
-
-		    new_frd_size = flat_file->blocklens[j];
-		    if (size != brd_size) {
-			i_offset += size;
-			new_brd_size -= size;
-		    }
-		}
-
-		if (size == brd_size) {
-/* reached end of contiguous block in memory */
-
-		    k = (k + 1)%flat_buf->count;
-		    buf_count++;
-		    i_offset = ((ADIO_Offset)buftype_extent*(ADIO_Offset)(buf_count/flat_buf->count) +
-			flat_buf->indices[k]); 
-		    new_brd_size = flat_buf->blocklens[k];
-		    if (size != frd_size) {
-			off += size;
-			new_frd_size -= size;
-		    }
-		}
-    ADIOI_Assert(((ADIO_Offset)num + size) == (unsigned)(num + size));
-		num += size;
-		frd_size = new_frd_size;
-                brd_size = new_brd_size;
-	    }
-	}
-	
-        if (fd->atomicity)
-            ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
-
-	if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
-
-	ADIOI_Free(readbuf); /* malloced in the buffered_read macro */
-
-	if (err_flag) {
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE, myname,
-					       __LINE__, MPI_ERR_IO, "**io",
-					       "**io %s", strerror(errno));
-	}
-	else *error_code = MPI_SUCCESS;
-    }
-
-    fd->fp_sys_posn = -1;   /* set it to null. */
-
-#ifdef HAVE_STATUS_SET_BYTES
-    MPIR_Status_set_bytes(status, datatype, bufsize);
-/* This is a temporary way of filling in status. The right way is to 
-   keep track of how much data was actually read and placed in buf 
-   by ADIOI_BUFFERED_READ. */
-#endif
-
-    if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
-}
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_write.c b/src/mpi/romio/adio/ad_bg/ad_bg_write.c
deleted file mode 100644
index abd4888..0000000
--- a/src/mpi/romio/adio/ad_bg/ad_bg_write.c
+++ /dev/null
@@ -1,579 +0,0 @@
-/* ---------------------------------------------------------------- */
-/* (C)Copyright IBM Corp.  2007, 2008                               */
-/* ---------------------------------------------------------------- */
-/**
- * \file ad_bg_write.c
- * \brief ???
- */
-
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/* 
- *   Copyright (C) 1997 University of Chicago. 
- *   See COPYRIGHT notice in top-level directory.
- */
-
-#include "ad_bg.h"
-#include "adio_extern.h"
-
-#include "ad_bg_tuning.h"
-
-#ifdef AGGREGATION_PROFILE
-#include "mpe.h"
-#endif
-
-void ADIOI_BG_WriteContig(ADIO_File fd, const void *buf, int count, 
-                     MPI_Datatype datatype, int file_ptr_type,
-		     ADIO_Offset offset, ADIO_Status *status, int *error_code)
-{
-    MPI_Count err=-1, datatype_size;
-    ADIO_Offset len;
-    static char myname[] = "ADIOI_BG_WRITECONTIG";
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5036, 0, NULL);
-#endif
-    /* timing */
-    double io_time, io_time2;
-
-    MPI_Type_size_x(datatype, &datatype_size);
-    len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
-    ADIOI_Assert(len == (unsigned int) len); /* write takes an unsigned int parm */
-
-    if (bgmpio_timing) {
-	io_time = MPI_Wtime();
-	bgmpio_prof_cw[ BGMPIO_CIO_DATA_SIZE ] += len;
-    }
-
-    if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
-	if (bgmpio_timing) io_time2 = MPI_Wtime();
-	if (fd->fp_sys_posn != offset)
-	    lseek(fd->fd_sys, offset, SEEK_SET);
-	if (bgmpio_timing) bgmpio_prof_cw[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
-	ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
-	if (bgmpio_timing) io_time2 = MPI_Wtime();
-	err = write(fd->fd_sys, buf, (unsigned int)len);
-	if (bgmpio_timing) bgmpio_prof_cw[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
-	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
-	fd->fp_sys_posn = offset + err;
-	/* individual file pointer not updated */        
-    }
-    else { /* write from curr. location of ind. file pointer */
-	offset = fd->fp_ind;
-	if (bgmpio_timing) io_time2 = MPI_Wtime();
-	if (fd->fp_sys_posn != fd->fp_ind)
-	    lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
-	if (bgmpio_timing) bgmpio_prof_cw[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
-	ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
-	if (bgmpio_timing) io_time2 = MPI_Wtime();
-	err = write(fd->fd_sys, buf, (unsigned int)len);
-	if (bgmpio_timing) bgmpio_prof_cw[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
-	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
-	fd->fp_ind += err;
-	fd->fp_sys_posn = fd->fp_ind;
-    }
-
-    if (bgmpio_timing) bgmpio_prof_cw[ BGMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
-
-    /* --BEGIN ERROR HANDLING-- */
-    if (err == -1) {
-	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
-					   myname, __LINE__, MPI_ERR_IO,
-					   "**io",
-					   "**io %s", strerror(errno));
-	return;
-    }
-    /* --END ERROR HANDLING-- */
-
-#ifdef HAVE_STATUS_SET_BYTES
-    MPIR_Status_set_bytes(status, datatype, err);
-#endif
-
-    *error_code = MPI_SUCCESS;
-#ifdef AGGREGATION_PROFILE
-    MPE_Log_event (5037, 0, NULL);
-#endif
-}
-
-
-#define ADIOI_BUFFERED_WRITE \
-{ \
-    if (req_off >= writebuf_off + writebuf_len) { \
-	lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
-	err = write(fd->fd_sys, writebuf, writebuf_len); \
-        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
-        if (err == -1) err_flag = 1; \
-	writebuf_off = req_off; \
-        writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
-	if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
-	lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
-	err = read(fd->fd_sys, writebuf, writebuf_len); \
-        if (err == -1) { \
-            *error_code = MPIO_Err_create_code(MPI_SUCCESS, \
-					       MPIR_ERR_RECOVERABLE, myname, \
-					       __LINE__, MPI_ERR_IO, \
-					       "**ioRMWrdwr", 0); \
-	    return; \
-        } \
-    } \
-    write_sz = (unsigned) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
-    ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off));\
-    memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
-    while (write_sz != req_len) { \
-	lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
-	err = write(fd->fd_sys, writebuf, writebuf_len); \
-        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
-        if (err == -1) err_flag = 1; \
-        req_len -= write_sz; \
-        userbuf_off += write_sz; \
-        writebuf_off += writebuf_len; \
-        writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
-	if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
-	lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
-	err = read(fd->fd_sys, writebuf, writebuf_len); \
-        if (err == -1) { \
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS, \
-					       MPIR_ERR_RECOVERABLE, myname, \
-					       __LINE__, MPI_ERR_IO, \
-					       "**ioRMWrdwr", 0); \
-	    return; \
-        } \
-        write_sz = ADIOI_MIN(req_len, writebuf_len); \
-        memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
-    } \
-}
-
-
-/* this macro is used when filetype is contig and buftype is not contig.
-   it does not do a read-modify-write and does not lock*/
-#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
-{ \
-    if (req_off >= writebuf_off + writebuf_len) { \
-	lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
-	if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
-	err = write(fd->fd_sys, writebuf, writebuf_len); \
-        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
-        if (err == -1) err_flag = 1; \
-	writebuf_off = req_off; \
-        writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
-    } \
-    write_sz = (unsigned) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
-    ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off));\
-    memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
-    while (write_sz != req_len) { \
-	lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
-	if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
-	err = write(fd->fd_sys, writebuf, writebuf_len); \
-        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
-        if (err == -1) err_flag = 1; \
-        req_len -= write_sz; \
-        userbuf_off += write_sz; \
-        writebuf_off += writebuf_len; \
-        writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
-        write_sz = ADIOI_MIN(req_len, writebuf_len); \
-        memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
-    } \
-}
-
-
-
-void ADIOI_BG_WriteStrided(ADIO_File fd, const void *buf, int count,
-                       MPI_Datatype datatype, int file_ptr_type,
-                       ADIO_Offset offset, ADIO_Status *status, int
-                       *error_code)
-{
-/* offset is in units of etype relative to the filetype. */
-
-
-
-    ADIOI_Flatlist_node *flat_buf, *flat_file;
-    ADIO_Offset i_offset, sum, size_in_filetype;
-    int i, j, k, err=-1, st_index=0;
-    int n_etypes_in_filetype;
-    ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes;
-    ADIO_Offset abs_off_in_filetype=0;
-    MPI_Count filetype_size, etype_size, buftype_size;
-    MPI_Aint filetype_extent, buftype_extent; 
-    int buf_count, buftype_is_contig, filetype_is_contig;
-    ADIO_Offset userbuf_off;
-    ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
-    char *writebuf, *value;
-    unsigned bufsize, writebuf_len, max_bufsize, write_sz;
-    int err_flag=0, info_flag;
-    ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len;
-    static char myname[] = "ADIOI_BG_WRITESTRIDED";
-
-    if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
-    	/* if user has disabled data sieving on reads, use naive
-	 * approach instead.
-	 */
-      /*FPRINTF(stderr, "ADIOI_GEN_WriteStrided_naive(%d):\n", __LINE__);*/
-      ADIOI_GEN_WriteStrided_naive(fd, 
-				    buf,
-				    count,
-				    datatype,
-				    file_ptr_type,
-				    offset,
-				    status,
-				    error_code);
-    	return;
-    }
-    /*FPRINTF(stderr, "%s(%d):\n",myname, __LINE__);*/
-
-    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
-    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
-
-    MPI_Type_size_x(fd->filetype, &filetype_size);
-    if ( ! filetype_size ) {
-#ifdef HAVE_STATUS_SET_BYTES
-	MPIR_Status_set_bytes(status, datatype, 0);
-#endif
-	*error_code = MPI_SUCCESS; 
-	return;
-    }
-
-    MPI_Type_extent(fd->filetype, &filetype_extent);
-    MPI_Type_size_x(datatype, &buftype_size);
-    MPI_Type_extent(datatype, &buftype_extent);
-    etype_size = fd->etype_size;
-
-    ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
-    bufsize = buftype_size * count;
-
-/* get max_bufsize from the info object. */
-
-    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
-    ADIOI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, 
-                 &info_flag);
-    max_bufsize = atoi(value);
-    ADIOI_Free(value);
-
-    if (!buftype_is_contig && filetype_is_contig) {
-
-/* noncontiguous in memory, contiguous in file. */
-
-	ADIOI_Flatten_datatype(datatype);
-	flat_buf = ADIOI_Flatlist;
-	while (flat_buf->type != datatype) flat_buf = flat_buf->next;
-
-        off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : 
-                 fd->disp + etype_size * offset;
-
-        start_off = off;
-	end_offset = off + bufsize - 1;
-        writebuf_off = off;
-        writebuf = (char *) ADIOI_Malloc(max_bufsize);
-        writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
-
-/* if atomicity is true, lock the region to be accessed */
-        if (fd->atomicity) 
-            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
-
-        for (j=0; j<count; j++) 
-        {
-            for (i=0; i<flat_buf->count; i++) {
-                userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
-		req_off = off;
-		req_len = flat_buf->blocklens[i];
-		ADIOI_BUFFERED_WRITE_WITHOUT_READ
-                off += flat_buf->blocklens[i];
-            }
-        }
-
-        /* write the buffer out finally */
-	lseek(fd->fd_sys, writebuf_off, SEEK_SET); 
-	if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
-	err = write(fd->fd_sys, writebuf, writebuf_len); 
-        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
-        if (err == -1) err_flag = 1; 
-
-        if (fd->atomicity) 
-            ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
-
-	ADIOI_Free(writebuf); /* malloced in the buffered_write macro */
-
-        if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
-	if (err_flag) {
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE, myname,
-					       __LINE__, MPI_ERR_IO, "**io",
-					       "**io %s", strerror(errno));
-	}
-	else *error_code = MPI_SUCCESS;
-    }
-
-    else {  /* noncontiguous in file */
-
-/* filetype already flattened in ADIO_Open */
-	flat_file = ADIOI_Flatlist;
-	while (flat_file->type != fd->filetype) flat_file = flat_file->next;
-	disp = fd->disp;
-
-	if (file_ptr_type == ADIO_INDIVIDUAL) {
-	/* Wei-keng reworked type processing to be a bit more efficient */
-            offset       = fd->fp_ind - disp;
-            n_filetypes  = (offset - flat_file->indices[0]) / filetype_extent;
-            offset      -= (ADIO_Offset)n_filetypes * filetype_extent;
-            /* now offset is local to this extent */
-
-            /* find the block where offset is located, skip blocklens[i]==0 */
-            for (i=0; i<flat_file->count; i++) {
-                ADIO_Offset dist;
-                if (flat_file->blocklens[i] == 0) continue;
-                dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
-                /* fwr_size is from offset to the end of block i */
-                if (dist == 0) {
-                    i++;
-                    offset   = flat_file->indices[i];
-                    fwr_size = flat_file->blocklens[i];
-                    break;
-                }
-                if (dist > 0) {
-                    fwr_size = dist;
-                    break;
-                }
-            }
-            st_index = i;  /* starting index in flat_file->indices[] */
-            offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
-	}
-	else {
-	    n_etypes_in_filetype = filetype_size/etype_size;
-	    n_filetypes = offset / n_etypes_in_filetype;
-	    etype_in_filetype = offset % n_etypes_in_filetype;
-	    size_in_filetype = etype_in_filetype * etype_size;
- 
-	    sum = 0;
-	    for (i=0; i<flat_file->count; i++) {
-		sum += flat_file->blocklens[i];
-		if (sum > size_in_filetype) {
-		    st_index = i;
-		    fwr_size = sum - size_in_filetype;
-		    abs_off_in_filetype = flat_file->indices[i] +
-			size_in_filetype - (sum - flat_file->blocklens[i]);
-		    break;
-		}
-	    }
-
-	    /* abs. offset in bytes in the file */
-	    offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + 
-		    abs_off_in_filetype;
-	}
-
-        start_off = offset;
-        /* Wei-keng Liao:write request is within single flat_file contig block*/
-	/* this could happen, for example, with subarray types that are
-	 * actually fairly contiguous */
-        if (buftype_is_contig && bufsize <= fwr_size) {
-            ADIO_WriteContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
-                             offset, status, error_code);
-
-	    if (file_ptr_type == ADIO_INDIVIDUAL) {
-                /* update MPI-IO file pointer to point to the first byte 
-		 * that can be accessed in the fileview. */
-                fd->fp_ind = offset + bufsize;
-                if (bufsize == fwr_size) {
-                    do {
-                        st_index++;
-                        if (st_index == flat_file->count) {
-                            st_index = 0;
-                            n_filetypes++;
-                        }
-                    } while (flat_file->blocklens[st_index] == 0);
-                    fd->fp_ind = disp + flat_file->indices[st_index]
-                               + (ADIO_Offset)n_filetypes*filetype_extent;
-                }
-            }
-	    fd->fp_sys_posn = -1;   /* set it to null. */ 
-#ifdef HAVE_STATUS_SET_BYTES
-	    MPIR_Status_set_bytes(status, datatype, bufsize);
-#endif 
-            return;
-        }
-
-       /* Calculate end_offset, the last byte-offset that will be accessed.
-         e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/
-
-	st_fwr_size = fwr_size;
-	st_n_filetypes = n_filetypes;
-	i_offset = 0;
-	j = st_index;
-	off = offset;
-	fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
-	while (i_offset < bufsize) {
-	    i_offset += fwr_size;
-	    end_offset = off + fwr_size - 1;
-
-            j = (j+1) % flat_file->count;
-            n_filetypes += (j == 0) ? 1 : 0;
-            while (flat_file->blocklens[j]==0) {
-                j = (j+1) % flat_file->count;
-                n_filetypes += (j == 0) ? 1 : 0;
-            }
-
-	    off = disp + flat_file->indices[j] + 
-		    n_filetypes*(ADIO_Offset)filetype_extent;
-	    fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
-	}
-
-/* if atomicity is true, lock the region to be accessed */
-        if (fd->atomicity) 
-            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
-
-        /* initial read for the read-modify-write */
-        writebuf_off = offset;
-        writebuf = (char *) ADIOI_Malloc(max_bufsize);
-        writebuf_len = (unsigned)(ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
-	if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
-	lseek(fd->fd_sys, writebuf_off, SEEK_SET); 
-	err = read(fd->fd_sys, writebuf, writebuf_len); 
-        if (err == -1) {
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE,
-					       myname, __LINE__,
-					       MPI_ERR_IO,
-					       "ADIOI_BG_WriteStrided: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.", 0);
-	    return;
-        } 
-
-	if (buftype_is_contig && !filetype_is_contig) {
-
-/* contiguous in memory, noncontiguous in file. should be the most
-   common case. */
-
-	    i_offset = 0;
-	    j = st_index;
-	    off = offset;
-	    n_filetypes = st_n_filetypes;
-	    fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
-	    while (i_offset < bufsize) {
-                if (fwr_size) { 
-                    /* TYPE_UB and TYPE_LB can result in 
-                       fwr_size = 0. save system call in such cases */ 
-		    /* lseek(fd->fd_sys, off, SEEK_SET);
-		    err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/
-
-		    req_off = off;
-		    req_len = fwr_size;
-		    userbuf_off = i_offset;
-		    ADIOI_BUFFERED_WRITE
-		}
-		i_offset += fwr_size;
-
-                if (off + fwr_size < disp + flat_file->indices[j] +
-                   flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
-                       off += fwr_size;
-                /* did not reach end of contiguous block in filetype.
-                   no more I/O needed. off is incremented by fwr_size. */
-                else {
-                    j = (j+1) % flat_file->count;
-                    n_filetypes += (j == 0) ? 1 : 0;
-                    while (flat_file->blocklens[j]==0) {
-                        j = (j+1) % flat_file->count;
-                        n_filetypes += (j == 0) ? 1 : 0;
-                    }
-		    off = disp + flat_file->indices[j] + 
-                                    n_filetypes*(ADIO_Offset)filetype_extent;
-		    fwr_size = ADIOI_MIN(flat_file->blocklens[j], 
-				    bufsize-i_offset);
-		}
-	    }
-	}
-	else {
-/* noncontiguous in memory as well as in file */
-
-	    ADIOI_Flatten_datatype(datatype);
-	    flat_buf = ADIOI_Flatlist;
-	    while (flat_buf->type != datatype) flat_buf = flat_buf->next;
-
-	    k = num = buf_count = 0;
-	    i_offset = flat_buf->indices[0];
-	    j = st_index;
-	    off = offset;
-	    n_filetypes = st_n_filetypes;
-	    fwr_size = st_fwr_size;
-	    bwr_size = flat_buf->blocklens[0];
-
-	    while (num < bufsize) {
-		size = ADIOI_MIN(fwr_size, bwr_size);
-		if (size) {
-		    /* lseek(fd->fd_sys, off, SEEK_SET);
-		    err = write(fd->fd_sys, ((char *) buf) + i_offset, size); */
-
-		    req_off = off;
-		    req_len = size;
-		    userbuf_off = i_offset;
-		    ADIOI_BUFFERED_WRITE
-		}
-
-		new_fwr_size = fwr_size;
-		new_bwr_size = bwr_size;
-
-		if (size == fwr_size) {
-/* reached end of contiguous block in file */
- 		    j = (j+1) % flat_file->count;
- 		    n_filetypes += (j == 0) ? 1 : 0;
- 		    while (flat_file->blocklens[j]==0) {
- 			j = (j+1) % flat_file->count;
- 			n_filetypes += (j == 0) ? 1 : 0;
-		    }
-
-		    off = disp + flat_file->indices[j] + 
-                                  n_filetypes*(ADIO_Offset)filetype_extent;
-
-		    new_fwr_size = flat_file->blocklens[j];
-		    if (size != bwr_size) {
-			i_offset += size;
-			new_bwr_size -= size;
-		    }
-		}
-
-		if (size == bwr_size) {
-/* reached end of contiguous block in memory */
-
-		    k = (k + 1)%flat_buf->count;
-		    buf_count++;
-		    i_offset = (ADIO_Offset)buftype_extent*(ADIO_Offset)(buf_count/flat_buf->count) +
-			flat_buf->indices[k]; 
-		    new_bwr_size = flat_buf->blocklens[k];
-		    if (size != fwr_size) {
-			off += size;
-			new_fwr_size -= size;
-		    }
-		}
-		num += size;
-		fwr_size = new_fwr_size;
-                bwr_size = new_bwr_size;
-	    }
-	}
-
-        /* write the buffer out finally */	
-	lseek(fd->fd_sys, writebuf_off, SEEK_SET); 
-	if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
-	err = write(fd->fd_sys, writebuf, writebuf_len); 
-
-        if (!(fd->atomicity))
-	    ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
-	else ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
-
-        if (err == -1) err_flag = 1; 
-
-	ADIOI_Free(writebuf); /* malloced in the buffered_write macro */
-
-	if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
-	if (err_flag) {
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE, myname,
-					       __LINE__, MPI_ERR_IO, "**io",
-					       "**io %s", strerror(errno));
-	}
-	else *error_code = MPI_SUCCESS;
-    }
-
-    fd->fp_sys_posn = -1;   /* set it to null. */
-
-#ifdef HAVE_STATUS_SET_BYTES
-    MPIR_Status_set_bytes(status, datatype, bufsize);
-/* This is a temporary way of filling in status. The right way is to 
-   keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
-#endif
-
-    if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
-}

http://git.mpich.org/mpich.git/commitdiff/5e34974e1da0038bfc0fbd65598d3e871c1541bf

commit 5e34974e1da0038bfc0fbd65598d3e871c1541bf
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Fri Jan 17 14:06:11 2014 -0600

    Allocate two-phase buffer outside write path
    
    There are many memory allocations in the write path.  Allocating the
    two-phase intermediate buffer outside of the write path might on some
    systems make a small difference, especially if there are many collective
    I/O calls, or if the system (like Blue Gene) has a small amount of
    memory.  Modified from Paul Coffman <pkcoff at us.ibm.com>'s original idea.

diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c b/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
index 4602127..9fb9128 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
@@ -455,7 +455,7 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
 
     MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm); 
 
-    if (ntimes) read_buf = (char *) ADIOI_Malloc(coll_bufsize);
+    read_buf = fd->io_buf;
 
     curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); 
     /* its use is explained below. calloc initializes to 0. */
@@ -660,9 +660,10 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
       ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+real_size-for_next_iter) == (ADIO_Offset)(MPIR_Upint)(read_buf+real_size-for_next_iter));
       ADIOI_Assert((for_next_iter+coll_bufsize) == (size_t)(for_next_iter+coll_bufsize));
 	    memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter);
-	    ADIOI_Free(read_buf);
-	    read_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize);
-	    memcpy(read_buf, tmp_buf, for_next_iter);
+	    ADIOI_Free(fd->io_buf);
+	    fd->io_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize);
+	    memcpy(fd->io_buf, tmp_buf, for_next_iter);
+	    read_buf = fd->io_buf;
 	    ADIOI_Free(tmp_buf);
 	}
 
@@ -701,7 +702,6 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
         MPE_Log_event(8, 0, "end communication");
 #endif
 
-    if (ntimes) ADIOI_Free(read_buf);
     ADIOI_Free(curr_offlen_ptr);
     ADIOI_Free(count);
     ADIOI_Free(partial_send);
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c b/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
index b104f82..5bae065 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
@@ -479,7 +479,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
     MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX,
 		  fd->comm); 
 
-    if (ntimes) write_buf = (char *) ADIOI_Malloc(coll_bufsize);
+    write_buf = fd->io_buf;
 
     curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); 
     /* its use is explained below. calloc initializes to 0. */
@@ -699,7 +699,6 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
 	MPE_Log_event(8, 0, "end communication");
 #endif
 
-    if (ntimes) ADIOI_Free(write_buf);
     ADIOI_Free(curr_offlen_ptr);
     ADIOI_Free(count);
     ADIOI_Free(partial_recv);
diff --git a/src/mpi/romio/adio/common/ad_close.c b/src/mpi/romio/adio/common/ad_close.c
index ada4d33..62e18f9 100644
--- a/src/mpi/romio/adio/common/ad_close.c
+++ b/src/mpi/romio/adio/common/ad_close.c
@@ -104,5 +104,7 @@ void ADIO_Close(ADIO_File fd, int *error_code)
 
     MPI_Info_free(&(fd->info));
 
+    if (fd->io_buf != NULL) ADIOI_Free(fd->io_buf);
+
     /* memory for fd is freed in MPI_File_close */
 }
diff --git a/src/mpi/romio/adio/common/ad_open.c b/src/mpi/romio/adio/common/ad_open.c
index 8a9d33b..4a965cf 100644
--- a/src/mpi/romio/adio/common/ad_open.c
+++ b/src/mpi/romio/adio/common/ad_open.c
@@ -95,6 +95,10 @@ MPI_File ADIO_Open(MPI_Comm orig_comm,
 	if (*error_code != MPI_SUCCESS)
 	    goto fn_exit;
     }
+    /* Instead of repeatedly allocating this buffer in collective read/write,
+     * allocating up-front might make memory management on small platforms
+     * (e.g. Blue Gene) more efficent */
+    fd->io_buf = ADIOI_Malloc(fd->hints->cb_buffer_size);
 
      /* deferred open: 
      * we can only do this optimization if 'fd->hints->deferred_open' is set
@@ -175,6 +179,7 @@ MPI_File ADIO_Open(MPI_Comm orig_comm,
 	if (fd->hints->cb_config_list) ADIOI_Free(fd->hints->cb_config_list);
 	if (fd->hints) ADIOI_Free(fd->hints);
 	if (fd->info != MPI_INFO_NULL) MPI_Info_free(&(fd->info));
+	if (fd->io_buf) ADIOI_Free(fd->io_buf);
 	ADIOI_Free(fd);
         fd = ADIO_FILE_NULL;
 	if (*error_code == MPI_SUCCESS)
diff --git a/src/mpi/romio/adio/common/ad_read_coll.c b/src/mpi/romio/adio/common/ad_read_coll.c
index 6a2b39a..0eb8d90 100644
--- a/src/mpi/romio/adio/common/ad_read_coll.c
+++ b/src/mpi/romio/adio/common/ad_read_coll.c
@@ -570,7 +570,7 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
 
     MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm); 
 
-    if (ntimes) read_buf = (char *) ADIOI_Malloc(coll_bufsize);
+    read_buf = fd->io_buf;  /* Allocated at open time */
 
     curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); 
     /* its use is explained below. calloc initializes to 0. */
@@ -740,9 +740,10 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
       ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+real_size-for_next_iter) == (ADIO_Offset)(MPIR_Upint)(read_buf+real_size-for_next_iter));
       ADIOI_Assert((for_next_iter+coll_bufsize) == (size_t)(for_next_iter+coll_bufsize));
 	    memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter);
-	    ADIOI_Free(read_buf);
-	    read_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize);
-	    memcpy(read_buf, tmp_buf, for_next_iter);
+	    ADIOI_Free(fd->io_buf);
+	    fd->io_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize);
+	    memcpy(fd->io_buf, tmp_buf, for_next_iter);
+	    read_buf = fd->io_buf;
 	    ADIOI_Free(tmp_buf);
 	}
 
@@ -762,7 +763,6 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
 			    others_req, m,
                             buftype_extent, buf_idx); 
 
-    if (ntimes) ADIOI_Free(read_buf);
     ADIOI_Free(curr_offlen_ptr);
     ADIOI_Free(count);
     ADIOI_Free(partial_send);
diff --git a/src/mpi/romio/adio/common/ad_write_coll.c b/src/mpi/romio/adio/common/ad_write_coll.c
index 8d07c19..bf92a75 100644
--- a/src/mpi/romio/adio/common/ad_write_coll.c
+++ b/src/mpi/romio/adio/common/ad_write_coll.c
@@ -366,7 +366,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
     MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX,
 		  fd->comm); 
 
-    if (ntimes) write_buf = (char *) ADIOI_Malloc(coll_bufsize);
+    write_buf = fd->io_buf;
 
     curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); 
     /* its use is explained below. calloc initializes to 0. */
@@ -544,7 +544,6 @@ static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
         if (*error_code != MPI_SUCCESS) return;
     }
 
-    if (ntimes) ADIOI_Free(write_buf);
     ADIOI_Free(curr_offlen_ptr);
     ADIOI_Free(count);
     ADIOI_Free(partial_recv);
diff --git a/src/mpi/romio/adio/include/adio.h b/src/mpi/romio/adio/include/adio.h
index a25855d..b370f29 100644
--- a/src/mpi/romio/adio/include/adio.h
+++ b/src/mpi/romio/adio/include/adio.h
@@ -233,6 +233,7 @@ typedef struct ADIOI_FileD {
     ADIO_Offset *file_realm_st_offs; /* file realm starting offsets */
     MPI_Datatype *file_realm_types;  /* file realm datatypes */
     int my_cb_nodes_index; /* my index into cb_config_list. -1 if N/A */
+    char *io_buf;          /* two-phase buffer allocated out of i/o path */
     /* External32 */
     int is_external32;      /* bool:  0 means native view */
 

http://git.mpich.org/mpich.git/commitdiff/6ca13e5d2b1ceafa649d1b66700208470ccd03a2

commit 6ca13e5d2b1ceafa649d1b66700208470ccd03a2
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Thu Jan 16 12:54:16 2014 -0600

    remove uneeded barrier
    
    For quite some time the barrier here has had the comment 'Why?'.  Since
    no one knows, and there are plenty of other syncronization points in
    this path, remove it.

diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.c b/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.c
index dea444e..4872c22 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_aggrs.c
@@ -945,7 +945,6 @@ void ADIOI_BG_Calc_others_req(ADIO_File fd, int count_my_req_procs,
     if ( sendBufForLens    == (void*)0xFFFFFFFFFFFFFFFF) sendBufForLens    = NULL;
 
     /* Calculate the displacements from the sendBufForOffsets/Lens */
-    MPI_Barrier(fd->comm);/* Why?*/
     for (i=0; i<nprocs; i++)
     {
 	/* Send these offsets to process i.*/

http://git.mpich.org/mpich.git/commitdiff/f3a43a5acf948c84e816ee304156247cda31b341

commit f3a43a5acf948c84e816ee304156247cda31b341
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Wed Jan 8 13:11:42 2014 -0600

    bluegene timing: condense into one set of timers
    
    bluegene timer code had two "levels" of timing.  that seemed kind of
    pointless so lump it all into one level.

diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_read.c b/src/mpi/romio/adio/ad_bg/ad_bg_read.c
index 0fe7d20..503c004 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_read.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_read.c
@@ -43,32 +43,32 @@ void ADIOI_BG_ReadContig(ADIO_File fd, void *buf, int count,
     }
 
     if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
-	if (bgmpio_timing2) io_time2 = MPI_Wtime();
+	if (bgmpio_timing) io_time2 = MPI_Wtime();
 	if (fd->fp_sys_posn != offset)
 	    lseek(fd->fd_sys, offset, SEEK_SET);
-	if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
+	if (bgmpio_timing) bgmpio_prof_cr[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
 	if (fd->atomicity)
 	    ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
 	else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
-	if (bgmpio_timing2) io_time2 = MPI_Wtime();
+	if (bgmpio_timing) io_time2 = MPI_Wtime();
 	err = read(fd->fd_sys, buf, (unsigned int)len);
-	if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
+	if (bgmpio_timing) bgmpio_prof_cr[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
 	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
 	fd->fp_sys_posn = offset + err;
 	/* individual file pointer not updated */        
     }
     else {  /* read from curr. location of ind. file pointer */
 	offset = fd->fp_ind;
-	if (bgmpio_timing2) io_time2 = MPI_Wtime();
+	if (bgmpio_timing) io_time2 = MPI_Wtime();
 	if (fd->fp_sys_posn != fd->fp_ind)
 	    lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
-	if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
+	if (bgmpio_timing) bgmpio_prof_cr[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
 	if (fd->atomicity)
 	    ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
 	else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
-	if (bgmpio_timing2) io_time2 = MPI_Wtime();
+	if (bgmpio_timing) io_time2 = MPI_Wtime();
 	err = read(fd->fd_sys, buf, (unsigned int)len);
-	if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
+	if (bgmpio_timing) bgmpio_prof_cr[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
 	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
 	fd->fp_ind += err;
 	fd->fp_sys_posn = fd->fp_ind;
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
index 78e944b..e1b0e35 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
@@ -54,12 +54,6 @@ double	bgmpio_prof_cr    [BGMPIO_CIO_LAST];
  *   - 1 - Collect/report timing.
  *   - Default is 0.
  *
- * - BGMPIO_TIMING2 - collect additional averages for MPI I/O collective calls.
- *   Possible values:
- *   - 0 - Do not collect/report averages.
- *   - 1 - Collect/report averages.
- *   - Default is 0.
- *
  * - BGMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated
  *   for aggregator collective i/o.  Possible values:
  *   - 0 - Use two MPI_Allgather's to collect starting and ending offsets.
@@ -97,9 +91,6 @@ void ad_bg_get_env_vars() {
     bgmpio_timing = 0;
 	x = getenv( "BGMPIO_TIMING"       ); 
 	if (x) bgmpio_timing       = atoi(x);
-    bgmpio_timing2 = 0;
-	x = getenv( "BGMPIO_TIMING2"      ); 
-	if (x) bgmpio_timing2      = atoi(x);
     bgmpio_tunegather = 1;
 	x = getenv( "BGMPIO_TUNEGATHER"   ); 
 	if (x) bgmpio_tunegather   = atoi(x);
@@ -156,25 +147,19 @@ void ad_bg_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs )
 
 	    for (i=0; i<BGMPIO_CIO_LAST; i++) bgmpio_prof_avg[i] /= nr_aggs;
 
-	    if (bgmpio_timing2) {
-		bgmpio_prof_avg[ BGMPIO_CIO_B_POSI_RW  ] =
-		    bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nr_aggs /
-		    bgmpio_prof_max[ BGMPIO_CIO_T_POSI_RW  ];
-		bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_RW  ] =
-		    bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nr_aggs /
-		    bgmpio_prof_max[ BGMPIO_CIO_T_MPIO_RW  ];
-	    } else {
-
-		bgmpio_prof_avg[ BGMPIO_CIO_B_POSI_RW  ] = -1;
-		bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_RW  ] = -1;
-	    }
+	    bgmpio_prof_avg[ BGMPIO_CIO_B_POSI_RW  ] =
+		bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nr_aggs /
+		bgmpio_prof_max[ BGMPIO_CIO_T_POSI_RW  ];
+	    bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_RW  ] =
+		bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nr_aggs /
+		bgmpio_prof_max[ BGMPIO_CIO_T_MPIO_RW  ];
 
 	    bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_CRW ] =
 		bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nr_aggs /
 		bgmpio_prof_max[ BGMPIO_CIO_T_MPIO_CRW ];
 
-	    fprintf(stderr,"TIMING-1 %1s,", (rw ? "W" : "R") );
-	    fprintf(stderr,"SIZE: %12.4f , ", bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nr_aggs);
+	    fprintf(stderr,"TIMING-%1s,", (rw ? "W" : "R") );
+	    fprintf(stderr,"SIZE: %12.4lld , ", (long long int)(bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nr_aggs));
 	    fprintf(stderr,"SEEK-avg: %10.3f , ",
 		    bgmpio_prof_avg[ BGMPIO_CIO_T_SEEK ]     );
 	    fprintf(stderr,"SEEK-max: %10.3f , ",
@@ -191,9 +176,16 @@ void ad_bg_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs )
 		    bgmpio_prof_avg[ BGMPIO_CIO_T_MYREQ ]    );
 	    fprintf(stderr,"OTHERREQ-max: %10.3f , ",
 		    bgmpio_prof_max[ BGMPIO_CIO_T_OTHREQ ]   );
-	    fprintf(stderr,"EXCHANGE-max: %10.3f \n",
+	    fprintf(stderr,"EXCHANGE-max: %10.3f , ",
 		    bgmpio_prof_max[ BGMPIO_CIO_T_DEXCH ]    );
-	    fprintf(stderr,"TIMING-2 %1s,", (rw ? "W" : "R") );
+	    fprintf(stderr, "EXCHANGE-SETUP-max: %10.3f , ",
+		    bgmpio_prof_max[ BGMPIO_CIO_T_DEXCH_SETUP]  );
+	    fprintf(stderr, "EXCHANGE-NET-max: %10.3f , ",
+		    bgmpio_prof_max[ BGMPIO_CIO_T_DEXCH_NET]  );
+	    fprintf(stderr, "EXCHANGE-SORT-max: %10.3f , ",
+		    bgmpio_prof_max[ BGMPIO_CIO_T_DEXCH_SORT]  );
+	    fprintf(stderr, "EXCHANGE-SIEVE-max: %10.3f , ",
+		    bgmpio_prof_max[ BGMPIO_CIO_T_DEXCH_SIEVE]  );
 	    fprintf(stderr,"POSIX-TIME-avg: %10.3f , ",
 		    bgmpio_prof_avg[ BGMPIO_CIO_T_POSI_RW ]  );
 	    fprintf(stderr,"MPIIO-CONTIG-TIME-avg: %10.3f , ",
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
index 35317ac..818727d 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
@@ -41,6 +41,10 @@ enum {
     BGMPIO_CIO_T_MYREQ,	/* time for ADIOI_BG_Calc_my_req(), local */
     BGMPIO_CIO_T_OTHREQ,	/* time for ADIOI_Calc_others_req(), short Alltoall */
     BGMPIO_CIO_T_DEXCH,	/* time for I/O data exchange */
+    BGMPIO_CIO_T_DEXCH_SETUP,	/* time for setup portion of I/O data exchange */
+    BGMPIO_CIO_T_DEXCH_NET,	/* time for network portion of I/O data exchange */
+    BGMPIO_CIO_T_DEXCH_SORT, 	/* time to sort requesst in I/O data exchange */
+    BGMPIO_CIO_T_DEXCH_SIEVE, 	/* time for read portion of RMW in two phase */
     BGMPIO_CIO_T_POSI_RW,
     BGMPIO_CIO_B_POSI_RW,
     BGMPIO_CIO_T_MPIO_RW,	/* time for ADIOI_BG_WriteContig() */
@@ -56,7 +60,6 @@ extern double 	bgmpio_prof_cr    [BGMPIO_CIO_LAST];
 
 /* corresponds to environment variables to select optimizations and timing level */
 extern int 	bgmpio_timing;
-extern int 	bgmpio_timing2;
 extern int      bgmpio_timing_cw_level;
 extern int 	bgmpio_comm;
 extern int 	bgmpio_tunegather;
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c b/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
index 2621c36..b104f82 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
@@ -1266,8 +1266,10 @@ static void ADIOI_W_Exchange_data_alltoallv(
     int *srt_len, sum;
     ADIO_Offset *srt_off;
     static char myname[] = "ADIOI_W_EXCHANGE_DATA";
+    double io_time;
 
 
+    io_time = MPI_Wtime();
   /* exchange recv_size info so that each process knows how much to
      send to whom. */
     MPI_Alltoall(recv_size, 1, MPI_INT, send_size, 1, MPI_INT, fd->comm);
@@ -1316,6 +1318,9 @@ static void ADIOI_W_Exchange_data_alltoallv(
 	ADIOI_Free(send_buf);
     }
 
+    bgmpio_prof_cw[BGMPIO_CIO_T_DEXCH_SETUP] += MPI_Wtime() - io_time;
+
+    io_time = MPI_Wtime();
   /* alltoallv */
     MPI_Alltoallv( 
             all_send_buf, send_size, sdispls, MPI_BYTE,
@@ -1325,6 +1330,8 @@ static void ADIOI_W_Exchange_data_alltoallv(
     ADIOI_Free( all_send_buf );
     ADIOI_Free(sdispls);
 
+    bgmpio_prof_cw[BGMPIO_CIO_T_DEXCH_NET] += MPI_Wtime() - io_time;
+    io_time = MPI_Wtime();
   /* data sieving pre-read */
   /* To avoid a read-modify-write, check if there are holes in the 
      data to be written. For this, merge the (sorted) offset lists
@@ -1356,6 +1363,8 @@ static void ADIOI_W_Exchange_data_alltoallv(
     ADIOI_Free(srt_off);
     ADIOI_Free(srt_len);
 
+    bgmpio_prof_cw[BGMPIO_CIO_T_DEXCH_SORT] += MPI_Wtime() - io_time;
+    io_time = MPI_Wtime();
     if (nprocs_recv) {
         if (*hole) {
             ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
@@ -1371,7 +1380,8 @@ static void ADIOI_W_Exchange_data_alltoallv(
             /* --END ERROR HANDLING-- */
         }
     }
-    
+    bgmpio_prof_cw[BGMPIO_CIO_T_DEXCH_SIEVE] += MPI_Wtime() - io_time;
+
   /* scater all_recv_buf into 4M cb_buffer */
     tmp_len = (int *) ADIOI_Malloc(nprocs*sizeof(int));
     for (i=0; i<nprocs; i++)
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_write.c b/src/mpi/romio/adio/ad_bg/ad_bg_write.c
index 8c5734b..abd4888 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_write.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_write.c
@@ -44,28 +44,28 @@ void ADIOI_BG_WriteContig(ADIO_File fd, const void *buf, int count,
     }
 
     if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
-	if (bgmpio_timing2) io_time2 = MPI_Wtime();
+	if (bgmpio_timing) io_time2 = MPI_Wtime();
 	if (fd->fp_sys_posn != offset)
 	    lseek(fd->fd_sys, offset, SEEK_SET);
-	if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
+	if (bgmpio_timing) bgmpio_prof_cw[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
 	ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
-	if (bgmpio_timing2) io_time2 = MPI_Wtime();
+	if (bgmpio_timing) io_time2 = MPI_Wtime();
 	err = write(fd->fd_sys, buf, (unsigned int)len);
-	if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
+	if (bgmpio_timing) bgmpio_prof_cw[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
 	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
 	fd->fp_sys_posn = offset + err;
 	/* individual file pointer not updated */        
     }
     else { /* write from curr. location of ind. file pointer */
 	offset = fd->fp_ind;
-	if (bgmpio_timing2) io_time2 = MPI_Wtime();
+	if (bgmpio_timing) io_time2 = MPI_Wtime();
 	if (fd->fp_sys_posn != fd->fp_ind)
 	    lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
-	if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
+	if (bgmpio_timing) bgmpio_prof_cw[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
 	ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
-	if (bgmpio_timing2) io_time2 = MPI_Wtime();
+	if (bgmpio_timing) io_time2 = MPI_Wtime();
 	err = write(fd->fd_sys, buf, (unsigned int)len);
-	if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
+	if (bgmpio_timing) bgmpio_prof_cw[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
 	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
 	fd->fp_ind += err;
 	fd->fp_sys_posn = fd->fp_ind;
diff --git a/src/mpi/romio/adio/common/ad_read.c b/src/mpi/romio/adio/common/ad_read.c
index c0aa4bd..28fa128 100644
--- a/src/mpi/romio/adio/common/ad_read.c
+++ b/src/mpi/romio/adio/common/ad_read.c
@@ -49,7 +49,7 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
     }
  
 #if ROMIO_BG
-    if (bgmpio_timing2) io_time2 = MPI_Wtime();
+    if (bgmpio_timing) io_time2 = MPI_Wtime();
 #endif
     p=buf;
     while (bytes_xfered < len) {
@@ -81,7 +81,7 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
 	p += err;
     }
 #if ROMIO_BG
-    if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
+    if (bgmpio_timing) bgmpio_prof_cr[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
 #endif
     fd->fp_sys_posn = offset + bytes_xfered;
 
diff --git a/src/mpi/romio/adio/common/ad_write.c b/src/mpi/romio/adio/common/ad_write.c
index 875dd1b..dcfa74c 100644
--- a/src/mpi/romio/adio/common/ad_write.c
+++ b/src/mpi/romio/adio/common/ad_write.c
@@ -52,7 +52,7 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
     }
 
 #ifdef ROMIO_BG
-    if (bgmpio_timing2) io_time2 = MPI_Wtime();
+    if (bgmpio_timing) io_time2 = MPI_Wtime();
 #endif
     p = (char *)buf;
     while (bytes_xfered < len) {
@@ -80,7 +80,7 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
     }
 
 #ifdef ROMIO_BG
-    if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
+    if (bgmpio_timing) bgmpio_prof_cw[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
 #endif
     fd->fp_sys_posn = offset + bytes_xfered;
 

http://git.mpich.org/mpich.git/commitdiff/5bc8aedcff265252754a4c3ac01e709ad66ee9af

commit 5bc8aedcff265252754a4c3ac01e709ad66ee9af
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Tue Jan 7 15:44:12 2014 -0600

    use pwrite/pread instead of seek+write/read
    
    this "new" system call (part of POSIX-2001) saves us a system call on
    Blue Gene.  Seems to get us back 5 seconds for one workload at small
    (half rack) scales.

diff --git a/src/mpi/romio/adio/common/ad_read.c b/src/mpi/romio/adio/common/ad_read.c
index d6818b5..c0aa4bd 100644
--- a/src/mpi/romio/adio/common/ad_read.c
+++ b/src/mpi/romio/adio/common/ad_read.c
@@ -48,34 +48,6 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
 	offset = fd->fp_ind;
     }
  
-#ifdef ROMIO_BG
-    if (bgmpio_timing2) io_time2 = MPI_Wtime();
-#endif
-
-    if (fd->fp_sys_posn != offset) {
-#ifdef ADIOI_MPE_LOGGING
-        MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
-#endif
-	err_lseek = lseek(fd->fd_sys, offset, SEEK_SET);
-#ifdef ADIOI_MPE_LOGGING
-        MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
-#endif
-#ifdef ROMIO_BG
-	if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
-#endif
-	/* --BEGIN ERROR HANDLING-- */
-	if (err_lseek == -1) {
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE,
-					       myname, __LINE__,
-					       MPI_ERR_IO, "**io",
-					       "**io %s", strerror(errno));
-	    fd->fp_sys_posn = -1;
-	    return;
-	}
-	/* --END ERROR HANDLING-- */
-    }
-
 #if ROMIO_BG
     if (bgmpio_timing2) io_time2 = MPI_Wtime();
 #endif
@@ -85,7 +57,7 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
 	MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
 #endif
 	rd_count = len - bytes_xfered;
-	err = read(fd->fd_sys, p, rd_count);
+	err = pread(fd->fd_sys, p, rd_count, offset+bytes_xfered);
 	/* --BEGIN ERROR HANDLING-- */
 	if (err == -1) {
 	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
diff --git a/src/mpi/romio/adio/common/ad_write.c b/src/mpi/romio/adio/common/ad_write.c
index 8a5beb6..875dd1b 100644
--- a/src/mpi/romio/adio/common/ad_write.c
+++ b/src/mpi/romio/adio/common/ad_write.c
@@ -51,33 +51,6 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
 	offset = fd->fp_ind;
     }
 
-    if (fd->fp_sys_posn != offset) {
-#ifdef ADIOI_MPE_LOGGING
-        MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
-#endif
-#ifdef ROMIO_BG
-	if (bgmpio_timing2) io_time2 = MPI_Wtime();
-#endif
-	err_lseek = lseek(fd->fd_sys, offset, SEEK_SET);
-#ifdef ROMIO_BG
-	if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
-#endif
-#ifdef ADIOI_MPE_LOGGING
-        MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
-#endif
-	/* --BEGIN ERROR HANDLING-- */
-	if (err_lseek == -1) {
-	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-					       MPIR_ERR_RECOVERABLE,
-					       myname, __LINE__,
-					       MPI_ERR_IO, "**io",
-					       "**io %s", strerror(errno));
-	    fd->fp_sys_posn = -1;
-	    return;
-	}
-	/* --END ERROR HANDLING-- */
-    }
-    
 #ifdef ROMIO_BG
     if (bgmpio_timing2) io_time2 = MPI_Wtime();
 #endif
@@ -87,7 +60,7 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
 	MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
 #endif
 	wr_count = len - bytes_xfered;
-	err = write(fd->fd_sys, p, wr_count);
+	err = pwrite(fd->fd_sys, p, wr_count, offset+bytes_xfered);
 	/* --BEGIN ERROR HANDLING-- */
 	if (err == -1) {
 	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
diff --git a/src/mpi/romio/configure.ac b/src/mpi/romio/configure.ac
index 0b1adbb..4bf99dd 100644
--- a/src/mpi/romio/configure.ac
+++ b/src/mpi/romio/configure.ac
@@ -1420,6 +1420,8 @@ if test -n "$mpi_hp"; then
 fi
 #
 AC_CHECK_FUNCS(strerror)
+AC_CHECK_FUNCS(pwrite pread ,, AC_MSG_ERROR([pwrite/pread not detected and no workaround has been implemented]))
+
 if test -z "$srcdir" -o "$srcdir" = "." ; then srcdir="$ROMIO_HOME" ; fi
 AC_SUBST(srcdir)
 

http://git.mpich.org/mpich.git/commitdiff/c97af627dc0611881f28041f5451ee7d5603d1bf

commit c97af627dc0611881f28041f5451ee7d5603d1bf
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Fri Jan 3 15:40:39 2014 -0600

    bg-timing: DO NOT MERGE WITH MASTER: time lockless
    
    bglockles uses the common read/write routines for contig read/wrties, so
    bluegene timing infrastrucutre wasn't actually timing anything.  Since
    this introduces blue gene bits into common code, please do not merge to
    master.  Instead, we should rework all the timing bits so that it no
    longer times "bluegene" but rather all of ROMIO.  Furthermore, the
    locky bits of 'bg:' driver should be yanked anyway, obviating the need
    for bglockless.

diff --git a/src/mpi/romio/adio/common/ad_read.c b/src/mpi/romio/adio/common/ad_read.c
index 13bdb52..d6818b5 100644
--- a/src/mpi/romio/adio/common/ad_read.c
+++ b/src/mpi/romio/adio/common/ad_read.c
@@ -13,6 +13,9 @@
 #ifdef AGGREGATION_PROFILE
 #include "mpe.h"
 #endif
+#ifdef ROMIO_BG
+# include "adio/ad_bg/ad_bg_tuning.h"
+#endif
 
 void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count, 
 			  MPI_Datatype datatype, int file_ptr_type,
@@ -25,6 +28,7 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
     ADIO_Offset len, bytes_xfered=0;
     size_t rd_count;
     static char myname[] = "ADIOI_GEN_READCONTIG";
+    double io_time=0, io_time2=0;
     char *p;
 
 #ifdef AGGREGATION_PROFILE
@@ -33,9 +37,20 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
     MPI_Type_size_x(datatype, &datatype_size);
     len = datatype_size * (ADIO_Offset)count;
 
+#ifdef ROMIO_BG
+    if (bgmpio_timing) {
+	io_time = MPI_Wtime();
+	bgmpio_prof_cr[ BGMPIO_CIO_DATA_SIZE ] += len;
+    }
+#endif
+
     if (file_ptr_type == ADIO_INDIVIDUAL) {
 	offset = fd->fp_ind;
     }
+ 
+#ifdef ROMIO_BG
+    if (bgmpio_timing2) io_time2 = MPI_Wtime();
+#endif
 
     if (fd->fp_sys_posn != offset) {
 #ifdef ADIOI_MPE_LOGGING
@@ -45,6 +60,9 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
 #ifdef ADIOI_MPE_LOGGING
         MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
 #endif
+#ifdef ROMIO_BG
+	if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
+#endif
 	/* --BEGIN ERROR HANDLING-- */
 	if (err_lseek == -1) {
 	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
@@ -58,6 +76,9 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
 	/* --END ERROR HANDLING-- */
     }
 
+#if ROMIO_BG
+    if (bgmpio_timing2) io_time2 = MPI_Wtime();
+#endif
     p=buf;
     while (bytes_xfered < len) {
 #ifdef ADIOI_MPE_LOGGING
@@ -87,7 +108,9 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
 	bytes_xfered += err;
 	p += err;
     }
-
+#if ROMIO_BG
+    if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
+#endif
     fd->fp_sys_posn = offset + bytes_xfered;
 
     if (file_ptr_type == ADIO_INDIVIDUAL) {
@@ -104,4 +127,5 @@ void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count,
 #ifdef AGGREGATION_PROFILE
     MPE_Log_event (5035, 0, NULL);
 #endif
+    if (bgmpio_timing) bgmpio_prof_cr[ BGMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
 }
diff --git a/src/mpi/romio/adio/common/ad_write.c b/src/mpi/romio/adio/common/ad_write.c
index 05b470a..8a5beb6 100644
--- a/src/mpi/romio/adio/common/ad_write.c
+++ b/src/mpi/romio/adio/common/ad_write.c
@@ -14,6 +14,11 @@
 #include "mpe.h"
 #endif
 
+#ifdef ROMIO_BG
+#include "adio/ad_bg/ad_bg_tuning.h"
+#endif
+
+
 void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
 			   MPI_Datatype datatype, int file_ptr_type,
 			   ADIO_Offset offset, ADIO_Status *status,
@@ -25,6 +30,7 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
     ADIO_Offset len, bytes_xfered=0;
     size_t wr_count;
     static char myname[] = "ADIOI_GEN_WRITECONTIG";
+    double io_time=0, io_time2=0;
     char * p;
 
 #ifdef AGGREGATION_PROFILE
@@ -34,6 +40,13 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
     MPI_Type_size_x(datatype, &datatype_size);
     len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
 
+#ifdef ROMIO_BG
+    if (bgmpio_timing) {
+	io_time = MPI_Wtime();
+	bgmpio_prof_cw[ BGMPIO_CIO_DATA_SIZE ] += len;
+    }
+#endif
+
     if (file_ptr_type == ADIO_INDIVIDUAL) {
 	offset = fd->fp_ind;
     }
@@ -42,7 +55,13 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
 #ifdef ADIOI_MPE_LOGGING
         MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
 #endif
+#ifdef ROMIO_BG
+	if (bgmpio_timing2) io_time2 = MPI_Wtime();
+#endif
 	err_lseek = lseek(fd->fd_sys, offset, SEEK_SET);
+#ifdef ROMIO_BG
+	if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
+#endif
 #ifdef ADIOI_MPE_LOGGING
         MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
 #endif
@@ -59,6 +78,9 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
 	/* --END ERROR HANDLING-- */
     }
     
+#ifdef ROMIO_BG
+    if (bgmpio_timing2) io_time2 = MPI_Wtime();
+#endif
     p = (char *)buf;
     while (bytes_xfered < len) {
 #ifdef ADIOI_MPE_LOGGING
@@ -84,12 +106,19 @@ void ADIOI_GEN_WriteContig(ADIO_File fd, const void *buf, int count,
 	p += err;
     }
 
+#ifdef ROMIO_BG
+    if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
+#endif
     fd->fp_sys_posn = offset + bytes_xfered;
 
     if (file_ptr_type == ADIO_INDIVIDUAL) {
 	fd->fp_ind += bytes_xfered; 
     }
 
+#ifdef ROMIO_BG
+    if (bgmpio_timing) bgmpio_prof_cw[ BGMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
+#endif
+
 #ifdef HAVE_STATUS_SET_BYTES
     /* bytes_xfered could be larger than int */
     if (err != -1 && status) MPIR_Status_set_bytes(status, datatype, bytes_xfered);

http://git.mpich.org/mpich.git/commitdiff/751176bc3c371056039ab77f7832c24eafa2ef02

commit 751176bc3c371056039ab77f7832c24eafa2ef02
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Fri Nov 15 12:48:13 2013 -0600

    dust off old Blue Gene timing infrastrucutre
    
    Protected by an 'ifdef', this BGL-era code bitrotted a bit.  clean it up
    and see if it does anything useful today.
    - Removes preprocessor guards: the counters and timers do nothing
      expensive unless environment variables are set
    - remove the idea of a "level"
    - remove barrier from timing collection.
    - bugfix: MPI_Wtime() does not necessarily start at zero, so properly initialze
      timers for collective read/write
    - report only from I/O aggregators.  when reporting "time spent in i/o"
      vs "time spent communicating" it makes more sense to look only at the
      aggregators.  The non-aggregators are going to skew the results
      because they are spending some communication time actually
      communicating, but some of that time blocked, waiting for aggregators
      to finish.

diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c b/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
index 7f68ae7..4602127 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
@@ -117,9 +117,8 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
     int  ii;
     ADIO_Offset *len_list = NULL;
     int *buf_idx = NULL;
-#if BG_PROFILE 
-    BGMPIO_T_CIO_RESET( 0, r )
-#endif
+
+    BGMPIO_T_CIO_RESET( r)
 
 #ifdef HAVE_STATUS_SET_BYTES
     MPI_Count bufsize, size;
@@ -144,9 +143,8 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
     nprocs_for_coll = fd->hints->cb_nodes;
     orig_fp = fd->fp_ind;
 
-#if BG_PROFILE 
-    BGMPIO_T_CIO_SET_GET( 0, r, 0, 1, 0, BGMPIO_CIO_LCOMP, BGMPIO_CIO_LAST )
-#endif
+    BGMPIO_T_CIO_SET_GET( r, 1, 0, BGMPIO_CIO_T_MPIO_CRW, BGMPIO_CIO_LAST)
+    BGMPIO_T_CIO_SET_GET( r, 1, 0, BGMPIO_CIO_T_LCOMP, BGMPIO_CIO_LAST )
 
     /* only check for interleaving if cb_read isn't disabled */
     if (fd->hints->cb_read != ADIOI_HINT_DISABLE) {
@@ -160,9 +158,7 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
 			      &offset_list, &len_list, &start_offset,
 			      &end_offset, &contig_access_count); 
     
-#if BG_PROFILE 
-    BGMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGMPIO_CIO_GATHER, BGMPIO_CIO_LCOMP )
-#endif
+    BGMPIO_T_CIO_SET_GET( r, 1, 1, BGMPIO_CIO_T_GATHER, BGMPIO_CIO_T_LCOMP )
 
 #ifdef RDCOLL_DEBUG
     for (i=0; i<contig_access_count; i++) {
@@ -203,9 +199,7 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
                       ADIO_OFFSET, fd->comm);
     }
 
-#if BG_PROFILE 
-    BGMPIO_T_CIO_SET_GET( 0, r, 0, 1, 1, BGMPIO_CIO_PATANA, BGMPIO_CIO_GATHER )
-#endif
+    BGMPIO_T_CIO_SET_GET( r, 1, 1, BGMPIO_CIO_T_PATANA, BGMPIO_CIO_T_GATHER )
 
 	/* are the accesses of different processes interleaved? */
 	for (i=1; i<nprocs; i++)
@@ -247,9 +241,7 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
 	return;
     }
 
-#if BG_PROFILE 
-    BGMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGMPIO_CIO_FD_PART, BGMPIO_CIO_PATANA )
-#endif
+    BGMPIO_T_CIO_SET_GET( r, 1, 1, BGMPIO_CIO_T_FD_PART, BGMPIO_CIO_T_PATANA )
 
     /* We're going to perform aggregation of I/O.  Here we call
      * ADIOI_Calc_file_domains() to determine what processes will handle I/O
@@ -278,9 +270,7 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
 			    fd->hints->min_fdomain_size, &fd_size, 
 			    fd->hints->striping_unit);
 
-#if BG_PROFILE 
-    BGMPIO_T_CIO_SET_GET( 0, r, 0, 1, 1, BGMPIO_CIO_MYREQ, BGMPIO_CIO_FD_PART )
-#endif
+    BGMPIO_T_CIO_SET_GET( r, 1, 1, BGMPIO_CIO_T_MYREQ, BGMPIO_CIO_T_FD_PART )
 
     /* calculate where the portions of the access requests of this process 
      * are located in terms of the file domains.  this could be on the same
@@ -307,9 +297,7 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
 		      &count_my_req_per_proc, &my_req,
 		      &buf_idx);
 
-#if BG_PROFILE 
-    BGMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGMPIO_CIO_OTHREQ, BGMPIO_CIO_MYREQ )
-#endif
+    BGMPIO_T_CIO_SET_GET( r, 1, 1, BGMPIO_CIO_T_OTHREQ, BGMPIO_CIO_T_MYREQ )
 
     /* perform a collective communication in order to distribute the
      * data calculated above.  fills in the following:
@@ -330,9 +318,7 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
 			  nprocs, myrank, &count_others_req_procs, 
 			  &others_req); 
 
-#if BG_PROFILE 
-    BGMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGMPIO_CIO_DEXCH, BGMPIO_CIO_OTHREQ )
-#endif
+    BGMPIO_T_CIO_SET_GET( r, 1, 1, BGMPIO_CIO_T_DEXCH, BGMPIO_CIO_T_OTHREQ )
 
     /* my_req[] and count_my_req_per_proc aren't needed at this point, so 
      * let's free the memory 
@@ -355,12 +341,10 @@ void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
 			len_list, contig_access_count, min_st_offset,
 			fd_size, fd_start, fd_end, buf_idx, error_code);
 
-#if BG_PROFILE 
-    BGMPIO_T_CIO_SET_GET( 0, r, 1, 0, 1, BGMPIO_CIO_LAST, BGMPIO_CIO_T_DEXCH )
-    BGMPIO_T_CIO_SET_GET( 0, r, 0, 0, 1, BGMPIO_CIO_LAST, BGMPIO_CIO_T_MPIO_CRW )
+    BGMPIO_T_CIO_SET_GET( r, 0, 1, BGMPIO_CIO_LAST, BGMPIO_CIO_T_DEXCH )
+    BGMPIO_T_CIO_SET_GET( r, 0, 1, BGMPIO_CIO_LAST, BGMPIO_CIO_T_MPIO_CRW )
 
-    BGMPIO_T_CIO_REPORT( 0, r, fd, myrank )
-#endif
+    BGMPIO_T_CIO_REPORT( 0, fd, myrank, nprocs)
 
     if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
 
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_read.c b/src/mpi/romio/adio/ad_bg/ad_bg_read.c
index 32d0c6d..0fe7d20 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_read.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_read.c
@@ -30,85 +30,51 @@ void ADIOI_BG_ReadContig(ADIO_File fd, void *buf, int count,
 #ifdef AGGREGATION_PROFILE
     MPE_Log_event (5034, 0, NULL);
 #endif
-#if BG_PROFILE
-		/* timing */
-		double io_time, io_time2;
-
-		if (bgmpio_timing) {
-		    io_time = MPI_Wtime();
-		    bgmpio_prof_cr[ BGMPIO_CIO_DATA_SIZE ] += len;
-		}
-#endif
+    /* timing */
+    double io_time, io_time2;
 
     MPI_Type_size_x(datatype, &datatype_size);
     len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
     ADIOI_Assert(len == (unsigned int) len); /* read takes an unsigned int parm */
 
-#if BG_PROFILE
-
-    if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
-        	if (bgmpio_timing2) io_time2 = MPI_Wtime();
-	if (fd->fp_sys_posn != offset)
-	    lseek(fd->fd_sys, offset, SEEK_SET);
-        	if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
-	if (fd->atomicity)
-	    ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
-	else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
-        	if (bgmpio_timing2) io_time2 = MPI_Wtime();
-	err = read(fd->fd_sys, buf, (unsigned int)len);
-        	if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
-	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
-	fd->fp_sys_posn = offset + err;
-	/* individual file pointer not updated */        
-    }
-    else {  /* read from curr. location of ind. file pointer */
-	offset = fd->fp_ind;
-        	if (bgmpio_timing2) io_time2 = MPI_Wtime();
-	if (fd->fp_sys_posn != fd->fp_ind)
-	    lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
-        	if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
-	if (fd->atomicity)
-	    ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
-	else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
-        	if (bgmpio_timing2) io_time2 = MPI_Wtime();
-	err = read(fd->fd_sys, buf, (unsigned int)len);
-        	if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
-	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
-	fd->fp_ind += err;
-	fd->fp_sys_posn = fd->fp_ind;
+    if (bgmpio_timing) {
+	io_time = MPI_Wtime();
+	bgmpio_prof_cr[ BGMPIO_CIO_DATA_SIZE ] += len;
     }
 
-#else	/* BG_PROFILE */
-
     if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
+	if (bgmpio_timing2) io_time2 = MPI_Wtime();
 	if (fd->fp_sys_posn != offset)
 	    lseek(fd->fd_sys, offset, SEEK_SET);
+	if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
 	if (fd->atomicity)
 	    ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
 	else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
+	if (bgmpio_timing2) io_time2 = MPI_Wtime();
 	err = read(fd->fd_sys, buf, (unsigned int)len);
+	if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
 	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
 	fd->fp_sys_posn = offset + err;
 	/* individual file pointer not updated */        
     }
     else {  /* read from curr. location of ind. file pointer */
 	offset = fd->fp_ind;
+	if (bgmpio_timing2) io_time2 = MPI_Wtime();
 	if (fd->fp_sys_posn != fd->fp_ind)
 	    lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
+	if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
 	if (fd->atomicity)
 	    ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
 	else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
+	if (bgmpio_timing2) io_time2 = MPI_Wtime();
 	err = read(fd->fd_sys, buf, (unsigned int)len);
+	if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
 	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
 	fd->fp_ind += err;
 	fd->fp_sys_posn = fd->fp_ind;
     }
 
-#endif   /* BG_PROFILE */
-
-#if BG_PROFILE
-    		if (bgmpio_timing) bgmpio_prof_cr[ BGMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
-#endif
+    if (bgmpio_timing) bgmpio_prof_cr[ BGMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
 
     /* --BEGIN ERROR HANDLING-- */
     if (err == -1) {
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
index 632bacd..78e944b 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
@@ -29,6 +29,7 @@
 
 int 	bgmpio_timing;
 int 	bgmpio_timing2;
+int     bgmpio_timing_cw_level;
 int 	bgmpio_comm;
 int 	bgmpio_tunegather;
 int 	bgmpio_tuneblocking;
@@ -48,13 +49,13 @@ double	bgmpio_prof_cr    [BGMPIO_CIO_LAST];
  *   - Default is 0.
  *
  * - BGMPIO_TIMING - collect timing breakdown for MPI I/O collective calls.
- *   Must also compile the library with BG_PROFILE defined. Possible values:
+ *   Possible values:
  *   - 0 - Do not collect/report timing.
  *   - 1 - Collect/report timing.
  *   - Default is 0.
  *
  * - BGMPIO_TIMING2 - collect additional averages for MPI I/O collective calls.
- *   Must also compile the library with BG_PROFILE defined. Possible values:
+ *   Possible values:
  *   - 0 - Do not collect/report averages.
  *   - 1 - Collect/report averages.
  *   - Default is 0.
@@ -119,11 +120,27 @@ void ad_bg_get_env_vars() {
 }
 
 /* report timing breakdown for MPI I/O collective call */
-void ad_bg_wr_timing_report( int rw, ADIO_File fd, int myrank, int nprocs )
+void ad_bg_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs )
 {
     int i;
 
     if (bgmpio_timing) {
+	/* Timing across the whole communicator is a little bit interesting,
+	 * but what is *more* interesting is if we single out the aggregators
+	 * themselves.  non-aggregators spend a lot of time in "exchange" not
+	 * exchanging data, but blocked because they are waiting for
+	 * aggregators to finish writing.  If we focus on just the aggregator
+	 * processes we will get a more clear picture about the data exchange
+	 * vs. i/o time breakdown */
+
+	/* if deferred open enabled, we could use the aggregator communicator */
+	MPI_Comm agg_comm;
+	int nr_aggs, agg_rank;
+	MPI_Comm_split(fd->comm, (fd->is_agg ? 1 : MPI_UNDEFINED), 0, &agg_comm);
+	if(agg_comm != MPI_COMM_NULL) {
+	    MPI_Comm_size(agg_comm, &nr_aggs);
+	    MPI_Comm_rank(agg_comm, &agg_rank);
+	}
 
 	double *bgmpio_prof_org = bgmpio_prof_cr;
 	if (rw) bgmpio_prof_org = bgmpio_prof_cw;
@@ -131,46 +148,66 @@ void ad_bg_wr_timing_report( int rw, ADIO_File fd, int myrank, int nprocs )
 	double bgmpio_prof_avg[ BGMPIO_CIO_LAST ];
 	double bgmpio_prof_max[ BGMPIO_CIO_LAST ];
 	
-	MPI_Reduce( bgmpio_prof_org, bgmpio_prof_avg, BGMPIO_CIO_LAST, MPI_DOUBLE, MPI_SUM, 0, fd->comm );
-	MPI_Reduce( bgmpio_prof_org, bgmpio_prof_max, BGMPIO_CIO_LAST, MPI_DOUBLE, MPI_MAX, 0, fd->comm );
-
-	if (myrank == 0) {
+	if( agg_comm != MPI_COMM_NULL) {
+	    MPI_Reduce( bgmpio_prof_org, bgmpio_prof_avg, BGMPIO_CIO_LAST, MPI_DOUBLE, MPI_SUM, 0, agg_comm);
+	    MPI_Reduce( bgmpio_prof_org, bgmpio_prof_max, BGMPIO_CIO_LAST, MPI_DOUBLE, MPI_MAX, 0, agg_comm);
+	}
+	if (agg_comm != MPI_COMM_NULL && agg_rank == 0) {
 
-	    for (i=0; i<BGMPIO_CIO_LAST; i++) bgmpio_prof_avg[i] /= nprocs;
+	    for (i=0; i<BGMPIO_CIO_LAST; i++) bgmpio_prof_avg[i] /= nr_aggs;
 
 	    if (bgmpio_timing2) {
-		bgmpio_prof_avg[ BGMPIO_CIO_B_POSI_RW  ] = bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nprocs / 
-							     bgmpio_prof_max[ BGMPIO_CIO_T_POSI_RW  ];
-		bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_RW  ] = bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nprocs / 
-							     bgmpio_prof_max[ BGMPIO_CIO_T_MPIO_RW  ];
+		bgmpio_prof_avg[ BGMPIO_CIO_B_POSI_RW  ] =
+		    bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nr_aggs /
+		    bgmpio_prof_max[ BGMPIO_CIO_T_POSI_RW  ];
+		bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_RW  ] =
+		    bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nr_aggs /
+		    bgmpio_prof_max[ BGMPIO_CIO_T_MPIO_RW  ];
 	    } else {
 
-		bgmpio_prof_avg[ BGMPIO_CIO_B_POSI_RW  ] = 0;
-		bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_RW  ] = 0;
+		bgmpio_prof_avg[ BGMPIO_CIO_B_POSI_RW  ] = -1;
+		bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_RW  ] = -1;
 	    }
 
-		bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_CRW ] = bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nprocs / 
-							     bgmpio_prof_max[ BGMPIO_CIO_T_MPIO_CRW ];
-
-	    printf("\tTIMING-1 %1s , ", (rw ? "W" : "R") );
-	    printf(    "SZ: %12.4f , ", bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nprocs );
-	    printf(  "SK-a: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_SEEK ]     );
-	    printf(  "SK-m: %10.3f , ", bgmpio_prof_max[ BGMPIO_CIO_T_SEEK ]     );
-	    printf(  "LC-a: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_LCOMP ]    );
-	    printf(  "GA-m: %10.3f , ", bgmpio_prof_max[ BGMPIO_CIO_T_GATHER ]   );
-	    printf(  "AN-a: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_PATANA ]   );
-	    printf(  "FD-a: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_FD_PART ]  );
-	    printf(  "MY-a: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_MYREQ ]    );
-	    printf(  "OT-m: %10.3f , ", bgmpio_prof_max[ BGMPIO_CIO_T_OTHREQ ]   );
-	    printf(  "EX-m: %10.3f , ", bgmpio_prof_max[ BGMPIO_CIO_T_DEXCH ]    );
-	    printf("\tTIMING-2 %1s , ", (rw ? "W" : "R") );
-	    printf( "PXT-m: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_POSI_RW ]  );
-	    printf( "MPT-m: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_MPIO_RW ]  );
-	    printf("MPTC-m: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_MPIO_CRW ] );
-	    printf(   "PXB: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_B_POSI_RW ]  );
-	    printf(   "MPB: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_RW ]  );
-	    printf(  "MPBC: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_CRW ] );
+	    bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_CRW ] =
+		bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nr_aggs /
+		bgmpio_prof_max[ BGMPIO_CIO_T_MPIO_CRW ];
+
+	    fprintf(stderr,"TIMING-1 %1s,", (rw ? "W" : "R") );
+	    fprintf(stderr,"SIZE: %12.4f , ", bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nr_aggs);
+	    fprintf(stderr,"SEEK-avg: %10.3f , ",
+		    bgmpio_prof_avg[ BGMPIO_CIO_T_SEEK ]     );
+	    fprintf(stderr,"SEEK-max: %10.3f , ",
+		    bgmpio_prof_max[ BGMPIO_CIO_T_SEEK ]     );
+	    fprintf(stderr,"LOCAL-avg: %10.3f , ",
+		    bgmpio_prof_avg[ BGMPIO_CIO_T_LCOMP ]    );
+	    fprintf(stderr,"GATHER-max: %10.3f , ",
+		    bgmpio_prof_max[ BGMPIO_CIO_T_GATHER ]   );
+	    fprintf(stderr,"PATTERN-avg: %10.3f , ",
+		    bgmpio_prof_avg[ BGMPIO_CIO_T_PATANA ]   );
+	    fprintf(stderr,"FILEDOMAIN-avg: %10.3f , ",
+		    bgmpio_prof_avg[ BGMPIO_CIO_T_FD_PART ]  );
+	    fprintf(stderr,"MYREQ-avg: %10.3f , ",
+		    bgmpio_prof_avg[ BGMPIO_CIO_T_MYREQ ]    );
+	    fprintf(stderr,"OTHERREQ-max: %10.3f , ",
+		    bgmpio_prof_max[ BGMPIO_CIO_T_OTHREQ ]   );
+	    fprintf(stderr,"EXCHANGE-max: %10.3f \n",
+		    bgmpio_prof_max[ BGMPIO_CIO_T_DEXCH ]    );
+	    fprintf(stderr,"TIMING-2 %1s,", (rw ? "W" : "R") );
+	    fprintf(stderr,"POSIX-TIME-avg: %10.3f , ",
+		    bgmpio_prof_avg[ BGMPIO_CIO_T_POSI_RW ]  );
+	    fprintf(stderr,"MPIIO-CONTIG-TIME-avg: %10.3f , ",
+		    bgmpio_prof_avg[ BGMPIO_CIO_T_MPIO_RW ]  );
+	    fprintf(stderr,"MPIIO-STRIDED-TIME-avg: %10.3f , ",
+		    bgmpio_prof_avg[ BGMPIO_CIO_T_MPIO_CRW ] );
+	    fprintf(stderr,"POSIX-BW-avg: %10.3f , ",
+		    bgmpio_prof_avg[ BGMPIO_CIO_B_POSI_RW ]  );
+	    fprintf(stderr,"MPI-BW-avg: %10.3f , ",
+		    bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_RW ]  );
+	    fprintf(stderr,"MPI-BW-collective-avg: %10.3f\n ",
+		    bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_CRW ] );
 	}
+	if (agg_comm != MPI_COMM_NULL) MPI_Comm_free(&agg_comm);
     }
 
 }
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
index 03703a0..35317ac 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
@@ -57,6 +57,7 @@ extern double 	bgmpio_prof_cr    [BGMPIO_CIO_LAST];
 /* corresponds to environment variables to select optimizations and timing level */
 extern int 	bgmpio_timing;
 extern int 	bgmpio_timing2;
+extern int      bgmpio_timing_cw_level;
 extern int 	bgmpio_comm;
 extern int 	bgmpio_tunegather;
 extern int 	bgmpio_tuneblocking;
@@ -86,24 +87,21 @@ void ad_bg_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs );
  *   T := timing; 
  * CIO := collective I/O 
  */
-#define BGMPIO_T_CIO_RESET( LEVEL, RW ) \
-	if (bgmpio_timing_cw_level >= LEVEL) { \
+#define BGMPIO_T_CIO_RESET( RW ) \
+	{ \
 	  int i; \
-	  for ( i = 0; i < BGMPIO_T_LAST; i ++ ) \
+	  for ( i = 0; i < BGMPIO_CIO_LAST; i ++ ) \
 	    bgmpio_prof_c##RW [ i ] = 0; \
 	}
 
-#define BGMPIO_T_CIO_REPORT( LEVEL, RW, FD, MYRANK, NPROCS ) \
-	if (bgmpio_timing_cw_level >= LEVEL) { \
-	  ad_bg_timing_crw_report ( RW, FD, MYRANK, NPROCS ); \
-   	}
-
-#define BGMPIO_T_CIO_SET_GET( LEVEL, RW, DOBAR, ISSET, ISGET, VAR1, VAR2 ) \
-	if (bgmpio_timing_cw_level >= LEVEL) { \
-	  if ( DOBAR ) MPI_Barrier( fd->comm ); \
-	  double temp = MPI_Wtime(); \
-	  if ( ISSET ) bgmpio_prof_c##RW [ VAR1 ] = temp; \
-	  if ( ISGET ) bgmpio_prof_c##RW [ VAR2 ] = temp - bgmpio_prof_c##RW [ VAR2 ] ; \
-	}
+#define BGMPIO_T_CIO_REPORT( RW, FD, MYRANK, NPROCS ) \
+	ad_bg_timing_crw_report ( RW, FD, MYRANK, NPROCS ); \
+
+#define BGMPIO_T_CIO_SET_GET(RW, ISSET, ISGET, VAR1, VAR2 ) \
+         {\
+	 double temp = MPI_Wtime(); \
+	 if ( ISSET ) bgmpio_prof_c##RW [ VAR1 ] = temp; \
+	 if ( ISGET ) bgmpio_prof_c##RW [ VAR2 ] = temp - bgmpio_prof_c##RW [ VAR2 ] ;\
+	 }
 
 #endif  /* AD_BG_TUNING_H_ */
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c b/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
index fd390b3..2621c36 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
@@ -124,9 +124,7 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 
     int *buf_idx = NULL;
     ADIO_Offset *len_list = NULL;
-#if BG_PROFILE 
-    BGMPIO_T_CIO_RESET( 0, w )
-#endif
+    BGMPIO_T_CIO_RESET( w )
 #if 0
     /* From common code - not implemented for bg.*/
     int old_error, tmp_error;
@@ -152,9 +150,8 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
     nprocs_for_coll = fd->hints->cb_nodes;
     orig_fp = fd->fp_ind;
 
-#if BG_PROFILE 
-    BGMPIO_T_CIO_SET_GET( 0, w, 0, 1, 0, BGMPIO_CIO_LCOMP, BGMPIO_CIO_LAST )
-#endif
+    BGMPIO_T_CIO_SET_GET( w, 1, 0, BGMPIO_CIO_T_MPIO_CRW, BGMPIO_CIO_LAST)
+    BGMPIO_T_CIO_SET_GET( w, 1, 0, BGMPIO_CIO_T_LCOMP, BGMPIO_CIO_LAST )
 
 
     /* only check for interleaving if cb_write isn't disabled */
@@ -169,9 +166,7 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 			      &offset_list, &len_list, &start_offset,
 			      &end_offset, &contig_access_count); 
 
-#if BG_PROFILE 
-    BGMPIO_T_CIO_SET_GET( 0, w, 1, 1, 1, BGMPIO_CIO_GATHER, BGMPIO_CIO_LCOMP )
-#endif
+    BGMPIO_T_CIO_SET_GET( w, 1, 1, BGMPIO_CIO_T_GATHER, BGMPIO_CIO_T_LCOMP )
 
 	/* each process communicates its start and end offsets to other 
 	   processes. The result is an array each of start and end offsets stored
@@ -205,9 +200,7 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 		      ADIO_OFFSET, fd->comm);
     }
 
-#if BG_PROFILE 
-    BGMPIO_T_CIO_SET_GET( 0, w, 0, 1, 1, BGMPIO_CIO_PATANA, BGMPIO_CIO_GATHER )
-#endif
+    BGMPIO_T_CIO_SET_GET(w, 1, 1, BGMPIO_CIO_T_PATANA, BGMPIO_CIO_T_GATHER )
 
 	/* are the accesses of different processes interleaved? */
 	for (i=1; i<nprocs; i++)
@@ -251,9 +244,7 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 	return;
     }
 
-#if BG_PROFILE 
-    BGMPIO_T_CIO_SET_GET( 0, w, 1, 1, 1, BGMPIO_CIO_FD_PART, BGMPIO_CIO_PATANA )
-#endif
+    BGMPIO_T_CIO_SET_GET( w, 1, 1, BGMPIO_CIO_T_FD_PART, BGMPIO_CIO_T_PATANA )
 	
 /* Divide the I/O workload among "nprocs_for_coll" processes. This is
    done by (logically) dividing the file into file domains (FDs); each
@@ -270,9 +261,7 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 			    fd->hints->min_fdomain_size, &fd_size,
 			    fd->hints->striping_unit);   
 
-#if BG_PROFILE 
-    BGMPIO_T_CIO_SET_GET( 0, w, 0, 1, 1, BGMPIO_CIO_MYREQ, BGMPIO_CIO_FD_PART )
-#endif
+    BGMPIO_T_CIO_SET_GET( w, 1, 1, BGMPIO_CIO_T_MYREQ, BGMPIO_CIO_T_FD_PART )
 	
 /* calculate what portions of the access requests of this process are
    located in what file domains */
@@ -290,9 +279,7 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 		      &count_my_req_per_proc, &my_req,
 		      &buf_idx); 
 
-#if BG_PROFILE 
-    BGMPIO_T_CIO_SET_GET( 0, w, 1, 1, 1, BGMPIO_CIO_OTHREQ, BGMPIO_CIO_MYREQ )
-#endif
+    BGMPIO_T_CIO_SET_GET( w, 1, 1, BGMPIO_CIO_T_OTHREQ, BGMPIO_CIO_T_MYREQ )
 	
 /* based on everyone's my_req, calculate what requests of other
    processes lie in this process's file domain.
@@ -312,9 +299,7 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 			  nprocs, myrank,
 			  &count_others_req_procs, &others_req); 
     
-#if BG_PROFILE 
-    BGMPIO_T_CIO_SET_GET( 0, w, 1, 1, 1, BGMPIO_CIO_DEXCH, BGMPIO_CIO_OTHREQ )
-#endif
+    BGMPIO_T_CIO_SET_GET( w, 1, 1, BGMPIO_CIO_T_DEXCH, BGMPIO_CIO_T_OTHREQ )
 
     ADIOI_Free(count_my_req_per_proc);
     for (i=0; i < nprocs; i++) {
@@ -331,12 +316,10 @@ void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 			len_list, contig_access_count, min_st_offset,
 			fd_size, fd_start, fd_end, buf_idx, error_code);
 
-#if BG_PROFILE 
-    BGMPIO_T_CIO_SET_GET( 0, w, 1, 0, 1, BGMPIO_CIO_LAST, BGMPIO_CIO_T_DEXCH )
-    BGMPIO_T_CIO_SET_GET( 0, w, 0, 0, 1, BGMPIO_CIO_LAST, BGMPIO_CIO_T_MPIO_CRW )
+    BGMPIO_T_CIO_SET_GET( w, 0, 1, BGMPIO_CIO_LAST, BGMPIO_CIO_T_DEXCH )
+    BGMPIO_T_CIO_SET_GET( w, 0, 1, BGMPIO_CIO_LAST, BGMPIO_CIO_T_MPIO_CRW )
 
-    BGMPIO_T_CIO_REPORT( 0, w, fd, myrank )
-#endif
+    BGMPIO_T_CIO_REPORT( 1, fd, myrank, nprocs)
 #if 0
     /* From common code - not implemented for bg.
      * 
diff --git a/src/mpi/romio/adio/ad_bg/ad_bg_write.c b/src/mpi/romio/adio/ad_bg/ad_bg_write.c
index 4ffbab7..8c5734b 100644
--- a/src/mpi/romio/adio/ad_bg/ad_bg_write.c
+++ b/src/mpi/romio/adio/ad_bg/ad_bg_write.c
@@ -31,77 +31,47 @@ void ADIOI_BG_WriteContig(ADIO_File fd, const void *buf, int count,
 #ifdef AGGREGATION_PROFILE
     MPE_Log_event (5036, 0, NULL);
 #endif
-#if BG_PROFILE
-		/* timing */
-		double io_time, io_time2;
+    /* timing */
+    double io_time, io_time2;
 
-		if (bgmpio_timing) { 
-		    io_time = MPI_Wtime(); 
-		    bgmpio_prof_cw[ BGMPIO_CIO_DATA_SIZE ] += len;
-		}
-#endif
-			  
     MPI_Type_size_x(datatype, &datatype_size);
     len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
     ADIOI_Assert(len == (unsigned int) len); /* write takes an unsigned int parm */
 
-#if BG_PROFILE
-
-    if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
-        	if (bgmpio_timing2) io_time2 = MPI_Wtime();
-	if (fd->fp_sys_posn != offset)
-	    lseek(fd->fd_sys, offset, SEEK_SET);
-        	if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
-	ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
-        	if (bgmpio_timing2) io_time2 = MPI_Wtime();
-	err = write(fd->fd_sys, buf, (unsigned int)len);
-        	if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
-	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
-	fd->fp_sys_posn = offset + err;
-	/* individual file pointer not updated */        
-    }
-    else { /* write from curr. location of ind. file pointer */
-	offset = fd->fp_ind;
-	        if (bgmpio_timing2) io_time2 = MPI_Wtime();
-	if (fd->fp_sys_posn != fd->fp_ind)
-	    lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
-        	if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
-	ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
-        	if (bgmpio_timing2) io_time2 = MPI_Wtime();
-	err = write(fd->fd_sys, buf, (unsigned int)len);
-        	if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
-	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
-	fd->fp_ind += err;
-	fd->fp_sys_posn = fd->fp_ind;
+    if (bgmpio_timing) {
+	io_time = MPI_Wtime();
+	bgmpio_prof_cw[ BGMPIO_CIO_DATA_SIZE ] += len;
     }
 
-#else	/* BG_PROFILE */
-
     if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
+	if (bgmpio_timing2) io_time2 = MPI_Wtime();
 	if (fd->fp_sys_posn != offset)
 	    lseek(fd->fd_sys, offset, SEEK_SET);
+	if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
 	ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
+	if (bgmpio_timing2) io_time2 = MPI_Wtime();
 	err = write(fd->fd_sys, buf, (unsigned int)len);
+	if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
 	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
 	fd->fp_sys_posn = offset + err;
 	/* individual file pointer not updated */        
     }
     else { /* write from curr. location of ind. file pointer */
 	offset = fd->fp_ind;
+	if (bgmpio_timing2) io_time2 = MPI_Wtime();
 	if (fd->fp_sys_posn != fd->fp_ind)
 	    lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
+	if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
 	ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
+	if (bgmpio_timing2) io_time2 = MPI_Wtime();
 	err = write(fd->fd_sys, buf, (unsigned int)len);
+	if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
 	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
 	fd->fp_ind += err;
 	fd->fp_sys_posn = fd->fp_ind;
     }
 
-#endif	/* BG_PROFILE */
-
-#if BG_PROFILE
-		if (bgmpio_timing) bgmpio_prof_cw[ BGMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
-#endif
+    if (bgmpio_timing) bgmpio_prof_cw[ BGMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
 
     /* --BEGIN ERROR HANDLING-- */
     if (err == -1) {

-----------------------------------------------------------------------

Summary of changes:
 src/mpi/romio/adio/Makefile.mk                     |    5 +-
 src/mpi/romio/adio/ad_bg/Makefile.mk               |   35 -
 src/mpi/romio/adio/ad_bg/ad_bg.c                   |   51 -
 src/mpi/romio/adio/ad_bg/ad_bg.h                   |   97 --
 src/mpi/romio/adio/ad_bg/ad_bg_aggrs.c             | 1025 -------------
 src/mpi/romio/adio/ad_bg/ad_bg_aggrs.h             |  104 --
 src/mpi/romio/adio/ad_bg/ad_bg_close.c             |   53 -
 src/mpi/romio/adio/ad_bg/ad_bg_fcntl.c             |   58 -
 src/mpi/romio/adio/ad_bg/ad_bg_flush.c             |   90 --
 src/mpi/romio/adio/ad_bg/ad_bg_getsh.c             |   84 --
 src/mpi/romio/adio/ad_bg/ad_bg_hints.c             |  318 ----
 src/mpi/romio/adio/ad_bg/ad_bg_open.c              |  316 ----
 src/mpi/romio/adio/ad_bg/ad_bg_pset.c              |  290 ----
 src/mpi/romio/adio/ad_bg/ad_bg_pset.h              |   75 -
 src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c            | 1148 ---------------
 src/mpi/romio/adio/ad_bg/ad_bg_read.c              |  557 -------
 src/mpi/romio/adio/ad_bg/ad_bg_setsh.c             |   68 -
 src/mpi/romio/adio/ad_bg/ad_bg_tuning.c            |  176 ---
 src/mpi/romio/adio/ad_bg/ad_bg_tuning.h            |  109 --
 src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c            | 1533 -------------------
 src/mpi/romio/adio/ad_bg/ad_bg_write.c             |  609 --------
 src/mpi/romio/adio/ad_bgl/Makefile.mk              |   34 -
 src/mpi/romio/adio/ad_bgl/ad_bgl.c                 |   60 -
 src/mpi/romio/adio/ad_bgl/ad_bgl.h                 |   97 --
 src/mpi/romio/adio/ad_bgl/ad_bgl_aggrs.c           |  966 ------------
 src/mpi/romio/adio/ad_bgl/ad_bgl_aggrs.h           |  108 --
 src/mpi/romio/adio/ad_bgl/ad_bgl_close.c           |   53 -
 src/mpi/romio/adio/ad_bgl/ad_bgl_fcntl.c           |   58 -
 src/mpi/romio/adio/ad_bgl/ad_bgl_flush.c           |   90 --
 src/mpi/romio/adio/ad_bgl/ad_bgl_getsh.c           |   84 --
 src/mpi/romio/adio/ad_bgl/ad_bgl_hints.c           |  302 ----
 src/mpi/romio/adio/ad_bgl/ad_bgl_open.c            |  304 ----
 src/mpi/romio/adio/ad_bgl/ad_bgl_pset.c            |  109 --
 src/mpi/romio/adio/ad_bgl/ad_bgl_pset.h            |   82 -
 src/mpi/romio/adio/ad_bgl/ad_bgl_rdcoll.c          | 1147 ---------------
 src/mpi/romio/adio/ad_bgl/ad_bgl_read.c            |  549 -------
 src/mpi/romio/adio/ad_bgl/ad_bgl_setsh.c           |   68 -
 src/mpi/romio/adio/ad_bgl/ad_bgl_tuning.c          |  163 --
 src/mpi/romio/adio/ad_bgl/ad_bgl_tuning.h          |   95 --
 src/mpi/romio/adio/ad_bgl/ad_bgl_wrcoll.c          | 1535 -------------------
 src/mpi/romio/adio/ad_bgl/ad_bgl_write.c           |  611 --------
 src/mpi/romio/adio/ad_bglockless/Makefile.mk       |   17 -
 src/mpi/romio/adio/ad_bglockless/ad_bglockless.c   |   44 -
 src/mpi/romio/adio/ad_bglockless/ad_bglockless.h   |   14 -
 .../adio/ad_bglockless/ad_bglockless_features.c    |   40 -
 src/mpi/romio/adio/{ad_bg => ad_gpfs}/.gitignore   |    0
 src/mpi/romio/adio/ad_gpfs/Makefile.mk             |   27 +
 src/mpi/romio/adio/ad_gpfs/ad_gpfs.c               |   54 +
 src/mpi/romio/adio/ad_gpfs/ad_gpfs.h               |   73 +
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.c         |  837 +++++++++++
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.h         |   85 ++
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_close.c         |   55 +
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_flush.c         |   71 +
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_open.c          |  153 ++
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c        | 1161 +++++++++++++++
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c        |  265 ++++
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h        |  112 ++
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c        | 1551 ++++++++++++++++++++
 src/mpi/romio/adio/ad_gpfs/bg/Makefile.mk          |   21 +
 src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.c        |  531 +++++++
 src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.h        |   30 +
 src/mpi/romio/adio/ad_gpfs/bg/ad_bg_hints.c        |  318 ++++
 src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.c         |  290 ++++
 src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.h         |   76 +
 src/mpi/romio/adio/ad_testfs/ad_testfs_hints.c     |    5 -
 src/mpi/romio/adio/common/Makefile.mk              |    4 +-
 src/mpi/romio/adio/common/ad_close.c               |    2 +
 src/mpi/romio/adio/common/ad_fstype.c              |   86 +-
 src/mpi/romio/adio/common/ad_get_sh_fp.c           |   23 -
 src/mpi/romio/adio/common/ad_open.c                |    5 +
 src/mpi/romio/adio/common/ad_opencoll.c            |   19 +-
 src/mpi/romio/adio/common/ad_read.c                |   53 +-
 src/mpi/romio/adio/common/ad_read_coll.c           |   10 +-
 src/mpi/romio/adio/common/ad_set_sh_fp.c           |   18 -
 src/mpi/romio/adio/common/ad_threaded_io.c         |   32 +
 src/mpi/romio/adio/common/ad_write.c               |   55 +-
 src/mpi/romio/adio/common/ad_write_coll.c          |    3 +-
 src/mpi/romio/adio/common/cb_config_list.c         |    2 +-
 src/mpi/romio/adio/common/p2p_aggregation.c        |  791 ++++++++++
 src/mpi/romio/adio/include/adio.h                  |   11 +-
 src/mpi/romio/adio/include/adioi.h                 |   44 +
 src/mpi/romio/adio/include/adioi_fs_proto.h        |   16 +-
 src/mpi/romio/configure.ac                         |   63 +-
 src/mpi/romio/test/coll_perf.c                     |    6 +-
 84 files changed, 6732 insertions(+), 13657 deletions(-)
 delete mode 100644 src/mpi/romio/adio/ad_bg/Makefile.mk
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg.c
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg.h
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg_aggrs.c
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg_aggrs.h
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg_close.c
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg_fcntl.c
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg_flush.c
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg_getsh.c
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg_hints.c
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg_open.c
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg_pset.c
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg_pset.h
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg_read.c
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg_setsh.c
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg_tuning.c
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg_tuning.h
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg_wrcoll.c
 delete mode 100644 src/mpi/romio/adio/ad_bg/ad_bg_write.c
 delete mode 100644 src/mpi/romio/adio/ad_bgl/Makefile.mk
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl.c
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl.h
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl_aggrs.c
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl_aggrs.h
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl_close.c
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl_fcntl.c
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl_flush.c
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl_getsh.c
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl_hints.c
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl_open.c
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl_pset.c
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl_pset.h
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl_rdcoll.c
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl_read.c
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl_setsh.c
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl_tuning.c
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl_tuning.h
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl_wrcoll.c
 delete mode 100644 src/mpi/romio/adio/ad_bgl/ad_bgl_write.c
 delete mode 100644 src/mpi/romio/adio/ad_bglockless/Makefile.mk
 delete mode 100644 src/mpi/romio/adio/ad_bglockless/ad_bglockless.c
 delete mode 100644 src/mpi/romio/adio/ad_bglockless/ad_bglockless.h
 delete mode 100644 src/mpi/romio/adio/ad_bglockless/ad_bglockless_features.c
 rename src/mpi/romio/adio/{ad_bg => ad_gpfs}/.gitignore (100%)
 create mode 100644 src/mpi/romio/adio/ad_gpfs/Makefile.mk
 create mode 100644 src/mpi/romio/adio/ad_gpfs/ad_gpfs.c
 create mode 100644 src/mpi/romio/adio/ad_gpfs/ad_gpfs.h
 create mode 100644 src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.c
 create mode 100644 src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.h
 create mode 100644 src/mpi/romio/adio/ad_gpfs/ad_gpfs_close.c
 create mode 100644 src/mpi/romio/adio/ad_gpfs/ad_gpfs_flush.c
 create mode 100644 src/mpi/romio/adio/ad_gpfs/ad_gpfs_open.c
 create mode 100644 src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c
 create mode 100644 src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
 create mode 100644 src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
 create mode 100644 src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
 create mode 100644 src/mpi/romio/adio/ad_gpfs/bg/Makefile.mk
 create mode 100644 src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.c
 create mode 100644 src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.h
 create mode 100644 src/mpi/romio/adio/ad_gpfs/bg/ad_bg_hints.c
 create mode 100644 src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.c
 create mode 100644 src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.h
 create mode 100644 src/mpi/romio/adio/common/ad_threaded_io.c
 create mode 100644 src/mpi/romio/adio/common/p2p_aggregation.c


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list