[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.1-276-g71f5e50

Service Account noreply at mpich.org
Fri May 23 14:57:44 CDT 2014


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  71f5e5070c49f5507935016e03953d1eeee73aeb (commit)
       via  f16bfb543764378ad62a05f99c6fd5481996a1bb (commit)
       via  e053df00a4a22febb28438362f477b20cbef18e1 (commit)
       via  71a21c87809b7d7074eb59d00f29334099b41607 (commit)
       via  ae5b26577bfcc2cb3034fccc2daba3138a264682 (commit)
       via  8f6179fa8bdd050a0038ab1034a550a7b7e922a5 (commit)
       via  3c9992ef6a70f6f977e74fe994c9b7e56d3b00c3 (commit)
       via  8421f3a9c477188aae522caa23969c4ea2c68ea0 (commit)
       via  67bf775e9534ef87f1cb8dec536a1f2977379abc (commit)
       via  faad606c90bcd14204021bbd0ab0de01351da779 (commit)
      from  659053cbe6a35e78ad166d72ce927de7bc69f0d4 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/71f5e5070c49f5507935016e03953d1eeee73aeb

commit 71f5e5070c49f5507935016e03953d1eeee73aeb
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Tue May 20 13:59:54 2014 -0500

    Free duplicated communicator in error path
    
    If something goes wrong in open, the error path might in some cases
    (such as inconsistent amode) leave the duplicated communicator hanging
    around.  Thanks, Wei-keng
    
    While I was here, I improved the comment about why we cannot use one of
    the built-in operators for the amode check.
    
    Signed-off-by: Wei-keng Liao <wkliao at eecs.northwestern.edu>

diff --git a/src/mpi/romio/mpi-io/open.c b/src/mpi/romio/mpi-io/open.c
index e8cdea3..2d72dae 100644
--- a/src/mpi/romio/mpi-io/open.c
+++ b/src/mpi/romio/mpi-io/open.c
@@ -48,7 +48,7 @@ int MPI_File_open(MPI_Comm comm, ROMIO_CONST char *filename, int amode,
 {
     int error_code = MPI_SUCCESS, file_system, flag, tmp_amode=0, rank;
     char *tmp;
-    MPI_Comm dupcomm;
+    MPI_Comm dupcomm = MPI_COMM_NULL;
     ADIOI_Fns *fsops;
     static char myname[] = "MPI_FILE_OPEN";
 #ifdef MPI_hpux
@@ -106,7 +106,12 @@ int MPI_File_open(MPI_Comm comm, ROMIO_CONST char *filename, int amode,
     MPIR_MPIOInit(&error_code);
     if (error_code != MPI_SUCCESS) goto fn_fail;
 
-/* check if amode is the same on all processes */
+/* check if amode is the same on all processes: at first glance, one might try
+ * to use a built-in operator like MPI_BAND, but we need every mpi process to
+ * agree the amode was not the same.  Consider process A with
+ * MPI_MODE_CREATE|MPI_MODE_RDWR, and B with MPI_MODE_RDWR:  MPI_BAND yields
+ * MPI_MODE_RDWR.  A determines amodes are different, but B proceeds having not
+ * detected an error */
     MPI_Allreduce(&amode, &tmp_amode, 1, MPI_INT, ADIO_same_amode, dupcomm);
 
     if (tmp_amode == ADIO_AMODE_NOMATCH) {
@@ -149,7 +154,6 @@ int MPI_File_open(MPI_Comm comm, ROMIO_CONST char *filename, int amode,
 
     /* --BEGIN ERROR HANDLING-- */
     if (error_code != MPI_SUCCESS) {
-        MPI_Comm_free(&dupcomm);
 	goto fn_fail;
     }
     /* --END ERROR HANDLING-- */
@@ -194,6 +198,7 @@ fn_exit:
     return error_code;
 fn_fail:
     /* --BEGIN ERROR HANDLING-- */
+    if (dupcomm != MPI_COMM_NULL) MPI_Comm_free(&dupcomm);
     error_code = MPIO_Err_return_file(MPI_FILE_NULL, error_code);
     goto fn_exit;
     /* --END ERROR HANDLING-- */

http://git.mpich.org/mpich.git/commitdiff/f16bfb543764378ad62a05f99c6fd5481996a1bb

commit f16bfb543764378ad62a05f99c6fd5481996a1bb
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Mon May 12 12:43:37 2014 -0500

    Blue Gene: one-hop aggregator placement
    
    Another experimental approach to selecting and placing aggregators on
    Blue Gene /Q.  This one tries to place aggregators in a ring such that
    they are one hop away from each other.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
index 43236e0..c993021 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
@@ -39,6 +39,7 @@ int     gpfsmpio_pthreadio;
 int     gpfsmpio_p2pcontig;
 int	gpfsmpio_balancecontig;
 int     gpfsmpio_devnullio;
+int     gpfsmpio_bridgeringagg;
 
 double	gpfsmpio_prof_cw    [GPFSMPIO_CIO_LAST+1];
 double	gpfsmpio_prof_cr    [GPFSMPIO_CIO_LAST+1];
@@ -120,6 +121,13 @@ double	gpfsmpio_prof_cr    [GPFSMPIO_CIO_LAST+1];
  *   - 0 (disabled) or 1 (enabled)
  *   - Default is 0
  *
+ * - GPFSMPIO_BRIDGERINGAGG - Relevant only to BGQ.  Aggregator placement
+ *   optimization whch forms a 5-d ring around the bridge node starting at
+ *   GPFSMPIO_BRIDGERINGAGG hops away.  Experimental performance results
+ *   suggest best value is 1 and only in conjunction with GPFSMPIO_P2PCONTIG
+ *   and GPFSMPIO_BALANCECONTIG.  The number of aggregators selected is still
+ *   GPFSMPIO_NAGG_PSET however the bridge node itself is NOT selected.
+ *
  */
 
 void ad_gpfs_get_env_vars() {
@@ -164,6 +172,10 @@ void ad_gpfs_get_env_vars() {
     gpfsmpio_devnullio = 0;
     x = getenv( "GPFSMPIO_DEVNULLIO" );
     if (x) gpfsmpio_devnullio = atoi(x);
+
+    gpfsmpio_bridgeringagg = 0;
+    x = getenv( "GPFSMPIO_BRIDGERINGAGG" );
+    if (x) gpfsmpio_bridgeringagg = atoi(x);
 }
 
 /* report timing breakdown for MPI I/O collective call */
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
index c5b9c84..16ce7cc 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
@@ -68,6 +68,7 @@ extern int      gpfsmpio_pthreadio;
 extern int      gpfsmpio_p2pcontig;
 extern int  gpfsmpio_balancecontig;
 extern int      gpfsmpio_devnullio;
+extern int      gpfsmpio_bridgeringagg;
 
 /* Default is, well, kind of complicated. Blue Gene /L and /P had "psets": one
  * i/o node and all compute nodes wired to it.  On Blue Gene /Q that
diff --git a/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.c b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.c
index 72fcfdf..6cadb8f 100644
--- a/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.c
+++ b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.c
@@ -16,6 +16,7 @@
 
 // Uncomment this line to turn tracing on for the gpfsmpio_balancecontig aggr selection optimization
 // #define balancecontigtrace 1
+// #define bridgeringaggtrace 1
 
 #include "adio.h"
 #include "adio_cb_config_list.h"
@@ -183,8 +184,132 @@ ADIOI_BG_compute_agg_ranklist_serial_do (const ADIOI_BG_ConfInfo_t *confInfo,
    /* BES: This should be done in the init routines probably. */
     int i, j;
     int aggTotal;
-    int distance, numAggs;
     int *aggList;
+
+    if (gpfsmpio_bridgeringagg > 0) {
+
+      int numAggs = confInfo->aggRatio * confInfo->ioMinSize /*virtualPsetSize*/;
+        /* the number of aggregators is (numAggs per bridgenode) */
+      if(numAggs == 1)
+        aggTotal = 1;
+      else
+        aggTotal = confInfo->numBridgeRanks * numAggs;
+
+      aggList = (int *)ADIOI_Malloc(aggTotal * sizeof(int));
+      if(aggTotal == 1) { /* special case when we only have one bridge node */
+
+        sortstruct *bridgelist = (sortstruct *)ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
+        for(i=0; i < confInfo->nProcs; i++)
+        {
+          bridgelist[i].bridge = all_procInfo[i].bridgeRank;
+          bridgelist[i].rank = i;
+          TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
+        }
+
+        /* This list contains rank->bridge info. Now, we need to sort this list. */
+        qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
+
+        aggList[0] = bridgelist[0].bridge;
+        ADIOI_Free(bridgelist);
+
+      }
+      else { // aggTotal > 1
+
+        ADIOI_BG_ProcInfo_t *allProcInfoAggNodeList = (ADIOI_BG_ProcInfo_t *) ADIOI_Malloc(confInfo->nProcs * sizeof(ADIOI_BG_ProcInfo_t));
+        int allProcInfoAggNodeListSize = 0;
+        int maxManhattanDistanceToBridge = 0;
+
+        // for ppn > 1, assign minumum rank as agg candidate
+        for (i=0;i<confInfo->nProcs;i++) {
+          int addProcToAggNodeList = 1;
+          for (j=0;j<allProcInfoAggNodeListSize;j++) {
+            if ((allProcInfoAggNodeList[j].torusCoords[0] == all_procInfo[i].torusCoords[0]) &&
+              (allProcInfoAggNodeList[j].torusCoords[1] == all_procInfo[i].torusCoords[1]) &&
+              (allProcInfoAggNodeList[j].torusCoords[2] == all_procInfo[i].torusCoords[2]) &&
+              (allProcInfoAggNodeList[j].torusCoords[3] == all_procInfo[i].torusCoords[3]) &&
+              (allProcInfoAggNodeList[j].torusCoords[4] == all_procInfo[i].torusCoords[4]) &&
+              addProcToAggNodeList) {
+              // proc is in the node list, replace if this rank is smaller
+              addProcToAggNodeList = 0;
+
+              if (allProcInfoAggNodeList[j].rank > all_procInfo[i].rank)
+                allProcInfoAggNodeList[j] = all_procInfo[i];
+            }
+          } // for j
+          if (addProcToAggNodeList) {
+            allProcInfoAggNodeList[allProcInfoAggNodeListSize] = all_procInfo[i];
+            if (allProcInfoAggNodeList[allProcInfoAggNodeListSize].manhattanDistanceToBridge > maxManhattanDistanceToBridge)
+              maxManhattanDistanceToBridge = allProcInfoAggNodeList[allProcInfoAggNodeListSize].manhattanDistanceToBridge;
+            allProcInfoAggNodeListSize++;
+          }
+        } // for i
+
+#ifdef bridgeringaggtrace
+      fprintf(stderr,"allProcInfoAggNodeListSize is %d aggTotal is %d\n",allProcInfoAggNodeListSize,aggTotal);
+#endif
+
+      int *aggNodeBridgeList = (int *) ADIOI_Malloc (allProcInfoAggNodeListSize * sizeof(int)); // list of all bridge ranks
+      int *aggNodeBridgeListNum = (int *) ADIOI_Malloc (allProcInfoAggNodeListSize * sizeof(int));
+      for (i=0;i<allProcInfoAggNodeListSize;i++) {
+        aggNodeBridgeList[i] = -1;
+        aggNodeBridgeListNum[i] = 0;
+      }
+
+      int aggNodeBridgeListSize = 0;
+      for (i=0;i<allProcInfoAggNodeListSize;i++) {
+        int foundBridge = 0;
+        for (j=0;(j<aggNodeBridgeListSize && !foundBridge);j++) {
+          if (aggNodeBridgeList[j] == allProcInfoAggNodeList[i].bridgeRank) {
+            foundBridge = 1;
+            aggNodeBridgeListNum[i]++;
+          }
+        }
+        if (!foundBridge) {
+          aggNodeBridgeList[aggNodeBridgeListSize] = allProcInfoAggNodeList[i].bridgeRank;
+          aggNodeBridgeListNum[aggNodeBridgeListSize] = 1;
+          aggNodeBridgeListSize++;
+        }
+      }
+
+      // add aggs based on numAggs per bridge, starting at gpfsmpio_bridgeringagg hops and increasing until numAggs aggs found
+      int currentAggListSize = 0;
+      for (i=0;i<aggNodeBridgeListSize;i++) {
+        int currentBridge = aggNodeBridgeList[i];
+        int currentNumHops = gpfsmpio_bridgeringagg;
+        int numAggsAssignedToThisBridge = 0;
+        while ((numAggsAssignedToThisBridge < numAggs) && (currentNumHops <= maxManhattanDistanceToBridge)) {
+          for (j=0;j<allProcInfoAggNodeListSize;j++) {
+            if (allProcInfoAggNodeList[j].bridgeRank == currentBridge) {
+              if (allProcInfoAggNodeList[j].manhattanDistanceToBridge == currentNumHops) {
+                aggList[currentAggListSize] = allProcInfoAggNodeList[j].rank;
+#ifdef bridgeringaggtrace
+                printf("Assigned agg rank %d at torus coords %u %u %u %u %u to bridge %d at torus coords %u %u %u %u %u at a distance of %d hops\n",allProcInfoAggNodeList[j].rank,allProcInfoAggNodeList[j].torusCoords[0],allProcInfoAggNodeList[j].torusCoords[1],allProcInfoAggNodeList[j].torusCoords[2],allProcInfoAggNodeList[j].torusCoords[3],allProcInfoAggNodeList[j].torusCoords[4], currentBridge, all_procInfo[currentBridge].torusCoords[0], all_procInfo[currentBridge].torusCoords[1], all_procInfo[currentBridge].torusCoords[2], all_procInfo[currentBridge].torusCoords[3], all_procInfo[currentBridge].torusCoords[4],currentNumHops);
+#endif
+                currentAggListSize++;
+                numAggsAssignedToThisBridge++;
+                if (numAggsAssignedToThisBridge >= numAggs)
+                  break;
+              }
+            }
+          }
+          currentNumHops++;
+        } // while
+        ADIOI_Assert(numAggsAssignedToThisBridge == numAggs);
+      } // for
+
+      ADIOI_Free(allProcInfoAggNodeList);
+      ADIOI_Free(aggNodeBridgeList);
+      ADIOI_Free(aggNodeBridgeListNum);
+
+      } // else aggTotal  > 1
+
+       memcpy(tmp_ranklist, aggList, aggTotal*sizeof(int));
+    } // gpfsmpio_bridgeringagg > 0
+
+    else { // gpfsmpio_bridgeringagg unset - default code
+
+    int distance, numAggs;
+
     /* Aggregators will be midpoints between sorted MPI rank lists of who shares a given
      * bridge node */
 
@@ -282,9 +407,11 @@ ADIOI_BG_compute_agg_ranklist_serial_do (const ADIOI_BG_ConfInfo_t *confInfo,
 
 
    ADIOI_Free (bridgelist);
-   ADIOI_Free (aggList);
 
    TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial_do\n");
+   }
+
+   ADIOI_Free (aggList);
    return aggTotal;
 
 }
@@ -371,8 +498,8 @@ ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
 	for (i=0;i<naggs;i++)
 	    bridgelistnum[i] = 0;
 
-	/* Each entry in this list corresponds with the bridgelist and will
-	 * contain the lowest bridge agg rank on that ion. */
+	/* Each entry in this list corresponds with the bridgelist and will contain the lowest bridge
+	 * agg rank on that ion. */
 	int *summarybridgeminionaggrank = (int *) ADIOI_Malloc (naggs * sizeof(int));
 	for (i=0;i<naggs;i++)
 	    summarybridgeminionaggrank[i] = -1;
@@ -408,9 +535,8 @@ ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
 	    bridgelistnum[summaryranklistbridgeindex]++;
 	}
 
-    /* at this point summarybridgeminionaggrank has the agg rank of the bridge
-     * for entries; now need to make each entry the minimum bridge rank for the
-     * entire ion. */
+    /* at this point summarybridgeminionaggrank has the agg rank of the bridge for entries,
+     * need to make each entry the minimum bridge rank for the entire ion. */
     for (i=0;i<numbridges;i++) {
         int aggIonId = ionlist[i];
         int j;
diff --git a/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.c b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.c
index 7cd0ba2..e3a66a7 100644
--- a/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.c
+++ b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.c
@@ -13,6 +13,8 @@
  */
 
 /* #define TRACE_ON */
+// #define bridgeringaggtrace 1
+
 #include <stdlib.h>
 #include "../ad_gpfs.h"
 #include "ad_bg_pset.h"
@@ -79,6 +81,35 @@ static int intsort(const void *p1, const void *p2)
    return(i1->bridgeCoord - i2->bridgeCoord);
 }
 
+unsigned torusSize[MPIX_TORUS_MAX_DIMS];
+unsigned dimTorus[MPIX_TORUS_MAX_DIMS];
+
+/* This function computes the number of hops between the torus coordinates of the
+ * aggCoords and bridgeCoords parameters.
+*/
+static unsigned procManhattanDistance(unsigned *aggCoords, unsigned *bridgeCoords) {
+
+  unsigned totalDistance = 0;
+  int i;
+  for (i=0;i<MPIX_TORUS_MAX_DIMS;i++) {
+    unsigned dimDistance = abs(aggCoords[i] - bridgeCoords[i]);
+    if (dimDistance > 0) { // could torus make it closer?
+      if (dimTorus[i]) {
+        if (aggCoords[i] == torusSize[i]) { // is wrap-around closer
+          if ((bridgeCoords[i]+1) < dimDistance) // assume will use torus link
+            dimDistance = bridgeCoords[i]+1;
+        }
+        else if (bridgeCoords[i] == torusSize[i]) { // is wrap-around closer
+          if ((aggCoords[i]+1) < dimDistance) // assume will use torus link
+            dimDistance = aggCoords[i]+1;
+        }
+      }
+    }
+    totalDistance += dimDistance;
+  }
+  return totalDistance;
+}
+
 
 void 
 ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf, 
@@ -102,14 +133,36 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
 
    proc->rank = rank;
    proc->coreID = hw.coreID;
-   proc->ionID = MPIX_IO_node_id ();
+
+   if (gpfsmpio_bridgeringagg > 0) {
+     for (i=0;i<MPIX_TORUS_MAX_DIMS;i++) {
+       proc->torusCoords[i] = hw.Coords[i];
+     }
+#ifdef bridgeringaggtrace
+     if (rank == 0)
+       fprintf(stderr,"Block dimensions:\n");
+#endif
+     for (i=0;i<MPIX_TORUS_MAX_DIMS;i++) {
+       torusSize[i] = hw.Size[i];
+       dimTorus[i] = hw.isTorus[i];
+#ifdef bridgeringaggtrace
+       if (rank == 0)
+         fprintf(stderr,"Dimension %d has %d elements wrap-around value is %d\n",i,torusSize[i],dimTorus[i]);
+#endif
+     }
+   }
 
    MPI_Comm_size(comm, &commsize);
 
+   proc->ionID = MPIX_IO_node_id ();
+
    if(size == 1)
    {
       proc->iamBridge = 1;
       proc->bridgeRank = rank;
+      if (gpfsmpio_bridgeringagg > 0) {
+        proc->manhattanDistanceToBridge = 0;
+      }
 
       /* Set up the other parameters */
       proc->myIOSize = size;
@@ -143,8 +196,32 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
       (hw.Coords[1] == pers.Network_Config.cnBridge_B) && 
       (hw.Coords[2] == pers.Network_Config.cnBridge_C) && 
       (hw.Coords[3] == pers.Network_Config.cnBridge_D) && 
-      (hw.Coords[4] == pers.Network_Config.cnBridge_E))
+      (hw.Coords[4] == pers.Network_Config.cnBridge_E)) {
       iambridge = 1;      /* I am bridge */
+      if (gpfsmpio_bridgeringagg > 0) {
+        proc->manhattanDistanceToBridge = 0;
+      }
+    }
+    else {  // calculate manhattan distance to bridge if gpfsmpio_bridgeringagg is set
+      if (gpfsmpio_bridgeringagg > 0) {
+        unsigned aggCoords[MPIX_TORUS_MAX_DIMS],manhattanBridgeCoords[MPIX_TORUS_MAX_DIMS];
+        aggCoords[0] = hw.Coords[0];
+        manhattanBridgeCoords[0] = pers.Network_Config.cnBridge_A;
+        aggCoords[1] = hw.Coords[1];
+        manhattanBridgeCoords[1] = pers.Network_Config.cnBridge_B;
+        aggCoords[2] = hw.Coords[2];
+        manhattanBridgeCoords[2] = pers.Network_Config.cnBridge_C;
+        aggCoords[3] = hw.Coords[3];
+        manhattanBridgeCoords[3] = pers.Network_Config.cnBridge_D;
+        aggCoords[4] = hw.Coords[4];
+        manhattanBridgeCoords[4] = pers.Network_Config.cnBridge_E;
+
+        proc->manhattanDistanceToBridge= procManhattanDistance(aggCoords, manhattanBridgeCoords);
+#ifdef bridgeringaggtrace
+        fprintf(stderr,"agg coords are %u %u %u %u %u bridge coords are %u %u %u %u %u distance is %u\n",aggCoords[0],aggCoords[1],aggCoords[2],aggCoords[3],aggCoords[4],manhattanBridgeCoords[0],manhattanBridgeCoords[1],manhattanBridgeCoords[2],manhattanBridgeCoords[3],manhattanBridgeCoords[4], proc->manhattanDistanceToBridge);
+#endif
+      }
+    }
 
    TRACE_ERR("Bridge coords(%8.8X): %d %d %d %d %d, %d. iambridge %d\n",bridgeCoords, pers.Network_Config.cnBridge_A,pers.Network_Config.cnBridge_B,pers.Network_Config.cnBridge_C,pers.Network_Config.cnBridge_D,pers.Network_Config.cnBridge_E,0, iambridge);
 
diff --git a/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.h b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.h
index 51ae4a0..d5f36b1 100644
--- a/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.h
+++ b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.h
@@ -17,6 +17,10 @@
 #ifndef AD_BG_PSET_H_
 #define AD_BG_PSET_H_
 
+#ifdef HAVE_MPIX_H
+#include <mpix.h>
+#endif
+
 /* Keeps specific information to each process, will be exchanged among processes */
 typedef struct {
    int ioNodeIndex; /* similar to psetNum on BGL/BGP */
@@ -31,6 +35,8 @@ typedef struct {
       node, i.e. psetsize*/
    int iamBridge; /* am *I* the bridge rank? */
    int __ipad[2];
+   unsigned torusCoords[MPIX_TORUS_MAX_DIMS]; /* torus coordinates of node on which this rank resides */
+   unsigned manhattanDistanceToBridge; /* number of hops between this rank and the bridge node */
 } ADIOI_BG_ProcInfo_t __attribute__((aligned(16)));
 
 /* Keeps general information for the whole communicator, only on process 0 */

http://git.mpich.org/mpich.git/commitdiff/e053df00a4a22febb28438362f477b20cbef18e1

commit e053df00a4a22febb28438362f477b20cbef18e1
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Wed May 7 16:17:34 2014 -0500

    fix to support gpfs hints and deferred open
    
    If deferred open enabled, the processes with no i/o will not have the
    file open.  if they try to call gpfs_fcntl() on their invalid file
    handle, they will get an error.  only processes doing any i/o need to
    set the hints anyway.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
index 3d11db9..2d890e5 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
@@ -551,17 +551,25 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
     if ((st_loc==-1) && (end_loc==-1)) {
 	ntimes = 0; /* this process does no writing. */
     }
-    if (ntimes != 0 && getenv("ROMIO_GPFS_DECLARE_ACCESS")!=NULL) {
+    if (ntimes > 0) { /* only set the gpfs hint if we have io - ie this rank is
+			 an aggregator -- otherwise will fail for deferred open */
+      if (getenv("ROMIO_GPFS_DECLARE_ACCESS")!=NULL) {
 	gpfs_wr_access_start(fd->fd_sys, st_loc, end_loc - st_loc);
+      }
     }
+
     ADIO_Offset st_loc_ion, end_loc_ion, needs_gpfs_access_cleanup=0;
 #ifdef BGQPLATFORM
-    if (getenv("ROMIO_GPFS_DECLARE_ION_ACCESS")!=NULL) {
+    if (ntimes > 0) { /* only set the gpfs hint if we have io - ie this rank is
+			 an aggregator -- otherwise will fail for deferred open */
+
+      if (getenv("ROMIO_GPFS_DECLARE_ION_ACCESS")!=NULL) {
 	if (gpfs_find_access_for_ion(fd, st_loc, end_loc, fd_start, fd_end,
 		    &st_loc_ion, &end_loc_ion)) {
 	    gpfs_wr_access_start(fd->fd_sys, st_loc_ion, end_loc_ion-st_loc_ion);
 	    needs_gpfs_access_cleanup=1;
 	}
+      }
     }
 #endif
 
@@ -851,10 +859,6 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
 	needs_gpfs_access_cleanup=0;
     }
 
-    if (needs_gpfs_access_cleanup) {
-	gpfs_wr_access_end(fd->fd_sys, end_loc_ion-st_loc_ion, st_loc_ion);
-	needs_gpfs_access_cleanup=0;
-    }
     unsetenv("LIBIOLOG_EXTRA_INFO");
 }
 

http://git.mpich.org/mpich.git/commitdiff/71a21c87809b7d7074eb59d00f29334099b41607

commit 71a21c87809b7d7074eb59d00f29334099b41607
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Tue May 20 11:06:51 2014 -0500

    gpfs_find_access_for_ion is BGQ-specific
    
    For PE and other not-bluegene GPFS platforms, provide the "build an
    access based on IO nodes" function only on Blue Gene.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
index 37d3fd8..3d11db9 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
@@ -422,6 +422,7 @@ void gpfs_wr_access_end(int fd, ADIO_Offset offset, ADIO_Offset length)
         ADIOI_Assert(rc == 0);
 }
 
+#ifdef BGQPLATFORM
 /* my_start, my_end: this processes file domain.  coudd be -1,-1 for "no i/o"
  * fd_start, fd_end: arrays of length fd->hints->cb_nodes specifying all file domains */
 int gpfs_find_access_for_ion(ADIO_File fd,
@@ -466,7 +467,7 @@ int gpfs_find_access_for_ion(ADIO_File fd,
     ADIOI_Free(rank_to_ionode);
     return 1;
 }
-
+#endif // BGQPLATFORM
 
 
 /* If successful, error_code is set to MPI_SUCCESS.  Otherwise an error
@@ -554,6 +555,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
 	gpfs_wr_access_start(fd->fd_sys, st_loc, end_loc - st_loc);
     }
     ADIO_Offset st_loc_ion, end_loc_ion, needs_gpfs_access_cleanup=0;
+#ifdef BGQPLATFORM
     if (getenv("ROMIO_GPFS_DECLARE_ION_ACCESS")!=NULL) {
 	if (gpfs_find_access_for_ion(fd, st_loc, end_loc, fd_start, fd_end,
 		    &st_loc_ion, &end_loc_ion)) {
@@ -561,6 +563,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
 	    needs_gpfs_access_cleanup=1;
 	}
     }
+#endif
 
     MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX,
 		  fd->comm); 

http://git.mpich.org/mpich.git/commitdiff/ae5b26577bfcc2cb3034fccc2daba3138a264682

commit ae5b26577bfcc2cb3034fccc2daba3138a264682
Author: Paul Coffman <pkcoff at us.ibm.com>
Date:   Mon Apr 14 16:48:39 2014 -0500

    Assert on non-zero gpfs_fcntl
    
    an assertion is pretty heavy-handed here but if we have gone through the
    hoops to request these routines, we should know if they fail.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
index c052081..37d3fd8 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
@@ -397,6 +397,7 @@ void gpfs_wr_access_start(int fd, ADIO_Offset offset, ADIO_Offset length)
         take_locks.access.isWrite = 1;
 
         rc = gpfs_fcntl(fd, &take_locks);
+        ADIOI_Assert(rc == 0);
 }
 
 void gpfs_wr_access_end(int fd, ADIO_Offset offset, ADIO_Offset length)
@@ -418,6 +419,7 @@ void gpfs_wr_access_end(int fd, ADIO_Offset offset, ADIO_Offset length)
         free_locks.free.length = length;
 
         rc = gpfs_fcntl(fd, &free_locks);
+        ADIOI_Assert(rc == 0);
 }
 
 /* my_start, my_end: this processes file domain.  coudd be -1,-1 for "no i/o"

http://git.mpich.org/mpich.git/commitdiff/8f6179fa8bdd050a0038ab1034a550a7b7e922a5

commit 8f6179fa8bdd050a0038ab1034a550a7b7e922a5
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Thu Apr 10 16:01:52 2014 +0000

    make event elements array a bit bigger
    
    typical convention is to use last element of enum "LAST_ITEM" as a
    marker for how big an array of said enum would be.  the timing code
    though uses this last element.  xl compiler caught us overrunning the
    array.

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
index 902c3ac..43236e0 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c
@@ -40,8 +40,8 @@ int     gpfsmpio_p2pcontig;
 int	gpfsmpio_balancecontig;
 int     gpfsmpio_devnullio;
 
-double	gpfsmpio_prof_cw    [GPFSMPIO_CIO_LAST];
-double	gpfsmpio_prof_cr    [GPFSMPIO_CIO_LAST];
+double	gpfsmpio_prof_cw    [GPFSMPIO_CIO_LAST+1];
+double	gpfsmpio_prof_cr    [GPFSMPIO_CIO_LAST+1];
 
 /* set internal variables for tuning environment variables */
 /** \page mpiio_vars MPIIO Configuration
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
index 735eba6..c5b9c84 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h
@@ -52,8 +52,9 @@ enum {
     GPFSMPIO_CIO_LAST
 };
 
-extern double 	gpfsmpio_prof_cw    [GPFSMPIO_CIO_LAST];
-extern double 	gpfsmpio_prof_cr    [GPFSMPIO_CIO_LAST];
+/* +1 because GPFSMPIO_CIO_LAST is actually used to say "zero this counter"" */
+extern double 	gpfsmpio_prof_cw    [GPFSMPIO_CIO_LAST+1];
+extern double 	gpfsmpio_prof_cr    [GPFSMPIO_CIO_LAST+1];
 
 
 /* corresponds to environment variables to select optimizations and timing level */

http://git.mpich.org/mpich.git/commitdiff/3c9992ef6a70f6f977e74fe994c9b7e56d3b00c3

commit 3c9992ef6a70f6f977e74fe994c9b7e56d3b00c3
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Thu Apr 10 16:00:35 2014 +0000

    Debugging code caused segfault
    
    this debugging code, when --enable-g=all is set, will exectue but not
    print anything.  for some problems sizes/scale, the way it accesses the
    128k'th element causes a segfault.

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c
index 494ea12..938a133 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c
@@ -1115,7 +1115,8 @@ static void ADIOI_R_Exchange_data_alltoallv(
     DBG_FPRINTF(stderr, "\ttails = %4d, %4d\n", stail, rtail );
     if (nprocs_send) {
     DBG_FPRINTF(stderr, "\tall_send_buf =  [%d]%2d,",0,all_send_buf[0]);
-    for (i=1; i<nprocs; i++) if(all_send_buf[(i-1)*131072]!=all_send_buf[i*131072]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i, all_send_buf  [i*131072] ); }
+    /* someone at some point found it useful to look at the 128th kilobyte of data from each processor, but this segfaults in many situations if "all debugging" enabled */
+    //for (i=1; i<nprocs; i++) if(all_send_buf[(i-1)*131072]!=all_send_buf[i*131072]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i, all_send_buf  [i*131072] ); }
     }
 #endif
     

http://git.mpich.org/mpich.git/commitdiff/8421f3a9c477188aae522caa23969c4ea2c68ea0

commit 8421f3a9c477188aae522caa23969c4ea2c68ea0
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Tue Feb 25 09:18:53 2014 -0600

    an IO-node aware mode for gpfs hints
    
    Send a single access_range hint to the IO node in hopes this approach
    will make the gpfs_fcntl hints do something.

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
index 4bab0c1..c052081 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
@@ -17,6 +17,8 @@
 #include "ad_gpfs.h"
 #include "ad_gpfs_aggrs.h"
 
+#include <mpix.h>
+
 #ifdef AGGREGATION_PROFILE
 #include "mpe.h"
 #endif
@@ -33,6 +35,7 @@
 #include <gpfs_fcntl.h>
 #endif
 
+#include <limits.h>
 /* prototypes of functions used for collective writes only. */
 static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
                          datatype, int nprocs, int myrank, ADIOI_Access
@@ -417,6 +420,50 @@ void gpfs_wr_access_end(int fd, ADIO_Offset offset, ADIO_Offset length)
         rc = gpfs_fcntl(fd, &free_locks);
 }
 
+/* my_start, my_end: this processes file domain.  coudd be -1,-1 for "no i/o"
+ * fd_start, fd_end: arrays of length fd->hints->cb_nodes specifying all file domains */
+int gpfs_find_access_for_ion(ADIO_File fd,
+	ADIO_Offset my_start, ADIO_Offset my_end,
+	ADIO_Offset *fd_start, ADIO_Offset *fd_end,
+	ADIO_Offset *start, ADIO_Offset *end)
+{
+    int my_ionode = MPIX_IO_node_id();
+    int *rank_to_ionode;
+    int i, nprocs, rank;
+    ADIO_Offset group_start=LLONG_MAX, group_end=0;
+
+    MPI_Comm_size(fd->comm, &nprocs);
+    MPI_Comm_rank(fd->comm, &rank);
+
+    rank_to_ionode = ADIOI_Calloc(nprocs, sizeof(int));
+    MPI_Allgather(&my_ionode, 1, MPI_INT,  rank_to_ionode, 1, MPI_INT, fd->comm);
+
+    /* rank_to_ionode now contains a mapping from MPI rank to IO node */
+    /* fd->hints->ranklist[] contains a list of MPI ranks that are aggregators */
+    /* fd_start[] and fd_end[] contain a list of file domains. */
+
+    /* what we really want to do is take all the file domains associated
+     * with a given i/o node and find the begin/end of that range.
+     *
+     * Because gpfs_fcntl hints are expected to be released, we'll pass this
+     * start/end back to the caller, who will both declare and free this range
+     */
+    if (my_start == -1 || my_end == -1) {
+	ADIOI_Free(rank_to_ionode);
+	return 0; /* no work to do */
+    }
+
+    for (i=0; i<fd->hints->cb_nodes; i++ ){
+	if (my_ionode == rank_to_ionode[fd->hints->ranklist[i]] ) {
+	    group_start = ADIOI_MIN(fd_start[i], group_start);
+	    group_end = ADIOI_MAX(fd_end[i], group_end);
+	}
+    }
+    *start = group_start;
+    *end = group_end;
+    ADIOI_Free(rank_to_ionode);
+    return 1;
+}
 
 
 
@@ -798,6 +845,11 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
 	gpfs_wr_access_end(fd->fd_sys, st_loc_ion, end_loc_ion-st_loc_ion);
 	needs_gpfs_access_cleanup=0;
     }
+
+    if (needs_gpfs_access_cleanup) {
+	gpfs_wr_access_end(fd->fd_sys, end_loc_ion-st_loc_ion, st_loc_ion);
+	needs_gpfs_access_cleanup=0;
+    }
     unsetenv("LIBIOLOG_EXTRA_INFO");
 }
 

http://git.mpich.org/mpich.git/commitdiff/67bf775e9534ef87f1cb8dec536a1f2977379abc

commit 67bf775e9534ef87f1cb8dec536a1f2977379abc
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Thu Feb 20 13:47:57 2014 -0600

    romio+gpfs: experiement with per-access hints
    
    these don't seem to help on vesta
    
    next up: sending individual range requests to the io node (once I figure
    out how to do that)

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
index 2ff9842..4bab0c1 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c
@@ -26,6 +26,13 @@
 
 #include <pthread.h>
 
+#ifdef HAVE_GPFS_H
+#include <gpfs.h>
+#endif
+#ifdef HAVE_GPFS_FCNTL_H
+#include <gpfs_fcntl.h>
+#endif
+
 /* prototypes of functions used for collective writes only. */
 static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
                          datatype, int nprocs, int myrank, ADIOI_Access
@@ -368,6 +375,49 @@ void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
 #endif
 }
 
+void gpfs_wr_access_start(int fd, ADIO_Offset offset, ADIO_Offset length)
+{
+        int rc;
+        struct {
+                gpfsFcntlHeader_t header;
+                gpfsAccessRange_t access;
+        } take_locks;
+
+        take_locks.header.totalLength = sizeof(take_locks);
+        take_locks.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
+        take_locks.header.fcntlReserved = 0;
+
+        take_locks.access.structLen = sizeof(take_locks.access);
+        take_locks.access.structType = GPFS_ACCESS_RANGE;
+        take_locks.access.start = offset;
+        take_locks.access.length = length;
+        take_locks.access.isWrite = 1;
+
+        rc = gpfs_fcntl(fd, &take_locks);
+}
+
+void gpfs_wr_access_end(int fd, ADIO_Offset offset, ADIO_Offset length)
+{
+        int rc;
+        struct {
+                gpfsFcntlHeader_t header;
+                gpfsFreeRange_t free;
+        } free_locks;
+
+
+        free_locks.header.totalLength = sizeof(free_locks);
+        free_locks.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
+        free_locks.header.fcntlReserved = 0;
+
+        free_locks.free.structLen = sizeof(free_locks.free);
+        free_locks.free.structType = GPFS_FREE_RANGE;
+        free_locks.free.start = offset;
+        free_locks.free.length = length;
+
+        rc = gpfs_fcntl(fd, &free_locks);
+}
+
+
 
 
 /* If successful, error_code is set to MPI_SUCCESS.  Otherwise an error
@@ -451,6 +501,17 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
     if ((st_loc==-1) && (end_loc==-1)) {
 	ntimes = 0; /* this process does no writing. */
     }
+    if (ntimes != 0 && getenv("ROMIO_GPFS_DECLARE_ACCESS")!=NULL) {
+	gpfs_wr_access_start(fd->fd_sys, st_loc, end_loc - st_loc);
+    }
+    ADIO_Offset st_loc_ion, end_loc_ion, needs_gpfs_access_cleanup=0;
+    if (getenv("ROMIO_GPFS_DECLARE_ION_ACCESS")!=NULL) {
+	if (gpfs_find_access_for_ion(fd, st_loc, end_loc, fd_start, fd_end,
+		    &st_loc_ion, &end_loc_ion)) {
+	    gpfs_wr_access_start(fd->fd_sys, st_loc_ion, end_loc_ion-st_loc_ion);
+	    needs_gpfs_access_cleanup=1;
+	}
+    }
 
     MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX,
 		  fd->comm); 
@@ -729,6 +790,14 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
     ADIOI_Free(curr_to_proc);
     ADIOI_Free(done_to_proc);
 
+    if (ntimes != 0 && getenv("ROMIO_GPFS_DECLARE_ACCESS")!=NULL) {
+	gpfs_wr_access_end(fd->fd_sys, st_loc, end_loc-st_loc);
+    }
+
+    if (needs_gpfs_access_cleanup) {
+	gpfs_wr_access_end(fd->fd_sys, st_loc_ion, end_loc_ion-st_loc_ion);
+	needs_gpfs_access_cleanup=0;
+    }
     unsetenv("LIBIOLOG_EXTRA_INFO");
 }
 

http://git.mpich.org/mpich.git/commitdiff/faad606c90bcd14204021bbd0ab0de01351da779

commit faad606c90bcd14204021bbd0ab0de01351da779
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Mon Jan 27 19:48:54 2014 +0000

    romio+gpfs: experiment with gpfs_fcntl hints
    
    When many processes write to a gpfs file, even with nicely aligned
    blocks, we see immense variation in access times.  Could that variation
    be mitigated if we drop locks on first access?
    
    next experiment (not yet implemented): what if we pre-declare
    access/ranges when file domains computed?

diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.h b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.h
index 4d0b0da..1b6215e 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.h
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.h
@@ -18,6 +18,10 @@
 
 #include "adio.h"
 #include <sys/stat.h>
+
+#ifdef HAVE_GPFS_H
+#include <gpfs.h>
+#endif
 
 
     /* overriding ADIOI_Calc_file_domains() to apply 'aligned file domain partitioning'. */
diff --git a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_open.c b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_open.c
index 99b2c1b..4e236b6 100644
--- a/src/mpi/romio/adio/ad_gpfs/ad_gpfs_open.c
+++ b/src/mpi/romio/adio/ad_gpfs/ad_gpfs_open.c
@@ -19,6 +19,39 @@
 #include <sys/types.h>
 #include <unistd.h>
 
+#ifdef HAVE_GPFS_H
+#include <gpfs.h>
+#endif
+#ifdef HAVE_GPFS_FCNTL_H
+#include <gpfs_fcntl.h>
+#endif
+
+#ifdef HAVE_GPFS_FCNTL_H
+void gpfs_free_all_locks(int fd)
+{
+    int rc;
+    struct {
+	gpfsFcntlHeader_t header;
+	gpfsFreeRange_t release;
+    } release_all;
+
+    release_all.header.totalLength = sizeof(release_all);
+    release_all.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
+    release_all.header.fcntlReserved = 0;
+
+    release_all.release.structLen = sizeof(release_all.release);
+    release_all.release.structType = GPFS_FREE_RANGE;
+    release_all.release.start = 0;
+    release_all.release.length = 0;
+
+    rc = gpfs_fcntl(fd, &release_all);
+    if (rc != 0) {
+	DBGV_FPRINTF(stderr,"GPFS fcntl release failed with rc=%d, errno=%d\n",
+		rc,errno);
+    }
+}
+#endif
+
 
 void ADIOI_GPFS_Open(ADIO_File fd, int *error_code)
 {
@@ -101,6 +134,14 @@ void ADIOI_GPFS_Open(ADIO_File fd, int *error_code)
 #ifdef ADIOI_MPE_LOGGING
         MPE_Log_event(ADIOI_MPE_stat_b, 0, NULL);
 #endif
+
+#ifdef HAVE_GPFS_FCNTL_H
+	/* in parallel workload, might be helpful to immediately release block
+	 * tokens.  Or, system call overhead will outweigh any benefits... */
+	if (getenv("ROMIO_GPFS_FREE_LOCKS")!=NULL)
+	    gpfs_free_all_locks(fd->fd_sys);
+
+#endif
     }
 
   if (fd->fd_sys == -1)  {
diff --git a/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.h b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.h
index d6baaed..b154722 100644
--- a/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.h
+++ b/src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.h
@@ -20,6 +20,12 @@
 #include "adio.h"
 #include <sys/stat.h>
 
+#ifdef HAVE_GPFS_H
+#include <gpfs.h>
+#endif
+#if !defined(GPFS_SUPER_MAGIC)
+  #define GPFS_SUPER_MAGIC (0x47504653)
+#endif
 
     /* generate a list of I/O aggregators that utilizes BG-PSET orginization. */
     int ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset);
diff --git a/src/mpi/romio/configure.ac b/src/mpi/romio/configure.ac
index 484556d..df13053 100644
--- a/src/mpi/romio/configure.ac
+++ b/src/mpi/romio/configure.ac
@@ -973,6 +973,11 @@ fi
 AS_IF([test -n "$file_system_gpfs"],
     [SYSDEP_INC=-I${prefix}/include], [SYSDEP_INC=])
 
+AC_CHECK_HEADERS([gpfs.h gpfs_fcntl.h])
+AS_IF([test "$ac_cv_header_gpfs_h" = "yes" -o "$ac_cv_header_gpfs_fcntl_h" = "yes"], [
+  AC_SEARCH_LIBS([gpfs_fcntl], [gpfs], [],
+    [AC_MSG_ERROR([Library containing gpfs_fcntl symbols not found])])
+])
 # Check for presence and characteristics of async. I/O calls if
 # not disabled.
 

-----------------------------------------------------------------------

Summary of changes:
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_aggrs.h  |    4 +
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_open.c   |   41 ++++++++
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_rdcoll.c |    3 +-
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.c |   16 +++-
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_tuning.h |    6 +-
 src/mpi/romio/adio/ad_gpfs/ad_gpfs_wrcoll.c |  130 +++++++++++++++++++++++++
 src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.c |  140 +++++++++++++++++++++++++--
 src/mpi/romio/adio/ad_gpfs/bg/ad_bg_aggrs.h |    6 +
 src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.c  |   81 +++++++++++++++-
 src/mpi/romio/adio/ad_gpfs/bg/ad_bg_pset.h  |    6 +
 src/mpi/romio/configure.ac                  |    5 +
 src/mpi/romio/mpi-io/open.c                 |   11 ++-
 12 files changed, 432 insertions(+), 17 deletions(-)


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list