[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.1-218-gbfc3495

Service Account noreply at mpich.org
Sun May 4 21:47:35 CDT 2014


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  bfc34956d4c0b1711013ebbafddfa23bfc0a8988 (commit)
       via  2489d4aa8511f9bcd5332e1664903b142acbe565 (commit)
       via  4e48d6de832879128c3e0408f04365c6e57c6409 (commit)
       via  160cddc3d1929a9d2bb063fe58cc3a60c950ad9f (commit)
       via  1c5c594554343a0b6b4335cb28790aa51ab8a968 (commit)
       via  514214bc5cfdc7382d6d2667e86424fa5aa312e1 (commit)
       via  754b3d3803ee8c4142fe45601ea258f7669f3bbb (commit)
       via  7c13f5537bf75a9aaa32a6b2a155d6e8c0f251d0 (commit)
       via  7a80eb875e239817b163aadfb5b07ca769c27c05 (commit)
       via  12531694f51ca930806613014c1560abcc7a1357 (commit)
      from  62ada3c53c9f307f870f6fcd325346931266cbd6 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/bfc34956d4c0b1711013ebbafddfa23bfc0a8988

commit bfc34956d4c0b1711013ebbafddfa23bfc0a8988
Author: William Gropp <wgropp at illinois.edu>
Date:   Mon Mar 10 08:08:52 2014 -0500

    Remove incomplete dataloop optimizations
    
    The best approach for some of these optimizations is through the new
    system, so this commit removes some of the work in progress to allow
    this patch for dataloop performance to be integrated with MPICH
    until the dataloop code is replaced.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpid/common/datatype/dataloop/dataloop_optimize.c b/src/mpid/common/datatype/dataloop/dataloop_optimize.c
index 5332001..29be813 100644
--- a/src/mpid/common/datatype/dataloop/dataloop_optimize.c
+++ b/src/mpid/common/datatype/dataloop/dataloop_optimize.c
@@ -396,19 +396,8 @@ int PREPEND_PREFIX(Dataloop_optimize)(DLOOP_Dataloop *dlpOld_p, int level )
 #endif
             }
         }
-        /* replace blockindexed of a single element with contig,
-           but with an offset (how?) */
-        /* TODO */
-        if (dlpOld_p->loop_params.bi_t.count == 1 &&
-            dlpOld_p->loop_params.bi_t.offset_array[0] == 0) {
-#ifdef MPICH_DEBUG_DATALOOP
-            if (printIfOptimized || printDataloop) {
-                printf( "replacement Contig is:(NOTDONE)\n" );
-                /* dl_print_blockindexed( level, dlpOld_p ); */
-                }
-#endif
-        }
         break;
+
     case DLOOP_KIND_INDEXED:
         /* if sub-dloop is (non-basic) contig, merge with blockcount */
 #ifdef MPICH_DEBUG_DATALOOP
@@ -423,8 +412,6 @@ int PREPEND_PREFIX(Dataloop_optimize)(DLOOP_Dataloop *dlpOld_p, int level )
             }
         }
 
-        /* replace indexed of constant block count with blockindexed */
-
         /* replace indexed of a single element with contig */
 
         /* If all block counts are multiples of the smallest, and if most

http://git.mpich.org/mpich.git/commitdiff/2489d4aa8511f9bcd5332e1664903b142acbe565

commit 2489d4aa8511f9bcd5332e1664903b142acbe565
Author: William Gropp <wgropp at illinois.edu>
Date:   Sat Mar 8 16:11:14 2014 -0600

    Correct error in description
    
    A probably copy and paste error in the description of the block indexed
    structure.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpid/common/datatype/dataloop/dataloop_parts.h b/src/mpid/common/datatype/dataloop/dataloop_parts.h
index b4bd636..56d6bd5 100644
--- a/src/mpid/common/datatype/dataloop/dataloop_parts.h
+++ b/src/mpid/common/datatype/dataloop/dataloop_parts.h
@@ -116,7 +116,6 @@ typedef struct DLOOP_Dataloop_vector {
 + count - Number of blocks
 . blocksize - Number of elements in each block
 . offset_array - Array of offsets (in bytes) to each block
-. total_blocks - count of total blocks in the array (cached value)
 - dataloop - Dataloop of each element
 
   Module:

http://git.mpich.org/mpich.git/commitdiff/4e48d6de832879128c3e0408f04365c6e57c6409

commit 4e48d6de832879128c3e0408f04365c6e57c6409
Author: William Gropp <wgropp at illinois.edu>
Date:   Sat Mar 8 16:10:18 2014 -0600

    Fix location of debugging statement
    
    Statement was placed before one of the values was defined.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpid/common/datatype/dataloop/segment_packunpack.c b/src/mpid/common/datatype/dataloop/segment_packunpack.c
index 45fade4..1712f38 100644
--- a/src/mpid/common/datatype/dataloop/segment_packunpack.c
+++ b/src/mpid/common/datatype/dataloop/segment_packunpack.c
@@ -415,8 +415,8 @@ int PREPEND_PREFIX(Segment_index_m2m)(DLOOP_Offset *blocks_p,
     char *cbufp;
     struct PREPEND_PREFIX(m2m_params) *paramp = v_paramp;
 
-    DBG_SEGMENT(printf( "index m2m: elsize = %d, count = %d\n", (int)el_size, (int)count ));
     DLOOP_Handle_get_size_macro(el_type, el_size);
+    DBG_SEGMENT(printf( "index m2m: elsize = %d, count = %d\n", (int)el_size, (int)count ));
 
     while (blocks_left) {
 	char *src, *dest;

http://git.mpich.org/mpich.git/commitdiff/160cddc3d1929a9d2bb063fe58cc3a60c950ad9f

commit 160cddc3d1929a9d2bb063fe58cc3a60c950ad9f
Author: William Gropp <wgropp at illinois.edu>
Date:   Sat Mar 8 15:10:55 2014 -0600

    Remove incorrect use of status in test
    
    This test was created from another one that used communication - in that
    process, the use of status.MPI_SOURCE needed to be changed and wasn't in
    the original version of this test.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/test/mpi/datatype/structpack2.c b/test/mpi/datatype/structpack2.c
index 16b0151..1dcbd2e 100644
--- a/test/mpi/datatype/structpack2.c
+++ b/test/mpi/datatype/structpack2.c
@@ -23,7 +23,6 @@ int main( int argc, char *argv[])
 	MPI_Aint disps[2];
 	MPI_Datatype bases[2];
 	MPI_Datatype str, con;
-	MPI_Status status;
 	char *buffer;
 	int   bufsize, position, insize;
 
@@ -86,23 +85,23 @@ int main( int argc, char *argv[])
 		printf("%d Sent: %d %c, Got: %d %c\n", rank,
 			s[j].i, s[j].c, s1[j].i, s1[j].c );
 #endif
-		if ( s1[j].i != j + status.MPI_SOURCE ) {
+		if ( s1[j].i != j + rank ) {
 		    errs++;
 		    printf( "Got s[%d].i = %d (%x); expected %d\n", j, s1[j].i,
-			    s1[j].i, j + status.MPI_SOURCE );
+			    s1[j].i, j + rank );
 		}
-		if ( s1[j].c != 'a' + j + status.MPI_SOURCE ) {
+		if ( s1[j].c != 'a' + j + rank ) {
 		    errs++;
 		    /* If the character is not a printing character,
 		       this can generate an file that diff, for example,
 		       believes is a binary file */
 		    if (isprint( (int)(s1[j].c) )) {
 			printf( "Got s[%d].c = %c; expected %c\n", j, s1[j].c,
-				j + status.MPI_SOURCE + 'a');
+				j + rank + 'a');
 		    }
 		    else {
 			printf( "Got s[%d].c = %x; expected %c\n", j, (int)s1[j].c,
-				j + status.MPI_SOURCE + 'a');
+				j + rank + 'a');
 		    }
 		}
 	}

http://git.mpich.org/mpich.git/commitdiff/1c5c594554343a0b6b4335cb28790aa51ab8a968

commit 1c5c594554343a0b6b4335cb28790aa51ab8a968
Author: William Gropp <wgropp at illinois.edu>
Date:   Wed Feb 19 08:25:53 2014 -0600

    Address many of the perf problems in #1788
    
    This adds a step that optimizes the dataloop representation, primarily
    merging CONTIG blocks with the parent dataloop, such as a VECTOR loop.
    It also performs a change of VECTOR of CONTIG with extent > size to
    VECTOR of VECTOR; this reduces the stack operations needed to perform
    the move.
    
    This is a temporary fix for the dataloop performance.  See the DAME
    wiki page (http://wiki.mpich.org/mpich/index.php/DAME) for current work
    on a replacement, higher performance datatype system.
    
    A partial, but not complete, fix for ticket #1788
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpid/common/datatype/dataloop/Makefile.mk b/src/mpid/common/datatype/dataloop/Makefile.mk
index 8bf212a..6cb8799 100644
--- a/src/mpid/common/datatype/dataloop/Makefile.mk
+++ b/src/mpid/common/datatype/dataloop/Makefile.mk
@@ -21,7 +21,8 @@ mpi_core_sources +=                                    \
     src/mpid/common/datatype/dataloop/segment_count.c                \
     src/mpid/common/datatype/dataloop/segment_flatten.c              \
     src/mpid/common/datatype/dataloop/segment_packunpack.c           \
-    src/mpid/common/datatype/dataloop/subarray_support.c
+    src/mpid/common/datatype/dataloop/subarray_support.c             \
+    src/mpid/common/datatype/dataloop/dataloop_optimize.c
 
 # several headers are included by the rest of MPICH
 AM_CPPFLAGS += -I$(top_srcdir)/src/mpid/common/datatype
diff --git a/src/mpid/common/datatype/dataloop/dataloop_create.h b/src/mpid/common/datatype/dataloop/dataloop_create.h
index f054429..0815b0b 100644
--- a/src/mpid/common/datatype/dataloop/dataloop_create.h
+++ b/src/mpid/common/datatype/dataloop/dataloop_create.h
@@ -94,5 +94,16 @@ DLOOP_Count PREPEND_PREFIX(Type_blockindexed_count_contig)(DLOOP_Count count,
                                                            const void *disp_array,
                                                            int dispinbytes,
                                                            DLOOP_Offset old_extent);
-                                                          
+
+int PREPEND_PREFIX(Dataloop_optimize)( DLOOP_Dataloop *dlpOld_p, int level );
+
+int PREPEND_PREFIX(Dataloop_est_complexity)(DLOOP_Dataloop *,
+					    MPI_Aint *, MPI_Aint *);
+int PREPEND_PREFIX(Dataloop_est_struct_complexity)( int,
+						    const int [],
+						    const DLOOP_Type [],
+						    MPI_Aint *,
+						    MPI_Aint * );
+
+void PREPEND_PREFIX(Dataloop_debug_print)( DLOOP_Dataloop *dp );
 #endif
diff --git a/src/mpid/common/datatype/dataloop/dataloop_create_struct.c b/src/mpid/common/datatype/dataloop/dataloop_create_struct.c
index 7d51b7b..70054b6 100644
--- a/src/mpid/common/datatype/dataloop/dataloop_create_struct.c
+++ b/src/mpid/common/datatype/dataloop/dataloop_create_struct.c
@@ -11,6 +11,57 @@
 #error "You must explicitly include a header that sets the PREPEND_PREFIX and includes dataloop_parts.h"
 #endif
 
+
+/*
+=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===
+
+categories :
+    - name        : DATATYPE
+      description : Datatype optimization parameters
+
+cvars:
+   - name         : MPIR_CVAR_DATALOOP_OPTIMIZE
+     category     : DATATYPE
+     type         : boolean
+     default      : true
+     class        : none
+     verbosity    : MPI_T_VERBOSITY_USER_BASIC
+     scope        : MPI_T_SCOPE_LOCAL
+     description  : >-
+       By default, the internal representation of an MPI datatype that
+       is used by MPICH to move data is very similar to the original
+       description of the datatype.  If this flag is true, additional
+       optimizations are used to improve the performance of datatypes.
+
+   - name        : MPIR_CVAR_DATALOOP_FLATTEN
+     category    : DATATYPE
+     type        : boolean
+     class       : none
+     default     : true
+     verbosity   : MPI_T_VERBOSITY_USER_BASIC
+     scope       : MPI_T_SCOPE_LOCAL
+     description : >-
+      If true, attempt to "flatten" the internal representation of
+      MPI struct datatypes (created with MPI_Type_create_struct).
+
+   - name        : MPIR_CVAR_DATALOOP_FLATTEN_MULT
+     category    : DATATYPE
+     type        : int
+     class       : none
+     default     : 2
+     verbosity   : MPI_T_VERBOSITY_USER_BASIC
+     scope       : MPI_T_SCOPE_LOCAL
+     description : >-
+       Flattening an MPI struct datatype does not always improve
+       performance.  This parameter is a threshold that is used in
+       comparing the size of the description with the amount of data
+       moved.  Larger values make it more likely that a struct datatype
+       will be flattened.  The default value is adequate for flattening
+       simple structs, and will usually avoid flattening structs
+       containing vectors or block-indexed data.
+
+=== END_MPI_T_CVAR_INFO_BLOCK ===
+*/
 static int DLOOP_Dataloop_create_struct_memory_error(void);
 static int DLOOP_Dataloop_create_unique_type_struct(DLOOP_Count count,
 						    const int *blklens,
@@ -238,19 +289,37 @@ int PREPEND_PREFIX(Dataloop_create_struct)(DLOOP_Count count,
      * if caller asked for homogeneous or all bytes representation,
      * flatten the type and store it as an indexed type so that
      * there are no branches in the dataloop tree.
+     *
+     * Note that this is not always an optimization - for example,
+     * replacing two long block_indexed with one longer indexed (with
+     * the additional blockcount array) is likely to be slower, because
+     * of the additional memory motion required.
      */
-    if ((flag == DLOOP_DATALOOP_HOMOGENEOUS) ||
-	     (flag == DLOOP_DATALOOP_ALL_BYTES))
-    {
-	return DLOOP_Dataloop_create_flattened_struct(count,
-						      blklens,
-						      disps,
-						      oldtypes,
-						      dlp_p,
-						      dlsz_p,
-						      dldepth_p,
-						      flag);
-    }
+    if (MPIR_CVAR_DATALOOP_FLATTEN && (
+	(flag == DLOOP_DATALOOP_HOMOGENEOUS) ||
+	(flag == DLOOP_DATALOOP_ALL_BYTES) ))
+	{
+	    MPI_Aint nElms = 0, nDesc = 0;
+	    PREPEND_PREFIX(Dataloop_est_struct_complexity)( count,
+							    blklens,
+							    oldtypes,
+							    &nElms,
+							    &nDesc );
+
+	    /* Only convert to flattened if the flattened description
+	       is likely to be more efficient.  The magic number of 24 was
+	       determined emperically.  */
+	    if ( nDesc * 24 * MPIR_CVAR_DATALOOP_FLATTEN_MULT > nElms) {
+		return DLOOP_Dataloop_create_flattened_struct(count,
+							      blklens,
+							      disps,
+							      oldtypes,
+							      dlp_p,
+							      dlsz_p,
+							      dldepth_p,
+							      flag);
+	    }
+	}
 
     /* scan through types and gather derived type info */
     for (i=0; i < count; i++)
diff --git a/src/mpid/common/datatype/dataloop/dataloop_optimize.c b/src/mpid/common/datatype/dataloop/dataloop_optimize.c
new file mode 100644
index 0000000..5332001
--- /dev/null
+++ b/src/mpid/common/datatype/dataloop/dataloop_optimize.c
@@ -0,0 +1,681 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+
+/*
+ *  (C) 2013 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "dataloop.h"
+
+/* #define MPICH_DEBUG_DATALOOP */
+#ifdef MPICH_DEBUG_DATALOOP
+static int firstCall = 1;
+static int printDataloop = 0;
+static int printIfOptimized = 0;
+
+/* Print format:
+   (spaces for level).(el_size,el_extent,el_type)(count)....
+*/
+static void dl_print_dataloop( int, int, DLOOP_Dataloop * );
+static void dl_print_contig( int, DLOOP_Dataloop * );
+static void dl_print_vector( int, DLOOP_Dataloop * );
+static void dl_print_blockindexed( int, DLOOP_Dataloop * );
+static void dl_print_struct( int, DLOOP_Dataloop * );
+static void dl_print( int, const char * );
+
+static void dl_print_tab( int l )
+{
+    int i;
+    for (i=2*l; i!=0; i--) printf( "%c", ' ' );
+}
+static void dl_print_base( DLOOP_Dataloop *dp )
+{
+    printf( "(%ld,%ld,%lx)(%ld)", (long)dp->el_size, (long)dp->el_extent,
+            (long)dp->el_type, (long)dp->loop_params.count );
+}
+static void dl_print( int l, const char *s )
+{
+    dl_print_tab(l);
+    printf( "%s", s );
+}
+static void dl_print_contig( int l, DLOOP_Dataloop *dp )
+{
+    dl_print_tab(l);
+    printf( "CONTIG " );
+    dl_print_base( dp );
+    printf( "\n" );
+}
+static void dl_print_vector( int l, DLOOP_Dataloop *dp )
+{
+    int stride = dp->loop_params.v_t.stride;
+    int blocksize = dp->loop_params.v_t.blocksize ;
+    dl_print_tab(l);
+    printf( "VECTOR " );
+    dl_print_base( dp );
+    printf( ":Stride %d Blocksize %d\n", stride, blocksize );
+}
+static void dl_print_blockindexed( int l, DLOOP_Dataloop *dp )
+{
+    int blocksize = dp->loop_params.bi_t.blocksize ;
+    DLOOP_Offset *offarray = dp->loop_params.bi_t.offset_array;
+    int i, n;
+    dl_print_tab(l);
+    printf( "BLOCKINDEXED " );
+    dl_print_base( dp );
+    printf( ":Blocksize %d:", blocksize );
+    n = dp->loop_params.bi_t.count;
+    if (n > 8) n = 8;
+    for (i=0; i<n; i++) {
+        printf( "%lx,", (long)offarray[i] );
+    }
+    if (dp->loop_params.bi_t.count > n) printf( "..." );
+    printf( "\n" );
+}
+static void dl_print_indexed( int l, DLOOP_Dataloop *dp )
+{
+    DLOOP_Count  *blocksizearray = dp->loop_params.i_t.blocksize_array ;
+    DLOOP_Offset *offarray = dp->loop_params.i_t.offset_array;
+    int          i, n;
+    int          minblock, maxblock;
+    dl_print_tab(l);
+    printf( "INDEXED " );
+    dl_print_base( dp );
+    n = dp->loop_params.i_t.count;
+    minblock = maxblock = (n>0) ? blocksizearray[0] : 0;
+    for (i=0; i<n; i++) {
+        if (blocksizearray[i] > maxblock) maxblock = blocksizearray[i];
+        if (blocksizearray[i] < minblock) minblock = blocksizearray[i];
+    }
+    printf( "blocks in [%d,%d]", minblock, maxblock );
+
+    if (n > 8) n = 8;
+    for (i=0; i<n; i++) {
+        printf( "(%lx,%ld)", (long)offarray[i], (long)blocksizearray[i] );
+    }
+    if (dp->loop_params.i_t.count > n) printf( "..." );
+    printf( "\n" );
+}
+
+static void dl_print_struct( int l, DLOOP_Dataloop *dp )
+{
+    DLOOP_Count  *blocksizearray = dp->loop_params.s_t.blocksize_array ;
+    DLOOP_Offset *offarray = dp->loop_params.s_t.offset_array;
+    DLOOP_Dataloop **looparray = dp->loop_params.s_t.dataloop_array;
+    int          i, n;
+    dl_print_tab(l);
+    printf( "STRUCT " );
+    dl_print_base( dp );
+    printf( "\n" );
+    n = dp->loop_params.i_t.count;
+    if (n > 8) n = 8;
+    for (i=0; i<n; i++) {
+        dl_print_tab(l+1);
+        printf( "(%lx,%ld):\n", (long)offarray[i], (long)blocksizearray[i] );
+        dl_print_dataloop( l+1, 0, looparray[i] );
+    }
+    if (dp->loop_params.i_t.count > n) printf( "...\n" );
+}
+static void dl_print_dataloop( int l, int doBase, DLOOP_Dataloop *dp )
+{
+    dl_print_tab( l );
+    if (doBase)
+        dl_print_base( dp );
+    switch (dp->kind & DLOOP_KIND_MASK) {
+    case DLOOP_KIND_CONTIG:
+        dl_print_contig( l, dp );
+        break;
+    case DLOOP_KIND_VECTOR:
+        dl_print_vector( l, dp );
+        break;
+    case DLOOP_KIND_BLOCKINDEXED:
+        dl_print_blockindexed( l, dp );
+        break;
+    case DLOOP_KIND_INDEXED:
+        dl_print_indexed( l, dp );
+        break;
+    case DLOOP_KIND_STRUCT:
+        dl_print_struct( l, dp );
+        break;
+    default:
+        dl_print( l, "Unknown dataloop type " );
+        printf( "\n" );
+        break;
+    }
+}
+#endif
+
+void PREPEND_PREFIX(Dataloop_debug_print)( DLOOP_Dataloop *dp )
+{
+#ifdef MPICH_DEBUG_DATALOOP
+    if (firstCall) {
+        char *s = getenv( "MPICH_DATALOOP_PRINT" );
+        if (s && (strcmp(s,"yes")==0 || strcmp(s,"YES") == 0)) {
+            printDataloop = 1;
+            printIfOptimized = 1;
+        }
+        firstCall = 0;
+    }
+    if (printDataloop) {
+        printf( "In Dataloop_debug_print:\n" );
+        dl_print_dataloop( 1, 0, dp );
+    }
+#endif
+}
+
+/*
+ * Indicates whether a dataloop is a basic and final contig type.
+ * This can be used to determine when a contig type can be removed
+ * in a dataloop.
+ */
+static int dl_contig_isFinal( DLOOP_Dataloop *dp )
+{
+    if ((dp->kind & DLOOP_KIND_MASK) != DLOOP_KIND_CONTIG) return 0;
+    if (dp->el_size == dp->el_extent &&
+        (dp->kind & DLOOP_FINAL_MASK))
+        return 1;
+    return 0;
+}
+
+
+/*
+ * Optimize a dataloop
+ *
+ * Apply the following transformations and return a new dataloop.
+ * 1. Convert all predefined types to UINTS with the best alignment (may be BYTE
+ *    in worst case)
+ * 2. Convert blocks of contiguous into a single block of basic unit (e.g.,
+ *    a vector type with a block count of 27 applied to a contiguous type of
+ *    6 ints will be turned into a block count of (27*6) UINTs)
+ * 3. Convert struct (with different dataloops (from different MPI datatypes)
+ *    into indexed when all types are contig
+ * 4. Convert dataloops with counts of 1 into simpler types (e.g., q vector
+ *    with 1 element is really a contig type)
+ *
+ * Value of these optimizations
+ * A 2012 paper[1] compared performance of Open MPI, MPICH2, and user-written code
+ * for some datatypes, and found MPICH2 often performed poorer than other
+ * options.  An investigation showed that some of the issues are due to
+ * a failure to perform optimizations of these type (especially #1 and 2).
+ * It may also be necessary to enhance the dataloop execution engine, but
+ * that will b a separate step.
+ *
+ * [1] T. Schneider and R. Gerstenberger and T. Hoefler, "Micro-Applications
+ *     for Communication Data Access Patterns and MPI Datatypes", EuroMPI 2012
+ *
+ * The level argument is used primarily for debugging output; it keeps track
+ * of how deep a recursive application of this routine has gone.
+ */
+int PREPEND_PREFIX(Dataloop_optimize)(DLOOP_Dataloop *dlpOld_p, int level )
+{
+    int i;
+
+#ifdef MPICH_DEBUG_DATALOOP
+    /* Temp for debugging */
+    /* This is threadsafe in the sense that we don't care */
+    if (firstCall) {
+        char *s = getenv( "MPICH_DATALOOP_PRINT" );
+        if (s && (strcmp(s,"yes")==0 || strcmp(s,"YES") == 0)) {
+            printDataloop = 1;
+            printIfOptimized = 1;
+        }
+        firstCall = 0;
+    }
+    if (printDataloop && level == 0)
+        printf( "About to optimize in commit...\n" );
+#endif
+
+    switch (dlpOld_p->kind & DLOOP_KIND_MASK) {
+    case DLOOP_KIND_CONTIG:
+#ifdef MPICH_DEBUG_DATALOOP
+        if (printDataloop)
+            dl_print_contig( level, dlpOld_p );
+#endif
+        /* replace contig of (non-basic) contig with contig (basic) */
+        if (!(dlpOld_p->kind & DLOOP_FINAL_MASK)) {
+            DLOOP_Dataloop *dlpChild_p = dlpOld_p->loop_params.c_t.dataloop;
+            PREPEND_PREFIX(Dataloop_optimize)( dlpChild_p, level+1 );
+            if ((dlpChild_p->kind & DLOOP_KIND_MASK) == DLOOP_KIND_CONTIG &&
+                dl_contig_isFinal( dlpChild_p )) {
+                if (dlpOld_p->el_size == dlpOld_p->el_extent &&
+                    !MPIU_Prod_overflows_max(
+                             dlpChild_p->loop_params.c_t.count,
+                             dlpOld_p->loop_params.c_t.count,
+                             INT_MAX ) ) {
+
+#ifdef MPICH_DEBUG_DATALOOP
+                    if (printDataloop)
+                        printf( "replacing with contig\n" );
+#endif
+                    dlpOld_p->loop_params.c_t.count *= dlpChild_p->loop_params.c_t.count;
+                    dlpOld_p->el_size   = dlpChild_p->el_size;
+                    dlpOld_p->el_extent = dlpChild_p->el_extent;
+                    dlpOld_p->el_type   = dlpChild_p->el_type;
+                    dlpOld_p->kind     |= DLOOP_FINAL_MASK;
+                    dlpOld_p->loop_params.c_t.dataloop = 0;
+#ifdef MPICH_DEBUG_DATALOOP
+                    if (printIfOptimized || printDataloop) {
+                        printf( "replacement contig is:\n" );
+                        dl_print_contig( level, dlpOld_p );
+                    }
+#endif
+                }
+                else {
+                    /* */
+                    /* printf( "not replacing...\n" ); */
+                    /* If the low level contig is a single byte,
+                       we could make that replacement. Not done. */
+                    /* By doing nothing here, we ensure that the dataloop
+                       is correct if not fully optimized */
+                    ;
+                }
+            }
+        }
+        break;
+
+    case DLOOP_KIND_VECTOR:
+        /* if sub-dloop is (non-basic) contig, merge with blockcount */
+#ifdef MPICH_DEBUG_DATALOOP
+        if (printDataloop)
+            dl_print_vector( level, dlpOld_p );
+#endif
+
+        if (!(dlpOld_p->kind & DLOOP_FINAL_MASK)) {
+            DLOOP_Dataloop *dlpChild_p = dlpOld_p->loop_params.v_t.dataloop;
+            PREPEND_PREFIX(Dataloop_optimize)( dlpChild_p, level+1 );
+
+            if (dl_contig_isFinal( dlpChild_p ) &&
+                    !MPIU_Prod_overflows_max(
+                             dlpChild_p->loop_params.count,
+                             dlpOld_p->loop_params.v_t.blocksize,
+                             INT_MAX ) ) {
+                /* We can replace the contig type by enlarging the blocksize */
+                if (dlpOld_p->el_size == dlpOld_p->el_extent ||
+                    dlpOld_p->loop_params.v_t.blocksize == 1) {
+                    /* Reset the kind to final, free the child type,
+                       set to null */
+                    dlpOld_p->loop_params.v_t.blocksize *=
+                        dlpChild_p->loop_params.count;
+                    dlpOld_p->el_size   = dlpChild_p->el_size;
+                    dlpOld_p->el_type   = dlpChild_p->el_type;
+                    /*dlpOld_p->el_extent = dlpChild_p->el_extent; */
+                    dlpOld_p->kind     |= DLOOP_FINAL_MASK;
+                    dlpOld_p->loop_params.v_t.dataloop = 0;
+#ifdef MPICH_DEBUG_DATALOOP
+                    if (printIfOptimized || printDataloop) {
+                        printf( "replacement Vector is:\n" );
+                        dl_print_vector( level, dlpOld_p );
+                    }
+#endif
+                }
+                else {
+                    /* TODO: If the vector elements do not have
+                       size==extent, and the blocksize is greater than 1,
+                       then it may be better to replace the elements with
+                       a single strided(vector) copy with blocksize elements:
+                       New vector:
+                         stride <- extent
+                         el_size <- size
+                         extent <- ?
+                         count <- blocksize
+                         blocksize <- 1
+                       Old vector become
+                         blocksize <- 1
+                         extent <- ?
+                */
+                    dlpChild_p->loop_params.v_t.stride =
+                        dlpOld_p->el_extent;
+                    dlpChild_p->el_size = 1;
+                    dlpChild_p->el_type = MPI_BYTE;
+                    dlpChild_p->loop_params.v_t.dataloop = 0;
+                    dlpChild_p->loop_params.v_t.count =
+                        dlpOld_p->loop_params.v_t.blocksize;
+                    dlpChild_p->loop_params.v_t.blocksize = dlpOld_p->el_size;
+                    dlpChild_p->kind = DLOOP_KIND_VECTOR |
+                        DLOOP_FINAL_MASK;
+                    dlpOld_p->loop_params.v_t.blocksize = 1;
+#ifdef MPICH_DEBUG_DATALOOP
+                    if (printIfOptimized || printDataloop) {
+                        printf( "Replacing vector of contig with vector of vector\n" );
+                        printf( "replacement Vector is:\n" );
+                        dl_print_vector( level, dlpOld_p );
+                        dl_print_vector( level+1, dlpChild_p );
+                    }
+#endif
+                }
+            }
+        }
+        /* replace vector of a single element with contig */
+        if ((dlpOld_p->kind & DLOOP_FINAL_MASK)) {
+            int blocksize = dlpOld_p->loop_params.v_t.blocksize;
+            int count     = dlpOld_p->loop_params.v_t.count;
+            if (dlpOld_p->el_size * blocksize ==
+                dlpOld_p->loop_params.v_t.stride &&
+                    !MPIU_Prod_overflows_max( count, blocksize, INT_MAX ) ) {
+                dlpOld_p->kind = DLOOP_KIND_CONTIG | DLOOP_FINAL_MASK;
+                dlpOld_p->loop_params.c_t.dataloop = 0;
+                dlpOld_p->loop_params.c_t.count = count * blocksize;
+#ifdef MPICH_DEBUG_DATALOOP
+                if (printIfOptimized || printDataloop) {
+                    printf( "replacement Contig is:\n" );
+                    dl_print_contig( level, dlpOld_p );
+                }
+#endif
+            }
+        }
+        /* replace vector that is contiguous with contiguous */
+        break;
+
+    case DLOOP_KIND_BLOCKINDEXED:
+        /* if subdloop is (non-basic) contig, merge with blockcount */
+#ifdef MPICH_DEBUG_DATALOOP
+        if (printDataloop)
+            dl_print_blockindexed( level, dlpOld_p );
+#endif
+        if (!(dlpOld_p->kind & DLOOP_FINAL_MASK)) {
+            DLOOP_Dataloop *dlpChild_p = dlpOld_p->loop_params.bi_t.dataloop;
+            PREPEND_PREFIX(Dataloop_optimize)( dlpChild_p, level+1 );
+            if (dl_contig_isFinal( dlpChild_p ) &&
+                    !MPIU_Prod_overflows_max(
+                             dlpChild_p->loop_params.count,
+                             dlpOld_p->loop_params.bi_t.blocksize,
+                             INT_MAX ) ) {
+                /* We can replace the contig type by enlarging the blocksize */
+
+                /* Reset the kind to final, free the child type, set to null */
+                dlpOld_p->loop_params.bi_t.blocksize *= dlpChild_p->loop_params.count;
+                dlpOld_p->el_size   = dlpChild_p->el_size;
+                /*dlpOld_p->el_extent = dlpChild_p->el_extent;*/
+                dlpOld_p->el_type   = dlpChild_p->el_type;
+                dlpOld_p->kind     |= DLOOP_FINAL_MASK;
+                dlpOld_p->loop_params.bi_t.dataloop = 0;
+#ifdef MPICH_DEBUG_DATALOOP
+                if (printIfOptimized || printDataloop) {
+                    printf( "replacement BlockIndexed is:\n" );
+                    dl_print_blockindexed( level, dlpOld_p );
+                }
+#endif
+            }
+        }
+        /* replace blockindexed of a single element with contig,
+           but with an offset (how?) */
+        /* TODO */
+        if (dlpOld_p->loop_params.bi_t.count == 1 &&
+            dlpOld_p->loop_params.bi_t.offset_array[0] == 0) {
+#ifdef MPICH_DEBUG_DATALOOP
+            if (printIfOptimized || printDataloop) {
+                printf( "replacement Contig is:(NOTDONE)\n" );
+                /* dl_print_blockindexed( level, dlpOld_p ); */
+                }
+#endif
+        }
+        break;
+    case DLOOP_KIND_INDEXED:
+        /* if sub-dloop is (non-basic) contig, merge with blockcount */
+#ifdef MPICH_DEBUG_DATALOOP
+        if (printDataloop)
+            dl_print_indexed( level, dlpOld_p );
+#endif
+        if (!(dlpOld_p->kind & DLOOP_FINAL_MASK)) {
+            DLOOP_Dataloop *dlpChild_p = dlpOld_p->loop_params.i_t.dataloop;
+            PREPEND_PREFIX(Dataloop_optimize)( dlpChild_p, level+1 );
+            if (dl_contig_isFinal( dlpChild_p ) ) {
+                /* Could include the child type in the blocksize counts */
+            }
+        }
+
+        /* replace indexed of constant block count with blockindexed */
+
+        /* replace indexed of a single element with contig */
+
+        /* If all block counts are multiples of the smallest, and if most
+           blocks are smallest, then the other blocks could be split into
+           separate blocks with appropriate offsets, replacing indexed with
+           blockindexed */
+
+        break;
+
+    case DLOOP_KIND_STRUCT:
+        /* if sub-dloops are all contig, replace with indexed */
+        /* Not done yet - but first step is to recurse and
+           simply/optimize the component dataloops */
+#ifdef MPICH_DEBUG_DATALOOP
+        if (printDataloop) {
+            dl_print_struct( level, dlpOld_p );
+            printf( "now optimizing...\n" );
+        }
+#endif
+        if (!(dlpOld_p->kind & DLOOP_FINAL_MASK)) {
+            for (i=0; i<dlpOld_p->loop_params.s_t.count; i++) {
+                PREPEND_PREFIX(Dataloop_optimize)(
+                          dlpOld_p->loop_params.s_t.dataloop_array[i],
+                          level+1);
+            }
+        }
+        /* Can the preceding if case ever be false? */
+        /* Heres where we might check the following:
+            Are all child dataloops CONTIG?
+            Are all extents equal to sizes?
+            Are all LBs equal to 0?
+           If these are all true and in addition they are contiguous,
+           replace with a single contig (but be careful of the extent)
+           Otherwise, if these are all true, then replace with INDEXED.
+        */
+        if (!(dlpOld_p->kind & DLOOP_FINAL_MASK)) {
+            int isContig = 1;
+            int allContig = 1;
+            MPI_Aint lastAdd = 0;
+            for (i=0; i<dlpOld_p->loop_params.s_t.count; i++) {
+                DLOOP_Dataloop *dlpChild_p =
+                    dlpOld_p->loop_params.s_t.dataloop_array[i];
+                if ((dlpChild_p->kind & DLOOP_KIND_MASK) != DLOOP_KIND_CONTIG) {
+                    allContig = 0; break;
+                }
+                if (/* dlpChild_p->el_lb != 0 || */  /* No lb in dataloop(?) */
+                    dlpChild_p->el_extent != dlpChild_p->el_size) {
+#ifdef MPICH_DEBUG_DATALOOP
+                    if (printDataloop)
+                        printf( "not natural contig\n" );
+#endif
+                    allContig = 0; break;
+                }
+                if (isContig &&
+                    lastAdd != dlpOld_p->loop_params.s_t.offset_array[i]) {
+#ifdef MPICH_DEBUG_DATALOOP
+                    if (printDataloop)
+                        printf( "Not contiguous bytes: %lx != %lx\n",
+                            (long)lastAdd,
+                            (long)dlpOld_p->loop_params.s_t.offset_array[i] );
+#endif
+                    isContig = 0;
+                }
+                else {
+                    lastAdd += dlpChild_p->el_extent *
+                        dlpChild_p->loop_params.count;
+                }
+            }
+            if (allContig) {
+#ifdef MPICH_DEBUG_DATALOOP
+                if (printDataloop)
+                    printf( "All subtypes are contig - can replace with index\n" );
+#endif
+                if (isContig) {
+#ifdef MPICH_DEBUG_DATALOOP
+                    if (printDataloop)
+                        printf( "All subtypes consequtive - can replace with a single contig\n" );
+#endif
+                ;
+                }
+            }
+        }
+
+        break;
+    default:
+#ifdef MPICH_DEBUG_DATALOOP
+        if (printDataloop)
+            dl_print( level, "Unknown type!" );
+#endif
+        break;
+    }
+
+#ifdef MPICH_DEBUG_DATALOOP
+    if (printDataloop && level == 0)
+        printf( "Done!\n" );
+#endif
+
+    return 0;
+}
+
+
+/*
+ * Make an estimate at the complexity of a datatype.  This can be used
+ * to determine whether flattening the datatype to an indexed type is
+ * likely to be efficient.
+ */
+int PREPEND_PREFIX(Dataloop_est_complexity)(DLOOP_Dataloop *dlp_p,
+                                            MPI_Aint *nElms,
+                                            MPI_Aint *nDesc )
+{
+    int i;
+    MPI_Aint myElms = 0;
+    MPI_Aint myDesc = 0;
+    MPI_Aint childElms = 0, childDesc = 0;
+    DLOOP_Dataloop *dlpChild_p;
+
+    switch (dlp_p->kind & DLOOP_KIND_MASK) {
+    case DLOOP_KIND_CONTIG:
+        /* Data moved is count*size of the child type */
+
+        if (!(dlp_p->kind & DLOOP_FINAL_MASK)) {
+            dlpChild_p = dlp_p->loop_params.c_t.dataloop;
+            PREPEND_PREFIX(Dataloop_est_complexity)( dlpChild_p, &childElms,
+                                                     &childDesc );
+        }
+        else {
+            childElms = dlp_p->el_size;
+            childDesc = 0;
+        }
+        myElms += dlp_p->loop_params.c_t.count * childElms;
+        myDesc += childDesc + 1;
+
+        break;
+
+    case DLOOP_KIND_VECTOR:
+        /* Data moved is count*size of the child type */
+
+        if (!(dlp_p->kind & DLOOP_FINAL_MASK)) {
+            dlpChild_p = dlp_p->loop_params.v_t.dataloop;
+            PREPEND_PREFIX(Dataloop_est_complexity)( dlpChild_p, &childElms,
+                                                     &childDesc );
+        }
+        else {
+            childElms = dlp_p->el_size;
+            childDesc = 0;
+        }
+        myElms += dlp_p->loop_params.v_t.count *
+            dlp_p->loop_params.v_t.blocksize * childElms;
+        myDesc += childDesc + 2;
+
+        break;
+
+    case DLOOP_KIND_BLOCKINDEXED:
+        if (!(dlp_p->kind & DLOOP_FINAL_MASK)) {
+            dlpChild_p = dlp_p->loop_params.bi_t.dataloop;
+            PREPEND_PREFIX(Dataloop_est_complexity)( dlpChild_p, &childElms,
+                                                     &childDesc );
+        }
+        else {
+            childElms = dlp_p->el_size;
+            childDesc = 0;
+        }
+        myElms += dlp_p->loop_params.bi_t.count *
+            dlp_p->loop_params.bi_t.blocksize * childElms;
+        myDesc += childDesc + dlp_p->loop_params.bi_t.count;
+        break;
+
+    case DLOOP_KIND_INDEXED:
+
+        if (!(dlp_p->kind & DLOOP_FINAL_MASK)) {
+            dlpChild_p = dlp_p->loop_params.i_t.dataloop;
+            PREPEND_PREFIX(Dataloop_est_complexity)( dlpChild_p, &childElms,
+                                                     &childDesc );
+        }
+        else {
+            childElms = dlp_p->el_size;
+            childDesc = 0;
+        }
+        myElms += dlp_p->loop_params.i_t.total_blocks * childElms;
+        myDesc += childDesc + 2 * dlp_p->loop_params.i_t.count;
+
+        break;
+
+    case DLOOP_KIND_STRUCT:
+        if (!(dlp_p->kind & DLOOP_FINAL_MASK)) {
+            MPI_Aint celm, cdesc;
+            for (i=0; i<dlp_p->loop_params.s_t.count; i++) {
+                celm = 0; cdesc = 0;
+                PREPEND_PREFIX(Dataloop_est_complexity)(
+                                dlp_p->loop_params.s_t.dataloop_array[i],
+                               &celm, &cdesc );
+                childElms += celm * dlp_p->loop_params.s_t.blocksize_array[i];
+                childDesc += cdesc + 3;
+            }
+        }
+        else {
+            int elsize = dlp_p->el_size;
+            childElms = 0;
+            childDesc = 0;
+            for (i=0; i<dlp_p->loop_params.s_t.count; i++) {
+                childElms += elsize * dlp_p->loop_params.s_t.blocksize_array[i];
+                childDesc += 3;
+            }
+        }
+
+        myElms += childElms;
+        myDesc += childDesc;
+        break;
+
+    default:
+        break;
+    }
+
+    /* Return the final values */
+    *nElms += myElms;
+    *nDesc += myDesc;
+
+    return 0;
+}
+
+/*
+ * Estimate the complexity of a struct Dataloop before it is constructed.
+ */
+int PREPEND_PREFIX(Dataloop_est_struct_complexity)( int count,
+                                                    const int blklens[],
+                                                    const DLOOP_Type oldtypes[],
+                                                    MPI_Aint *nElms,
+                                                    MPI_Aint *nDesc )
+{
+    MPI_Aint myElms = 0, myDesc = 0;
+    int i;
+    int flag = MPID_DATALOOP_ALL_BYTES;
+
+    for (i=0; i<count; i++) {
+        DLOOP_Dataloop *dlp_p = 0;
+        MPI_Aint celms = 0, cdesc = 0;
+
+        DLOOP_Handle_get_loopptr_macro(oldtypes[i],dlp_p,flag);
+        if (dlp_p) {
+            PREPEND_PREFIX(Dataloop_est_complexity)( dlp_p,
+                                                     &celms, &cdesc );
+        }
+        else {
+            celms = 1;
+            cdesc = 1;
+        }
+        myElms += celms * blklens[i];
+        myDesc += cdesc;
+    }
+    *nElms = myElms;
+    *nDesc = myDesc;
+
+    return MPI_SUCCESS;
+}
diff --git a/src/mpid/common/datatype/dataloop/segment_packunpack.c b/src/mpid/common/datatype/dataloop/segment_packunpack.c
index 7446b59..45fade4 100644
--- a/src/mpid/common/datatype/dataloop/segment_packunpack.c
+++ b/src/mpid/common/datatype/dataloop/segment_packunpack.c
@@ -15,6 +15,28 @@
 
 /* NOTE: bufp values are unused, ripe for removal */
 
+/* #define MPICH_DEBUG_SEGMENT_MOVE */
+/* TODO: Consider integrating this with the general debug support. */
+/* Note: This does not use the CVAR support for the environment variable
+   because (a) this is a temporary code and (b) it is expert developer
+   only */
+#ifdef MPICH_DEBUG_SEGMENT_MOVE
+static int printSegment = -1;
+static void setPrint( void ) {
+    char *s = getenv( "MPICH_DATALOOP_PRINT" );
+    if (s && (strcmp(s,"yes")==0 || strcmp(s,"YES") == 0)) {
+        printSegment = 1;
+    }
+    else {
+        printSegment = 0;
+    }
+}
+#define DBG_SEGMENT(_a) do { if (printSegment < 0) setPrint(); \
+        if (printSegment) { _a; } } while( 0 )
+#else
+#define DBG_SEGMENT(_a)
+#endif
+
 int PREPEND_PREFIX(Segment_contig_m2m)(DLOOP_Offset *blocks_p,
 				       DLOOP_Type el_type,
 				       DLOOP_Offset rel_off,
@@ -52,6 +74,7 @@ void PREPEND_PREFIX(Segment_pack)(DLOOP_Segment *segp,
 {
     struct PREPEND_PREFIX(m2m_params) params; /* defined in dataloop_parts.h */
 
+    DBG_SEGMENT(printf( "Segment_pack...\n" ));
     /* experimenting with discarding buf value in the segment, keeping in
      * per-use structure instead. would require moving the parameters around a
      * bit.
@@ -77,6 +100,7 @@ void PREPEND_PREFIX(Segment_unpack)(DLOOP_Segment *segp,
 {
     struct PREPEND_PREFIX(m2m_params) params;
 
+    DBG_SEGMENT(printf( "Segment_unpack...\n" ));
     /* experimenting with discarding buf value in the segment, keeping in
      * per-use structure instead. would require moving the parameters around a
      * bit.
@@ -110,6 +134,8 @@ int PREPEND_PREFIX(Segment_contig_m2m)(DLOOP_Offset *blocks_p,
     DLOOP_Handle_get_size_macro(el_type, el_size);
     size = *blocks_p * el_size;
 
+    DBG_SEGMENT(printf( "element type = %lx\n", (long)el_type ));
+    DBG_SEGMENT(printf( "contig m2m: elsize = %d, size = %d\n", (int)el_size, (int)size ));
 #ifdef MPID_SU_VERBOSE
     dbg_printf("\t[contig unpack: do=" DLOOP_OFFSET_FMT_DEC_SPEC ", dp=%x, bp=%x, sz=" DLOOP_OFFSET_FMT_DEC_SPEC ", blksz=" DLOOP_OFFSET_FMT_DEC_SPEC "]\n",
 	       rel_off,
@@ -165,6 +191,7 @@ int PREPEND_PREFIX(Segment_vector_m2m)(DLOOP_Offset *blocks_p,
     DLOOP_Ensure_Offset_fits_in_pointer((DLOOP_VOID_PTR_CAST_TO_OFFSET (paramp->userbuf)) + rel_off);
     cbufp = (char*) paramp->userbuf + rel_off;
     DLOOP_Handle_get_size_macro(el_type, el_size);
+    DBG_SEGMENT(printf( "vector m2m: elsize = %d, count = %d, stride = %d, blocksize = %d\n", (int)el_size, (int)count, (int)stride, (int)blksz ));
 
     whole_count = (DLOOP_Count)((blksz > 0) ? (*blocks_p / (DLOOP_Offset) blksz) : 0);
     blocks_left = (DLOOP_Count)((blksz > 0) ? (*blocks_p % (DLOOP_Offset) blksz) : 0);
@@ -195,6 +222,9 @@ int PREPEND_PREFIX(Segment_vector_m2m)(DLOOP_Offset *blocks_p,
 	else {
 	    for (i=0; i < whole_count; i++) {
 		DLOOP_Memcpy(cbufp, paramp->streambuf, ((DLOOP_Offset) blksz) * el_size);
+                DBG_SEGMENT(printf("vec: memcpy %p %p %d\n", cbufp,
+                                   paramp->streambuf,
+                                   (int)(blksz * el_size) ));
 		/* Ensure that pointer increment fits in a pointer */
 		/* streambuf is a pointer (not a displacement) since it is being used for a memory copy */
 		DLOOP_Ensure_Offset_fits_in_pointer((DLOOP_VOID_PTR_CAST_TO_OFFSET (paramp->streambuf)) +
@@ -206,6 +236,9 @@ int PREPEND_PREFIX(Segment_vector_m2m)(DLOOP_Offset *blocks_p,
 	    }
 	    if (blocks_left) {
 		DLOOP_Memcpy(cbufp, paramp->streambuf, ((DLOOP_Offset) blocks_left) * el_size);
+                DBG_SEGMENT(printf("vec(left): memcpy %p %p %d\n", cbufp,
+                                   paramp->streambuf,
+                                   (int)(blocks_left * el_size) ));
 		/* Ensure that pointer increment fits in a pointer */
 		/* streambuf is a pointer (not a displacement) since
 		 * it is being used for a memory copy */
@@ -244,6 +277,9 @@ int PREPEND_PREFIX(Segment_vector_m2m)(DLOOP_Offset *blocks_p,
 		/* Ensure that pointer increment fits in a pointer */
 		/* streambuf is a pointer (not a displacement) since
 		 * it is being used for a memory copy */
+                DBG_SEGMENT(printf("vec: memcpy %p %p %d\n",
+                                   paramp->streambuf, cbufp,
+                                   (int)(blksz * el_size) ));
 		DLOOP_Ensure_Offset_fits_in_pointer((DLOOP_VOID_PTR_CAST_TO_OFFSET (paramp->streambuf)) +
 						 (DLOOP_Offset) blksz * el_size);
 		paramp->streambuf += (DLOOP_Offset) blksz * el_size;
@@ -251,6 +287,9 @@ int PREPEND_PREFIX(Segment_vector_m2m)(DLOOP_Offset *blocks_p,
 	    }
 	    if (blocks_left) {
 		DLOOP_Memcpy(paramp->streambuf, cbufp, (DLOOP_Offset) blocks_left * el_size);
+                DBG_SEGMENT(printf("vec(left): memcpy %p %p %d\n",
+                                   paramp->streambuf, cbufp,
+                                   (int)(blocks_left * el_size) ));
 		/* Ensure that pointer increment fits in a pointer */
 		/* streambuf is a pointer (not a displacement) since
 		 * it is being used for a memory copy */
@@ -282,6 +321,23 @@ int PREPEND_PREFIX(Segment_blkidx_m2m)(DLOOP_Offset *blocks_p,
     struct PREPEND_PREFIX(m2m_params) *paramp = v_paramp;
 
     DLOOP_Handle_get_size_macro(el_type, el_size);
+    DBG_SEGMENT(printf( "blkidx m2m: elsize = %d, count = %d, blocklen = %d\n", (int)el_size, (int)count, (int)blocklen ));
+
+    /* If the blocklen * el_size is relatively small, then for
+       performance reasons, its important to hoist most of these
+       tests out of the loop.  Ignoring some of the issues of handling
+       the available buffer size (blocks_left), this should translate
+       directly into code that looks like this for blocksize == 1
+
+       for (i=0; i<count; i++) {
+            dest[i] = userbuf[offsetarray[i]];
+       }
+
+       where "dest" and "userbuf" are pointers to objects of the correct
+       size.  If blocksize is > 1, then various unrollings are important
+       until blocksize is large enough to make the overhead of memcpy
+       negligible.  Datatypes such as this are used in LAMMPS, for example.
+    */
 
     while (blocks_left) {
 	char *src, *dest;
@@ -326,6 +382,7 @@ int PREPEND_PREFIX(Segment_blkidx_m2m)(DLOOP_Offset *blocks_p,
 	}
 	else {
 	    DLOOP_Memcpy(dest, src, (DLOOP_Offset) blocklen * el_size);
+            DBG_SEGMENT(printf( "blkidx m3m:memcpy(%p,%p,%d)\n",dest,src,(int)(blocklen*el_size)));
 	}
 
 	/* Ensure that pointer increment fits in a pointer */
@@ -358,6 +415,7 @@ int PREPEND_PREFIX(Segment_index_m2m)(DLOOP_Offset *blocks_p,
     char *cbufp;
     struct PREPEND_PREFIX(m2m_params) *paramp = v_paramp;
 
+    DBG_SEGMENT(printf( "index m2m: elsize = %d, count = %d\n", (int)el_size, (int)count ));
     DLOOP_Handle_get_size_macro(el_type, el_size);
 
     while (blocks_left) {
diff --git a/src/mpid/common/datatype/mpid_type_commit.c b/src/mpid/common/datatype/mpid_type_commit.c
index a384e50..b27eaff 100644
--- a/src/mpid/common/datatype/mpid_type_commit.c
+++ b/src/mpid/common/datatype/mpid_type_commit.c
@@ -19,6 +19,7 @@ Output Parameters:
   Return Value:
   0 on success, -1 on failure.
 @*/
+
 int MPID_Type_commit(MPI_Datatype *datatype_p)
 {
     int           mpi_errno=MPI_SUCCESS;
@@ -57,9 +58,21 @@ int MPID_Type_commit(MPI_Datatype *datatype_p)
 	MPIU_DBG_PRINTF(("# contig blocks = %d\n",
 			 (int) datatype_ptr->max_contig_blocks));
 
+	if (MPIR_CVAR_DATALOOP_OPTIMIZE) {
+	    MPID_Dataloop_optimize(datatype_ptr->dataloop, 0 );
+        }
+        else {
+            /* This allows the developer to output the final dataloops
+               in the case where the dataloops are not optimized.
+               It does nothing if that printing is not enabled.
+            */
+            MPID_Dataloop_debug_print( datatype_ptr->dataloop );
+        }
+
 #if 0
         MPIDI_Dataloop_dot_printf(datatype_ptr->dataloop, 0, 1);
 #endif
+
     }
 
     return mpi_errno;

http://git.mpich.org/mpich.git/commitdiff/514214bc5cfdc7382d6d2667e86424fa5aa312e1

commit 514214bc5cfdc7382d6d2667e86424fa5aa312e1
Author: William Gropp <wgropp at illinois.edu>
Date:   Wed Feb 19 08:12:40 2014 -0600

    Add more datatype tests
    
    These tests were inspired by tests in the MPICH-1 test suite
    (structpack2.c) or the Intel test suite (vecblklen.c and hvecblklen.c)
    that failed with early versions of the dataloop optimization code.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/test/mpi/datatype/Makefile.am b/test/mpi/datatype/Makefile.am
index bd1de7b..03c13b5 100644
--- a/test/mpi/datatype/Makefile.am
+++ b/test/mpi/datatype/Makefile.am
@@ -34,6 +34,7 @@ noinst_PROGRAMS =           \
     hindexed-zeros          \
     hindexed_block          \
     hindexed_block_contents \
+    hvecblklen              \
     indexed-misc            \
     large-count             \
     large_type              \
@@ -59,6 +60,7 @@ noinst_PROGRAMS =           \
     struct-ezhov            \
     struct-no-real-types    \
     struct-pack             \
+    structpack2             \
     struct-verydeep         \
     struct-zero-count       \
     subarray                \
@@ -75,6 +77,7 @@ noinst_PROGRAMS =           \
     typename                \
     unpack                  \
     unusual-noncontigs      \
+    vecblklen               \
     zeroblks                \
     zeroparms
 
diff --git a/test/mpi/datatype/hvecblklen.c b/test/mpi/datatype/hvecblklen.c
new file mode 100644
index 0000000..63fc417
--- /dev/null
+++ b/test/mpi/datatype/hvecblklen.c
@@ -0,0 +1,91 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mpitest.h"
+
+/* Inspired by the Intel MPI_Type_hvector_blklen test.
+   Added to include a test of a dataloop optimization that failed.
+*/
+int main( int argc, char *argv[] )
+{
+    MPI_Datatype ot, ot2, newtype;
+    int position, psize, insize, outsize;
+    char *inbuf=0, *outbuf=0, *pbuf=0, *p;
+    int  i, j, k;
+    int  errs = 0;
+    int  veccount=16, stride=16;
+
+    MTest_Init( &argc, &argv );
+    /*
+     * Create a type with some padding
+     */
+    MPI_Type_contiguous( 59, MPI_CHAR, &ot );
+    MPI_Type_create_resized( ot, 0, 64, &ot2 );
+    /*
+      Use a vector type with a block size equal to the stride - thus
+      tiling the target memory with copies of old type.  This is not
+      a contiguous copy since oldtype has a gap at the end.
+    */
+    MPI_Type_hvector( veccount, stride, stride*64, ot2, &newtype );
+    MPI_Type_commit( &newtype );
+
+    insize = veccount * stride * 64;
+    outsize = insize;
+    inbuf = (char *)malloc( insize );
+    outbuf = (char *)malloc( outsize );
+    for (i=0; i<outsize; i++) {
+        inbuf[i] = i % 64;
+        outbuf[i] = -1;
+    }
+
+    MPI_Pack_size( 1, newtype, MPI_COMM_WORLD, &psize );
+    pbuf = (char *)malloc( psize );
+
+    position = 0;
+    MPI_Pack( inbuf, 1, newtype, pbuf, psize, &position, MPI_COMM_WORLD );
+    psize    = position;
+    position = 0;
+    MPI_Unpack( pbuf, psize, &position, outbuf, 1, newtype, MPI_COMM_WORLD );
+
+
+    /* Check the output */
+    p = outbuf;
+    for (i=0; i<veccount; i++) {
+        for (j=0; j<stride; j++) {
+            for (k=0; k<59; k++) {
+                if (*p != k % 64) {
+                    errs++;
+                    fprintf( stderr, "[%d,%d,%d]expected %d but saw %d\n",
+                             i, j, k, (k%64), *p );
+                }
+                p++;
+            }
+            for (k=59; k<64; k++) {
+                if (*p != -1) {
+                    errs++;
+                    fprintf( stderr, "[%d,%d,%d]expected -1 but saw %d\n",
+                             i, j, k, *p );
+                }
+                p++;
+            }
+        }
+    }
+
+    free( pbuf );
+    free( inbuf );
+    free( outbuf );
+
+    MPI_Type_free( &ot );
+    MPI_Type_free( &ot2 );
+    MPI_Type_free( &newtype );
+    MTest_Finalize( errs );
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/test/mpi/datatype/structpack2.c b/test/mpi/datatype/structpack2.c
new file mode 100644
index 0000000..16b0151
--- /dev/null
+++ b/test/mpi/datatype/structpack2.c
@@ -0,0 +1,115 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include "mpitest.h"
+#include <stdlib.h>
+#include <stdio.h>
+/* The next is for isprint */
+#include <ctype.h>
+
+int main( int argc, char *argv[])
+{
+	struct a {	int	i;
+			char	c;
+		} s[10], s1[10];
+	int j;
+	int errs = 0, toterrs;
+	int rank, size, tsize;
+	MPI_Aint text;
+	int blens[2];
+	MPI_Aint disps[2];
+	MPI_Datatype bases[2];
+	MPI_Datatype str, con;
+	MPI_Status status;
+	char *buffer;
+	int   bufsize, position, insize;
+
+	MTest_Init( &argc, &argv );
+
+	MPI_Comm_rank( MPI_COMM_WORLD, &rank );
+	MPI_Comm_size( MPI_COMM_WORLD, &size );
+
+	for( j = 0; j < 10; j ++ ) {
+		s[j].i = j + rank;
+		s[j].c = j + rank + 'a';
+	}
+
+	blens[0] = blens[1] = 1;
+	disps[0] = 0; disps[1] = sizeof(int);
+	bases[0] = MPI_INT; bases[1] = MPI_CHAR;
+	MPI_Type_struct( 2, blens, disps, bases, &str );
+	MPI_Type_commit( &str );
+	MPI_Type_contiguous( 10, str, &con );
+	MPI_Type_commit( &con );
+	MPI_Type_size( con, &tsize );
+	MPI_Type_extent( con, &text );
+
+#ifdef DEBUG
+	printf("Size of MPI array is %d, extent is %d\n", tsize, text );
+#endif
+
+#ifdef DEBUG
+        {
+	void * p1, *p2;
+	p1 = s;
+	p2 = &(s[10].i);  /* This statement may fail on some systems */
+	printf("C array starts at %p and ends at %p for a length of %d\n",
+		s, &(s[9].c), (char *)p2-(char *)p1 );
+        }
+#endif
+	MPI_Type_extent( str, &text );
+#ifdef DEBUG
+	MPI_Type_size( str, &tsize );
+	printf("Size of MPI struct is %d, extent is %d\n", tsize, (int)text );
+	printf("Size of C struct is %d\n", sizeof(struct a) );
+#endif
+	if (text != sizeof(struct a)) {
+	    printf( "Extent of struct a (%d) does not match sizeof (%d)\n",
+		    (int)text, (int)sizeof(struct a) );
+	    errs++;
+	}
+
+	MPI_Pack_size(1, con, MPI_COMM_WORLD, &bufsize);
+	buffer = (char *) malloc(bufsize);
+
+	position = 0;
+	MPI_Pack(s,1,con,buffer,bufsize,&position,MPI_COMM_WORLD);
+	insize   = position;
+	position = 0;
+	MPI_Unpack(buffer,insize,&position,s1,1,con,MPI_COMM_WORLD );
+
+	for( j = 0; j < 10; j++ ) {
+#ifdef DEBUG
+		printf("%d Sent: %d %c, Got: %d %c\n", rank,
+			s[j].i, s[j].c, s1[j].i, s1[j].c );
+#endif
+		if ( s1[j].i != j + status.MPI_SOURCE ) {
+		    errs++;
+		    printf( "Got s[%d].i = %d (%x); expected %d\n", j, s1[j].i,
+			    s1[j].i, j + status.MPI_SOURCE );
+		}
+		if ( s1[j].c != 'a' + j + status.MPI_SOURCE ) {
+		    errs++;
+		    /* If the character is not a printing character,
+		       this can generate an file that diff, for example,
+		       believes is a binary file */
+		    if (isprint( (int)(s1[j].c) )) {
+			printf( "Got s[%d].c = %c; expected %c\n", j, s1[j].c,
+				j + status.MPI_SOURCE + 'a');
+		    }
+		    else {
+			printf( "Got s[%d].c = %x; expected %c\n", j, (int)s1[j].c,
+				j + status.MPI_SOURCE + 'a');
+		    }
+		}
+	}
+
+	MPI_Type_free( &str );
+	MPI_Type_free( &con );
+	MTest_Finalize( errs );
+	MPI_Finalize();
+	return 0;
+}
diff --git a/test/mpi/datatype/testlist.in b/test/mpi/datatype/testlist.in
index 6e9b5c2..549c3eb 100644
--- a/test/mpi/datatype/testlist.in
+++ b/test/mpi/datatype/testlist.in
@@ -7,6 +7,7 @@ simple-pack-external 1
 transpose-pack 1
 slice-pack 1
 struct-pack 1
+structpack2 1
 typecommit 1
 typename 1
 typefree 1
@@ -49,6 +50,8 @@ struct-verydeep 1
 get-elements 1
 hindexed_block 1 mpiversion=3.0
 hindexed_block_contents 1 mpiversion=3.0
+vecblklen 1
+hvecblklen 1
 longdouble 1
 dataalign 2
 @largetest at large-count 1 mpiversion=3.0 xfail=ticket1767
diff --git a/test/mpi/datatype/vecblklen.c b/test/mpi/datatype/vecblklen.c
new file mode 100644
index 0000000..1dc4c55
--- /dev/null
+++ b/test/mpi/datatype/vecblklen.c
@@ -0,0 +1,91 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mpitest.h"
+
+/* Inspired by the Intel MPI_Type_vector_blklen test.
+   Added to include a test of a dataloop optimization that failed.
+*/
+int main( int argc, char *argv[] )
+{
+    MPI_Datatype ot, ot2, newtype;
+    int position, psize, insize, outsize;
+    char *inbuf=0, *outbuf=0, *pbuf=0, *p;
+    int  i, j, k;
+    int  errs = 0;
+    int  veccount=16, stride=16;
+
+    MTest_Init( &argc, &argv );
+    /*
+     * Create a type with some padding
+     */
+    MPI_Type_contiguous( 59, MPI_CHAR, &ot );
+    MPI_Type_create_resized( ot, 0, 64, &ot2 );
+    /*
+      Use a vector type with a block size equal to the stride - thus
+      tiling the target memory with copies of old type.  This is not
+      a contiguous copy since oldtype has a gap at the end.
+    */
+    MPI_Type_vector( veccount, stride, stride, ot2, &newtype );
+    MPI_Type_commit( &newtype );
+
+    insize = veccount * stride * 64;
+    outsize = insize;
+    inbuf = (char *)malloc( insize );
+    outbuf = (char *)malloc( outsize );
+    for (i=0; i<outsize; i++) {
+        inbuf[i] = i % 64;
+        outbuf[i] = -1;
+    }
+
+    MPI_Pack_size( 1, newtype, MPI_COMM_WORLD, &psize );
+    pbuf = (char *)malloc( psize );
+
+    position = 0;
+    MPI_Pack( inbuf, 1, newtype, pbuf, psize, &position, MPI_COMM_WORLD );
+    psize    = position;
+    position = 0;
+    MPI_Unpack( pbuf, psize, &position, outbuf, 1, newtype, MPI_COMM_WORLD );
+
+
+    /* Check the output */
+    p = outbuf;
+    for (i=0; i<veccount; i++) {
+        for (j=0; j<stride; j++) {
+            for (k=0; k<59; k++) {
+                if (*p != k % 64) {
+                    errs++;
+                    fprintf( stderr, "[%d,%d,%d]expected %d but saw %d\n",
+                             i, j, k, (k%64), *p );
+                }
+                p++;
+            }
+            for (k=59; k<64; k++) {
+                if (*p != -1) {
+                    errs++;
+                    fprintf( stderr, "[%d,%d,%d]expected -1 but saw %d\n",
+                             i, j, k, *p );
+                }
+                p++;
+            }
+        }
+    }
+
+    free( pbuf );
+    free( inbuf );
+    free( outbuf );
+
+    MPI_Type_free( &ot );
+    MPI_Type_free( &ot2 );
+    MPI_Type_free( &newtype );
+    MTest_Finalize( errs );
+    MPI_Finalize();
+
+    return 0;
+}

http://git.mpich.org/mpich.git/commitdiff/754b3d3803ee8c4142fe45601ea258f7669f3bbb

commit 754b3d3803ee8c4142fe45601ea258f7669f3bbb
Author: William Gropp <wgropp at illinois.edu>
Date:   Wed Feb 19 07:54:13 2014 -0600

    Add detail to error messages in darray-pack
    
    While debugging dataloop optimizations, I needed a little more information
    about the errors from this test.  No change to output when there are
    no errors.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/test/mpi/datatype/darray-pack.c b/test/mpi/datatype/darray-pack.c
index 4af6d6c..e2104e7 100644
--- a/test/mpi/datatype/darray-pack.c
+++ b/test/mpi/datatype/darray-pack.c
@@ -124,13 +124,13 @@ int darray_2d_c_test1(void)
 
 	    if ((i == rank) && (array[i] != rank)) {
 		errs++;
-		if (verbose) fprintf(stderr, "array[%d] = %d; should be %d\n",
-				     i, array[i], rank);
+		if (verbose) fprintf(stderr, "[2d array rank=%d]:array[%d] = %d; should be %d\n",
+				     rank, i, array[i], rank);
 	    }
 	    else if ((i != rank) && (array[i] != 0)) {
 		errs++;
-		if (verbose) fprintf(stderr, "array[%d] = %d; should be %d\n",
-				     i, array[i], 0);
+		if (verbose) fprintf(stderr, "[2d array rank=%d]:array[%d] = %d; should be %d\n",
+				     rank, i, array[i], 0);
 	    }
 	}
 	MPI_Type_free(&darray);
@@ -205,23 +205,23 @@ int darray_4d_c_test1(void)
 	for (i=0; i < 4*rank; i++) {
 	    if (array[i] != 0) {
 		errs++;
-		if (verbose) fprintf(stderr, "array[%d] = %d; should be %d\n",
-				     i, array[i], 0);
+		if (verbose) fprintf(stderr, "[4d array rank=%d]:array[%d] = %d; should be %d\n",
+				     rank, i, array[i], 0);
 	    }
 	}
 
 	for (i=4*rank; i < 4*rank + 4; i++) {
 	    if (array[i] != i) {
 		errs++;
-		if (verbose) fprintf(stderr, "array[%d] = %d; should be %d\n",
-				     i, array[i], i);
+		if (verbose) fprintf(stderr, "[4d array rank=%d]:array[%d] = %d; should be %d\n",
+				     rank, i, array[i], i);
 	    }
 	}
 	for (i=4*rank+4; i < 72; i++) {
 	    if (array[i] != 0) {
 		errs++;
-		if (verbose) fprintf(stderr, "array[%d] = %d; should be %d\n",
-				     i, array[i], 0);
+		if (verbose) fprintf(stderr, "[4d array rank=%d]:array[%d] = %d; should be %d\n",
+				     rank, i, array[i], 0);
 	    }
 	}
 

http://git.mpich.org/mpich.git/commitdiff/7c13f5537bf75a9aaa32a6b2a155d6e8c0f251d0

commit 7c13f5537bf75a9aaa32a6b2a155d6e8c0f251d0
Author: William Gropp <wgropp at illinois.edu>
Date:   Wed Feb 19 08:36:37 2014 -0600

    Fix incorrect use of status and buffer mgmt
    
    Mostly minor changes in copyright message, but also retained correction
    to erroneous use of the status returned by a send request, and correctly
    freeing only allocated buffers.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/test/mpi/datatype/large_type_sendrec.c b/test/mpi/datatype/large_type_sendrec.c
index a697ef5..056c4a7 100644
--- a/test/mpi/datatype/large_type_sendrec.c
+++ b/test/mpi/datatype/large_type_sendrec.c
@@ -1,6 +1,5 @@
 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
 /*
- *
  *  (C) 2013 by Argonne National Laboratory.
  *      See COPYRIGHT in top-level directory.
  */
@@ -152,7 +151,8 @@ int main(int argc, char * argv[])
             MPI_ASSERT(MPI_Get_elements_x( &(statuses[1]), MPI_CHAR, &(ocount[1]) ));
         } else if (rank==0) {
             MPI_ASSERT(MPI_Wait( &(requests[0]), &(statuses[0]) ));
-            MPI_ASSERT(MPI_Get_elements_x( &(statuses[0]), MPI_CHAR, &(ocount[0]) ));
+	    /* No valid fields in status from a send request (MPI-3 p53,
+	       line 1-5) */
         }
     }
 
@@ -168,8 +168,8 @@ int main(int argc, char * argv[])
 	}
     }
 
-    free(rbuf);
-    free(sbuf);
+    if (rbuf) free(rbuf);
+    if (sbuf) free(sbuf);
 
     MPI_ASSERT(MPI_Type_free(&bigtype));
 

http://git.mpich.org/mpich.git/commitdiff/7a80eb875e239817b163aadfb5b07ca769c27c05

commit 7a80eb875e239817b163aadfb5b07ca769c27c05
Author: William Gropp <wgropp at illinois.edu>
Date:   Wed Feb 19 07:45:05 2014 -0600

    Minor typo fix for datatype/dataloop code
    
    While adding dataloop optimizations, a number of places where there
    were errors in text or in nomunclature were found and fixed.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpid/common/datatype/dataloop/dataloop.c b/src/mpid/common/datatype/dataloop/dataloop.c
index 532c51d..2cca5b9 100644
--- a/src/mpid/common/datatype/dataloop/dataloop.c
+++ b/src/mpid/common/datatype/dataloop/dataloop.c
@@ -54,7 +54,7 @@
 /*@
   Dataloop_free - deallocate the resources used to store a dataloop
 
-Input Parameters:
+Input/output Parameters:
 . dataloop - pointer to dataloop structure
 @*/
 void PREPEND_PREFIX(Dataloop_free)(DLOOP_Dataloop **dataloop)
@@ -483,7 +483,7 @@ void PREPEND_PREFIX(Dataloop_alloc_and_copy)(int kind,
 
 /*@
   Dataloop_struct_alloc - allocate the resources used to store a dataloop and
-                          copy in old dataloop as appropriate.  this version
+                          copy in old dataloop as appropriate.  This version
                           is specifically for use when a struct dataloop is
                           being created; the space to hold old dataloops in
                           this case must be described back to the
diff --git a/src/mpid/common/datatype/dataloop/dataloop_create_contig.c b/src/mpid/common/datatype/dataloop/dataloop_create_contig.c
index 290bf2a..2f3421c 100644
--- a/src/mpid/common/datatype/dataloop/dataloop_create_contig.c
+++ b/src/mpid/common/datatype/dataloop/dataloop_create_contig.c
@@ -11,14 +11,17 @@
    Dataloop_contiguous - create the dataloop representation for a
    contiguous datatype
 
-   Arguments:
+   Input Parameters:
 +  int icount,
-.  MPI_Datatype oldtype,
-.  DLOOP_Dataloop **dlp_p,
-.  int *dlsz_p,
-.  int *dldepth_p,
+.  DLOOP_Type oldtype
 -  int flag
 
+   Output Parameters:
++  DLOOP_Dataloop **dlp_p,
+.  DLOOP_Size *dlsz_p,
+-  int *dldepth_p,
+
+
 .N Errors
 .N Returns 0 on success, -1 on failure.
 @*/
diff --git a/src/mpid/common/datatype/mpid_type_debug.c b/src/mpid/common/datatype/mpid_type_debug.c
index 2dc5ceb..2489dbd 100644
--- a/src/mpid/common/datatype/mpid_type_debug.c
+++ b/src/mpid/common/datatype/mpid_type_debug.c
@@ -429,7 +429,12 @@ char *MPIDU_Datatype_combiner_to_string(int combiner)
     return NULL;
 }
 
-/* --BEGIN ERROR HANDLING-- */
+/* --BEGIN DEBUG-- */
+/*
+ * You must configure MPICH2 with the logging option enabled (--enable-g=log)
+ * for these routines to print - in which case, they use the same options
+ * as the logging code, including print to file and control by class (DATATYPE)
+ */
 void MPIDU_Datatype_debug(MPI_Datatype type,
 			  int array_ct)
 {
@@ -660,4 +665,4 @@ void MPIDI_Datatype_contents_printf(MPI_Datatype type,
 	    __mpidi_datatype_free_and_return;
     }
 }
-/* --END ERROR HANDLING-- */
+/* --END DEBUG-- */
diff --git a/src/mpid/common/datatype/mpid_type_dup.c b/src/mpid/common/datatype/mpid_type_dup.c
index 46ad53e..2fdae83 100644
--- a/src/mpid/common/datatype/mpid_type_dup.c
+++ b/src/mpid/common/datatype/mpid_type_dup.c
@@ -24,7 +24,7 @@ Output Parameters:
 . newtype - handle of newly created copy of datatype
 
   Return Value:
-  0 on success, -1 on failure.
+  0 on success, MPI error code on failure.
 @*/
 int MPID_Type_dup(MPI_Datatype oldtype,
 		  MPI_Datatype *newtype)
@@ -66,9 +66,12 @@ int MPID_Type_dup(MPI_Datatype oldtype,
 	new_dtp->has_sticky_lb = old_dtp->has_sticky_lb;
 	new_dtp->is_permanent  = old_dtp->is_permanent;
 	new_dtp->is_committed  = old_dtp->is_committed;
-	new_dtp->attributes    = NULL; /* ??? */
-	new_dtp->cache_id      = -1; /* ??? */
-	new_dtp->name[0]       = 0; /* ??? */
+
+	new_dtp->attributes    = NULL; /* Attributes are copied in the
+					top-level MPI_Type_dup routine */
+	new_dtp->cache_id      = -1;   /* ??? */
+	new_dtp->name[0]       = 0;    /* The Object name is not copied on
+					  a dup */
 	new_dtp->n_elements    = old_dtp->n_elements;
 	new_dtp->element_size  = old_dtp->element_size;
 	new_dtp->eltype        = old_dtp->eltype;

http://git.mpich.org/mpich.git/commitdiff/12531694f51ca930806613014c1560abcc7a1357

commit 12531694f51ca930806613014c1560abcc7a1357
Author: William Gropp <wgropp at illinois.edu>
Date:   Mon Feb 3 13:13:02 2014 -0600

    Mark indexperf as failing perf test
    
    With these optimizations, most of the datatype performance tests should
    pass.  However, indexperf still fails, since these optimizations to not
    address the pattern in indexperf.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/test/mpi/perf/testlist b/test/mpi/perf/testlist
index fb206ed..4d665bc 100644
--- a/test/mpi/perf/testlist
+++ b/test/mpi/perf/testlist
@@ -1,10 +1,10 @@
-transp-datatype 2  xfail=ticket1788
+transp-datatype 2
 sendrecvl 2
-twovec 1  xfail=ticket1788
-dtpack 1  xfail=ticket1789
-nestvec 1  xfail=ticket1788
-nestvec2 1  xfail=ticket1788
-indexperf 1  xfail=ticket1788
+twovec 1
+dtpack 1
+nestvec 1
+nestvec2 1
+indexperf 1 xfail=ticket1788
 non_zero_root 4
 timer 1
 # The commcreatep test looks at how communicator creation scales with group

-----------------------------------------------------------------------

Summary of changes:
 src/mpid/common/datatype/dataloop/Makefile.mk      |    3 +-
 src/mpid/common/datatype/dataloop/dataloop.c       |    4 +-
 .../common/datatype/dataloop/dataloop_create.h     |   13 +-
 .../datatype/dataloop/dataloop_create_contig.c     |   13 +-
 .../datatype/dataloop/dataloop_create_struct.c     |   93 +++-
 .../common/datatype/dataloop/dataloop_optimize.c   |  668 ++++++++++++++++++++
 src/mpid/common/datatype/dataloop/dataloop_parts.h |    1 -
 .../common/datatype/dataloop/segment_packunpack.c  |   58 ++
 src/mpid/common/datatype/mpid_type_commit.c        |   13 +
 src/mpid/common/datatype/mpid_type_debug.c         |    9 +-
 src/mpid/common/datatype/mpid_type_dup.c           |   11 +-
 test/mpi/datatype/Makefile.am                      |    3 +
 test/mpi/datatype/darray-pack.c                    |   20 +-
 test/mpi/datatype/hvecblklen.c                     |   91 +++
 test/mpi/datatype/large_type_sendrec.c             |    8 +-
 test/mpi/datatype/{dataalign.c => structpack2.c}   |   83 ++--
 test/mpi/datatype/testlist.in                      |    3 +
 test/mpi/datatype/vecblklen.c                      |   91 +++
 test/mpi/perf/testlist                             |   12 +-
 19 files changed, 1112 insertions(+), 85 deletions(-)
 create mode 100644 src/mpid/common/datatype/dataloop/dataloop_optimize.c
 create mode 100644 test/mpi/datatype/hvecblklen.c
 copy test/mpi/datatype/{dataalign.c => structpack2.c} (51%)
 create mode 100644 test/mpi/datatype/vecblklen.c


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list