[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.1rc2-166-g38ef581

mysql vizuser noreply at mpich.org
Thu Jan 16 21:39:48 CST 2014


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  38ef5818a883568e7dcc80f0e2aa0cfc972469be (commit)
      from  4e1b470dd1b47b514c65da9add3613fdd301c90c (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/38ef5818a883568e7dcc80f0e2aa0cfc972469be

commit 38ef5818a883568e7dcc80f0e2aa0cfc972469be
Author: Rob Latham <robl at mcs.anl.gov>
Date:   Mon Jan 13 15:43:51 2014 -0600

    a partial round of datatype optimizations
    
    Some datatype performance tests in the MPICH test suite fail:
    (perf/twovec,  perf/nestvec, perf/nestvec2, perf/indexperf,
    perf/transp-datatype).
    
    This changeset introduces a few optimizations that operate on the
    dataloop representation to make it more performant.  perf/indexperf
    should still fail under these changes.
    
    Original-author: Bill Gropp <wgropp at illinois.edu>
    
    See #1788, for which this resolves some but not all performance issues.
    
    Signed-off-by: Rob Latham <robl at mcs.anl.gov>

diff --git a/src/mpid/common/datatype/dataloop/Makefile.mk b/src/mpid/common/datatype/dataloop/Makefile.mk
index ec67180..e2f6cf1 100644
--- a/src/mpid/common/datatype/dataloop/Makefile.mk
+++ b/src/mpid/common/datatype/dataloop/Makefile.mk
@@ -21,7 +21,8 @@ lib_lib at MPILIBNAME@_la_SOURCES +=                                    \
     src/mpid/common/datatype/dataloop/segment_count.c                \
     src/mpid/common/datatype/dataloop/segment_flatten.c              \
     src/mpid/common/datatype/dataloop/segment_packunpack.c           \
-    src/mpid/common/datatype/dataloop/subarray_support.c
+    src/mpid/common/datatype/dataloop/subarray_support.c             \
+    src/mpid/common/datatype/dataloop/dataloop_optimize.c
 
 # several headers are included by the rest of MPICH
 AM_CPPFLAGS += -I$(top_srcdir)/src/mpid/common/datatype
diff --git a/src/mpid/common/datatype/dataloop/dataloop.c b/src/mpid/common/datatype/dataloop/dataloop.c
index 532c51d..2cca5b9 100644
--- a/src/mpid/common/datatype/dataloop/dataloop.c
+++ b/src/mpid/common/datatype/dataloop/dataloop.c
@@ -54,7 +54,7 @@
 /*@
   Dataloop_free - deallocate the resources used to store a dataloop
 
-Input Parameters:
+Input/output Parameters:
 . dataloop - pointer to dataloop structure
 @*/
 void PREPEND_PREFIX(Dataloop_free)(DLOOP_Dataloop **dataloop)
@@ -483,7 +483,7 @@ void PREPEND_PREFIX(Dataloop_alloc_and_copy)(int kind,
 
 /*@
   Dataloop_struct_alloc - allocate the resources used to store a dataloop and
-                          copy in old dataloop as appropriate.  this version
+                          copy in old dataloop as appropriate.  This version
                           is specifically for use when a struct dataloop is
                           being created; the space to hold old dataloops in
                           this case must be described back to the
diff --git a/src/mpid/common/datatype/dataloop/dataloop_create.h b/src/mpid/common/datatype/dataloop/dataloop_create.h
index f054429..803d7d4 100644
--- a/src/mpid/common/datatype/dataloop/dataloop_create.h
+++ b/src/mpid/common/datatype/dataloop/dataloop_create.h
@@ -94,5 +94,14 @@ DLOOP_Count PREPEND_PREFIX(Type_blockindexed_count_contig)(DLOOP_Count count,
                                                            const void *disp_array,
                                                            int dispinbytes,
                                                            DLOOP_Offset old_extent);
-                                                          
+
+int PREPEND_PREFIX(Dataloop_optimize)( DLOOP_Dataloop *dlpOld_p );
+
+int PREPEND_PREFIX(Dataloop_est_complexity)(DLOOP_Dataloop *,
+					    MPI_Aint *, MPI_Aint *);
+int PREPEND_PREFIX(Dataloop_est_struct_complexity)( int,
+						    const int [],
+						    const DLOOP_Type [],
+						    MPI_Aint *,
+						    MPI_Aint * );
 #endif
diff --git a/src/mpid/common/datatype/dataloop/dataloop_create_contig.c b/src/mpid/common/datatype/dataloop/dataloop_create_contig.c
index 290bf2a..2f3421c 100644
--- a/src/mpid/common/datatype/dataloop/dataloop_create_contig.c
+++ b/src/mpid/common/datatype/dataloop/dataloop_create_contig.c
@@ -11,14 +11,17 @@
    Dataloop_contiguous - create the dataloop representation for a
    contiguous datatype
 
-   Arguments:
+   Input Parameters:
 +  int icount,
-.  MPI_Datatype oldtype,
-.  DLOOP_Dataloop **dlp_p,
-.  int *dlsz_p,
-.  int *dldepth_p,
+.  DLOOP_Type oldtype
 -  int flag
 
+   Output Parameters:
++  DLOOP_Dataloop **dlp_p,
+.  DLOOP_Size *dlsz_p,
+-  int *dldepth_p,
+
+
 .N Errors
 .N Returns 0 on success, -1 on failure.
 @*/
diff --git a/src/mpid/common/datatype/dataloop/dataloop_create_struct.c b/src/mpid/common/datatype/dataloop/dataloop_create_struct.c
index 7d51b7b..47bbffc 100644
--- a/src/mpid/common/datatype/dataloop/dataloop_create_struct.c
+++ b/src/mpid/common/datatype/dataloop/dataloop_create_struct.c
@@ -11,6 +11,57 @@
 #error "You must explicitly include a header that sets the PREPEND_PREFIX and includes dataloop_parts.h"
 #endif
 
+
+/*
+=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===
+
+categories :
+    - name        : DATATYPE
+      description : Datatype optimization parameters
+
+cvars:
+   - name         : MPIR_CVAR_DATALOOP_OPTIMIZE
+     category     : DATATYPE
+     type         : boolean
+     default      : true
+     class        : none
+     verbosity    : MPI_T_VERBOSITY_USER_BASIC
+     scope        : MPI_T_SCOPE_ALL_EQ
+     description  : >-
+       By default, the internal representation of an MPI datatype that
+       is used by MPICH to move data is very similar to the original
+       description of the datatype.  If this flag is true, additional
+       optimizations are used to improve the performance of datatypes.
+
+   - name        : MPIR_CVAR_DATALOOP_FLATTEN
+     category    : DATATYPE
+     type        : boolean
+     class       : none
+     default     : true
+     verbosity   : MPI_T_VERBOSITY_USER_BASIC
+     scope       : MPI_T_SCOPE_ALL_EQ
+     description : >-
+      If true, attempt to "flatten" the internal representation of
+      MPI struct datatypes (created with MPI_Type_create_struct).
+
+   - name        : MPIR_CVAR_DATALOOP_FLATTEN_MULT
+     category    : DATATYPE
+     type        : int
+     class       : none
+     default     : 2
+     verbosity   : MPI_T_VERBOSITY_USER_BASIC
+     scope       : MPI_T_SCOPE_ALL_EQ
+     description : >-
+       Flattening an MPI struct datatype does not always improve
+       performance.  This parameter is a threshold that is used in
+       comparing the size of the description with the amount of data
+       moved.  Larger values make it more likely that a struct datatype
+       will be flattened.  The default value is adequate for flattening
+       simple structs, and will usually avoid flattening structs
+       containing vectors or block-indexed data.
+
+=== END_MPI_T_CVAR_INFO_BLOCK ===
+*/
 static int DLOOP_Dataloop_create_struct_memory_error(void);
 static int DLOOP_Dataloop_create_unique_type_struct(DLOOP_Count count,
 						    const int *blklens,
@@ -238,19 +289,37 @@ int PREPEND_PREFIX(Dataloop_create_struct)(DLOOP_Count count,
      * if caller asked for homogeneous or all bytes representation,
      * flatten the type and store it as an indexed type so that
      * there are no branches in the dataloop tree.
+     *
+     * Note that this is not always an optimization - for example,
+     * replacing two long block_indexed with one longer indexed (with
+     * the additional blockcount array) is likely to be slower, because
+     * of the additional memory motion required.
      */
-    if ((flag == DLOOP_DATALOOP_HOMOGENEOUS) ||
-	     (flag == DLOOP_DATALOOP_ALL_BYTES))
-    {
-	return DLOOP_Dataloop_create_flattened_struct(count,
-						      blklens,
-						      disps,
-						      oldtypes,
-						      dlp_p,
-						      dlsz_p,
-						      dldepth_p,
-						      flag);
-    }
+    if (MPIR_CVAR_DATALOOP_FLATTEN && (
+	(flag == DLOOP_DATALOOP_HOMOGENEOUS) ||
+	(flag == DLOOP_DATALOOP_ALL_BYTES) ))
+	{
+	    MPI_Aint nElms = 0, nDesc = 0;
+	    PREPEND_PREFIX(Dataloop_est_struct_complexity)( count,
+							    blklens,
+							    oldtypes,
+							    &nElms,
+							    &nDesc );
+
+	    /* Only convert to flattened if the flattened description
+	       is likely to be more efficient.  We estimate this
+	       by */
+	    if ( nDesc * 24 * MPIR_CVAR_DATALOOP_FLATTEN_MULT > nElms) {
+		return DLOOP_Dataloop_create_flattened_struct(count,
+							      blklens,
+							      disps,
+							      oldtypes,
+							      dlp_p,
+							      dlsz_p,
+							      dldepth_p,
+							      flag);
+	    }
+	}
 
     /* scan through types and gather derived type info */
     for (i=0; i < count; i++)
diff --git a/src/mpid/common/datatype/dataloop/dataloop_optimize.c b/src/mpid/common/datatype/dataloop/dataloop_optimize.c
new file mode 100644
index 0000000..54cd14c
--- /dev/null
+++ b/src/mpid/common/datatype/dataloop/dataloop_optimize.c
@@ -0,0 +1,514 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "dataloop.h"
+
+#define MPICH_DEBUG_DATALOOP
+#ifdef MPICH_DEBUG_DATALOOP
+static int level = 0;
+static int printDataloop = 0;
+static int printIfOptimized = 0;
+
+/* Print format:
+   (spaces for level).(el_size,el_extent,el_type)(count)....
+*/
+static void dl_print_dataloop( int, DLOOP_Dataloop * );
+static void dl_print_contig( int, DLOOP_Dataloop * );
+static void dl_print_vector( int, DLOOP_Dataloop * );
+static void dl_print_blockindexed( int, DLOOP_Dataloop * );
+static void dl_print_struct( int, DLOOP_Dataloop * );
+static void dl_print( int, const char * );
+
+static void dl_print_tab( int l )
+{
+    int i;
+    for (i=2*l; i!=0; i--) printf( "%c", ' ' );
+}
+static void dl_print_base( DLOOP_Dataloop *dp )
+{
+    printf( "(%ld,%ld,%lx)(%ld)", (long)dp->el_size, (long)dp->el_extent,
+	    (long)dp->el_type, (long)dp->loop_params.count );
+}
+static void dl_print( int l, const char *s )
+{
+    dl_print_tab(l);
+    printf( "%s", s );
+}
+static void dl_print_contig( int l, DLOOP_Dataloop *dp )
+{
+    dl_print_tab(l);
+    printf( "CONTIG " );
+    dl_print_base( dp );
+    printf( "\n" );
+}
+static void dl_print_vector( int l, DLOOP_Dataloop *dp )
+{
+    int stride = dp->loop_params.v_t.stride;
+    int blocksize = dp->loop_params.v_t.blocksize ;
+    dl_print_tab(l);
+    printf( "VECTOR " );
+    dl_print_base( dp );
+    printf( ":Stride %d Blocksize %d\n", stride, blocksize );
+}
+static void dl_print_blockindexed( int l, DLOOP_Dataloop *dp )
+{
+    int blocksize = dp->loop_params.bi_t.blocksize ;
+    DLOOP_Offset *offarray = dp->loop_params.bi_t.offset_array;
+    int i, n;
+    dl_print_tab(l);
+    printf( "BLOCKINDEXED " );
+    dl_print_base( dp );
+    printf( ":Blocksize %d:", blocksize );
+    n = dp->loop_params.bi_t.count;
+    if (n > 8) n = 8;
+    for (i=0; i<n; i++) {
+	printf( "%lx,", (long)offarray[i] );
+    }
+    if (dp->loop_params.bi_t.count > n) printf( "..." );
+    printf( "\n" );
+}
+static void dl_print_indexed( int l, DLOOP_Dataloop *dp )
+{
+    DLOOP_Count  *blocksizearray = dp->loop_params.i_t.blocksize_array ;
+    DLOOP_Offset *offarray = dp->loop_params.i_t.offset_array;
+    int          i, n;
+    int          minblock, maxblock;
+    dl_print_tab(l);
+    printf( "INDEXED " );
+    dl_print_base( dp );
+    n = dp->loop_params.i_t.count;
+    minblock = maxblock = (n>0) ? blocksizearray[0] : 0;
+    for (i=0; i<n; i++) {
+	if (blocksizearray[i] > maxblock) maxblock = blocksizearray[i];
+	if (blocksizearray[i] < minblock) minblock = blocksizearray[i];
+    }
+    printf( "blocks in [%d,%d]", minblock, maxblock );
+
+    if (n > 8) n = 8;
+    for (i=0; i<n; i++) {
+	printf( "(%lx,%ld)", (long)offarray[i], (long)blocksizearray[i] );
+    }
+    if (dp->loop_params.i_t.count > n) printf( "..." );
+    printf( "\n" );
+}
+
+static void dl_print_struct( int l, DLOOP_Dataloop *dp )
+{
+    DLOOP_Count  *blocksizearray = dp->loop_params.s_t.blocksize_array ;
+    DLOOP_Offset *offarray = dp->loop_params.s_t.offset_array;
+    DLOOP_Dataloop **looparray = dp->loop_params.s_t.dataloop_array;
+    int          i, n;
+    dl_print_tab(l);
+    printf( "STRUCT " );
+    dl_print_base( dp );
+    printf( "\n" );
+    n = dp->loop_params.i_t.count;
+    if (n > 8) n = 8;
+    for (i=0; i<n; i++) {
+	dl_print_tab(l+1);
+	printf( "(%lx,%ld)", (long)offarray[i], (long)blocksizearray[i] );
+	dl_print_dataloop( l+1, looparray[i] );
+	printf( "\n" );
+    }
+    if (dp->loop_params.i_t.count > n) printf( "...\n" );
+}
+static void dl_print_dataloop( int l, DLOOP_Dataloop *dp )
+{
+    dl_print_tab( l );
+    dl_print_base( dp );
+    switch (dp->kind & DLOOP_KIND_MASK) {
+    case DLOOP_KIND_CONTIG:
+	dl_print_contig( l, dp );
+	break;
+    case DLOOP_KIND_VECTOR:
+	dl_print_vector( l, dp );
+	break;
+    case DLOOP_KIND_BLOCKINDEXED:
+	dl_print_blockindexed( l, dp );
+	break;
+    case DLOOP_KIND_INDEXED:
+	dl_print_indexed( l, dp );
+	break;
+    case DLOOP_KIND_STRUCT:
+	dl_print_struct( l, dp );
+	break;
+    default:
+	dl_print( l, "Unknown dataloop type " );
+	printf( "\n" );
+	break;
+    }
+}
+#endif
+
+/*
+ * Indicates whether a dataloop is a basic and final contig type.
+ * This can be used to determine when a contig type can be removed
+ * in a dataloop.
+ */
+static int dl_contig_isFinal( DLOOP_Dataloop *dp )
+{
+    if ((dp->kind & DLOOP_KIND_MASK) != DLOOP_KIND_CONTIG) return 0;
+    if (dp->el_size == dp->el_extent &&
+	(dp->kind & DLOOP_FINAL_MASK))
+	return 1;
+    return 0;
+}
+
+/*
+ * Optimize a dataloop
+ *
+ * Apply the following transformations and return a new dataloop.
+ * 1. Convert all predefined types to UINTS with the best alignment (may be BYTE
+ *    in worst case)
+ * 2. Convert blocks of contiguous into a single block of basic unit (e.g.,
+ *    a vector type with a block count of 27 applied to a contiguous type of
+ *    6 ints will be turned into a block count of (27*6) UINTs)
+ * 3. Convert struct (with different dataloops (from different MPI datatypes)
+ *    into indexed when all types are contig
+ * 4. Convert dataloops with counts of 1 into simpler types (e.g., q vector
+ *    with 1 element is really a contig type)
+ *
+ * Value of these optimizations
+ * A 2012 paper compared performance of Open MPI, MPICH2, and user-written code
+ * for some datatypes, and found MPICH2 often performed poorer than other
+ * options.  An investigation showed that some of the issues are due to
+ * a failure to perform optimizations of these type (especially #1 and 2).
+ * It may also be necessary to enhance the dataloop execution engine, but
+ * that will b a separate step.
+ */
+int PREPEND_PREFIX(Dataloop_optimize)(DLOOP_Dataloop *dlpOld_p )
+{
+    int i;
+
+#ifdef MPICH_DEBUG_DATALOOP
+    /* Temp for debugging */
+    static int firstCall = 1;
+    /* This is threadsafe in the sense that we don't care */
+    if (firstCall) {
+	if (getenv("MPICH_DATALOOP_PRINT")) {
+	    printDataloop = 1;
+	    printIfOptimized = 1;
+	}
+	firstCall = 0;
+    }
+#endif
+
+    switch (dlpOld_p->kind & DLOOP_KIND_MASK) {
+    case DLOOP_KIND_CONTIG:
+#ifdef MPICH_DEBUG_DATALOOP
+	if (printDataloop)
+	    dl_print_contig( level, dlpOld_p );
+#endif
+	/* replace contig of (non-basic) contig with contig (basic) */
+	if (!(dlpOld_p->kind & DLOOP_FINAL_MASK)) {
+	    DLOOP_Dataloop *dlpChild_p = dlpOld_p->loop_params.c_t.dataloop;
+	    level++;
+	    PREPEND_PREFIX(Dataloop_optimize)( dlpChild_p );
+	    level--;
+	    if (dl_contig_isFinal( dlpChild_p ) ) {
+		dlpOld_p->loop_params.c_t.count *= dlpChild_p->loop_params.c_t.count;
+		dlpOld_p->el_size   = dlpChild_p->el_size;
+		dlpOld_p->el_extent = dlpChild_p->el_extent;
+		dlpOld_p->kind     |= DLOOP_FINAL_MASK;
+		dlpOld_p->loop_params.c_t.dataloop = 0;
+#ifdef MPICH_DEBUG_DATALOOP
+		if (printIfOptimized || printDataloop) {
+		    printf( "replacement contig is:\n" );
+		    dl_print_contig( level, dlpOld_p );
+		}
+#endif
+	    }
+	}
+	break;
+
+    case DLOOP_KIND_VECTOR:
+	/* if sub-dloop is (non-basic) contig, merge with blockcount */
+#ifdef MPICH_DEBUG_DATALOOP
+	if (printDataloop)
+	    dl_print_vector( level, dlpOld_p );
+#endif
+
+	if (!(dlpOld_p->kind & DLOOP_FINAL_MASK)) {
+	    DLOOP_Dataloop *dlpChild_p = dlpOld_p->loop_params.v_t.dataloop;
+	    level++;
+	    PREPEND_PREFIX(Dataloop_optimize)( dlpChild_p );
+	    level--;
+
+	    if (dl_contig_isFinal( dlpChild_p ) ) {
+		/* We can replace the contig type by enlarging the blocksize */
+
+		/* Reset the kind to final, free the child type, set to null */
+		dlpOld_p->loop_params.v_t.blocksize *= dlpChild_p->loop_params.count;
+		dlpOld_p->el_size   = dlpChild_p->el_size;
+		dlpOld_p->el_extent = dlpChild_p->el_extent;
+		dlpOld_p->kind     |= DLOOP_FINAL_MASK;
+		dlpOld_p->loop_params.v_t.dataloop = 0;
+#ifdef MPICH_DEBUG_DATALOOP
+		if (printIfOptimized || printDataloop) {
+		    printf( "replacement Vector is:\n" );
+		    dl_print_vector( level, dlpOld_p );
+		}
+#endif
+	    }
+	}
+	/* replace vector of a single element with contig */
+	if ((dlpOld_p->kind & DLOOP_FINAL_MASK)) {
+	    int blocksize = dlpOld_p->loop_params.v_t.blocksize;
+	    int count     = dlpOld_p->loop_params.v_t.count;
+	    if (dlpOld_p->el_size * blocksize ==
+		dlpOld_p->loop_params.v_t.stride ) {
+		dlpOld_p->kind = DLOOP_KIND_CONTIG | DLOOP_FINAL_MASK;
+		dlpOld_p->loop_params.c_t.dataloop = 0;
+		dlpOld_p->loop_params.c_t.count = count * blocksize;
+#ifdef MPICH_DEBUG_DATALOOP
+		if (printIfOptimized || printDataloop) {
+		    printf( "replacement Contig is:\n" );
+		    dl_print_contig( level, dlpOld_p );
+		}
+#endif
+	    }
+	}
+	/* replace vector that is contiguous with contiguous */
+	break;
+
+    case DLOOP_KIND_BLOCKINDEXED:
+	/* if subdloop is (non-basic) contig, merge with blockcount */
+#ifdef MPICH_DEBUG_DATALOOP
+	if (printDataloop)
+	    dl_print_blockindexed( level, dlpOld_p );
+#endif
+	if (!(dlpOld_p->kind & DLOOP_FINAL_MASK)) {
+	    DLOOP_Dataloop *dlpChild_p = dlpOld_p->loop_params.bi_t.dataloop;
+	    level++;
+	    PREPEND_PREFIX(Dataloop_optimize)( dlpChild_p );
+	    level--;
+	    if (dl_contig_isFinal( dlpChild_p ) ) {
+		/* We can replace the contig type by enlarging the blocksize */
+
+		/* Reset the kind to final, free the child type, set to null */
+		dlpOld_p->loop_params.bi_t.blocksize *= dlpChild_p->loop_params.count;
+		dlpOld_p->el_size   = dlpChild_p->el_size;
+		dlpOld_p->el_extent = dlpChild_p->el_extent;
+		dlpOld_p->kind     |= DLOOP_FINAL_MASK;
+		dlpOld_p->loop_params.bi_t.dataloop = 0;
+#ifdef MPICH_DEBUG_DATALOOP
+		if (printIfOptimized || printDataloop) {
+		    printf( "replacement BlockIndexed is:\n" );
+		    dl_print_blockindexed( level, dlpOld_p );
+		}
+#endif
+	    }
+	}
+	/* replace blockindexed of a single element with contig */
+	break;
+    case DLOOP_KIND_INDEXED:
+	/* if sub-dloop is (non-basic) contig, merge with blockcount */
+#ifdef MPICH_DEBUG_DATALOOP
+	if (printDataloop)
+	    dl_print_indexed( level, dlpOld_p );
+#endif
+	if (!(dlpOld_p->kind & DLOOP_FINAL_MASK)) {
+	    DLOOP_Dataloop *dlpChild_p = dlpOld_p->loop_params.i_t.dataloop;
+	    level++;
+	    PREPEND_PREFIX(Dataloop_optimize)( dlpChild_p );
+	    level--;
+	    if (dl_contig_isFinal( dlpChild_p ) ) {
+		/* Could include the child type in the blocksize counts */
+	    }
+	}
+
+	/* replace indexed of constant block count with blockindexed */
+
+	/* replace indexed of a single element with contig */
+
+	/* If all block counts are multiples of the smallest, and if most
+	   blocks are smallest, then the other blocks could be split into
+	   separate blocks with appropriate offsets, replacing indexed with
+	   blockindexed */
+
+	break;
+
+    case DLOOP_KIND_STRUCT:
+	/* if sub-dloops are all contig, replace with indexed */
+#ifdef MPICH_DEBUG_DATALOOP
+	if (printDataloop)
+	    dl_print_struct( level, dlpOld_p );
+#endif
+	if (!(dlpOld_p->kind & DLOOP_FINAL_MASK)) {
+	    for (i=0; i<dlpOld_p->loop_params.s_t.count; i++) {
+		level ++;
+		PREPEND_PREFIX(Dataloop_optimize)(
+			  dlpOld_p->loop_params.s_t.dataloop_array[i] );
+		level --;
+	    }
+	}
+	break;
+    default:
+#ifdef MPICH_DEBUG_DATALOOP
+	if (printDataloop)
+	    dl_print( level, "Unknown type!" );
+#endif
+	break;
+    }
+
+    return 0;
+}
+
+
+/*
+ * Make an estimate at the complexity of a datatype.  This can be used
+ * to determine whether flattening the datatype to an indexed type is
+ * likely to be efficient.
+ */
+int PREPEND_PREFIX(Dataloop_est_complexity)(DLOOP_Dataloop *dlp_p,
+					    MPI_Aint *nElms,
+					    MPI_Aint *nDesc )
+{
+    int i;
+    MPI_Aint myElms = 0;
+    MPI_Aint myDesc = 0;
+    MPI_Aint childElms = 0, childDesc = 0;
+    DLOOP_Dataloop *dlpChild_p;
+
+    switch (dlp_p->kind & DLOOP_KIND_MASK) {
+    case DLOOP_KIND_CONTIG:
+        /* Data moved is count*size of the child type */
+
+	if (!(dlp_p->kind & DLOOP_FINAL_MASK)) {
+	    dlpChild_p = dlp_p->loop_params.c_t.dataloop;
+	    PREPEND_PREFIX(Dataloop_est_complexity)( dlpChild_p, &childElms,
+						     &childDesc );
+	}
+	else {
+	    childElms = dlp_p->el_size;
+	    childDesc = 0;
+	}
+	myElms += dlp_p->loop_params.c_t.count * childElms;
+	myDesc += childDesc + 1;
+
+	break;
+
+    case DLOOP_KIND_VECTOR:
+        /* Data moved is count*size of the child type */
+
+	if (!(dlp_p->kind & DLOOP_FINAL_MASK)) {
+	    dlpChild_p = dlp_p->loop_params.v_t.dataloop;
+	    PREPEND_PREFIX(Dataloop_est_complexity)( dlpChild_p, &childElms,
+						     &childDesc );
+	}
+	else {
+	    childElms = dlp_p->el_size;
+	    childDesc = 0;
+	}
+	myElms += dlp_p->loop_params.v_t.count *
+	    dlp_p->loop_params.v_t.blocksize * childElms;
+	myDesc += childDesc + 2;
+
+	break;
+
+    case DLOOP_KIND_BLOCKINDEXED:
+	if (!(dlp_p->kind & DLOOP_FINAL_MASK)) {
+	    dlpChild_p = dlp_p->loop_params.bi_t.dataloop;
+	    PREPEND_PREFIX(Dataloop_est_complexity)( dlpChild_p, &childElms,
+						     &childDesc );
+	}
+	else {
+	    childElms = dlp_p->el_size;
+	    childDesc = 0;
+	}
+	myElms += dlp_p->loop_params.bi_t.count *
+	    dlp_p->loop_params.bi_t.blocksize * childElms;
+	myDesc += childDesc + dlp_p->loop_params.bi_t.count;
+	break;
+
+    case DLOOP_KIND_INDEXED:
+
+	if (!(dlp_p->kind & DLOOP_FINAL_MASK)) {
+	    dlpChild_p = dlp_p->loop_params.i_t.dataloop;
+	    PREPEND_PREFIX(Dataloop_est_complexity)( dlpChild_p, &childElms,
+						     &childDesc );
+	}
+	else {
+	    childElms = dlp_p->el_size;
+	    childDesc = 0;
+	}
+	myElms += dlp_p->loop_params.i_t.total_blocks * childElms;
+	myDesc += childDesc + 2 * dlp_p->loop_params.i_t.count;
+
+	break;
+
+    case DLOOP_KIND_STRUCT:
+	if (!(dlp_p->kind & DLOOP_FINAL_MASK)) {
+	    MPI_Aint celm, cdesc;
+	    for (i=0; i<dlp_p->loop_params.s_t.count; i++) {
+		celm = 0; cdesc = 0;
+		PREPEND_PREFIX(Dataloop_est_complexity)(
+		 	       dlp_p->loop_params.s_t.dataloop_array[i],
+			       &celm, &cdesc );
+		childElms += celm * dlp_p->loop_params.s_t.blocksize_array[i];
+		childDesc += cdesc + 3;
+	    }
+	}
+	else {
+	    int elsize = dlp_p->el_size;
+	    childElms = 0;
+	    childDesc = 0;
+	    for (i=0; i<dlp_p->loop_params.s_t.count; i++) {
+		childElms += elsize * dlp_p->loop_params.s_t.blocksize_array[i];
+		childDesc += 3;
+	    }
+	}
+
+	myElms += childElms;
+	myDesc += childDesc;
+	break;
+
+    default:
+	break;
+    }
+
+    /* Return the final values */
+    *nElms += myElms;
+    *nDesc += myDesc;
+
+    return 0;
+}
+
+/*
+ * Estimate the complexity of a struct Dataloop before it is constructed.
+ */
+int PREPEND_PREFIX(Dataloop_est_struct_complexity)( int count,
+						    const int blklens[],
+						    const DLOOP_Type oldtypes[],
+						    MPI_Aint *nElms,
+						    MPI_Aint *nDesc )
+{
+    MPI_Aint myElms = 0, myDesc = 0;
+    int i;
+    int flag = MPID_DATALOOP_ALL_BYTES;
+
+    for (i=0; i<count; i++) {
+	DLOOP_Dataloop *dlp_p = 0;
+	MPI_Aint celms = 0, cdesc = 0;
+
+	DLOOP_Handle_get_loopptr_macro(oldtypes[i],dlp_p,flag);
+	if (dlp_p) {
+	    PREPEND_PREFIX(Dataloop_est_complexity)( dlp_p,
+						     &celms, &cdesc );
+	}
+	else {
+	    celms = 1;
+	    cdesc = 1;
+	}
+	myElms += celms * blklens[i];
+	myDesc += cdesc;
+    }
+    *nElms = myElms;
+    *nDesc = myDesc;
+
+    return MPI_SUCCESS;
+}
diff --git a/src/mpid/common/datatype/dataloop/segment_packunpack.c b/src/mpid/common/datatype/dataloop/segment_packunpack.c
index 7446b59..5de8053 100644
--- a/src/mpid/common/datatype/dataloop/segment_packunpack.c
+++ b/src/mpid/common/datatype/dataloop/segment_packunpack.c
@@ -283,6 +283,22 @@ int PREPEND_PREFIX(Segment_blkidx_m2m)(DLOOP_Offset *blocks_p,
 
     DLOOP_Handle_get_size_macro(el_type, el_size);
 
+    /* If the blocklen * el_size is relatively small, then for
+       performance reasons, its important to hoist most of these
+       tests out of the loop.  Ignoring some of the issues of handling
+       the available buffer size (blocks_left), this should translate
+       directly into code that looks like this for blocksize == 1
+
+       for (i=0; i<count; i++) {
+            dest[i] = userbuf[offsetarray[i]];
+       }
+
+       where "dest" and "userbuf" are pointers to objects of the correct
+       size.  If blocksize is > 1, then various unrollings are important
+       until blocksize is large enough to make the overhead of memcpy
+       negligible.  Datatypes such as this are used in LAMMPS, for example.
+    */
+
     while (blocks_left) {
 	char *src, *dest;
 
diff --git a/src/mpid/common/datatype/mpid_type_commit.c b/src/mpid/common/datatype/mpid_type_commit.c
index a384e50..7a9091b 100644
--- a/src/mpid/common/datatype/mpid_type_commit.c
+++ b/src/mpid/common/datatype/mpid_type_commit.c
@@ -19,6 +19,7 @@ Output Parameters:
   Return Value:
   0 on success, -1 on failure.
 @*/
+
 int MPID_Type_commit(MPI_Datatype *datatype_p)
 {
     int           mpi_errno=MPI_SUCCESS;
@@ -57,9 +58,13 @@ int MPID_Type_commit(MPI_Datatype *datatype_p)
 	MPIU_DBG_PRINTF(("# contig blocks = %d\n",
 			 (int) datatype_ptr->max_contig_blocks));
 
+	if (MPIR_CVAR_DATALOOP_OPTIMIZE) {
+	    MPID_Dataloop_optimize(datatype_ptr->dataloop );
+        }
 #if 0
         MPIDI_Dataloop_dot_printf(datatype_ptr->dataloop, 0, 1);
 #endif
+
     }
 
     return mpi_errno;
diff --git a/src/mpid/common/datatype/mpid_type_debug.c b/src/mpid/common/datatype/mpid_type_debug.c
index 2dc5ceb..2489dbd 100644
--- a/src/mpid/common/datatype/mpid_type_debug.c
+++ b/src/mpid/common/datatype/mpid_type_debug.c
@@ -429,7 +429,12 @@ char *MPIDU_Datatype_combiner_to_string(int combiner)
     return NULL;
 }
 
-/* --BEGIN ERROR HANDLING-- */
+/* --BEGIN DEBUG-- */
+/*
+ * You must configure MPICH2 with the logging option enabled (--enable-g=log)
+ * for these routines to print - in which case, they use the same options
+ * as the logging code, including print to file and control by class (DATATYPE)
+ */
 void MPIDU_Datatype_debug(MPI_Datatype type,
 			  int array_ct)
 {
@@ -660,4 +665,4 @@ void MPIDI_Datatype_contents_printf(MPI_Datatype type,
 	    __mpidi_datatype_free_and_return;
     }
 }
-/* --END ERROR HANDLING-- */
+/* --END DEBUG-- */
diff --git a/src/mpid/common/datatype/mpid_type_dup.c b/src/mpid/common/datatype/mpid_type_dup.c
index 46ad53e..2fdae83 100644
--- a/src/mpid/common/datatype/mpid_type_dup.c
+++ b/src/mpid/common/datatype/mpid_type_dup.c
@@ -24,7 +24,7 @@ Output Parameters:
 . newtype - handle of newly created copy of datatype
 
   Return Value:
-  0 on success, -1 on failure.
+  0 on success, MPI error code on failure.
 @*/
 int MPID_Type_dup(MPI_Datatype oldtype,
 		  MPI_Datatype *newtype)
@@ -66,9 +66,12 @@ int MPID_Type_dup(MPI_Datatype oldtype,
 	new_dtp->has_sticky_lb = old_dtp->has_sticky_lb;
 	new_dtp->is_permanent  = old_dtp->is_permanent;
 	new_dtp->is_committed  = old_dtp->is_committed;
-	new_dtp->attributes    = NULL; /* ??? */
-	new_dtp->cache_id      = -1; /* ??? */
-	new_dtp->name[0]       = 0; /* ??? */
+
+	new_dtp->attributes    = NULL; /* Attributes are copied in the
+					top-level MPI_Type_dup routine */
+	new_dtp->cache_id      = -1;   /* ??? */
+	new_dtp->name[0]       = 0;    /* The Object name is not copied on
+					  a dup */
 	new_dtp->n_elements    = old_dtp->n_elements;
 	new_dtp->element_size  = old_dtp->element_size;
 	new_dtp->eltype        = old_dtp->eltype;
diff --git a/test/mpi/perf/testlist b/test/mpi/perf/testlist
index fb206ed..4d665bc 100644
--- a/test/mpi/perf/testlist
+++ b/test/mpi/perf/testlist
@@ -1,10 +1,10 @@
-transp-datatype 2  xfail=ticket1788
+transp-datatype 2
 sendrecvl 2
-twovec 1  xfail=ticket1788
-dtpack 1  xfail=ticket1789
-nestvec 1  xfail=ticket1788
-nestvec2 1  xfail=ticket1788
-indexperf 1  xfail=ticket1788
+twovec 1
+dtpack 1
+nestvec 1
+nestvec2 1
+indexperf 1 xfail=ticket1788
 non_zero_root 4
 timer 1
 # The commcreatep test looks at how communicator creation scales with group

-----------------------------------------------------------------------

Summary of changes:
 src/mpid/common/datatype/dataloop/Makefile.mk      |    3 +-
 src/mpid/common/datatype/dataloop/dataloop.c       |    4 +-
 .../common/datatype/dataloop/dataloop_create.h     |   11 +-
 .../datatype/dataloop/dataloop_create_contig.c     |   13 +-
 .../datatype/dataloop/dataloop_create_struct.c     |   93 +++-
 .../common/datatype/dataloop/dataloop_optimize.c   |  514 ++++++++++++++++++++
 .../common/datatype/dataloop/segment_packunpack.c  |   16 +
 src/mpid/common/datatype/mpid_type_commit.c        |    5 +
 src/mpid/common/datatype/mpid_type_debug.c         |    9 +-
 src/mpid/common/datatype/mpid_type_dup.c           |   11 +-
 test/mpi/perf/testlist                             |   12 +-
 11 files changed, 658 insertions(+), 33 deletions(-)
 create mode 100644 src/mpid/common/datatype/dataloop/dataloop_optimize.c


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list