[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.1.2-119-g9c3e547

Service Account noreply at mpich.org
Wed Aug 27 13:47:28 CDT 2014


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  9c3e5475e4e3006e1db1f112805bdb48e1130075 (commit)
      from  ffa9c6d675c8463f903aa0b186f883f7c5cd218f (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/9c3e5475e4e3006e1db1f112805bdb48e1130075

commit 9c3e5475e4e3006e1db1f112805bdb48e1130075
Author: Pavan Balaji <balaji at anl.gov>
Date:   Tue Aug 26 16:26:50 2014 -0500

    Initial HCOLL integration.
    
    Most of the hcoll code is in a separate directory, expect for a few
    changes in mainline mpich.
    
    1. The comm structure stores some hcoll specific data structures.
    
    2. The nemesis and sock progress engines need to poke the hcoll progress.
    
    3. CH3 added comm creation hooks into hcoll.
    
    Signed-off-by: Devendar Bureddy <devendar at mellanox.com>
    Signed-off-by: Antonio J. Pena <apenya at mcs.anl.gov>

diff --git a/src/include/mpiimpl.h b/src/include/mpiimpl.h
index 64043e4..f3ede9e 100644
--- a/src/include/mpiimpl.h
+++ b/src/include/mpiimpl.h
@@ -237,6 +237,10 @@ static MPIU_DBG_INLINE_KEYWORD void MPIUI_Memcpy(void * dst, const void * src, s
 /* Routines for memory management */
 #include "mpimem.h"
 
+#if defined HAVE_LIBHCOLL
+#include "../mpid/common/hcoll/hcollpre.h"
+#endif
+
 /*
  * Use MPIU_SYSCALL to wrap system calls; this provides a convenient point
  * for timing the calls and keeping track of the use of system calls.
@@ -1250,6 +1254,11 @@ typedef struct MPID_Comm {
 #ifdef MPID_HAS_HETERO
     int is_hetero;
 #endif
+
+#if defined HAVE_LIBHCOLL
+    hcoll_comm_priv_t hcoll_priv;
+#endif /* HAVE_LIBHCOLL */
+
   /* Other, device-specific information */
 #ifdef MPID_DEV_COMM_DECL
     MPID_DEV_COMM_DECL
diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_progress.c b/src/mpid/ch3/channels/nemesis/src/ch3_progress.c
index 7232361..13c2fa5 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_progress.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_progress.c
@@ -464,6 +464,13 @@ int MPIDI_CH3I_Progress (MPID_Progress_state *progress_state, int is_blocking)
             MPIDI_CH3_Progress_signal_completion();
         }
 
+#if defined HAVE_LIBHCOLL
+        if (MPIR_CVAR_CH3_ENABLE_HCOLL) {
+            mpi_errno = hcoll_do_progress();
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        }
+#endif /* HAVE_LIBHCOLL */
+
         /* in the case of progress_wait, bail out if anything completed (CC-1) */
         if (is_blocking) {
             int completion_count = OPA_load_int(&MPIDI_CH3I_progress_completion_count);
diff --git a/src/mpid/ch3/channels/sock/src/ch3_progress.c b/src/mpid/ch3/channels/sock/src/ch3_progress.c
index fec55f7..bfe2c21 100644
--- a/src/mpid/ch3/channels/sock/src/ch3_progress.c
+++ b/src/mpid/ch3/channels/sock/src/ch3_progress.c
@@ -88,6 +88,13 @@ static int MPIDI_CH3i_Progress_test(void)
     mpi_errno = MPIDU_Sched_progress(&made_progress);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
+#if defined HAVE_LIBHCOLL
+    if (MPIR_CVAR_CH3_ENABLE_HCOLL) {
+        mpi_errno = hcoll_do_progress();
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
+#endif /* HAVE_LIBHCOLL */
+
     mpi_errno = MPIDU_Sock_wait(MPIDI_CH3I_sock_set, 0, &event);
 
     if (mpi_errno == MPI_SUCCESS)
@@ -184,6 +191,18 @@ static int MPIDI_CH3i_Progress_wait(MPID_Progress_state * progress_state)
             break;
         }
 
+#if defined HAVE_LIBHCOLL
+        if (MPIR_CVAR_CH3_ENABLE_HCOLL) {
+            mpi_errno = hcoll_do_progress();
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+            /* if hcoll completed any pending requests, break.  Else,
+             * we are expecting at least one more socket event */
+            if (progress_state->ch.completion_count != MPIDI_CH3I_progress_completion_count)
+                break;
+        }
+#endif /* HAVE_LIBHCOLL */
+
 #       ifdef MPICH_IS_THREADED
 
 	/* The logic for this case is just complicated enough that
diff --git a/src/mpid/ch3/src/ch3u_comm.c b/src/mpid/ch3/src/ch3u_comm.c
index 0ceb952..fc3278a 100644
--- a/src/mpid/ch3/src/ch3u_comm.c
+++ b/src/mpid/ch3/src/ch3u_comm.c
@@ -6,6 +6,26 @@
 
 #include "mpidimpl.h"
 #include "mpl_utlist.h"
+#if defined HAVE_LIBHCOLL
+#include "../../common/hcoll/hcoll.h"
+#endif
+
+/*
+=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===
+
+cvars:
+    - name        : MPIR_CVAR_CH3_ENABLE_HCOLL
+      category    : CH3
+      type        : boolean
+      default     : false
+      class       : none
+      verbosity   : MPI_T_VERBOSITY_USER_BASIC
+      scope       : MPI_T_SCOPE_ALL_EQ
+      description : >-
+        If true, enable HCOLL collectives.
+
+=== END_MPI_T_CVAR_INFO_BLOCK ===
+*/
 
 static int register_hook_finalize(void *param);
 static int comm_created(MPID_Comm *comm, void *param);
@@ -44,6 +64,16 @@ int MPIDI_CH3I_Comm_init(void)
     /* register hooks for keeping track of communicators */
     mpi_errno = MPIDI_CH3U_Comm_register_create_hook(comm_created, NULL);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+#if defined HAVE_LIBHCOLL
+    if (MPIR_CVAR_CH3_ENABLE_HCOLL) {
+        mpi_errno = MPIDI_CH3U_Comm_register_create_hook(hcoll_comm_create, NULL);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = MPIDI_CH3U_Comm_register_destroy_hook(hcoll_comm_destroy, NULL);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
+#endif
+
     mpi_errno = MPIDI_CH3U_Comm_register_destroy_hook(comm_destroyed, NULL);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
     
diff --git a/src/mpid/common/Makefile.mk b/src/mpid/common/Makefile.mk
index bed34a5..e33e7ae 100644
--- a/src/mpid/common/Makefile.mk
+++ b/src/mpid/common/Makefile.mk
@@ -2,6 +2,7 @@
 ## vim: set ft=automake :
 ##
 ## (C) 2011 by Argonne National Laboratory.
+## (C) 2014 by Mellanox Technologies, Inc.
 ##     See COPYRIGHT in top-level directory.
 ##
 
@@ -9,4 +10,4 @@ include $(top_srcdir)/src/mpid/common/datatype/Makefile.mk
 include $(top_srcdir)/src/mpid/common/sched/Makefile.mk
 include $(top_srcdir)/src/mpid/common/sock/Makefile.mk
 include $(top_srcdir)/src/mpid/common/thread/Makefile.mk
-
+include $(top_srcdir)/src/mpid/common/hcoll/Makefile.mk
diff --git a/src/mpid/common/hcoll/Makefile.mk b/src/mpid/common/hcoll/Makefile.mk
new file mode 100644
index 0000000..405292a
--- /dev/null
+++ b/src/mpid/common/hcoll/Makefile.mk
@@ -0,0 +1,19 @@
+## -*- Mode: Makefile; -*-
+## vim: set ft=automake :
+##
+## (C) 2014 Mellanox Technologies, Inc.
+##     See COPYRIGHT in top-level directory.
+##
+
+if BUILD_HCOLL
+
+mpi_core_sources +=  \
+	src/mpid/common/hcoll/hcoll_init.c  \
+	src/mpid/common/hcoll/hcoll_ops.c  \
+	src/mpid/common/hcoll/hcoll_rte.c
+
+noinst_HEADERS += \
+	src/mpid/common/hcoll/hcoll.h \
+	src/mpid/common/hcoll/hcoll_dtypes.h
+
+endif BUILD_HCOLL
diff --git a/src/mpid/common/hcoll/errnames.txt b/src/mpid/common/hcoll/errnames.txt
new file mode 100644
index 0000000..3eee82f
--- /dev/null
+++ b/src/mpid/common/hcoll/errnames.txt
@@ -0,0 +1,6 @@
+#
+# HCOLL errors
+#
+**hcoll_wrong_arg:Error in hcolrte api: wrong null argument
+**hcoll_wrong_arg %p %d:Error in hcolrte api: wrong null argument (ec_h.handle = %p, ec_h.rank = %d)
+**null_buff_ptr:Error in hcolrte api: buffer pointer is NULL for non DTE_ZERO INLINE data representation
diff --git a/src/mpid/common/hcoll/hcoll.h b/src/mpid/common/hcoll/hcoll.h
new file mode 100644
index 0000000..2c09320
--- /dev/null
+++ b/src/mpid/common/hcoll/hcoll.h
@@ -0,0 +1,31 @@
+#ifndef _HCOLL_H_
+#define _HCOLL_H_
+
+#include "mpidimpl.h"
+#include "hcoll/api/hcoll_api.h"
+#include "hcoll/api/hcoll_constants.h"
+
+extern int world_comm_destroying;
+
+int hcoll_comm_create(MPID_Comm * comm, void *param);
+int hcoll_comm_destroy(MPID_Comm * comm, void *param);
+
+int hcoll_Barrier(MPID_Comm * comm_ptr, int *err);
+int hcoll_Bcast(void *buffer, int count, MPI_Datatype datatype, int root,
+                MPID_Comm * comm_ptr, int *err);
+int hcoll_Allgather(const void *sbuf, int scount, MPI_Datatype sdtype,
+                    void *rbuf, int rcount, MPI_Datatype rdtype, MPID_Comm * comm_ptr, int *err);
+int hcoll_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                    MPI_Op op, MPID_Comm * comm_ptr, int *err);
+
+int hcoll_Ibarrier_req(MPID_Comm * comm_ptr, MPID_Request ** request);
+int hcoll_Ibcast_req(void *buffer, int count, MPI_Datatype datatype, int root,
+                     MPID_Comm * comm_ptr, MPID_Request ** request);
+int hcoll_Iallgather_req(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                         int recvcount, MPI_Datatype recvtype, MPID_Comm * comm_ptr,
+                         MPID_Request ** request);
+int hcoll_Iallreduce_req(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                         MPI_Op op, MPID_Comm * comm_ptr, MPID_Request ** request);
+int hcoll_do_progress(void);
+
+#endif
diff --git a/src/mpid/common/hcoll/hcoll_dtypes.h b/src/mpid/common/hcoll/hcoll_dtypes.h
new file mode 100644
index 0000000..65ef6e3
--- /dev/null
+++ b/src/mpid/common/hcoll/hcoll_dtypes.h
@@ -0,0 +1,67 @@
+#ifndef _HCOLL_DTYPES_H_
+#define _HCOLL_DTYPES_H_
+#include "hcoll/api/hcoll_dte.h"
+
+static dte_data_representation_t mpi_dtype_2_dte_dtype(MPI_Datatype datatype)
+{
+    switch (datatype) {
+    case MPI_SIGNED_CHAR:
+        return DTE_BYTE;
+    case MPI_SHORT:
+        return DTE_INT16;
+    case MPI_INT:
+        return DTE_INT32;
+    case MPI_LONG:
+    case MPI_LONG_LONG:
+        return DTE_INT64;
+        /* return DTE_INT128; */
+    case MPI_BYTE:
+    case MPI_UNSIGNED_CHAR:
+        return DTE_UBYTE;
+    case MPI_UNSIGNED_SHORT:
+        return DTE_UINT16;
+    case MPI_UNSIGNED:
+        return DTE_UINT32;
+    case MPI_UNSIGNED_LONG:
+    case MPI_UNSIGNED_LONG_LONG:
+        return DTE_UINT64;
+        /* return DTE_UINT128; */
+    case MPI_FLOAT:
+        return DTE_FLOAT32;
+    case MPI_DOUBLE:
+        return DTE_FLOAT64;
+    case MPI_LONG_DOUBLE:
+        return DTE_FLOAT128;
+    default:
+        return DTE_ZERO;
+    }
+}
+
+static hcoll_dte_op_t *mpi_op_2_dte_op(MPI_Op op)
+{
+    switch (op) {
+    case MPI_MAX:
+        return &hcoll_dte_op_max;
+    case MPI_MIN:
+        return &hcoll_dte_op_min;
+    case MPI_SUM:
+        return &hcoll_dte_op_sum;
+    case MPI_PROD:
+        return &hcoll_dte_op_prod;
+    case MPI_LAND:
+        return &hcoll_dte_op_land;
+    case MPI_BAND:
+        return &hcoll_dte_op_band;
+    case MPI_LOR:
+        return &hcoll_dte_op_lor;
+    case MPI_BOR:
+        return &hcoll_dte_op_bor;
+    case MPI_LXOR:
+        return &hcoll_dte_op_lxor;
+    case MPI_BXOR:
+        return &hcoll_dte_op_bxor;
+    default:
+        return &hcoll_dte_op_null;
+    }
+}
+#endif
diff --git a/src/mpid/common/hcoll/hcoll_init.c b/src/mpid/common/hcoll/hcoll_init.c
new file mode 100644
index 0000000..2bae31d
--- /dev/null
+++ b/src/mpid/common/hcoll/hcoll_init.c
@@ -0,0 +1,212 @@
+#include "hcoll.h"
+
+static int hcoll_initialized = 0;
+int hcoll_enable = 1;
+int hcoll_enable_barrier = 1;
+int hcoll_enable_bcast = 1;
+int hcoll_enable_allgather = 1;
+int hcoll_enable_allreduce = 1;
+int hcoll_enable_ibarrier = 1;
+int hcoll_enable_ibcast = 1;
+int hcoll_enable_iallgather = 1;
+int hcoll_enable_iallreduce = 1;
+int hcoll_comm_attr_keyval = MPI_KEYVAL_INVALID;
+int world_comm_destroying = 0;
+
+#undef FUNCNAME
+#define FUNCNAME hcoll_destroy
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+
+int hcoll_destroy(void *param ATTRIBUTE((unused)))
+{
+    if (1 == hcoll_initialized) {
+        hcoll_finalize();
+        if (MPI_KEYVAL_INVALID != hcoll_comm_attr_keyval) {
+            MPIR_Comm_free_keyval_impl(hcoll_comm_attr_keyval);
+            hcoll_comm_attr_keyval = MPI_KEYVAL_INVALID;
+        }
+    }
+    hcoll_initialized = 0;
+    return 0;
+}
+
+static int hcoll_comm_attr_del_fn(MPI_Comm comm, int keyval, void *attr_val, void *extra_data)
+{
+    int mpi_errno;
+    if (MPI_COMM_WORLD == comm) {
+        world_comm_destroying = 1;
+    }
+    mpi_errno = hcoll_group_destroy_notify(attr_val);
+    return mpi_errno;
+}
+
+#define CHECK_ENABLE_ENV_VARS(nameEnv, name) \
+    do { \
+        envar = getenv("HCOLL_ENABLE_" #nameEnv); \
+        if (NULL != envar) { \
+            hcoll_enable_##name = atoi(envar); \
+            MPIU_DBG_MSG_D(CH3_OTHER, VERBOSE, "HCOLL_ENABLE_" #nameEnv " = %d\n", hcoll_enable_##name); \
+        } \
+    } while (0)
+
+#undef FUNCNAME
+#define FUNCNAME hcoll_initialize
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int hcoll_initialize(void)
+{
+    int mpi_errno;
+    char *envar;
+    mpi_errno = MPI_SUCCESS;
+    envar = getenv("HCOLL_ENABLE");
+    if (NULL != envar) {
+        hcoll_enable = atoi(envar);
+    }
+    if (0 == hcoll_enable) {
+        goto fn_exit;
+    }
+    hcoll_rte_fns_setup();
+    /*set INT_MAX/2 as tag_base here by the moment.
+     * Need to think more about it.
+     * The tag space should be positive, from > ~30 to MPI_TAG_UB value.
+     * It looks reasonable to set MPI_TAG_UB as base but it doesn't work for ofacm tags
+     * (probably due to wrong conversions from uint to int). That's why I set INT_MAX/2 instead of MPI_TAG_UB.
+     * BUT: it won't work for collectives whose sequence number reaches INT_MAX/2 number. In that case tags become negative.
+     * Moreover, it even won't work than 0 < (INT_MAX/2 - sequence number) < 30 because it can interleave with internal mpich coll tags.
+     */
+    hcoll_set_runtime_tag_offset(INT_MAX / 2, MPI_TAG_UB);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
+    mpi_errno = hcoll_init();
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
+
+    hcoll_initialized = 1;
+    MPIR_Add_finalize(hcoll_destroy, 0, 0);
+
+    mpi_errno =
+        MPIR_Comm_create_keyval_impl(MPI_NULL_COPY_FN, hcoll_comm_attr_del_fn,
+                                     &hcoll_comm_attr_keyval, NULL);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
+
+    CHECK_ENABLE_ENV_VARS(BARRIER, barrier);
+    CHECK_ENABLE_ENV_VARS(BCAST, bcast);
+    CHECK_ENABLE_ENV_VARS(ALLGATHER, allgather);
+    CHECK_ENABLE_ENV_VARS(ALLREDUCE, allreduce);
+    CHECK_ENABLE_ENV_VARS(IBARRIER, ibarrier);
+    CHECK_ENABLE_ENV_VARS(IBCAST, ibcast);
+    CHECK_ENABLE_ENV_VARS(IALLGATHER, iallgather);
+    CHECK_ENABLE_ENV_VARS(IALLREDUCE, iallreduce);
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#define INSTALL_COLL_WRAPPER(check_name, name) \
+    if (hcoll_enable_##check_name && (NULL != hcoll_collectives.coll_##check_name)) { \
+        comm_ptr->coll_fns->name      = hcoll_##name; \
+        MPIU_DBG_MSG(CH3_OTHER,VERBOSE, #name " wrapper installed"); \
+    }
+
+#undef FUNCNAME
+#define FUNCNAME hcoll_comm_create
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int hcoll_comm_create(MPID_Comm * comm_ptr, void *param)
+{
+    int mpi_errno;
+    int num_ranks;
+    int context_destroyed;
+    mpi_errno = MPI_SUCCESS;
+    if (0 == hcoll_initialized) {
+        mpi_errno = hcoll_initialize();
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+    }
+    if (0 == hcoll_enable) {
+        goto fn_exit;
+    }
+    num_ranks = comm_ptr->local_size;
+    if ((MPID_INTRACOMM != comm_ptr->comm_kind) || (2 > num_ranks)) {
+        comm_ptr->hcoll_priv.is_hcoll_init = 0;
+        goto fn_exit;
+    }
+    comm_ptr->hcoll_priv.hcoll_context = hcoll_create_context((rte_grp_handle_t) comm_ptr);
+    if (NULL == comm_ptr->hcoll_priv.hcoll_context) {
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "Couldn't create hcoll context.");
+        goto fn_fail;
+    }
+    mpi_errno =
+        MPIR_Comm_set_attr_impl(comm_ptr, hcoll_comm_attr_keyval,
+                                (void *) (comm_ptr->hcoll_priv.hcoll_context), MPIR_ATTR_PTR);
+    if (mpi_errno) {
+        hcoll_destroy_context(comm_ptr->hcoll_priv.hcoll_context,
+                              (rte_grp_handle_t) comm_ptr, &context_destroyed);
+        MPIU_Assert(context_destroyed);
+        comm_ptr->hcoll_priv.is_hcoll_init = 0;
+        MPIU_ERR_POP(mpi_errno);
+    }
+    comm_ptr->hcoll_priv.hcoll_origin_coll_fns = comm_ptr->coll_fns;
+    comm_ptr->coll_fns = (MPID_Collops *) MPIU_Malloc(sizeof(MPID_Collops));
+    memset(comm_ptr->coll_fns, 0, sizeof(MPID_Collops));
+    if (comm_ptr->hcoll_priv.hcoll_origin_coll_fns != 0) {
+        memcpy(comm_ptr->coll_fns, comm_ptr->hcoll_priv.hcoll_origin_coll_fns,
+               sizeof(MPID_Collops));
+    }
+    INSTALL_COLL_WRAPPER(barrier, Barrier);
+    INSTALL_COLL_WRAPPER(bcast, Bcast);
+    INSTALL_COLL_WRAPPER(allreduce, Allreduce);
+    INSTALL_COLL_WRAPPER(allgather, Allgather);
+    INSTALL_COLL_WRAPPER(ibarrier, Ibarrier_req);
+    INSTALL_COLL_WRAPPER(ibcast, Ibcast_req);
+    INSTALL_COLL_WRAPPER(iallreduce, Iallreduce_req);
+    INSTALL_COLL_WRAPPER(iallgather, Iallgather_req);
+
+    comm_ptr->hcoll_priv.is_hcoll_init = 1;
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME hcoll_comm_destroy
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int hcoll_comm_destroy(MPID_Comm * comm_ptr, void *param)
+{
+    int mpi_errno;
+    int context_destroyed;
+    if (0 == hcoll_enable) {
+        goto fn_exit;
+    }
+    mpi_errno = MPI_SUCCESS;
+    context_destroyed = 0;
+    if ((NULL != comm_ptr) && (0 != comm_ptr->hcoll_priv.is_hcoll_init)) {
+        if (NULL != comm_ptr->coll_fns) {
+            MPIU_Free(comm_ptr->coll_fns);
+        }
+        comm_ptr->coll_fns = comm_ptr->hcoll_priv.hcoll_origin_coll_fns;
+        hcoll_destroy_context(comm_ptr->hcoll_priv.hcoll_context,
+                              (rte_grp_handle_t) comm_ptr, &context_destroyed);
+        MPIU_Assert(context_destroyed);
+        comm_ptr->hcoll_priv.is_hcoll_init = 0;
+    }
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+int hcoll_do_progress(void)
+{
+    if (1 == hcoll_initialized) {
+        hcoll_progress_fn();
+    }
+
+    return MPI_SUCCESS;
+}
diff --git a/src/mpid/common/hcoll/hcoll_ops.c b/src/mpid/common/hcoll/hcoll_ops.c
new file mode 100644
index 0000000..0719ff6
--- /dev/null
+++ b/src/mpid/common/hcoll/hcoll_ops.c
@@ -0,0 +1,361 @@
+#include "hcoll.h"
+#include "hcoll_dtypes.h"
+
+#undef FUNCNAME
+#define FUNCNAME hcoll_Barrier
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int hcoll_Barrier(MPID_Comm * comm_ptr, int *err)
+{
+    int rc;
+    MPI_Comm comm = comm_ptr->handle;
+    MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING HCOL BARRIER.");
+    rc = hcoll_collectives.coll_barrier(comm_ptr->hcoll_priv.hcoll_context);
+    if (HCOLL_SUCCESS != rc) {
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING FALLBACK BARRIER.");
+        void *ptr = comm_ptr->coll_fns->Barrier;
+        comm_ptr->coll_fns->Barrier =
+            (NULL != comm_ptr->hcoll_priv.hcoll_origin_coll_fns) ?
+            comm_ptr->hcoll_priv.hcoll_origin_coll_fns->Barrier : NULL;
+        rc = MPI_Barrier(comm);
+        comm_ptr->coll_fns->Barrier = ptr;
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING FALLBACK BARRIER - done.");
+    }
+    return rc;
+}
+
+#undef FUNCNAME
+#define FUNCNAME hcoll_Bcast
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int hcoll_Bcast(void *buffer, int count, MPI_Datatype datatype, int root,
+                MPID_Comm * comm_ptr, int *err)
+{
+    dte_data_representation_t dtype;
+    int rc;
+    MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING HCOLL BCAST.");
+    dtype = mpi_dtype_2_dte_dtype(datatype);
+    int is_homogeneous = 1, use_fallback = 0;
+    MPI_Comm comm = comm_ptr->handle;
+#ifdef MPID_HAS_HETERO
+    if (comm_ptr->is_hetero)
+        is_homogeneous = 0;
+#endif
+    if (HCOL_DTE_IS_COMPLEX(dtype) || HCOL_DTE_IS_ZERO(dtype) || (0 == is_homogeneous)) {
+        /*If we are here then datatype is not simple predefined datatype */
+        /*In future we need to add more complex mapping to the dte_data_representation_t */
+        /* Now use fallback */
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "unsupported data layout, calling fallback bcast.");
+        use_fallback = 1;
+    }
+    else {
+        rc = hcoll_collectives.coll_bcast(buffer, count, dtype, root,
+                                          comm_ptr->hcoll_priv.hcoll_context);
+        if (HCOLL_SUCCESS != rc) {
+            use_fallback = 1;
+        }
+    }
+    if (1 == use_fallback) {
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING FALLBACK BCAST - done.");
+        void *ptr = comm_ptr->coll_fns->Bcast;
+        comm_ptr->coll_fns->Bcast =
+            (NULL != comm_ptr->hcoll_priv.hcoll_origin_coll_fns) ?
+            comm_ptr->hcoll_priv.hcoll_origin_coll_fns->Bcast : NULL;
+        rc = MPI_Bcast(buffer, count, datatype, root, comm);
+        comm_ptr->coll_fns->Bcast = ptr;
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING FALLBACK BCAST - done.");
+    }
+    return rc;
+}
+
+#undef FUNCNAME
+#define FUNCNAME hcoll_Allreduce
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int hcoll_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                    MPI_Op op, MPID_Comm * comm_ptr, int *err)
+{
+    dte_data_representation_t Dtype;
+    hcoll_dte_op_t *Op;
+    int rc;
+    int is_homogeneous = 1, use_fallback = 0;
+    MPI_Comm comm = comm_ptr->handle;
+#ifdef MPID_HAS_HETERO
+    if (comm_ptr->is_hetero)
+        is_homogeneous = 0;
+#endif
+
+    MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING HCOL ALLREDUCE.");
+    Dtype = mpi_dtype_2_dte_dtype(datatype);
+    Op = mpi_op_2_dte_op(op);
+    if (MPI_IN_PLACE == sendbuf) {
+        sendbuf = HCOLL_IN_PLACE;
+    }
+    if (HCOL_DTE_IS_COMPLEX(Dtype) || HCOL_DTE_IS_ZERO(Dtype) || (0 == is_homogeneous) ||
+        (HCOL_DTE_OP_NULL == Op->id)) {
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "unsupported data layout, calling fallback allreduce.");
+        use_fallback = 1;
+    }
+    else {
+        rc = hcoll_collectives.coll_allreduce(sendbuf, recvbuf, count, Dtype, Op,
+                                              comm_ptr->hcoll_priv.hcoll_context);
+        if (HCOLL_SUCCESS != rc) {
+            use_fallback = 1;
+        }
+    }
+    if (1 == use_fallback) {
+        if (HCOLL_IN_PLACE == sendbuf) {
+            sendbuf = MPI_IN_PLACE;
+        }
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING FALLBACK ALLREDUCE.");
+        void *ptr = comm_ptr->coll_fns->Allreduce;
+        comm_ptr->coll_fns->Allreduce =
+            (NULL != comm_ptr->hcoll_priv.hcoll_origin_coll_fns) ?
+            comm_ptr->hcoll_priv.hcoll_origin_coll_fns->Allreduce : NULL;
+        rc = MPI_Allreduce(sendbuf, recvbuf, count, datatype, op, comm);
+        comm_ptr->coll_fns->Allreduce = ptr;
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING FALLBACK ALLREDUCE done.");
+    }
+    return rc;
+}
+
+#undef FUNCNAME
+#define FUNCNAME hcoll_Allgather
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int hcoll_Allgather(const void *sbuf, int scount, MPI_Datatype sdtype,
+                    void *rbuf, int rcount, MPI_Datatype rdtype, MPID_Comm * comm_ptr, int *err)
+{
+    int is_homogeneous = 1, use_fallback = 0;
+    MPI_Comm comm = comm_ptr->handle;
+    dte_data_representation_t stype;
+    dte_data_representation_t rtype;
+    int rc;
+    is_homogeneous = 1;
+#ifdef MPID_HAS_HETERO
+    if (comm_ptr->is_hetero)
+        is_homogeneous = 0;
+#endif
+
+    MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING HCOLL ALLGATHER.");
+    stype = mpi_dtype_2_dte_dtype(sdtype);
+    rtype = mpi_dtype_2_dte_dtype(rdtype);
+    if (MPI_IN_PLACE == sbuf) {
+        sbuf = HCOLL_IN_PLACE;
+    }
+    if (HCOL_DTE_IS_COMPLEX(stype) || HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype) ||
+        HCOL_DTE_IS_COMPLEX(rtype) || is_homogeneous == 0) {
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "unsupported data layout; calling fallback allgather.");
+        use_fallback = 1;
+    }
+    else {
+        rc = hcoll_collectives.coll_allgather(sbuf, scount, stype, rbuf, rcount, rtype,
+                                              comm_ptr->hcoll_priv.hcoll_context);
+        if (HCOLL_SUCCESS != rc) {
+            use_fallback = 1;
+        }
+    }
+    if (1 == use_fallback) {
+        if (HCOLL_IN_PLACE == sbuf) {
+            sbuf = MPI_IN_PLACE;
+        }
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING FALLBACK ALLGATHER.");
+        void *ptr = comm_ptr->coll_fns->Allgather;
+        comm_ptr->coll_fns->Allgather =
+            (NULL != comm_ptr->hcoll_priv.hcoll_origin_coll_fns) ?
+            comm_ptr->hcoll_priv.hcoll_origin_coll_fns->Allgather : NULL;
+        rc = MPI_Allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
+        comm_ptr->coll_fns->Allgather = ptr;
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING FALLBACK ALLGATHER - done.");
+    }
+    return rc;
+}
+
+#undef FUNCNAME
+#define FUNCNAME hcoll_Ibarrier_req
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int hcoll_Ibarrier_req(MPID_Comm * comm_ptr, MPID_Request ** request)
+{
+    int rc;
+    void **rt_handle;
+    MPI_Comm comm;
+    MPI_Request req;
+    comm = comm_ptr->handle;
+    MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING HCOL IBARRIER.");
+    rt_handle = (void **) request;
+    rc = hcoll_collectives.coll_ibarrier(comm_ptr->hcoll_priv.hcoll_context, rt_handle);
+    if (HCOLL_SUCCESS != rc) {
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING FALLBACK IBARRIER.");
+        void *ptr = comm_ptr->coll_fns->Ibarrier_req;
+        comm_ptr->coll_fns->Ibarrier_req =
+            (comm_ptr->hcoll_priv.hcoll_origin_coll_fns !=
+             NULL) ? comm_ptr->hcoll_priv.hcoll_origin_coll_fns->Ibarrier_req : NULL;
+        rc = MPI_Ibarrier(comm, &req);
+        MPID_Request_get_ptr(req, *request);
+        comm_ptr->coll_fns->Ibarrier_req = ptr;
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING FALLBACK IBARRIER - done.");
+    }
+    return rc;
+}
+
+#undef FUNCNAME
+#define FUNCNAME hcoll_Ibcast_req
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int hcoll_Ibcast_req(void *buffer, int count, MPI_Datatype datatype, int root,
+                     MPID_Comm * comm_ptr, MPID_Request ** request)
+{
+    int rc;
+    void **rt_handle;
+    dte_data_representation_t dtype;
+    MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING HCOLL IBCAST.");
+    dtype = mpi_dtype_2_dte_dtype(datatype);
+    int is_homogeneous = 1, use_fallback = 0;
+    MPI_Comm comm = comm_ptr->handle;
+    MPI_Request req;
+    rt_handle = (void **) request;
+#ifdef MPID_HAS_HETERO
+    if (comm_ptr->is_hetero)
+        is_homogeneous = 0;
+#endif
+    if (HCOL_DTE_IS_COMPLEX(dtype) || HCOL_DTE_IS_ZERO(dtype) || (0 == is_homogeneous)) {
+        /*If we are here then datatype is not simple predefined datatype */
+        /*In future we need to add more complex mapping to the dte_data_representation_t */
+        /* Now use fallback */
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "unsupported data layout, calling fallback ibcast.");
+        use_fallback = 1;
+    }
+    else {
+        rc = hcoll_collectives.coll_ibcast(buffer, count, dtype, root, rt_handle,
+                                           comm_ptr->hcoll_priv.hcoll_context);
+        if (HCOLL_SUCCESS != rc) {
+            use_fallback = 1;
+        }
+    }
+    if (1 == use_fallback) {
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING FALLBACK IBCAST - done.");
+        void *ptr = comm_ptr->coll_fns->Ibcast_req;
+        comm_ptr->coll_fns->Ibcast_req =
+            (comm_ptr->hcoll_priv.hcoll_origin_coll_fns !=
+             NULL) ? comm_ptr->hcoll_priv.hcoll_origin_coll_fns->Ibcast_req : NULL;
+        rc = MPI_Ibcast(buffer, count, datatype, root, comm, &req);
+        MPID_Request_get_ptr(req, *request);
+        comm_ptr->coll_fns->Ibcast_req = ptr;
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING FALLBACK IBCAST - done.");
+    }
+    return rc;
+}
+
+#undef FUNCNAME
+#define FUNCNAME hcoll_Iallgather_req
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int hcoll_Iallgather_req(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                         int recvcount, MPI_Datatype recvtype, MPID_Comm * comm_ptr,
+                         MPID_Request ** request)
+{
+    int is_homogeneous = 1, use_fallback = 0;
+    MPI_Comm comm = comm_ptr->handle;
+    dte_data_representation_t stype;
+    dte_data_representation_t rtype;
+    int rc;
+    void **rt_handle;
+    MPI_Request req;
+    rt_handle = (void **) request;
+
+    is_homogeneous = 1;
+#ifdef MPID_HAS_HETERO
+    if (comm_ptr->is_hetero)
+        is_homogeneous = 0;
+#endif
+
+    MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING HCOLL IALLGATHER.");
+    stype = mpi_dtype_2_dte_dtype(sendtype);
+    rtype = mpi_dtype_2_dte_dtype(recvtype);
+    if (MPI_IN_PLACE == sendbuf) {
+        sendbuf = HCOLL_IN_PLACE;
+    }
+    if (HCOL_DTE_IS_COMPLEX(stype) || HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype) ||
+        HCOL_DTE_IS_COMPLEX(rtype) || is_homogeneous == 0) {
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "unsupported data layout; calling fallback iallgather.");
+        use_fallback = 1;
+    }
+    else {
+        rc = hcoll_collectives.coll_iallgather(sendbuf, sendcount, stype, recvbuf, recvcount, rtype,
+                                               comm_ptr->hcoll_priv.hcoll_context, rt_handle);
+        if (HCOLL_SUCCESS != rc) {
+            use_fallback = 1;
+        }
+    }
+    if (1 == use_fallback) {
+        if (HCOLL_IN_PLACE == sendbuf) {
+            sendbuf = MPI_IN_PLACE;
+        }
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING FALLBACK IALLGATHER.");
+        void *ptr = comm_ptr->coll_fns->Iallgather_req;
+        comm_ptr->coll_fns->Iallgather_req =
+            (comm_ptr->hcoll_priv.hcoll_origin_coll_fns !=
+             NULL) ? comm_ptr->hcoll_priv.hcoll_origin_coll_fns->Iallgather_req : NULL;
+        rc = MPI_Iallgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm, &req);
+        MPID_Request_get_ptr(req, *request);
+        comm_ptr->coll_fns->Iallgather_req = ptr;
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING FALLBACK IALLGATHER - done.");
+    }
+    return rc;
+}
+
+#undef FUNCNAME
+#define FUNCNAME hcoll_Iallreduce_req
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+int hcoll_Iallreduce_req(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                         MPI_Op op, MPID_Comm * comm_ptr, MPID_Request ** request)
+{
+    dte_data_representation_t Dtype;
+    hcoll_dte_op_t *Op;
+    int rc;
+    void **rt_handle;
+    MPI_Request req;
+    int is_homogeneous = 1, use_fallback = 0;
+    MPI_Comm comm = comm_ptr->handle;
+    rt_handle = (void **) request;
+#ifdef MPID_HAS_HETERO
+    if (comm_ptr->is_hetero)
+        is_homogeneous = 0;
+#endif
+
+    MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING HCOL IALLREDUCE.");
+    Dtype = mpi_dtype_2_dte_dtype(datatype);
+    Op = mpi_op_2_dte_op(op);
+    if (MPI_IN_PLACE == sendbuf) {
+        sendbuf = HCOLL_IN_PLACE;
+    }
+    if (HCOL_DTE_IS_COMPLEX(Dtype) || HCOL_DTE_IS_ZERO(Dtype) || (0 == is_homogeneous) ||
+        (HCOL_DTE_OP_NULL == Op->id)) {
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "unsupported data layout, calling fallback iallreduce.");
+        use_fallback = 1;
+    }
+    else {
+        rc = hcoll_collectives.coll_iallreduce(sendbuf, recvbuf, count, Dtype, Op,
+                                               comm_ptr->hcoll_priv.hcoll_context, rt_handle);
+        if (HCOLL_SUCCESS != rc) {
+            use_fallback = 1;
+        }
+    }
+    if (1 == use_fallback) {
+        if (HCOLL_IN_PLACE == sendbuf) {
+            sendbuf = MPI_IN_PLACE;
+        }
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING FALLBACK IALLREDUCE.");
+        void *ptr = comm_ptr->coll_fns->Iallreduce_req;
+        comm_ptr->coll_fns->Iallreduce_req =
+            (comm_ptr->hcoll_priv.hcoll_origin_coll_fns !=
+             NULL) ? comm_ptr->hcoll_priv.hcoll_origin_coll_fns->Iallreduce_req : NULL;
+        rc = MPI_Iallreduce(sendbuf, recvbuf, count, datatype, op, comm, &req);
+        MPID_Request_get_ptr(req, *request);
+        comm_ptr->coll_fns->Iallreduce_req = ptr;
+        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "RUNNING FALLBACK IALLREDUCE done.");
+    }
+    return rc;
+}
diff --git a/src/mpid/common/hcoll/hcoll_rte.c b/src/mpid/common/hcoll/hcoll_rte.c
new file mode 100644
index 0000000..b6a3ea7
--- /dev/null
+++ b/src/mpid/common/hcoll/hcoll_rte.c
@@ -0,0 +1,434 @@
+#include "hcoll.h"
+#include "hcoll/api/hcoll_dte.h"
+#include <assert.h>
+
+static int recv_nb(dte_data_representation_t data,
+                   uint32_t count,
+                   void *buffer,
+                   rte_ec_handle_t, rte_grp_handle_t, uint32_t tag, rte_request_handle_t * req);
+
+static int send_nb(dte_data_representation_t data,
+                   uint32_t count,
+                   void *buffer,
+                   rte_ec_handle_t ec_h,
+                   rte_grp_handle_t grp_h, uint32_t tag, rte_request_handle_t * req);
+
+static int test(rte_request_handle_t * request, int *completed);
+
+static int ec_handle_compare(rte_ec_handle_t handle_1,
+                             rte_grp_handle_t
+                             group_handle_1,
+                             rte_ec_handle_t handle_2, rte_grp_handle_t group_handle_2);
+
+static int get_ec_handles(int num_ec,
+                          int *ec_indexes, rte_grp_handle_t, rte_ec_handle_t * ec_handles);
+
+static int get_my_ec(rte_grp_handle_t, rte_ec_handle_t * ec_handle);
+
+static int group_size(rte_grp_handle_t group);
+static int my_rank(rte_grp_handle_t grp_h);
+static int ec_on_local_node(rte_ec_handle_t ec, rte_grp_handle_t group);
+static rte_grp_handle_t get_world_group_handle(void);
+static uint32_t jobid(void);
+
+static void *get_coll_handle(void);
+static int coll_handle_test(void *handle);
+static void coll_handle_free(void *handle);
+static void coll_handle_complete(void *handle);
+static int group_id(rte_grp_handle_t group);
+
+static int world_rank(rte_grp_handle_t grp_h, rte_ec_handle_t ec);
+
+#undef FUNCNAME
+#define FUNCNAME progress
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static void progress(void)
+{
+    int ret;
+
+    if (0 == world_comm_destroying) {
+        MPID_Progress_test();
+    }
+    else {
+        /* FIXME: The hcoll library needs to be updated to return
+         * error codes.  The progress function pointer right now
+         * expects that the function returns void. */
+        ret = hcoll_do_progress();
+        assert(ret == MPI_SUCCESS);
+    }
+}
+
+#undef FUNCNAME
+#define FUNCNAME init_module_fns
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static void init_module_fns(void)
+{
+    hcoll_rte_functions.send_fn = send_nb;
+    hcoll_rte_functions.recv_fn = recv_nb;
+    hcoll_rte_functions.ec_cmp_fn = ec_handle_compare;
+    hcoll_rte_functions.get_ec_handles_fn = get_ec_handles;
+    hcoll_rte_functions.rte_group_size_fn = group_size;
+    hcoll_rte_functions.test_fn = test;
+    hcoll_rte_functions.rte_my_rank_fn = my_rank;
+    hcoll_rte_functions.rte_ec_on_local_node_fn = ec_on_local_node;
+    hcoll_rte_functions.rte_world_group_fn = get_world_group_handle;
+    hcoll_rte_functions.rte_jobid_fn = jobid;
+    hcoll_rte_functions.rte_progress_fn = progress;
+    hcoll_rte_functions.rte_get_coll_handle_fn = get_coll_handle;
+    hcoll_rte_functions.rte_coll_handle_test_fn = coll_handle_test;
+    hcoll_rte_functions.rte_coll_handle_free_fn = coll_handle_free;
+    hcoll_rte_functions.rte_coll_handle_complete_fn = coll_handle_complete;
+    hcoll_rte_functions.rte_group_id_fn = group_id;
+    hcoll_rte_functions.rte_world_rank_fn = world_rank;
+}
+
+#undef FUNCNAME
+#define FUNCNAME hcoll_rte_fns_setup
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+void hcoll_rte_fns_setup(void)
+{
+    init_module_fns();
+}
+
+/* This function converts dte_general_representation data into regular iovec array which is
+  used in rml
+  */
+static inline int count_total_dte_repeat_entries(struct dte_data_representation_t *data)
+{
+    unsigned int i;
+
+    struct dte_generalized_iovec_t *dte_iovec = data->rep.general_rep->data_representation.data;
+    int total_entries_number = 0;
+    for (i = 0; i < dte_iovec->repeat_count; i++) {
+        total_entries_number += dte_iovec->repeat[i].n_elements;
+    }
+    return total_entries_number;
+}
+
+#undef FUNCNAME
+#define FUNCNAME recv_nb
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static int recv_nb(struct dte_data_representation_t data,
+                   uint32_t count,
+                   void *buffer,
+                   rte_ec_handle_t ec_h,
+                   rte_grp_handle_t grp_h, uint32_t tag, rte_request_handle_t * req)
+{
+    int mpi_errno;
+    MPI_Datatype dtype;
+    MPID_Request *request;
+    MPID_Comm *comm;
+    int context_offset;
+    size_t size;
+    mpi_errno = MPI_SUCCESS;
+    context_offset = MPID_CONTEXT_INTRA_COLL;
+    comm = (MPID_Comm *) grp_h;
+    if (!ec_h.handle) {
+        MPIU_ERR_SETANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**hcoll_wrong_arg",
+                             "**hcoll_wrong_arg %p %d", ec_h.handle, ec_h.rank);
+    }
+
+    if (HCOL_DTE_IS_INLINE(data)) {
+        if (!buffer && !HCOL_DTE_IS_ZERO(data)) {
+            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**null_buff_ptr");
+        }
+        size = (size_t) data.rep.in_line_rep.data_handle.in_line.packed_size * count / 8;
+        dtype = MPI_CHAR;
+        mpi_errno = MPID_Irecv(buffer, size, dtype, ec_h.rank, tag, comm, context_offset, &request);
+        req->data = (void *) request;
+        req->status = HCOLRTE_REQUEST_ACTIVE;
+    }
+    else {
+        int total_entries_number;
+        int i;
+        unsigned int j;
+        void *buf;
+        uint64_t len;
+        int repeat_count;
+        struct dte_struct_t *repeat;
+        if (NULL != buffer) {
+            /* We have a full data description & buffer pointer simultaneously.
+             * It is ambiguous. Throw a warning since the user might have made a
+             * mistake with data reps */
+            MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "Warning: buffer_pointer != NULL for NON-inline data "
+                         "representation: buffer_pointer is ignored");
+        }
+        total_entries_number = count_total_dte_repeat_entries(&data);
+        repeat = data.rep.general_rep->data_representation.data->repeat;
+        repeat_count = data.rep.general_rep->data_representation.data->repeat_count;
+        for (i = 0; i < repeat_count; i++) {
+            for (j = 0; j < repeat[i].n_elements; j++) {
+                char *repeat_unit = (char *) &repeat[i];
+                buf = (void *) (repeat_unit + repeat[i].elements[j].base_offset);
+                len = repeat[i].elements[j].packed_size;
+                recv_nb(DTE_BYTE, len, buf, ec_h, grp_h, tag, req);
+            }
+        }
+    }
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    return HCOLL_ERROR;
+}
+
+#undef FUNCNAME
+#define FUNCNAME send_nb
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static int send_nb(dte_data_representation_t data,
+                   uint32_t count,
+                   void *buffer,
+                   rte_ec_handle_t ec_h,
+                   rte_grp_handle_t grp_h, uint32_t tag, rte_request_handle_t * req)
+{
+    int mpi_errno;
+    MPI_Datatype dtype;
+    MPID_Request *request;
+    MPID_Comm *comm;
+    int context_offset;
+    size_t size;
+    mpi_errno = MPI_SUCCESS;
+    context_offset = MPID_CONTEXT_INTRA_COLL;
+    comm = (MPID_Comm *) grp_h;
+    if (!ec_h.handle) {
+        MPIU_ERR_SETANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**hcoll_wrong_arg",
+                             "**hcoll_wrong_arg %p %d", ec_h.handle, ec_h.rank);
+    }
+
+    if (HCOL_DTE_IS_INLINE(data)) {
+        if (!buffer && !HCOL_DTE_IS_ZERO(data)) {
+            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**null_buff_ptr");
+        }
+        size = (size_t) data.rep.in_line_rep.data_handle.in_line.packed_size * count / 8;
+        dtype = MPI_CHAR;
+        mpi_errno = MPID_Isend(buffer, size, dtype, ec_h.rank, tag, comm, context_offset, &request);
+        req->data = (void *) request;
+        req->status = HCOLRTE_REQUEST_ACTIVE;
+    }
+    else {
+        int total_entries_number;
+        int i;
+        unsigned int j;
+        void *buf;
+        uint64_t len;
+        int repeat_count;
+        struct dte_struct_t *repeat;
+        if (NULL != buffer) {
+            /* We have a full data description & buffer pointer simultaneously.
+             * It is ambiguous. Throw a warning since the user might have made a
+             * mistake with data reps */
+            MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "Warning: buffer_pointer != NULL for NON-inline data "
+                         "representation: buffer_pointer is ignored");
+        }
+        total_entries_number = count_total_dte_repeat_entries(&data);
+        repeat = data.rep.general_rep->data_representation.data->repeat;
+        repeat_count = data.rep.general_rep->data_representation.data->repeat_count;
+        for (i = 0; i < repeat_count; i++) {
+            for (j = 0; j < repeat[i].n_elements; j++) {
+                char *repeat_unit = (char *) &repeat[i];
+                buf = (void *) (repeat_unit + repeat[i].elements[j].base_offset);
+                len = repeat[i].elements[j].packed_size;
+                send_nb(DTE_BYTE, len, buf, ec_h, grp_h, tag, req);
+            }
+        }
+    }
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    return HCOLL_ERROR;
+}
+
+#undef FUNCNAME
+#define FUNCNAME test
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static int test(rte_request_handle_t * request, int *completed)
+{
+    MPID_Request *req;
+    req = (MPID_Request *) request->data;
+    if (HCOLRTE_REQUEST_ACTIVE != request->status) {
+        *completed = true;
+        return HCOLL_SUCCESS;
+    }
+
+    *completed = (int) MPID_Request_is_complete(req);
+    if (*completed) {
+        MPID_Request_release(req);
+        request->status = HCOLRTE_REQUEST_DONE;
+    }
+
+    return HCOLL_SUCCESS;
+}
+
+#undef FUNCNAME
+#define FUNCNAME ec_handle_compare
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static int ec_handle_compare(rte_ec_handle_t handle_1,
+                             rte_grp_handle_t
+                             group_handle_1,
+                             rte_ec_handle_t handle_2, rte_grp_handle_t group_handle_2)
+{
+    return handle_1.handle == handle_2.handle;
+}
+
+#undef FUNCNAME
+#define FUNCNAME get_ec_handles
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static int get_ec_handles(int num_ec,
+                          int *ec_indexes, rte_grp_handle_t grp_h, rte_ec_handle_t * ec_handles)
+{
+    int i;
+    MPID_Comm *comm;
+    comm = (MPID_Comm *) grp_h;
+    for (i = 0; i < num_ec; i++) {
+        ec_handles[i].rank = ec_indexes[i];
+        ec_handles[i].handle = (void *) (comm->vcr[ec_indexes[i]]);
+    }
+    return HCOLL_SUCCESS;
+}
+
+#undef FUNCNAME
+#define FUNCNAME get_my_ec
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static int get_my_ec(rte_grp_handle_t grp_h, rte_ec_handle_t * ec_handle)
+{
+    MPID_Comm *comm;
+    comm = (MPID_Comm *) grp_h;
+    int my_rank = MPIR_Comm_rank(comm);
+    ec_handle->handle = (void *) (comm->vcr[my_rank]);
+    ec_handle->rank = my_rank;
+    return HCOLL_SUCCESS;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME group_size
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static int group_size(rte_grp_handle_t grp_h)
+{
+    return MPIR_Comm_size((MPID_Comm *) grp_h);
+}
+
+#undef FUNCNAME
+#define FUNCNAME my_rank
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static int my_rank(rte_grp_handle_t grp_h)
+{
+    return MPIR_Comm_rank((MPID_Comm *) grp_h);
+}
+
+#undef FUNCNAME
+#define FUNCNAME ec_on_local_node
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static int ec_on_local_node(rte_ec_handle_t ec, rte_grp_handle_t group)
+{
+    MPID_Comm *comm;
+    MPID_Node_id_t nodeid, my_nodeid;
+    int my_rank;
+    comm = (MPID_Comm *) group;
+    MPID_Get_node_id(comm, ec.rank, &nodeid);
+    my_rank = MPIR_Comm_rank(comm);
+    MPID_Get_node_id(comm, my_rank, &my_nodeid);
+    return (nodeid == my_nodeid);
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME get_world_group_handle
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static rte_grp_handle_t get_world_group_handle(void)
+{
+    return (rte_grp_handle_t) (MPIR_Process.comm_world);
+}
+
+#undef FUNCNAME
+#define FUNCNAME jobid
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static uint32_t jobid(void)
+{
+    /* not used currently */
+    return 0;
+}
+
+#undef FUNCNAME
+#define FUNCNAME group_id
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static int group_id(rte_grp_handle_t group)
+{
+    MPID_Comm *comm;
+    comm = (MPID_Comm *) group;
+    return comm->context_id;
+}
+
+#undef FUNCNAME
+#define FUNCNAME get_coll_handle
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static void *get_coll_handle(void)
+{
+    MPID_Request *req;
+    req = MPID_Request_create();
+    req->kind = MPID_COLL_REQUEST;
+    return (void *) req;
+}
+
+#undef FUNCNAME
+#define FUNCNAME coll_handle_test
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static int coll_handle_test(void *handle)
+{
+    int completed;
+    MPID_Request *req;
+    req = (MPID_Request *) handle;
+    completed = (int) MPID_Request_is_complete(req);
+    return completed;
+}
+
+#undef FUNCNAME
+#define FUNCNAME coll_handle_free
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static void coll_handle_free(void *handle)
+{
+    MPID_Request *req;
+    if (NULL != handle) {
+        req = (MPID_Request *) handle;
+        MPID_Request_release(req);
+    }
+}
+
+#undef FUNCNAME
+#define FUNCNAME coll_handle_complete
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static void coll_handle_complete(void *handle)
+{
+    MPID_Request *req;
+    if (NULL != handle) {
+        req = (MPID_Request *) handle;
+        MPID_Request_set_completed(req);
+    }
+}
+
+#undef FUNCNAME
+#define FUNCNAME world_rank
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static int world_rank(rte_grp_handle_t grp_h, rte_ec_handle_t ec)
+{
+    return ((MPID_VCR) ec.handle)->pg_rank;
+}
diff --git a/src/mpid/common/hcoll/hcollpre.h b/src/mpid/common/hcoll/hcollpre.h
new file mode 100644
index 0000000..03ec11d
--- /dev/null
+++ b/src/mpid/common/hcoll/hcollpre.h
@@ -0,0 +1,10 @@
+#ifndef _HCOLLPRE_H_
+#define _HCOLLPRE_H_
+
+typedef struct {
+    int is_hcoll_init;
+    struct MPID_Collops *hcoll_origin_coll_fns;
+    void *hcoll_context;
+} hcoll_comm_priv_t;
+
+#endif
diff --git a/src/mpid/common/hcoll/subconfigure.m4 b/src/mpid/common/hcoll/subconfigure.m4
new file mode 100644
index 0000000..a55e5dd
--- /dev/null
+++ b/src/mpid/common/hcoll/subconfigure.m4
@@ -0,0 +1,13 @@
+[#] start of __file__
+
+AC_DEFUN([PAC_SUBCFG_PREREQ_]PAC_SUBCFG_AUTO_SUFFIX,[
+	PAC_SET_HEADER_LIB_PATH(hcoll)
+	PAC_CHECK_HEADER_LIB([hcoll/api/hcoll_api.h],[hcoll],[hcoll_init],[have_hcoll=yes],[have_hcoll=no])
+	AM_CONDITIONAL([BUILD_HCOLL],[test "$have_hcoll" = "yes"])
+])dnl end PREREQ
+
+AC_DEFUN([PAC_SUBCFG_BODY_]PAC_SUBCFG_AUTO_SUFFIX,[
+# nothing to do
+])dnl end _BODY
+
+[#] end of __file__

-----------------------------------------------------------------------

Summary of changes:
 src/include/mpiimpl.h                            |    9 +
 src/mpid/ch3/channels/nemesis/src/ch3_progress.c |    7 +
 src/mpid/ch3/channels/sock/src/ch3_progress.c    |   19 +
 src/mpid/ch3/src/ch3u_comm.c                     |   30 ++
 src/mpid/common/Makefile.mk                      |    3 +-
 src/mpid/common/hcoll/Makefile.mk                |   19 +
 src/mpid/common/hcoll/errnames.txt               |    6 +
 src/mpid/common/hcoll/hcoll.h                    |   31 ++
 src/mpid/common/hcoll/hcoll_dtypes.h             |   67 ++++
 src/mpid/common/hcoll/hcoll_init.c               |  212 +++++++++++
 src/mpid/common/hcoll/hcoll_ops.c                |  361 ++++++++++++++++++
 src/mpid/common/hcoll/hcoll_rte.c                |  434 ++++++++++++++++++++++
 src/mpid/common/hcoll/hcollpre.h                 |   10 +
 src/mpid/common/hcoll/subconfigure.m4            |   13 +
 14 files changed, 1220 insertions(+), 1 deletions(-)
 create mode 100644 src/mpid/common/hcoll/Makefile.mk
 create mode 100644 src/mpid/common/hcoll/errnames.txt
 create mode 100644 src/mpid/common/hcoll/hcoll.h
 create mode 100644 src/mpid/common/hcoll/hcoll_dtypes.h
 create mode 100644 src/mpid/common/hcoll/hcoll_init.c
 create mode 100644 src/mpid/common/hcoll/hcoll_ops.c
 create mode 100644 src/mpid/common/hcoll/hcoll_rte.c
 create mode 100644 src/mpid/common/hcoll/hcollpre.h
 create mode 100644 src/mpid/common/hcoll/subconfigure.m4


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list