[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.1-16-g38a5e61

Service Account noreply at mpich.org
Tue Feb 25 18:56:50 CST 2014


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  38a5e617acb34d94963e144aba2cb543a97cf608 (commit)
       via  724e7536d429ff264ef59928f2ffbc9c67f37d42 (commit)
      from  185b0c528ca8a8db48b2b5d703ab84774a2badc5 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/38a5e617acb34d94963e144aba2cb543a97cf608

commit 38a5e617acb34d94963e144aba2cb543a97cf608
Author: Pavan Balaji <balaji at mcs.anl.gov>
Date:   Mon Feb 24 16:50:18 2014 -0600

    Fix incorrect comments.
    
    The comment in the test program was referring to ARMCI, since it was
    carried over from the ARMCI-MPI test suite.
    
    Signed-off-by: Junchao Zhang <jczhang at mcs.anl.gov>

diff --git a/test/mpi/spawn/pgroup_intercomm_test.c b/test/mpi/spawn/pgroup_intercomm_test.c
index 1b30a7d..c3c199e 100644
--- a/test/mpi/spawn/pgroup_intercomm_test.c
+++ b/test/mpi/spawn/pgroup_intercomm_test.c
@@ -27,7 +27,7 @@ const int verbose = 0;
 void pgroup_create(int grp_size, int *pid_list, MPI_Comm *group_out);
 void pgroup_free(MPI_Comm *group);
 
-/** Free an ARMCI group
+/** Free the group
   */
 void pgroup_free(MPI_Comm *group) {
   /* Note: It's ok to compare predefined handles */
@@ -38,7 +38,7 @@ void pgroup_free(MPI_Comm *group) {
 }
 
 
-/* Create an ARMCI processor group containing the processes in pid_list.
+/* Create a processor group containing the processes in pid_list.
  *
  * NOTE: pid_list list must be identical and sorted on all processes
  */

http://git.mpich.org/mpich.git/commitdiff/724e7536d429ff264ef59928f2ffbc9c67f37d42

commit 724e7536d429ff264ef59928f2ffbc9c67f37d42
Author: Pavan Balaji <balaji at mcs.anl.gov>
Date:   Mon Feb 24 16:48:01 2014 -0600

    Remove armci-mpi.
    
    ARMCI-MPI is released separately.  There's no reason to package it in
    MPICH again.
    
    Fixes #2037.
    
    Signed-off-by: Junchao Zhang <jczhang at mcs.anl.gov>

diff --git a/autogen.sh b/autogen.sh
index 48b3e1d..6b4c5e1 100755
--- a/autogen.sh
+++ b/autogen.sh
@@ -92,7 +92,6 @@ confdb_dirs="${confdb_dirs} src/mpl/confdb"
 confdb_dirs="${confdb_dirs} src/pm/hydra/confdb"
 confdb_dirs="${confdb_dirs} src/pm/hydra/mpl/confdb"
 confdb_dirs="${confdb_dirs} test/mpi/confdb"
-confdb_dirs="${confdb_dirs} src/armci/m4"
 
 # hydra's copy of mpl
 sync_external src/mpl src/pm/hydra/mpl
@@ -159,7 +158,7 @@ export do_build_configure
 MAKE=${MAKE-make}
 
 # external packages that require autogen.sh to be run for each of them
-externals="src/pm/hydra src/mpi/romio src/armci src/pm/mpd src/openpa"
+externals="src/pm/hydra src/mpi/romio src/pm/mpd src/openpa"
 # amdirs are the directories that make use of autoreconf
 amdirs=". src/mpl src/util/logging/rlog"
 
diff --git a/maint/cvardirs b/maint/cvardirs
index 04a2a67..69cc1ee 100644
--- a/maint/cvardirs
+++ b/maint/cvardirs
@@ -1 +1 @@
-src/mpi src/mpi_t src/nameserv src/util src/binding src/include src/mpid src/pmi src/armci src/mutex
+src/mpi src/mpi_t src/nameserv src/util src/binding src/include src/mpid src/pmi src/mutex
diff --git a/maint/errmsgdirs b/maint/errmsgdirs
index 04a2a67..69cc1ee 100644
--- a/maint/errmsgdirs
+++ b/maint/errmsgdirs
@@ -1 +1 @@
-src/mpi src/mpi_t src/nameserv src/util src/binding src/include src/mpid src/pmi src/armci src/mutex
+src/mpi src/mpi_t src/nameserv src/util src/binding src/include src/mpid src/pmi src/mutex
diff --git a/src/armci/COPYRIGHT b/src/armci/COPYRIGHT
deleted file mode 100644
index 0d56e79..0000000
--- a/src/armci/COPYRIGHT
+++ /dev/null
@@ -1,49 +0,0 @@
-
-The following is a notice of limited availability of the code, and disclaimer
-which must be included in the prologue of the code and in all source listings
-of the code.
-
-Copyright (c) 2010   Mathematics and Computer Science, Argonne National Laboratory
-Copyright (c) 2010   Argonne Leadership Computing Facility, Argonne National Laboratory
-Copyright (c) 2010   Futures Laboratory, Oak Ridge National Laboratory
-
-Permission is hereby granted to use, reproduce, prepare derivative works, and
-to redistribute to others.
-
-
-   	       		 LICENSE
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-- Redistributions of source code must retain the above copyright
-  notice, this list of conditions and the following disclaimer.
-
-- Redistributions in binary form must reproduce the above copyright
-  notice, this list of conditions and the following disclaimer listed
-  in this license in the documentation and/or other materials
-  provided with the distribution.
-
-- Neither the name of the copyright holders nor the names of its
-  contributors may be used to endorse or promote products derived from
-  this software without specific prior written permission.
-
-The copyright holders provide no reassurances that the source code
-provided does not infringe any patent, copyright, or any other
-intellectual property rights of third parties.  The copyright holders
-disclaim any liability to any recipient for claims brought against
-recipient by any third party for infringement of that parties
-intellectual property rights.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/src/armci/Makefile.am b/src/armci/Makefile.am
deleted file mode 100644
index 4aafce5..0000000
--- a/src/armci/Makefile.am
+++ /dev/null
@@ -1,55 +0,0 @@
-#
-# Copyright (C) 2010. See COPYRIGHT in top-level directory.
-#
-
-ACLOCAL_AMFLAGS = -I m4
-AM_CPPFLAGS = -I$(top_srcdir)/src
-
-lib_LTLIBRARIES = libarmci.la
-
-# Needed to connect with the GA build system
-noinst_LTLIBRARIES = libarmcii.la
-
-libarmci_la_SOURCES = $(top_srcdir)/src/buffer.c        \
-                      $(top_srcdir)/src/debug.c         \
-                      $(top_srcdir)/src/groups.c        \
-                      $(top_srcdir)/src/internals.c     \
-                      $(top_srcdir)/src/malloc.c        \
-                      $(top_srcdir)/src/gmr.c           \
-                      $(top_srcdir)/src/message.c       \
-                      $(top_srcdir)/src/message_gop.c   \
-                      $(top_srcdir)/src/mutex.c         \
-                      $(top_srcdir)/src/mutex_hdl_queue.c \
-                      $(top_srcdir)/src/onesided.c      \
-                      $(top_srcdir)/src/onesided_nb.c   \
-                      $(top_srcdir)/src/rmw.c           \
-                      $(top_srcdir)/src/strided.c       \
-                      $(top_srcdir)/src/topology.c      \
-                      $(top_srcdir)/src/util.c          \
-                      $(top_srcdir)/src/value_ops.c     \
-                      $(top_srcdir)/src/vector.c        \
-                      $(top_srcdir)/src/init_finalize.c \
-                      $(top_srcdir)/src/conflict_tree.c \
-                      $(top_srcdir)/src/parmci.c
-
-libarmci_la_LDFLAGS = $(libarmci_abi_versionflags)
-
-libarmcii_la_SOURCES = $(libarmci_la_SOURCES)
-libarmcii_la_LDFLAGS = $(libarmci_abi_version)
-
-include_HEADERS = $(top_srcdir)/src/armci.h $(top_srcdir)/src/message.h \
-        $(top_srcdir)/src/armcix.h
-
-bin_PROGRAMS =
-check_PROGRAMS = 
-TESTS = 
-XFAIL_TESTS = 
-
-MPIEXEC = mpiexec -n 2
-TESTS_ENVIRONMENT = $(MPIEXEC)
-
-include benchmarks/Makefile.mk
-include tests/Makefile.mk
-
-.PHONY: checkprogs
-checkprogs: $(check_PROGRAMS)
diff --git a/src/armci/README b/src/armci/README
deleted file mode 100644
index ce7fd0c..0000000
--- a/src/armci/README
+++ /dev/null
@@ -1,152 +0,0 @@
-                    ARMCI on MPI-RMA Implementation Notes
-                       James Dinan <dinan at mcs.anl.gov>
-
-===============================================================================
-Introduction
-===============================================================================
-
-This project provides a full, high performance, portable implementation of the
-ARMCI runtime system using MPI's remote memory access (RMA) functionality.
-
-===============================================================================
-Installing ARMCI-MPI
-===============================================================================
-
-ARMCI-MPI uses autoconf and must be configured before compiling:
-
- $ ./configure
-
-Many configure options are provided, run "configure --help" for details.  After
-configuring the source tree, the code can be built and installed by running:
-
- $ make && make install
-
-ARMCI-MPI can be used with GA 5.0 and later by substituting this directory for
-the "armci" directory in the GA distribution.  The quality of MPI-RMA
-implementations varies.  As of August, 2011 the following MPI implementations are
-known to work correctly with ARMCI-MPI:
-
- + MVAPICH2 1.6
- + MPICH
- + Cray MPI on Cray XE6
- + IBM MPI on BG/P (set ARMCI_STRIDED_METHOD=IOV and ARMCI_IOV_METHOD=BATCHED)
- + OpenMPI 1.5.4 (set ARMCI_STRIDED_METHOD=IOV and ARMCI_IOV_METHOD=BATCHED)
-
-The following MPI implementations are known to fail with ARMCI-MPI:
-
- - MVAPICH2 prior to 1.6
-
-===============================================================================
-The ARMCI-MPI Test Suite
-===============================================================================
-
-ARMCI-MPI includes a set of testing and benchmark programs located under tests/
-and benchmarks/.  These programs can be compiled and run via:
-
-$ make check MPIEXEC="mpiexec -n 4"
-
-The MPIEXEC variable is optional and is used to override the default MPI launch
-command.  If you want only to build the test suite, the following target can be
-used:
-
-$ make checkprogs
-
-===============================================================================
-ARMCI-MPI Errata
-===============================================================================
-
-Direct access to local buffers:
-
- * Because of MPI's semantics, you are not allowed to access shared memory
-   directly, it must be through put/get.  Alternatively you can use the 
-   new ARMCI_Access_begin/end() functions.
-   
-Progress semantics:
-
- * On some MPI implementations and networks you may need to enable implicit
-   progress.  In many cases this is done through an environment variable.  For
-   MPICH: set MPICH_ASYNC_PROGRESS; for MVAPICH2 recompile with
-   --enable-async-progress and set MPICH_ASYNC_PROGRESS; set DCMF_INTERRUPTS=1
-   for MPICH-BG; etc.
-
-===============================================================================
-Environment Variables:
-===============================================================================
-
-Boolean environment variables are enabled when set to a value beginning with
-'t', 'T', 'y', 'Y', or '1'; any other value is interpreted as false.
-
- -------------------
-: Debugging Options :
- -------------------
-
-ARMCI_VERBOSE (boolean)
-
-  Enable extra status output from ARMCI-MPI.
-
-ARMCI_DEBUG_ALLOC (boolean)
-
-  Turn on extra shared allocation debugging.
-
-ARMCI_FLUSH_BARRIERS (boolean)
-
-  Enable/disable extra communication flushing in ARMCI_Barrier.  Extra flushes
-  are present to help make unsafe DLA safer.
-
- ---------------------
-: Performance Options :
- ---------------------
-
-ARMCI_CACHE_RANK_TRANSLATION (boolean)
-
-  Create a table to more quickly translate between absolute and group ranks.
-
- --------------------------
-: Noncollective Groups     :
- --------------------------
-
-ARMCI_NONCOLLECTIVE_GROUPS (boolean)
-
-  Enable noncollective ARMCI group formation; group creation is collective on
-  the output group rather than the parent group.
-
- --------------------------
-: Shared Buffer Protection :
- --------------------------
-
-ARMCI_SHR_BUF_METHOD = { COPY (default), NOGUARD }
-
-  ARMCI policy for managing shared origin buffers in communication operations:
-  lock the buffer (unsafe, but fast), copy the buffer (safe), or don't guard
-  the buffer - assume that the system is cache coherent and MPI supports
-  unlocked load/store.
-
- --------------------
-: I/O Vector Options :
- --------------------
-
-ARMCI_IOV_METHOD = { AUTO (default), CONSRV, BATCHED, DIRECT }
-
-  Select the IO vector communication strategy: automatic; a "conservative"
-  implementation that does lock/unlock around each operation; an implementation
-  that issues batches of operations within a single lock/unlock epoch; and a
-  direct implementation that generates datatypes for the origin and target and
-  issues a single operation using them.
-
-ARMCI_IOV_CHECKS (boolean)
-
-  Enable (expensive) IOV safety/debugging checks (not recommended for
-  performance runs).
-
-ARMCI_IOV_BATCHED_LIMIT = { 0 (default), 1, ... }
-
-  Set the maximum number of one-sided operations per epoch for the BATCHED IOV
-  method.  Zero (default) is unlimited.
-  
- -----------------
-: Strided Options :
- -----------------
-
-ARMCI_STRIDED_METHOD = { DIRECT (default), IOV }
-
-  Select the method for processing strided operations.
diff --git a/src/armci/VERSION b/src/armci/VERSION
deleted file mode 100644
index 0eb786c..0000000
--- a/src/armci/VERSION
+++ /dev/null
@@ -1,20 +0,0 @@
-# Version
-# ARMCI-MPI Revision number: 459
-ARMCI_VERSION=0.1
-
-# For libtool ABI versioning rules see:
-# http://www.nondot.org/sabre/Mirrored/libtool-2.1a/libtool_6.html#SEC36
-
-#     1. If the library source code has changed at all since the last
-#     update, then increment revision (`c:r:a' becomes `c:r+1:a').
-#
-#     2. If any interfaces have been added, removed, or changed since
-#     the last update, increment current, and set revision to 0.
-#
-#     3. If any interfaces have been added since the last public
-#     release, then increment age.
-#
-#     4. If any interfaces have been removed since the last public
-#     release, then set age to 0.
-
-libarmci_abi_version=1:0:0
diff --git a/src/armci/autogen.sh b/src/armci/autogen.sh
deleted file mode 100755
index 88fad80..0000000
--- a/src/armci/autogen.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#! /bin/sh
-
-if [ -n "$MPICH_AUTOTOOLS_DIR" ] ; then
-    autoreconf=${MPICH_AUTOTOOLS_DIR}/autoreconf
-else
-    autoreconf=${AUTORECONF:-autoreconf}
-fi
-
-$autoreconf ${autoreconf_args:-"-vif"}
diff --git a/src/armci/benchmarks/Makefile.mk b/src/armci/benchmarks/Makefile.mk
deleted file mode 100644
index 4ff84ec..0000000
--- a/src/armci/benchmarks/Makefile.mk
+++ /dev/null
@@ -1,22 +0,0 @@
-#
-# Copyright (C) 2010. See COPYRIGHT in top-level directory.
-#
-
-check_PROGRAMS += benchmarks/ping-pong          \
-                  benchmarks/ring-flood         \
-                  benchmarks/contiguous-bench   \
-                  benchmarks/strided-bench      \
-                  benchmarks/bench_groups       \
-                  # end
-
-TESTS          += benchmarks/ping-pong          \
-                  benchmarks/ring-flood         \
-                  benchmarks/contiguous-bench   \
-                  benchmarks/strided-bench      \
-                  # end
-
-benchmarks_ping_pong_LDADD = libarmci.la
-benchmarks_ring_flood_LDADD = libarmci.la
-benchmarks_contiguous_bench_LDADD = libarmci.la -lm
-benchmarks_strided_bench_LDADD = libarmci.la -lm
-benchmarks_bench_groups_LDADD = libarmci.la -lm
diff --git a/src/armci/benchmarks/bench_groups.c b/src/armci/benchmarks/bench_groups.c
deleted file mode 100644
index 265d4a1..0000000
--- a/src/armci/benchmarks/bench_groups.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <armci.h>
-#include <armci_internals.h>
-
-#define PART_SIZE 1
-
-int main(int argc, char **argv) {
-  int                      me, nproc;
-  int                      i, *procs;
-  ARMCI_Group              g_world, g_odd, g_even;
-
-  MPI_Init(&argc, &argv);
-  ARMCI_Init();
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &me);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  procs = malloc(sizeof(int) * ( nproc/2 + (nproc % 2 ? 1 : 0 )));
-
-  if (me == 0) printf("ARMCI Group test starting on %d procs\n", nproc);
-
-  ARMCI_Group_get_world(&g_world);
-  
-  if (me == 0) printf(" + Creating odd group\n");
-
-  for (i = 1; i < nproc; i += 2) {
-    procs[i/2] = i;
-  }
-
-  ARMCI_Group_create_child(i/2, procs, &g_odd, &g_world);
-
-  if (me == 0) printf(" + Creating even group\n");
-
-  for (i = 0; i < nproc; i += 2) {
-    procs[i/2] = i;
-  }
-
-  ARMCI_Group_create_child(i/2, procs, &g_even, &g_world);
-
-  /***********************************************************************/
-  {
-    int    grp_me, grp_nproc;
-    double t_abs_to_grp, t_grp_to_abs;
-    const int iter = 1000000;
-
-    if (me == 0) {
-      ARMCI_Group_rank(&g_even, &grp_me);
-      ARMCI_Group_size(&g_even, &grp_nproc);
-
-      t_abs_to_grp = MPI_Wtime();
-
-      for (i = 0; i < iter; i++)
-        ARMCII_Translate_absolute_to_group(&g_even, (grp_me+1) % grp_nproc);
-
-      t_abs_to_grp = MPI_Wtime() - t_abs_to_grp;
-
-      t_grp_to_abs = MPI_Wtime();
-
-      for (i = 0; i < iter; i++)
-        ARMCI_Absolute_id(&g_even, (grp_me+1) % grp_nproc);
-
-      t_grp_to_abs = MPI_Wtime() - t_grp_to_abs;
-
-      printf("t_abs_to_grp = %f us, t_grp_to_abs = %f us\n", t_abs_to_grp/iter * 1.0e6, t_grp_to_abs/iter * 1.0e6);
-    }
-
-    ARMCI_Barrier();
-  }
-  /***********************************************************************/
-
-  if (me == 0) printf(" + Freeing groups\n");
-
-  if (me % 2 > 0)
-    ARMCI_Group_free(&g_odd);
-  else
-    ARMCI_Group_free(&g_even);
-
-  free(procs);
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/benchmarks/contiguous-bench.c b/src/armci/benchmarks/contiguous-bench.c
deleted file mode 100644
index f1926d1..0000000
--- a/src/armci/benchmarks/contiguous-bench.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <string.h>
-
-#include <mpi.h>
-#include <armci.h>
-#ifdef MODE_SET
-#include <armcix.h>
-#endif
-
-#define MAX_DATA_SIZE   (1024*128*16)
-#define NUM_ITERATIONS  ((data_size <= 2048) ? 4096 : ((data_size <= 16384) ? 1024 : 512))
-#define NUM_WARMUP_ITER 1 
-
-int main(int argc, char ** argv) {
-  int    rank, nproc, test_iter, target_rank, data_size;
-#ifdef MULTIPLE
-  inr    thread_level;
-#endif
-  int   *buf;
-  void **base_ptrs;
-#ifdef MODE_SET
-  ARMCI_Group grp_world;
-#endif
-
-#ifdef MULTIPLE
-  MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &thread_level);
-#else
-  MPI_Init(&argc, &argv);
-#endif
-  ARMCI_Init();
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  if (rank == 0) printf("Starting one-sided contiguous performance test with %d processes\n", nproc);
-
-  buf = ARMCI_Malloc_local(MAX_DATA_SIZE);
-  base_ptrs = malloc(sizeof(void*)*nproc);
-  ARMCI_Malloc(base_ptrs, MAX_DATA_SIZE);
-
-  memset(buf, rank, MAX_DATA_SIZE);
-
-#ifdef MODE_SET
-  ARMCI_Group_get_default(&grp_world);
-
-  if (getenv("ARMCIX_MODE_SET"))
-    ARMCIX_Mode_set(ARMCIX_MODE_CONFLICT_FREE | ARMCIX_MODE_NO_LOAD_STORE, base_ptrs[rank], &grp_world);
-  else if (rank == 0)
-    printf("Warning: ARMCIX_MODE_SET not enabled\n");
-#endif
-
-  if (rank == 0)
-    printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", "Trg. Rank", "Xfer Size",
-        "Get (usec)", "Put (usec)", "Acc (usec)",
-        "Get (MiB/s)", "Put (MiB/s)", "Acc (MiB/s)");
-
-  for (target_rank = 1; rank == 0 && target_rank < nproc; target_rank++) {
-    for (data_size = sizeof(double); data_size <= MAX_DATA_SIZE; data_size *= 2) {
-      double t_get, t_put, t_acc;
-
-      for (test_iter = 0; test_iter < NUM_ITERATIONS + NUM_WARMUP_ITER; test_iter++) {
-        if (test_iter == NUM_WARMUP_ITER)
-          t_get = MPI_Wtime();
-
-        ARMCI_Get(base_ptrs[target_rank], buf, data_size, target_rank);
-      }
-      t_get = (MPI_Wtime() - t_get)/NUM_ITERATIONS;
-
-      for (test_iter = 0; test_iter < NUM_ITERATIONS + NUM_WARMUP_ITER; test_iter++) {
-        if (test_iter == NUM_WARMUP_ITER)
-          t_put = MPI_Wtime();
-
-        ARMCI_Put(buf, base_ptrs[target_rank], data_size, target_rank);
-      }
-      ARMCI_Fence(target_rank);
-      t_put = (MPI_Wtime() - t_put)/NUM_ITERATIONS;
-
-      for (test_iter = 0; test_iter < NUM_ITERATIONS + NUM_WARMUP_ITER; test_iter++) {
-        double scale = 1.0;
-
-        if (test_iter == NUM_WARMUP_ITER)
-          t_acc = MPI_Wtime();
-#ifdef NO_ACC
-        int stride = 0;
-        ARMCI_AccS(ARMCI_ACC_DBL, &scale, buf, &stride, base_ptrs[target_rank], &stride, &data_size, 0, target_rank);
-#else
-        ARMCI_Acc(ARMCI_ACC_DBL, &scale, buf, base_ptrs[target_rank], data_size, target_rank);
-#endif
-      }
-      ARMCI_Fence(target_rank);
-      t_acc = (MPI_Wtime() - t_acc)/NUM_ITERATIONS;
-
-      printf("%12d %12d %12.3f %12.3f %12.3f %12.3f %12.3f %12.3f\n", target_rank, data_size,
-          t_get*1.0e6, t_put*1.0e6, t_acc*1.0e6, data_size/(1024.0*1024.0)/t_get, data_size/(1024.0*1024.0)/t_put, data_size/(1024.0*1024.0)/t_acc);
-    }
-  }
-
-  ARMCI_Barrier();
-
-  ARMCI_Free(base_ptrs[rank]);
-  ARMCI_Free_local(buf);
-  free(base_ptrs);
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/benchmarks/ping-pong.c b/src/armci/benchmarks/ping-pong.c
deleted file mode 100644
index d1071ba..0000000
--- a/src/armci/benchmarks/ping-pong.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <stdint.h>
-
-#include <mpi.h>
-#include <armci.h>
-
-#define MAX_SIZE   262144
-#define NUM_ROUNDS 1000
-
-int main(int argc, char **argv) {
-  int        me, nproc, zero, target;
-  int        msg_length, round, i;
-  double     t_start, t_stop;
-  uint8_t  *snd_buf;  // Send buffer (byte array)
-  uint8_t **rcv_buf;  // Receive buffer (byte array)
-
-  MPI_Init(&argc, &argv);
-  ARMCI_Init();
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &me);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  if (nproc < 2)
-    ARMCI_Error("This benchmark should be run on at least two processes", 1);
-
-  if (me == 0)
-    printf("ARMCI ping-pong latency test, performing %d rounds at each xfer size.\n", NUM_ROUNDS);
-
-  rcv_buf = malloc(nproc*sizeof(void*));
-
-  ARMCI_Malloc((void*)rcv_buf, MAX_SIZE);
-  snd_buf = ARMCI_Malloc_local(MAX_SIZE);
-
-  zero = 0;
-
-  for (i = 0; i < MAX_SIZE; i++) {
-    snd_buf[i] = 1;
-  }
-
-  for (target = 1; target < nproc; target++) {
-    if (me == 0) printf("\n========== Process pair: %d and %d ==========\n\n", 0, target);
-
-    for (msg_length = 1; msg_length <= MAX_SIZE; msg_length *= 2) {
-      ARMCI_Barrier();
-      t_start = MPI_Wtime();
-
-      if (me == 0 || me == target) {
-
-        // Perform NUM_ROUNDS ping-pongs
-        for (round = 0; round < NUM_ROUNDS*2; round++) {
-          int my_target = me == 0 ? target : 0;
-
-          // I am the sender
-          if (round % 2 == me) {
-            if ((round % 2 == 0 && me == 0) || (round % 2 != 0 && me != 0)) {
-              // Clear start and end markers for next round
-#ifdef DIRECT_ACCESS
-              ((uint8_t*)rcv_buf[me])[0] = 0;
-              ((uint8_t*)rcv_buf[me])[msg_length-1] = 0;
-#else
-              ARMCI_Put(&zero, &(((uint8_t*)rcv_buf[me])[0]),            1, me);
-              ARMCI_Put(&zero, &(((uint8_t*)rcv_buf[me])[msg_length-1]), 1, me);
-#endif
-
-              ARMCI_Put(snd_buf, rcv_buf[my_target], msg_length, my_target);
-              ARMCI_Fence(my_target); // This is optional, we don't need notification
-            }
-
-            // I am the receiver
-            else {
-#ifdef DIRECT_ACCESS
-              while (((volatile uint8_t*)rcv_buf[me])[0] == 0) ;
-              while (((volatile uint8_t*)rcv_buf[me])[msg_length-1] == 0) ;
-#else
-              uint8_t val;
-
-              do {
-                ARMCI_Get(&(((uint8_t*)rcv_buf[me])[0]), &val, 1, me);
-              } while (val == 0);
-
-              do {
-                ARMCI_Get(&(((uint8_t*)rcv_buf[me])[msg_length-1]), &val, 1, me);
-              } while (val == 0);
-#endif
-            }
-          }
-        }
-      }
-
-      ARMCI_Barrier(); // FIXME: Time here increases with nproc :(
-      t_stop = MPI_Wtime();
-
-      if (me == 0)
-        printf("%8d bytes \t %12.8f us\n", msg_length, (t_stop-t_start)/NUM_ROUNDS*1.0e6);
-    }
-
-    ARMCI_Barrier();
-  }
-
-  ARMCI_Free(rcv_buf[me]);
-  free(rcv_buf);
-  ARMCI_Free_local(snd_buf);
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/benchmarks/ring-flood.c b/src/armci/benchmarks/ring-flood.c
deleted file mode 100644
index ea6523d..0000000
--- a/src/armci/benchmarks/ring-flood.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <stdint.h>
-
-#include <mpi.h>
-#include <armci.h>
-
-#define MAX_XFER_SIZE 8192
-#define NUM_XFERS     1024
-
-int main(int argc, char **argv) {
-  int          me, nproc;
-  int          msg_length, i;
-  double       t_start, t_stop;
-  armci_hdl_t *handles;  // Non-blocking handles (NUM_XFERS)
-  uint8_t    *snd_buf;  // Send buffer    (MAX_XFER_SIZE)
-  uint8_t   **rcv_buf;  // Receive buffer (MAX_XFER_SIZE * NUM_XFERS)
-
-  MPI_Init(&argc, &argv);
-  ARMCI_Init();
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &me);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  if (nproc < 2)
-    ARMCI_Error("This benchmark should be run on at least two processes", 1);
-
-  if (me == 0)
-    printf("ARMCI flood bandwidth test, performing %d non-blocking xfers at each size.\n\n", NUM_XFERS);
-
-  handles = malloc(NUM_XFERS*sizeof(armci_hdl_t));
-
-  rcv_buf = malloc(nproc*sizeof(void*));
-  ARMCI_Malloc((void*)rcv_buf, MAX_XFER_SIZE*NUM_XFERS);
-
-  snd_buf = ARMCI_Malloc_local(MAX_XFER_SIZE);
-
-  for (i = 0; i < MAX_XFER_SIZE; i++) {
-    snd_buf[i] = (uint8_t) me;
-  }
-
-  for (msg_length = 1; msg_length <= MAX_XFER_SIZE; msg_length *= 2) {
-    int xfer;
-
-    for (xfer = 0; xfer < NUM_XFERS; xfer++)
-      ARMCI_INIT_HANDLE(&handles[xfer]);
-
-    ARMCI_Barrier();
-    t_start = MPI_Wtime();
-
-    // Initiate puts, perform NUM_XFERS NB puts to my right neighbor
-    for (xfer = 0; xfer < NUM_XFERS; xfer++) {
-       ARMCI_NbPut(snd_buf, ((uint8_t*)rcv_buf[(me+1)%nproc])+msg_length*xfer,
-           msg_length, (me+1)%nproc, &handles[xfer]);
-    }
-
-    // Wait for completion
-    for (xfer = 0; xfer < NUM_XFERS; xfer++)
-      ARMCI_Wait(&handles[xfer]);
-
-    ARMCI_Barrier();
-    t_stop = MPI_Wtime();
-
-    if (me == 0)
-      printf("%8d bytes \t %12.8f sec \t %12.8f GB/s\n", 
-          msg_length*NUM_XFERS, (t_stop-t_start),
-          (msg_length*NUM_XFERS)/(t_stop-t_start)/1.0e9);
-  }
-
-  ARMCI_Free(rcv_buf[me]);
-  free(rcv_buf);
-  free(handles);
-  ARMCI_Free_local(snd_buf);
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/benchmarks/strided-bench.c b/src/armci/benchmarks/strided-bench.c
deleted file mode 100644
index 7e64262..0000000
--- a/src/armci/benchmarks/strided-bench.c
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <string.h>
-
-#include <mpi.h>
-#include <armci.h>
-#ifdef MODE_SET
-#include <armcix.h>
-#endif
-
-#define MAX_XDIM        1024
-#define MAX_YDIM        1024
-
-#define MAX_DATA_SIZE   (MAX_XDIM*MAX_YDIM*sizeof(double))
-#define NUM_ITERATIONS  ((xdim*ydim <= 1024) ? 64 : 16)
-#define NUM_WARMUP_ITER 1 
-
-int main(int argc, char ** argv) {
-  int    rank, nproc;
-#ifdef MULTIPLE
-  int    thread_level;
-#endif
-  int    target_rank, xdim, ydim, test_iter;
-  int    stride[1], count[2], levels;
-  double scale;
-  int   *buf;
-  void **base_ptrs;
-#ifdef MODE_SET
-  ARMCI_Group grp_world;
-#endif
-
-#ifdef MULTIPLE
-  MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &thread_level);
-#else
-  MPI_Init(&argc, &argv);
-#endif
-  ARMCI_Init();
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  if (rank == 0) printf("Starting one-sided strided performance test with %d processes\n", nproc);
-
-  buf = ARMCI_Malloc_local(MAX_DATA_SIZE);
-  base_ptrs = malloc(sizeof(void*)*nproc);
-  ARMCI_Malloc(base_ptrs, MAX_DATA_SIZE);
-
-  memset(buf, rank+1, MAX_DATA_SIZE);
-
-#ifdef MODE_SET
-  ARMCI_Group_get_default(&grp_world);
-
-  if (getenv("ARMCIX_MODE_SET"))
-    ARMCIX_Mode_set(ARMCIX_MODE_CONFLICT_FREE | ARMCIX_MODE_NO_LOAD_STORE, base_ptrs[rank], &grp_world);
-  else if (rank == 0)
-    printf("Warning: ARMCIX_MODE_SET not enabled\n");
-#endif
-
-  if (rank == 0)
-    printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", "Trg. Rank", "Xdim Ydim",
-        "Get (usec)", "Put (usec)", "Acc (usec)",
-        "Get (MiB/s)", "Put (MiB/s)", "Acc (MiB/s)");
-
-  stride[0] = MAX_XDIM*sizeof(double);
-  levels    = 1;
-  scale     = 1.0;
-
-  for (target_rank = 1; rank == 0 && target_rank < nproc; target_rank++) {
-
-    for (xdim = 1; xdim <= MAX_XDIM; xdim *= 2) {
-      count[0] = xdim*sizeof(double);
-
-      for (ydim = 1; ydim <= MAX_YDIM; ydim *= 2) {
-        const int data_size = xdim*ydim*sizeof(double);
-        double    t_get, t_put, t_acc;
-
-        count[1] = ydim;
-
-        for (test_iter = 0; test_iter < NUM_ITERATIONS + NUM_WARMUP_ITER; test_iter++) {
-          if (test_iter == NUM_WARMUP_ITER)
-            t_put = MPI_Wtime();
-
-          ARMCI_PutS(buf, stride, base_ptrs[target_rank], stride, count, levels, target_rank);
-        }
-        ARMCI_Fence(target_rank);
-        t_put = (MPI_Wtime() - t_put)/NUM_ITERATIONS;
-
-        for (test_iter = 0; test_iter < NUM_ITERATIONS + NUM_WARMUP_ITER; test_iter++) {
-          if (test_iter == NUM_WARMUP_ITER)
-            t_acc = MPI_Wtime();
-
-          ARMCI_AccS(ARMCI_ACC_DBL, (void*) &scale, buf, stride, base_ptrs[target_rank], stride, count, levels, target_rank);
-        }
-        ARMCI_Fence(target_rank);
-        t_acc = (MPI_Wtime() - t_acc)/NUM_ITERATIONS;
-
-        for (test_iter = 0; test_iter < NUM_ITERATIONS + NUM_WARMUP_ITER; test_iter++) {
-          if (test_iter == NUM_WARMUP_ITER)
-            t_get = MPI_Wtime();
-
-          ARMCI_GetS(base_ptrs[target_rank], stride, buf, stride, count, levels, target_rank);
-        }
-        t_get = (MPI_Wtime() - t_get)/NUM_ITERATIONS;
-
-        printf("%12d %6d%6d %12.3f %12.3f %12.3f %12.3f %12.3f %12.3f\n", target_rank, xdim, ydim,
-            t_get*1.0e6, t_put*1.0e6, t_acc*1.0e6, data_size/(1024.0*1024.0)/t_get, data_size/(1024.0*1024.0)/t_put, data_size/(1024.0*1024.0)/t_acc);
-      }
-    }
-  }
-
-  ARMCI_Barrier();
-
-  ARMCI_Free(base_ptrs[rank]);
-  ARMCI_Free_local(buf);
-  free(base_ptrs);
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/configure.ac b/src/armci/configure.ac
deleted file mode 100644
index c146c45..0000000
--- a/src/armci/configure.ac
+++ /dev/null
@@ -1,196 +0,0 @@
-dnl
-dnl Copyright (C) 2010. See COPYRIGHT in top-level directory.
-dnl
-
-AC_PREREQ(2.62)
-
-AC_INIT([armci],[0])
-AC_CONFIG_AUX_DIR(m4)
-AC_CONFIG_MACRO_DIR(m4)
-AM_INIT_AUTOMAKE([-Wall -Werror -Wno-portability-recursive foreign 1.12.3 color-tests parallel-tests subdir-objects])
-
-LT_PREREQ([2.2.6])
-
-## Bug in libtool adds -O2 and -g by default
-if test ! -z "$MPICC" ; then
-   CC=$MPICC
-   export CC
-fi
-PAC_PUSH_FLAG(CFLAGS)
-AC_PROG_CC(mpicc)
-AM_PROG_CC_C_O
-
-AM_PROG_AR
-
-LT_INIT()
-PAC_POP_FLAG(CFLAGS)
-
-## Non-verbose make
-m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
-
-## Version checks
-if test -s "$srcdir/VERSION" ; then
-    . $srcdir/VERSION
-    export ARMCI_VERSION
-else
-    AC_MSG_ERROR([Version information not found. Configuration aborted.])
-fi
-
-# ABI version
-AC_SUBST(libarmci_abi_version)
-
-# Release version
-# Produce a numeric version assuming the following format:
-# Version: [MAJ].[MIN].[REV][EXT][EXT_NUMBER]
-# Example: 1.0.7rc1 has
-#          MAJ = 1
-#          MIN = 0
-#          REV = 7
-#          EXT = rc
-#          EXT_NUMBER = 1
-#
-# Converting to numeric version will convert EXT to a format number:
-#          ALPHA (a) = 0
-#          BETA (b)  = 1
-#          RC (rc)   = 2
-#          PATCH (p) = 3
-# Regular releases are treated as patch 0
-#
-# Numeric version will have 1 digit for MAJ, 2 digits for MIN,
-# 2 digits for REV, 1 digit for EXT and 2 digits for EXT_NUMBER.
-changequote(<<,>>)
-V1=`expr $ARMCI_VERSION : '\([0-9]*\)\.[0-9]*\.*[0-9]*[a-zA-Z]*[0-9]*'`
-V2=`expr $ARMCI_VERSION : '[0-9]*\.\([0-9]*\)\.*[0-9]*[a-zA-Z]*[0-9]*'`
-V3=`expr $ARMCI_VERSION : '[0-9]*\.[0-9]*\.*\([0-9]*\)[a-zA-Z]*[0-9]*'`
-V4=`expr $ARMCI_VERSION : '[0-9]*\.[0-9]*\.*[0-9]*\([a-zA-Z]*\)[0-9]*'`
-V5=`expr $ARMCI_VERSION : '[0-9]*\.[0-9]*\.*[0-9]*[a-zA-Z]*\([0-9]*\)'`
-changequote([,])
-
-if test "$V2" -le 9 ; then V2=0$V2 ; fi
-if test "$V3" = "" ; then V3=0; fi
-if test "$V3" -le 9 ; then V3=0$V3 ; fi
-if test "$V4" = "a" ; then
-    V4=0
-elif test "$V4" = "b" ; then
-    V4=1
-elif test "$V4" = "rc" ; then
-    V4=2
-elif test "$V4" = "" ; then
-    V4=3
-    V5=0
-elif test "$V4" = "p" ; then
-    V4=3
-fi
-if test "$V5" -le 9 ; then V5=0$V5 ; fi
-
-ARMCI_NUMVERSION=`expr $V1$V2$V3$V4$V5 + 0`
-AC_SUBST(ARMCI_NUMVERSION)
-AC_SUBST(ARMCI_VERSION)
-
-AC_CONFIG_HEADER(src/armciconf.h)
-AH_TOP([/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/*
- *  (C) 2010 by Argonne National Laboratory.
- *      See COPYRIGHT in top-level directory.
- */
-#ifndef _ARMCICONF_H_
-#define _ARMCICONF_H_
-])
-AH_BOTTOM([#endif /* _ARMCICONF_H_ */])
-
-PAC_ARG_STRICT
-PAC_CC_FUNCTION_NAME_SYMBOL
-
-## Error checking functionality
-#AC_ARG_ENABLE(error-checking,
-	#AC_HELP_STRING([--enable-error-checking],[Enable error checking functionality]),
-	#enable_error_checking=$enableval,
-	#enable_error_checking=yes)
-#if test "$enable_error_checking" = "yes" ; then
-   #AC_DEFINE(ERROR_CHECKING,1,[Define if error checking is enabled])
-#fi
-
-## Check if __VA_ARGS__ is defined by the compiler
-PAC_C_MACRO_VA_ARGS
-
-## const and restrict
-AC_C_CONST
-AC_C_RESTRICT
-
-## Chcek for C99
-AC_PROG_CC_C99
-if test "$ac_cv_prog_cc_c99" = "no" ; then
-   AC_ERROR([C99 not supported by the compiler])
-fi
-
-AC_CHECK_HEADERS([execinfo.h stdint.h inttypes.h])
-AC_TYPE_UINT8_T
-
-## Debugging support
-AC_ARG_ENABLE(g, AC_HELP_STRING([--enable-g],[Enable Debugging]),
-                 [ debug=$enableval ],
-                 [ debug=no ])
-AC_MSG_CHECKING(debugging support)
-AC_MSG_RESULT($debug)
-if test "$debug" = "yes"; then
-   CFLAGS="$CFLAGS -g -O0"
-fi
-
-## Safety checks
-AC_ARG_ENABLE(safety-checks, AC_HELP_STRING([--disable-safety-checks],[Disable safety checks for better performance]),
-                 [ safety_enabled=$enableval ],
-                 [ safety_enabled=yes ])
-AC_MSG_CHECKING(whether safety checks are enabled)
-AC_MSG_RESULT($safety_enabled)
-if test "$safety_enabled" = "no"; then
-   AC_DEFINE(NO_SEATBELTS,1,[Defined when safety checks are disabled])
-fi
-
-## ARMCI Groups
-AC_ARG_ENABLE(armci-group, AC_HELP_STRING([--enable-armci-group],[Enable ARMCI subset-collective group formation]),
-                 [ armci_group_enabled=$enableval ],
-                 [ armci_group_enabled=no ])
-AC_MSG_CHECKING(whether ARMCI subset-collective group formation is enabled)
-AC_MSG_RESULT($armci_group_enabled)
-if test "$armci_group_enabled" = "yes"; then
-   AC_DEFINE(ARMCI_GROUP,1,[Defined when ARMCI subset-collective group formation is enabled])
-fi
-
-# Check for support for weak symbols.
-AC_ARG_ENABLE(weak-symbols, AC_HELP_STRING([--enable-weak-symbols],
-                 [Use weak symbols to implement PARMCI routines (default)]),,
-                 enable_weak_symbols=yes)
-if test $enable_weak_symbols = yes ; then
-    # Turn off weak symbols if they aren't available
-    PAC_PROG_C_WEAK_SYMBOLS(,enable_weak_symbols=no)
-fi
-if test $enable_weak_symbols = "yes" ; then
-    AC_DEFINE(USE_WEAK_SYMBOLS,1,[Define if weak symbols should be used])
-    # Check for the ability to support multiple weak symbols
-    if test "$pac_cv_prog_c_weak_symbols" = "pragma weak" ; then
-       PAC_PROG_C_MULTIPLE_WEAK_SYMBOLS(AC_DEFINE(HAVE_MULTIPLE_PRAGMA_WEAK,1,[Define if multiple weak symbols may be defined]))
-    fi
-fi
-
-## Enable creation of libtool-style versioning or no versioning
-AC_ARG_ENABLE(versioning,
-        [AC_HELP_STRING([--enable-versioning],[Enable library versioning])],,
-        [enable_versioning=yes])
-
-if test "$enable_versioning" = "yes" ; then
-   libarmci_abi_versionflags="-version-info \$(libarmci_abi_version)"
-else
-   libarmci_abi_versionflags="-avoid-version"
-fi
-export libarmci_abi_versionflags
-AC_SUBST(libarmci_abi_versionflags)
-
-
-## Documentation
-AC_PATH_PROG([DOXYGEN],[doxygen],,$PATH)
-AC_SUBST(DOXYGEN)
-
-AC_SUBST(top_srcdir)
-
-## Final output
-AC_OUTPUT(Makefile)
diff --git a/src/armci/src/armci.h b/src/armci/src/armci.h
deleted file mode 100644
index 2a4dd4d..0000000
--- a/src/armci/src/armci.h
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#ifndef _ARMCI_H_
-#define _ARMCI_H_
-
-#include <mpi.h>
-
-enum  ARMCI_Acc_e { ARMCI_ACC_INT /*     int */, ARMCI_ACC_LNG /*           long */,
-                    ARMCI_ACC_FLT /*   float */, ARMCI_ACC_DBL /*         double */,
-                    ARMCI_ACC_CPL /* complex */, ARMCI_ACC_DCP /* double complex */ };
-
-typedef long armci_size_t;
-
-int   ARMCI_Init(void);
-int   ARMCI_Init_args(int *argc, char ***argv);
-int   ARMCI_Initialized(void);
-   
-int   ARMCI_Finalize(void);
-void  ARMCI_Cleanup(void);
-
-void  ARMCI_Error(char *msg, int code);
-
-int   ARMCI_Malloc(void **base_ptrs, armci_size_t size);
-int   ARMCI_Free(void *ptr);
-
-void *ARMCI_Malloc_local(armci_size_t size);
-int   ARMCI_Free_local(void *ptr);
-
-void  ARMCI_Barrier(void);
-void  ARMCI_Fence(int proc);
-void  ARMCI_AllFence(void);
-
-void  ARMCI_Access_begin(void *ptr); /* NEW API */
-void  ARMCI_Access_end(void *ptr);   /* NEW API */
-
-void  ARMCI_Copy(void *src, void *dst, int size);
-
-int   ARMCI_Get(void *src, void *dst, int size, int target);
-int   ARMCI_Put(void *src, void *dst, int size, int target);
-int   ARMCI_Acc(int datatype, void *scale, void *src, void *dst, int bytes, int proc);
-
-int   ARMCI_PutS(void *src_ptr, int src_stride_ar[/*stride_levels*/],
-                 void *dst_ptr, int dst_stride_ar[/*stride_levels*/], 
-                 int count[/*stride_levels+1*/], int stride_levels, int proc);
-int   ARMCI_GetS(void *src_ptr, int src_stride_ar[/*stride_levels*/],
-                 void *dst_ptr, int dst_stride_ar[/*stride_levels*/], 
-                 int count[/*stride_levels+1*/], int stride_levels, int proc);
-int   ARMCI_AccS(int datatype, void *scale,
-                 void *src_ptr, int src_stride_ar[/*stride_levels*/],
-                 void *dst_ptr, int dst_stride_ar[/*stride_levels*/],
-                 int count[/*stride_levels+1*/], int stride_levels, int proc);
-
-int   ARMCI_Put_flag(void *src, void* dst, int size, int *flag, int value, int proc);
-int   ARMCI_PutS_flag(void *src_ptr, int src_stride_ar[/*stride_levels*/],
-                 void *dst_ptr, int dst_stride_ar[/*stride_levels*/], 
-                 int count[/*stride_levels+1*/], int stride_levels, 
-                 int *flag, int value, int proc);
-
-
-/** Non-blocking ops.  MPI-2 forces remote completion on everything so these all
-  * currently behave the same as the blocking ops.
-  */
-
-typedef int armci_hdl_t;
-
-void  ARMCI_INIT_HANDLE(armci_hdl_t *hdl);
-void  ARMCI_SET_AGGREGATE_HANDLE(armci_hdl_t* handle);
-void  ARMCI_UNSET_AGGREGATE_HANDLE(armci_hdl_t* handle);
-
-int   ARMCI_NbPut(void *src, void *dst, int bytes, int proc, armci_hdl_t *hdl);
-int   ARMCI_NbGet(void *src, void *dst, int bytes, int proc, armci_hdl_t *hdl);
-int   ARMCI_NbAcc(int datatype, void *scale, void *src, void *dst, int bytes, int proc, armci_hdl_t *hdl);
-
-int   ARMCI_Wait(armci_hdl_t* hdl);
-int   ARMCI_Test(armci_hdl_t* hdl);
-int   ARMCI_WaitAll(void);
-
-int   ARMCI_NbPutS(void *src_ptr, int src_stride_ar[/*stride_levels*/],
-                   void *dst_ptr, int dst_stride_ar[/*stride_levels*/], 
-                   int count[/*stride_levels+1*/], int stride_levels, int proc, armci_hdl_t *hdl);
-int   ARMCI_NbGetS(void *src_ptr, int src_stride_ar[/*stride_levels*/],
-                   void *dst_ptr, int dst_stride_ar[/*stride_levels*/], 
-                   int count[/*stride_levels+1*/], int stride_levels, int proc, armci_hdl_t *hdl);
-int   ARMCI_NbAccS(int datatype, void *scale,
-                   void *src_ptr, int src_stride_ar[/*stride_levels*/],
-                   void *dst_ptr, int dst_stride_ar[/*stride_levels*/],
-                   int count[/*stride_levels+1*/], int stride_levels, int proc, armci_hdl_t *hdl);
-
-void armci_write_strided(void *ptr, int stride_levels, int stride_arr[], int count[], char *buf);
-void armci_read_strided(void *ptr, int stride_levels, int stride_arr[], int count[], char *buf);
-
-
-/** Generalized I/O Vector operations.
-  */
-
-typedef struct {
-  void **src_ptr_array;  // Source starting addresses of each data segment.
-  void **dst_ptr_array;  // Destination starting addresses of each data segment.
-  int    bytes;          // The length of each segment in bytes.
-  int    ptr_array_len;  // Number of data segment.
-} armci_giov_t;
-
-int ARMCI_PutV(armci_giov_t *iov, int iov_len, int proc);
-int ARMCI_GetV(armci_giov_t *iov, int iov_len, int proc);
-int ARMCI_AccV(int datatype, void *scale, armci_giov_t *iov, int iov_len, int proc);
-
-int ARMCI_NbPutV(armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle);
-int ARMCI_NbGetV(armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle);
-int ARMCI_NbAccV(int datatype, void *scale, armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle);
-
-
-/** Scalar/value operations.
-  */
-
-int ARMCI_PutValueInt(int src, void *dst, int proc);
-int ARMCI_PutValueLong(long src, void *dst, int proc);
-int ARMCI_PutValueFloat(float src, void *dst, int proc);
-int ARMCI_PutValueDouble(double src, void *dst, int proc);
-
-int ARMCI_NbPutValueInt(int src, void *dst, int proc, armci_hdl_t *hdl);
-int ARMCI_NbPutValueLong(long src, void *dst, int proc, armci_hdl_t *hdl);
-int ARMCI_NbPutValueFloat(float src, void *dst, int proc, armci_hdl_t *hdl);
-int ARMCI_NbPutValueDouble(double src, void *dst, int proc, armci_hdl_t *hdl);
-
-int    ARMCI_GetValueInt(void *src, int proc);
-long   ARMCI_GetValueLong(void *src, int proc);
-float  ARMCI_GetValueFloat(void *src, int proc);     
-double ARMCI_GetValueDouble(void *src, int proc);     
-
-
-/** Mutexes
-  */
-
-int   ARMCI_Create_mutexes(int count);
-int   ARMCI_Destroy_mutexes(void);
-void  ARMCI_Lock(int mutex, int proc);
-void  ARMCI_Unlock(int mutex, int proc);
-
-/** ARMCI Read-Modify-Write API
-  */
-
-enum ARMCI_Rmw_e { ARMCI_FETCH_AND_ADD, ARMCI_FETCH_AND_ADD_LONG, 
-                   ARMCI_SWAP, ARMCI_SWAP_LONG };
-
-int ARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc);
-
-/** ARMCI Groups API
-  */
-
-typedef struct {
-  MPI_Comm  comm;
-  MPI_Comm  noncoll_pgroup_comm;
-  int      *grp_to_abs;
-  int      *abs_to_grp;
-  int       rank;
-  int       size;
-} ARMCI_Group;
-
-void ARMCI_Group_create(int grp_size, int *pid_list, ARMCI_Group *group_out);
-void ARMCI_Group_create_child(int grp_size, int *pid_list, ARMCI_Group *group_out, ARMCI_Group *group_parent);
-void ARMCI_Group_free(ARMCI_Group *group);
-
-int  ARMCI_Group_rank(ARMCI_Group *group, int *rank);
-void ARMCI_Group_size(ARMCI_Group *group, int *size);
-
-void ARMCI_Group_set_default(ARMCI_Group *group);
-void ARMCI_Group_get_default(ARMCI_Group *group_out);
-void ARMCI_Group_get_world(ARMCI_Group *group_out);
-
-int ARMCI_Absolute_id(ARMCI_Group *group,int group_rank);
-
-int ARMCI_Malloc_group(void **ptr_arr, armci_size_t bytes, ARMCI_Group *group);
-int ARMCI_Free_group(void *ptr, ARMCI_Group *group);
-
-/** ARMCI Message API is in another file:
-  */
-
-#include <message.h>
-
-/** Topology API
-  */
-
-enum armci_domain_e { ARMCI_DOMAIN_SMP };
-
-typedef int armci_domain_t;
-
-int armci_domain_nprocs(armci_domain_t domain, int id);
-int armci_domain_id(armci_domain_t domain, int glob_proc_id);
-int armci_domain_glob_proc_id(armci_domain_t domain, int id, int loc_proc_id);
-int armci_domain_my_id(armci_domain_t domain);
-int armci_domain_count(armci_domain_t domain);
-int armci_domain_same_id(armci_domain_t domain, int proc);
-
-int ARMCI_Same_node(int proc);
-
-/** Odds and ends
-  */
-
-int  ARMCI_Uses_shm(void);
-void ARMCI_Set_shm_limit(unsigned long shmemlimit);
-int  ARMCI_Uses_shm_grp(ARMCI_Group *group);
-
-/** PARMCI -- Profiling Interface
-  */
-
-int     PARMCI_Init(void);
-int     PARMCI_Init_args(int *argc, char ***argv);
-int     PARMCI_Initialized(void);
-int     PARMCI_Finalize(void);
-
-int     PARMCI_Malloc(void **base_ptrs, armci_size_t size);
-int     PARMCI_Free(void *ptr);
-void   *PARMCI_Malloc_local(armci_size_t size);
-int     PARMCI_Free_local(void *ptr);
-
-void    PARMCI_Barrier(void);
-void    PARMCI_Fence(int proc);
-void    PARMCI_AllFence(void);
-void    PARMCI_Access_begin(void *ptr);
-void    PARMCI_Access_end(void *ptr);
-
-int     PARMCI_Get(void *src, void *dst, int size, int target);
-int     PARMCI_Put(void *src, void *dst, int size, int target);
-int     PARMCI_Acc(int datatype, void *scale, void *src, void *dst, int bytes, int proc);
-
-int     PARMCI_PutS(void *src_ptr, int src_stride_ar[], void *dst_ptr, int dst_stride_ar[], 
-                 int count[], int stride_levels, int proc);
-int     PARMCI_GetS(void *src_ptr, int src_stride_ar[], void *dst_ptr, int dst_stride_ar[], 
-                 int count[], int stride_levels, int proc);
-int     PARMCI_AccS(int datatype, void *scale, void *src_ptr, int src_stride_ar[],
-                 void *dst_ptr, int dst_stride_ar[], int count[], int stride_levels, int proc);
-int     PARMCI_Put_flag(void *src, void* dst, int size, int *flag, int value, int proc);
-int     PARMCI_PutS_flag(void *src_ptr, int src_stride_ar[], void *dst_ptr, int dst_stride_ar[], 
-                 int count[], int stride_levels, int *flag, int value, int proc);
-
-int     PARMCI_PutV(armci_giov_t *iov, int iov_len, int proc);
-int     PARMCI_GetV(armci_giov_t *iov, int iov_len, int proc);
-int     PARMCI_AccV(int datatype, void *scale, armci_giov_t *iov, int iov_len, int proc);
-
-int     PARMCI_Wait(armci_hdl_t* hdl);
-int     PARMCI_Test(armci_hdl_t* hdl);
-int     PARMCI_WaitAll(void);
-
-int     PARMCI_NbPut(void *src, void *dst, int bytes, int proc, armci_hdl_t *hdl);
-int     PARMCI_NbGet(void *src, void *dst, int bytes, int proc, armci_hdl_t *hdl);
-int     PARMCI_NbAcc(int datatype, void *scale, void *src, void *dst, int bytes, int proc, armci_hdl_t *hdl);
-int     PARMCI_NbPutS(void *src_ptr, int src_stride_ar[], void *dst_ptr, int dst_stride_ar[], 
-                   int count[], int stride_levels, int proc, armci_hdl_t *hdl);
-int     PARMCI_NbGetS(void *src_ptr, int src_stride_ar[], void *dst_ptr, int dst_stride_ar[], 
-                   int count[], int stride_levels, int proc, armci_hdl_t *hdl);
-int     PARMCI_NbAccS(int datatype, void *scale, void *src_ptr, int src_stride_ar[],
-                   void *dst_ptr, int dst_stride_ar[], int count[], int stride_levels, int proc, armci_hdl_t *hdl);
-int     PARMCI_NbPutV(armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle);
-int     PARMCI_NbGetV(armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle);
-int     PARMCI_NbAccV(int datatype, void *scale, armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle);
-
-int     PARMCI_PutValueInt(int src, void *dst, int proc);
-int     PARMCI_PutValueLong(long src, void *dst, int proc);
-int     PARMCI_PutValueFloat(float src, void *dst, int proc);
-int     PARMCI_PutValueDouble(double src, void *dst, int proc);
-int     PARMCI_NbPutValueInt(int src, void *dst, int proc, armci_hdl_t *hdl);
-int     PARMCI_NbPutValueLong(long src, void *dst, int proc, armci_hdl_t *hdl);
-int     PARMCI_NbPutValueFloat(float src, void *dst, int proc, armci_hdl_t *hdl);
-int     PARMCI_NbPutValueDouble(double src, void *dst, int proc, armci_hdl_t *hdl);
-
-int     PARMCI_GetValueInt(void *src, int proc);
-long    PARMCI_GetValueLong(void *src, int proc);
-float   PARMCI_GetValueFloat(void *src, int proc);
-double  PARMCI_GetValueDouble(void *src, int proc);
-
-int     PARMCI_Create_mutexes(int count);
-int     PARMCI_Destroy_mutexes(void);
-void    PARMCI_Lock(int mutex, int proc);
-void    PARMCI_Unlock(int mutex, int proc);
-int     PARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc);
-
-void    parmci_msg_barrier(void);
-void    parmci_msg_group_barrier(ARMCI_Group *group);
-
-#endif /* _ARMCI_H_ */
diff --git a/src/armci/src/armci_internals.h b/src/armci/src/armci_internals.h
deleted file mode 100644
index fb0382f..0000000
--- a/src/armci/src/armci_internals.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#ifndef HAVE_ARMCI_INTERNALS_H
-#define HAVE_ARMCI_INTERNALS_H
-
-#include <armci.h>
-#include <armciconf.h>
-
-#if   HAVE_STDINT_H
-#  include <stdint.h>
-#elif HAVE_INTTYPES_H
-#  include <inttypes.h>
-#endif
-
-/* Likely/Unlikely macros borrowed from MPICH:
- */
-
-/* These likely/unlikely macros provide static branch prediction hints to the
- * compiler, if such hints are available.  Simply wrap the relevant expression in
- * the macro, like this:
- *
- * if (unlikely(ptr == NULL)) {
- *     // ... some unlikely code path ...
- * }
- *
- * They should be used sparingly, especially in upper-level code.  It's easy to
- * incorrectly estimate branching likelihood, while the compiler can often do a
- * decent job if left to its own devices.
- *
- * These macros are not namespaced because the namespacing is cumbersome.
- */
-/* safety guard for now, add a configure check in the future */
-#if defined(__GNUC__) && (__GNUC__ >= 3)
-#  define unlikely(x_) __builtin_expect(!!(x_),0)
-#  define likely(x_)   __builtin_expect(!!(x_),1)
-#else
-#  define unlikely(x_) (x_)
-#  define likely(x_)   (x_)
-#endif
-
-
-/* Disable safety checks if the user asks for it */
-
-#ifdef NO_SEATBELTS
-#define NO_CHECK_OVERLAP /* Disable checks for overlapping IOV operations */
-//#define NO_USE_CTREE     /* Use the slower O(N) check instead of the conflict tree */
-#define NO_CHECK_BUFFERS /* Disable checking for shared origin buffers    */
-
-#else
-#endif
-
-/* Internal types */
-
-enum ARMCII_Op_e { ARMCII_OP_PUT, ARMCII_OP_GET, ARMCII_OP_ACC };
-
-enum ARMCII_Strided_methods_e { ARMCII_STRIDED_IOV, ARMCII_STRIDED_DIRECT };
-
-enum ARMCII_Iov_methods_e { ARMCII_IOV_AUTO, ARMCII_IOV_CONSRV,
-                            ARMCII_IOV_BATCHED, ARMCII_IOV_DIRECT };
-
-enum ARMCII_Shr_buf_methods_e { ARMCII_SHR_BUF_COPY, ARMCII_SHR_BUF_NOGUARD };
-
-extern char ARMCII_Strided_methods_str[][10];
-extern char ARMCII_Iov_methods_str[][10];
-extern char ARMCII_Shr_buf_methods_str[][10];
-
-typedef struct {
-  int           init_count;             /* Number of times ARMCI_Init has been called                           */
-  int           debug_alloc;            /* Do extra debuggin on memory allocation                               */
-  int           debug_flush_barriers;   /* Flush all windows on a barrier                                       */
-  int           iov_checks;             /* Disable IOV same allocation and overlapping checks                   */
-  int           iov_batched_limit;      /* Max number of ops per epoch for BATCHED IOV method                   */
-  int           noncollective_groups;   /* Use noncollective group creation algorithm                           */
-  int           cache_rank_translation; /* Enable caching of translation between absolute and group ranks       */
-  int           verbose;                /* ARMCI should produce extra status output                             */
-
-  enum ARMCII_Strided_methods_e strided_method; /* Strided transfer method              */
-  enum ARMCII_Iov_methods_e     iov_method;     /* IOV transfer method                  */
-  enum ARMCII_Shr_buf_methods_e shr_buf_method; /* Shared buffer management method      */
-} global_state_t;
-
-
-/* Global data */
-
-extern ARMCI_Group    ARMCI_GROUP_WORLD;
-extern ARMCI_Group    ARMCI_GROUP_DEFAULT;
-extern MPI_Op         MPI_ABSMIN_OP;
-extern MPI_Op         MPI_ABSMAX_OP;
-extern MPI_Op         MPI_SELMIN_OP;
-extern MPI_Op         MPI_SELMAX_OP;
-extern global_state_t ARMCII_GLOBAL_STATE;
-  
-
-/* Utility functions */
-
-void  ARMCII_Bzero(void *buf, armci_size_t size);
-int   ARMCII_Log2(unsigned int val);
-char *ARMCII_Getenv(char *varname);
-int   ARMCII_Getenv_bool(char *varname, int default_value);
-int   ARMCII_Getenv_int(char *varname, int default_value);
-
-/* Synchronization */
-
-void ARMCII_Flush_local(void);
-
-/* GOP Operators */
-
-void ARMCII_Absmin_op(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype);
-void ARMCII_Absmax_op(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype);
-void ARMCII_Absv_op(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype);
-void ARMCII_Msg_sel_min_op(void *data_in, void *data_inout, int *len, MPI_Datatype *datatype);
-void ARMCII_Msg_sel_max_op(void *data_in, void *data_inout, int *len, MPI_Datatype *datatype);
-
-
-/* Group helper routines */
-
-int  ARMCII_Translate_absolute_to_group(ARMCI_Group *group, int world_rank);
-void ARMCII_Group_init_from_comm(ARMCI_Group *group);
-
-
-/* I/O Vector data management and implementation */
-
-/** ARMCI IOV Iterator 
-  */
-typedef struct {
-  /* Strided Representation */
-  void *src;
-  void *dst;
-  int   stride_levels;
-
-  int  *base_ptr;
-  int  *src_stride_ar;
-  int  *dst_stride_ar;
-  int  *count;
-
-  /* Iterator State */
-  int   was_contiguous;
-  int  *idx;
-} armcii_iov_iter_t;
-
-void ARMCII_Acc_type_translate(int armci_datatype, MPI_Datatype *type, int *type_size);
-
-int  ARMCII_Iov_check_overlap(void **ptrs, int count, int size);
-int  ARMCII_Iov_check_same_allocation(void **ptrs, int count, int proc);
-
-void ARMCII_Strided_to_iov(armci_giov_t *iov,
-               void *src_ptr, int src_stride_ar[/*stride_levels*/],
-               void *dst_ptr, int dst_stride_ar[/*stride_levels*/], 
-               int count[/*stride_levels+1*/], int stride_levels);
-
-int ARMCII_Iov_op_dispatch(enum ARMCII_Op_e op, void **src, void **dst, int count, int size,
-    int datatype, int overlapping, int same_alloc, int proc);
-
-int ARMCII_Iov_op_safe(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count,
-    MPI_Datatype type, int proc);
-int ARMCII_Iov_op_batched(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count,
-    MPI_Datatype type, int proc);
-int ARMCII_Iov_op_datatype(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count,
-    MPI_Datatype type, int proc);
-
-armcii_iov_iter_t *ARMCII_Strided_to_iov_iter(
-               void *src_ptr, int src_stride_ar[/*stride_levels*/],
-               void *dst_ptr, int dst_stride_ar[/*stride_levels*/], 
-               int count[/*stride_levels+1*/], int stride_levels);
-void ARMCII_Iov_iter_free(armcii_iov_iter_t *it);
-int  ARMCII_Iov_iter_has_next(armcii_iov_iter_t *it);
-int  ARMCII_Iov_iter_next(armcii_iov_iter_t *it, void **src, void **dst);
-
-
-/* Shared to private buffer management routines */
-
-int  ARMCII_Buf_prepare_read_vec(void **orig_bufs, void ***new_bufs_ptr, int count, int size);
-void ARMCII_Buf_finish_read_vec(void **orig_bufs, void **new_bufs, int count, int size);
-int  ARMCII_Buf_prepare_acc_vec(void **orig_bufs, void ***new_bufs_ptr, int count, int size,
-                            int datatype, void *scale);
-void ARMCII_Buf_finish_acc_vec(void **orig_bufs, void **new_bufs, int count, int size);
-int  ARMCII_Buf_prepare_write_vec(void **orig_bufs, void ***new_bufs_ptr, int count, int size);
-void ARMCII_Buf_finish_write_vec(void **orig_bufs, void **new_bufs, int count, int size);
-
-int  ARMCII_Buf_acc_is_scaled(int datatype, void *scale);
-void ARMCII_Buf_acc_scale(void *buf_in, void *buf_out, int size, int datatype, void *scale);
-
-#endif /* HAVE_ARMCI_INTERNALS_H */
diff --git a/src/armci/src/armcix.h b/src/armci/src/armcix.h
deleted file mode 100644
index f860fc9..0000000
--- a/src/armci/src/armcix.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#ifndef _ARMCIX_H_
-#define _ARMCIX_H_
-
-#include <armci.h>
-#include <armciconf.h>
-
-#if   HAVE_STDINT_H
-#  include <stdint.h>
-#elif HAVE_INTTYPES_H
-#  include <inttypes.h>
-#endif
-
-/** Access mode extensions: set the access mode for an ARMCI allocation,
-  * enabling runtime layer optimizations.
-  */
-
-enum armcix_access_mode_e {
-  ARMCIX_MODE_ALL           = 0x1,  /* All access types permitted          */
-  ARMCIX_MODE_CONFLICT_FREE = 0x2,  /* Operations do not conflict          */
-  ARMCIX_MODE_NO_LOAD_STORE = 0x4   /* Load/store operations not permitted */
-};
-
-int ARMCIX_Mode_set(int mode, void *ptr, ARMCI_Group *group);
-int ARMCIX_Mode_get(void *ptr);
-
-/** Processor group extensions.
-  */
-
-int ARMCIX_Group_split(ARMCI_Group *parent, int color, int key, ARMCI_Group *new_group);
-int ARMCIX_Group_dup(ARMCI_Group *parent, ARMCI_Group *new_group);
-
-/** Mutex handles: These improve on basic ARMCI mutexes by allowing you to
-  * create multiple batches of mutexes.  This is needed to allow libraries access to
-  * mutexes.
-  */
-
-struct armcix_mutex_hdl_s {
-  int         my_count;
-  int         max_count;
-  ARMCI_Group grp;
-  MPI_Win    *windows;
-  uint8_t   **bases;
-};
-
-typedef struct armcix_mutex_hdl_s * armcix_mutex_hdl_t;
-
-armcix_mutex_hdl_t ARMCIX_Create_mutexes_hdl(int count, ARMCI_Group *pgroup);
-int  ARMCIX_Destroy_mutexes_hdl(armcix_mutex_hdl_t hdl);
-void ARMCIX_Lock_hdl(armcix_mutex_hdl_t hdl, int mutex, int proc);
-int  ARMCIX_Trylock_hdl(armcix_mutex_hdl_t hdl, int mutex, int proc);
-void ARMCIX_Unlock_hdl(armcix_mutex_hdl_t hdl, int mutex, int proc);
-
-#endif /* _ARMCIX_H_ */
diff --git a/src/armci/src/buffer.c b/src/armci/src/buffer.c
deleted file mode 100644
index d34be31..0000000
--- a/src/armci/src/buffer.c
+++ /dev/null
@@ -1,432 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <armci.h>
-#include <armci_internals.h>
-#include <gmr.h>
-#include <debug.h>
-
-
-/** Prepare a set of buffers for use with a put operation.  The returned set of
-  * buffers is guaranteed to be in private space.  Copies will be made if needed,
-  * the result should be completed by finish.
-  *
-  * @param[in]  orig_bufs Original set of buffers.
-  * @param[out] new_bufs  Pointer to the set of private buffers.
-  * @param[in]  count     Number of entries in the buffer list.
-  * @param[in]  size      The size of the buffers (all are of the same size).
-  * @return               Number of buffers that were moved.
-  */
-int ARMCII_Buf_prepare_read_vec(void **orig_bufs, void ***new_bufs_ptr, int count, int size) {
-  int num_moved = 0;
-
-  if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) {
-    void **new_bufs = malloc(count*sizeof(void*));
-    int i;
-
-    for (i = 0; i < count; i++)
-      new_bufs[i] = NULL;
-
-    for (i = 0; i < count; i++) {
-      // Check if the source buffer is within a shared region.  If so, copy it
-      // into a private buffer.
-      gmr_t *mreg = gmr_lookup(orig_bufs[i], ARMCI_GROUP_WORLD.rank);
-
-      if (mreg != NULL) {
-        MPI_Alloc_mem(size, MPI_INFO_NULL, &new_bufs[i]);
-        ARMCII_Assert(new_bufs[i] != NULL);
-
-        gmr_dla_lock(mreg);
-        ARMCI_Copy(orig_bufs[i], new_bufs[i], size);
-        // gmr_get(mreg, orig_bufs[i], new_bufs[i], size, ARMCI_GROUP_WORLD.rank);
-        gmr_dla_unlock(mreg);
-
-        num_moved++;
-      } else {
-        new_bufs[i] = orig_bufs[i];
-      }
-    }
-
-    *new_bufs_ptr = new_bufs;
-  }
-  else {
-    *new_bufs_ptr = orig_bufs;
-  }
-  
-  return num_moved;
-}
-
-
-/** Finish a set of prepared buffers.  Will perform communication and copies as
-  * needed to ensure results are in the original buffers.  Temporary space will be
-  * freed.
-  *
-  * @param[in]  orig_bufs Original set of buffers.
-  * @param[out] new_bufs  Set of private buffers.
-  * @param[in]  count     Number of entries in the buffer list.
-  * @param[in]  size      The size of the buffers (all are of the same size).
-  */
-void ARMCII_Buf_finish_read_vec(void **orig_bufs, void **new_bufs, int count, int size) {
-  if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) {
-    int i;
-
-    for (i = 0; i < count; i++) {
-      if (orig_bufs[i] != new_bufs[i]) {
-        MPI_Free_mem(new_bufs[i]);
-      }
-    }
-
-    free(new_bufs);
-  }
-}
-
-
-/** Prepare a set of buffers for use with an accumulate operation.  The
-  * returned set of buffers is guaranteed to be in private space and scaled.
-  * Copies will be made if needed, the result should be completed by finish.
-  *
-  * @param[in]  orig_bufs Original set of buffers.
-  * @param[out] new_bufs  Pointer to the set of private buffers.
-  * @param[in]  count     Number of entries in the buffer list.
-  * @param[in]  size      The size of the buffers (all are of the same size).
-  * @param[in]  datatype  The type of the buffer.
-  * @param[in]  scale     Scaling constant to apply to each buffer.
-  * @return               Number of buffers that were moved.
-  */
-int ARMCII_Buf_prepare_acc_vec(void **orig_bufs, void ***new_bufs_ptr, int count, int size,
-                            int datatype, void *scale) {
-
-  void **new_bufs;
-  int i, scaled, num_moved = 0;
-  
-  new_bufs = malloc(count*sizeof(void*));
-  ARMCII_Assert(new_bufs != NULL);
-
-  scaled = ARMCII_Buf_acc_is_scaled(datatype, scale);
-
-  for (i = 0; i < count; i++) {
-    gmr_t *mreg = NULL;
-
-    // Check if the source buffer is within a shared region.
-    if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD)
-      mreg = gmr_lookup(orig_bufs[i], ARMCI_GROUP_WORLD.rank);
-
-    if (scaled) {
-      MPI_Alloc_mem(size, MPI_INFO_NULL, &new_bufs[i]);
-      ARMCII_Assert(new_bufs[i] != NULL);
-
-      // Lock if needed so we can directly access the buffer
-      if (mreg != NULL)
-        gmr_dla_lock(mreg);
-
-      ARMCII_Buf_acc_scale(orig_bufs[i], new_bufs[i], size, datatype, scale);
-
-      if (mreg != NULL)
-        gmr_dla_unlock(mreg);
-    } else {
-      new_bufs[i] = orig_bufs[i];
-    }
-
-    if (mreg != NULL) {
-      // If the buffer wasn't copied, we should copy it into a private buffer
-      if (new_bufs[i] == orig_bufs[i]) {
-        MPI_Alloc_mem(size, MPI_INFO_NULL, &new_bufs[i]);
-        ARMCII_Assert(new_bufs[i] != NULL);
-
-        gmr_dla_lock(mreg);
-        ARMCI_Copy(orig_bufs[i], new_bufs[i], size);
-        gmr_dla_unlock(mreg);
-      }
-    }
-
-    if (new_bufs[i] == orig_bufs[i])
-      num_moved++;
-  }
-
-  *new_bufs_ptr = new_bufs;
-  
-  return num_moved;
-}
-
-
-/** Finish a set of prepared buffers.  Will perform communication and copies as
-  * needed to ensure results are in the original buffers.  Temporary space will be
-  * freed.
-  *
-  * @param[in]  orig_bufs Original set of buffers.
-  * @param[out] new_bufs  Set of private buffers.
-  * @param[in]  count     Number of entries in the buffer list.
-  * @param[in]  size      The size of the buffers (all are of the same size).
-  */
-void ARMCII_Buf_finish_acc_vec(void **orig_bufs, void **new_bufs, int count, int size) {
-  int i;
-
-  for (i = 0; i < count; i++) {
-    if (orig_bufs[i] != new_bufs[i]) {
-      MPI_Free_mem(new_bufs[i]);
-    }
-  }
-
-  free(new_bufs);
-}
-
-
-/** Prepare a set of buffers for use with a get operation.  The returned set of
-  * buffers is guaranteed to be in private space.  Copies will be made if needed,
-  * the result should be completed by finish.
-  *
-  * @param[in]  orig_bufs Original set of buffers.
-  * @param[out] new_bufs  Pointer to the set of private buffers.
-  * @param[in]  count     Number of entries in the buffer list.
-  * @param[in]  size      The size of the buffers (all are of the same size).
-  * @return               Number of buffers that were moved.
-  */
-int ARMCII_Buf_prepare_write_vec(void **orig_bufs, void ***new_bufs_ptr, int count, int size) {
-  int num_moved = 0;
-
-  if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) {
-    void **new_bufs = malloc(count*sizeof(void*));
-    int i;
-
-    for (i = 0; i < count; i++)
-      new_bufs[i] = NULL;
-
-    for (i = 0; i < count; i++) {
-      // Check if the destination buffer is within a shared region.  If not, create
-      // a temporary private buffer to hold the result.
-      gmr_t *mreg = gmr_lookup(orig_bufs[i], ARMCI_GROUP_WORLD.rank);
-
-      if (mreg != NULL) {
-        MPI_Alloc_mem(size, MPI_INFO_NULL, &new_bufs[i]);
-        ARMCII_Assert(new_bufs[i] != NULL);
-        num_moved++;
-      } else {
-        new_bufs[i] = orig_bufs[i];
-      }
-    }
-
-    *new_bufs_ptr = new_bufs;
-  } else {
-    *new_bufs_ptr = orig_bufs;
-  }
-  
-  return num_moved;
-}
-
-
-/** Finish a set of prepared buffers.  Will perform communication and copies as
-  * needed to ensure results are in the original buffers.  Temporary space will be
-  * freed.
-  *
-  * @param[in]  orig_bufs Original set of buffers.
-  * @param[out] new_bufs  Set of private buffers.
-  * @param[in]  count     Number of entries in the buffer list.
-  * @param[in]  size      The size of the buffers (all are of the same size).
-  */
-void ARMCII_Buf_finish_write_vec(void **orig_bufs, void **new_bufs, int count, int size) {
-  if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) {
-    int i;
-
-    for (i = 0; i < count; i++) {
-      if (orig_bufs[i] != new_bufs[i]) {
-        gmr_t *mreg = gmr_lookup(orig_bufs[i], ARMCI_GROUP_WORLD.rank);
-        ARMCII_Assert(mreg != NULL);
-
-        gmr_dla_lock(mreg);
-        ARMCI_Copy(new_bufs[i], orig_bufs[i], size);
-        // gmr_put(mreg, new_bufs[i], orig_bufs[i], size, ARMCI_GROUP_WORLD.rank);
-        gmr_dla_unlock(mreg);
-
-        MPI_Free_mem(new_bufs[i]);
-      }
-    }
-
-    free(new_bufs);
-  }
-}
-
-
-/** Check if an operation with the given parameters requires scaling.
-  *
-  * @param[in] datatype Type of the data involved in the operation
-  * @param[in] scale    Value of type datatype to scale
-  * @return             Nonzero if scale is not the identity scale
-  */
-int ARMCII_Buf_acc_is_scaled(int datatype, void *scale) {
-  switch (datatype) {
-    case ARMCI_ACC_INT:
-      if (*((int*)scale) == 1)
-        return 0;
-      break;
-
-    case ARMCI_ACC_LNG:
-      if (*((long*)scale) == 1)
-        return 0;
-      break;
-
-    case ARMCI_ACC_FLT:
-      if (*((float*)scale) == 1.0)
-        return 0;
-      break;
-
-    case ARMCI_ACC_DBL:
-      if (*((double*)scale) == 1.0)
-        return 0;
-      break;
-
-    case ARMCI_ACC_CPL:
-      if (((float*)scale)[0] == 1.0 && ((float*)scale)[1] == 0.0)
-        return 0;
-      break;
-
-    case ARMCI_ACC_DCP:
-      if (((double*)scale)[0] == 1.0 && ((double*)scale)[1] == 0.0)
-        return 0;
-      break;
-
-    default:
-      ARMCII_Error("unknown data type (%d)", datatype);
-  }
-
-  return 1;
-}
-
-
-/** Prepare a set of buffers for use with an accumulate operation.  The
-  * returned set of buffers is guaranteed to be in private space and scaled.
-  * Copies will be made if needed, the result should be completed by finish.
-  *
-  * @param[in]  buf       Original set of buffers.
-  * @param[in]  count     Number of entries in the buffer list.
-  * @param[in]  size      The size of the buffers (all are of the same size).
-  * @param[in]  datatype  The type of the buffer.
-  * @param[in]  scale     Scaling constant to apply to each buffer.
-  * @return               Pointer to the new buffer or buf
-  */
-void ARMCII_Buf_acc_scale(void *buf_in, void *buf_out, int size, int datatype, void *scale) {
-  int   j, nelem;
-  int   type_size = -1;
-  MPI_Datatype type;
-
-  switch (datatype) {
-    case ARMCI_ACC_INT:
-      MPI_Type_size(MPI_INT, &type_size);
-      type = MPI_INT;
-      nelem= size/type_size;
-
-      {
-        int *src_i = (int*) buf_in;
-        int *scl_i = (int*) buf_out;
-        const int s = *((int*) scale);
-
-        for (j = 0; j < nelem; j++)
-          scl_i[j] = src_i[j]*s;
-      }
-      break;
-
-    case ARMCI_ACC_LNG:
-      MPI_Type_size(MPI_LONG, &type_size);
-      type = MPI_LONG;
-      nelem= size/type_size;
-
-      {
-        long *src_l = (long*) buf_in;
-        long *scl_l = (long*) buf_out;
-        const long s = *((long*) scale);
-
-        for (j = 0; j < nelem; j++)
-          scl_l[j] = src_l[j]*s;
-      }
-      break;
-
-    case ARMCI_ACC_FLT:
-      MPI_Type_size(MPI_FLOAT, &type_size);
-      type = MPI_FLOAT;
-      nelem= size/type_size;
-
-      {
-        float *src_f = (float*) buf_in;
-        float *scl_f = (float*) buf_out;
-        const float s = *((float*) scale);
-
-        for (j = 0; j < nelem; j++)
-          scl_f[j] = src_f[j]*s;
-      }
-      break;
-
-    case ARMCI_ACC_DBL:
-      MPI_Type_size(MPI_DOUBLE, &type_size);
-      type = MPI_DOUBLE;
-      nelem= size/type_size;
-
-      {
-        double *src_d = (double*) buf_in;
-        double *scl_d = (double*) buf_out;
-        const double s = *((double*) scale);
-
-        for (j = 0; j < nelem; j++)
-          scl_d[j] = src_d[j]*s;
-      }
-      break;
-
-    case ARMCI_ACC_CPL:
-      MPI_Type_size(MPI_FLOAT, &type_size);
-      type = MPI_FLOAT;
-      nelem= size/type_size;
-
-      {
-        float *src_fc = (float*) buf_in;
-        float *scl_fc = (float*) buf_out;
-        const float s_r = ((float*)scale)[0];
-        const float s_c = ((float*)scale)[1];
-
-        for (j = 0; j < nelem; j += 2) {
-          // Complex multiplication: (a + bi)*(c + di)
-          const float src_fc_j   = src_fc[j];
-          const float src_fc_j_1 = src_fc[j+1];
-          /*
-          scl_fc[j]   = src_fc[j]*s_r   - src_fc[j+1]*s_c;
-          scl_fc[j+1] = src_fc[j+1]*s_r + src_fc[j]*s_c;
-          */
-          scl_fc[j]   = src_fc_j*s_r   - src_fc_j_1*s_c;
-          scl_fc[j+1] = src_fc_j_1*s_r + src_fc_j*s_c;
-        }
-      }
-      break;
-
-    case ARMCI_ACC_DCP:
-      MPI_Type_size(MPI_DOUBLE, &type_size);
-      type = MPI_DOUBLE;
-      nelem= size/type_size;
-
-      {
-        double *src_dc = (double*) buf_in;
-        double *scl_dc = (double*) buf_out;
-        const double s_r = ((double*)scale)[0];
-        const double s_c = ((double*)scale)[1];
-
-        for (j = 0; j < nelem; j += 2) {
-          // Complex multiplication: (a + bi)*(c + di)
-          const double src_dc_j   = src_dc[j];
-          const double src_dc_j_1 = src_dc[j+1];
-          /*
-          scl_dc[j]   = src_dc[j]*s_r   - src_dc[j+1]*s_c;
-          scl_dc[j+1] = src_dc[j+1]*s_r + src_dc[j]*s_c;
-          */
-          scl_dc[j]   = src_dc_j*s_r   - src_dc_j_1*s_c;
-          scl_dc[j+1] = src_dc_j_1*s_r + src_dc_j*s_c;
-        }
-      }
-      break;
-
-    default:
-      ARMCII_Error("unknown data type (%d)", datatype);
-  }
-
-  ARMCII_Assert_msg(size % type_size == 0, 
-      "Transfer size is not a multiple of the datatype size");
-}
diff --git a/src/armci/src/conflict_tree.c b/src/armci/src/conflict_tree.c
deleted file mode 100644
index 2814860..0000000
--- a/src/armci/src/conflict_tree.c
+++ /dev/null
@@ -1,310 +0,0 @@
-/** Copyright (C) 2010. See COPYRIGHT in top-level directory.
-  *
-  * Conflict Tree -- James Dinan <dinan at mcs.anl.gov>
-  *
-  * Conflict trees are used by ARMCI-MPI to detect conflicting accesses due to
-  * overlapping memory regions.
-  *
-  * This implementation uses an AVL tree which is a self-balancing binary tree.
-  * In contrast with interval and segment trees which can store mutiple
-  * overlapping regions and support stabbing queries, the conflict tree does
-  * not allow any overlap among elements in the tree.  If an overlapping insert
-  * is performed, it will fail.  Thus, objects in the tree are totally ordered
-  * and a standard binary tree (in this case, the AVL tree) is sufficient for
-  * detecting conflicts.
-  */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <debug.h>
-#include <conflict_tree.h>
-
-#define MAX(A,B) (((A) > (B)) ? A : B)
-
-static ctree_t ctree_balance(ctree_t node);
-static void ctree_destroy_rec(ctree_t root);
-static inline int ctree_node_height(ctree_t node);
-static inline void ctree_rotate_left(ctree_t node);
-static inline void ctree_rotate_right(ctree_t node);
-
-const ctree_t CTREE_EMPTY = NULL;
-
-
-/** Locate the node that conflicts with the given address range.
-  *
-  * @param[in] root Root of the ctree.
-  * @param[in] lo   Low end of the range.
-  * @param[in] hi   High end of the range.
-  * @return         Pointer to the ctree node or NULL if not found.
-  */
-ctree_t ctree_locate(ctree_t root, uint8_t *lo, uint8_t *hi) {
-  ctree_t cur = root;
-
-  while (cur != NULL) {
-    if (   (lo >= cur->lo && lo <= cur->hi)
-        || (hi >= cur->lo && hi <= cur->hi)
-        || (lo <  cur->lo && hi >  cur->hi))
-      break;
-
-    else if (lo < cur->lo)
-      cur = cur->left;
-   
-    else /* lo > cur->hi */
-      cur = cur->right;
-  }
-
-  return cur;
-}
-
-
-/** Insert an address range into the conflict detection tree.
-  *
-  * @param[inout] root The root of the tree
-  * @param[in]    lo   Lower bound of the address range
-  * @param[in]    hi   Upper bound of the address range
-  * @return            Zero on success, nonzero when a conflict is detected.
-  *                    When a conflict exists, the range is not added.
-  */
-int ctree_insert(ctree_t *root, uint8_t *lo, uint8_t *hi) {
-  ctree_t cur;
-  ctree_t new_node = (ctree_t) malloc(sizeof(struct ctree_node_s));
-
-  new_node->lo     = lo;
-  new_node->hi     = hi;
-  new_node->height = 1;
-  new_node->parent = NULL;
-  new_node->left   = NULL;
-  new_node->right  = NULL;
-
-  cur = *root;
-
-  // CASE: Empty tree
-  if (cur == NULL) {
-    *root = new_node;
-    return 0;
-  }
-
-  for (;;) {
-
-    // Check for conflicts as we go
-    if (   (lo >= cur->lo && lo <= cur->hi)
-        || (hi >= cur->lo && hi <= cur->hi)
-        || (lo <  cur->lo && hi >  cur->hi)) {
-      ARMCII_Dbg_print(DEBUG_CAT_CTREE, "Conflict inserting [%p, %p] with [%p, %p]\n", lo, hi, cur->lo, cur->hi);
-      free(new_node);
-      return 1;
-    }
-
-    // Place in left subtree
-    else if (lo < cur->lo) {
-      if (cur->left == NULL) {
-        new_node->parent = cur;
-        cur->left = new_node;
-        *root = ctree_balance(cur);
-        return 0;
-      } else {
-        cur = cur->left;
-      }
-    }
-   
-    // Place in right subtree
-    else /* lo > cur->hi */ {
-      if (cur->right == NULL) {
-        new_node->parent = cur;
-        cur->right = new_node;
-        *root = ctree_balance(cur);
-        return 0;
-      } else {
-        cur = cur->right;
-      }
-    }
-  }
-}
-
-
-/** Rotate the given pivot node to the left.
-  *
-  * @param[in] node This is the pivot node, it will be the new subtree root
-  *                 after the rotation is performed.
-  */
-static inline void ctree_rotate_left(ctree_t node) {
-  ctree_t old_root = node->parent;
-
-  //ARMCII_Dbg_print(DEBUG_CAT_CTREE, "[%10p, %10p] l=%d r=%d\n", node->lo, node->hi, 
-  //    ctree_node_height(node->left), ctree_node_height(node->right));
-
-  ARMCII_Assert(old_root->right == node);
-
-  // Set the parent pointer
-  node->parent     = old_root->parent;
-
-  // Set the parent's child pointer
-  if (node->parent != NULL) {
-    if (node->parent->left == old_root)
-      node->parent->left = node;
-    else
-      node->parent->right = node;
-  }
-
-  // Set child pointers and their parents
-  old_root->right         = node->left;
-  if (old_root->right != NULL)
-    old_root->right->parent = old_root;
-
-  node->left              = old_root;
-  node->left->parent      = node;
-
-  old_root->height = MAX(ctree_node_height(old_root->left), ctree_node_height(old_root->right)) + 1;
-  node->height     = MAX(ctree_node_height(node->left), ctree_node_height(node->right)) + 1;
-}
-
-
-/** Rotate the given pivot node to the right.
-  *
-  * @param[in] node This is the pivot node, it will be the new subtree root
-  *                 after the rotation is performed.
-  */
-static inline void ctree_rotate_right(ctree_t node) {
-  ctree_t old_root = node->parent;
-
-  //ARMCII_Dbg_print(DEBUG_CAT_CTREE, "[%10p, %10p] l=%d r=%d\n", node->lo, node->hi, 
-  //    ctree_node_height(node->left), ctree_node_height(node->right));
-
-  ARMCII_Assert(old_root->left == node);
-
-  // Set the parent pointer
-  node->parent     = old_root->parent;
-
-  // Set the parent's child pointer
-  if (node->parent != NULL) {
-    if (node->parent->left == old_root)
-      node->parent->left = node;
-    else
-      node->parent->right = node;
-  }
-
-  // Set child pointers and their parents
-  old_root->left  = node->right;
-  if (old_root->left != NULL)
-    old_root->left->parent = old_root;
-
-  node->right         = old_root;
-  node->right->parent = node;
-
-  old_root->height = MAX(ctree_node_height(old_root->left), ctree_node_height(old_root->right)) + 1;
-  node->height     = MAX(ctree_node_height(node->left), ctree_node_height(node->right)) + 1;
-}
-
-
-/** Fetch the height of a given node.
-  */
-static inline int ctree_node_height(ctree_t node) {
-  if (node == NULL)
-    return 0;
-  else
-    return node->height;
-}
-
-
-/** Rebalance the tree starting with the current node and proceeding
-  * upwards towards the root.
-  */
-static ctree_t ctree_balance(ctree_t node) {
-  ctree_t root = node;
-
-  while (node != NULL) {
-    int height_l = ctree_node_height(node->left);
-    int height_r = ctree_node_height(node->right);
-
-    node->height = MAX(ctree_node_height(node->left), ctree_node_height(node->right)) + 1;
-
-    // Rebalance to preserve the property that right and left heights
-    // differ by at most 1.
-    if (abs(height_l - height_r) >= 2) {
-
-      // CASE: Right is heavy
-      if (height_l - height_r == -2) {
-        int height_r_l = ctree_node_height(node->right->left);
-        int height_r_r = ctree_node_height(node->right->right);
-
-        // CASE: Right, right
-        if (height_r_l - height_r_r <= 0) {
-          node = node->right;
-          ctree_rotate_left(node);
-        }
-        // CASE: Right, left
-        else {
-          ctree_rotate_right(node->right->left);
-          node = node->right;
-          ctree_rotate_left(node);
-        }
-
-      // CASE: Left is heavy
-      } else if (height_l - height_r == 2) {
-        int height_l_l = ctree_node_height(node->left->left);
-        int height_l_r = ctree_node_height(node->left->right);
-
-        // CASE: Left, left
-        if (height_l_l - height_l_r >= 0) {
-          node = node->left;
-          ctree_rotate_right(node);
-        }
-        // CASE: Left, right
-        else {
-          ctree_rotate_left(node->left->right);
-          node = node->left;
-          ctree_rotate_right(node);
-        }
-
-      } else {
-        ARMCII_Error("CTree invariant violated, height difference of %d is too large", height_l - height_r);
-      }
-    }
-
-    root = node;
-    node = node->parent;
-  }
-
-  return root;
-}
-
-
-/** Recursive function to traverse a tree and destroy all its nodes.
-  */
-static void ctree_destroy_rec(ctree_t root) {
-  if (root == NULL) return;
-  
-  ctree_destroy_rec(root->left);
-  ctree_destroy_rec(root->right);
-
-  free(root);
-}
-
-
-/** Destroy a conflict tree.
-  */
-void ctree_destroy(ctree_t *root) {
-  ctree_destroy_rec(*root);
-
-  *root = NULL;
-}
-
-
-/** Print out the contents of the conflict detection tree.
-  */
-void ctree_print(ctree_t root) {
-  if (root != NULL) {
-    int  i,idx;
-    char s[32] = "";
-
-    ctree_print(root->left);
-
-    for (i = 1, idx = 0; i < 32-1 && i < root->height; i++)
-      idx += sprintf(s+idx, "\t");
-    
-    printf("%10p:%s[%p, %p] p=%p h=%d\n", (void*)root, s, root->lo, root->hi, (void*)(root->parent), root->height);
-
-    ctree_print(root->right);
-  }
-}
diff --git a/src/armci/src/conflict_tree.h b/src/armci/src/conflict_tree.h
deleted file mode 100644
index 33aa0c4..0000000
--- a/src/armci/src/conflict_tree.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#ifndef _CONFLICT_TREE_H
-#define _CONFLICT_TREE_H
-
-#include <armci_internals.h>
-
-struct ctree_node_s {
-  uint8_t *lo;
-  uint8_t *hi;
-
-  int height;
-
-  struct ctree_node_s *parent;
-  struct ctree_node_s *left;
-  struct ctree_node_s *right;
-};
-
-typedef struct ctree_node_s * ctree_t;
-
-extern const ctree_t CTREE_EMPTY;
-
-int     ctree_insert(ctree_t *root, uint8_t *lo, uint8_t *hi);
-ctree_t ctree_locate(ctree_t root, uint8_t *lo, uint8_t *hi);
-void    ctree_destroy(ctree_t *root);
-void    ctree_print(ctree_t root);
-
-
-#endif /* _CONFLICT_TREE_H */
diff --git a/src/armci/src/debug.c b/src/armci/src/debug.c
deleted file mode 100644
index 8f6c690..0000000
--- a/src/armci/src/debug.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <mpi.h>
-
-#include <armci_internals.h>
-#include <debug.h>
-
-/* Set the default debugging message classes to enable.
- */
-unsigned DEBUG_CATS_ENABLED = 
-    DEBUG_CAT_NONE;
-    // DEBUG_CAT_ALL;
-    // DEBUG_CAT_ALLOC;
-    // DEBUG_CAT_ALLOC | DEBUG_CAT_MEM_REGION;
-    // DEBUG_CAT_MUTEX;
-    // DEBUG_CAT_GROUPS;
-
-
-/** Print an assertion failure message and abort the program.
-  */
-void ARMCII_Assert_fail(const char *expr, const char *msg, const char *file, int line, const char *func) {
-  int rank;
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
-  if (msg == NULL)
-    fprintf(stderr, "[%d] ARMCI assert fail in %s() [%s:%d]: \"%s\"\n", rank, func, file, line, expr);
-  else
-    fprintf(stderr, "[%d] ARMCI assert fail in %s() [%s:%d]: \"%s\"\n"
-                    "[%d] Message: \"%s\"\n", rank, func, file, line, expr, rank, msg);
-
-#if HAVE_EXECINFO_H
-  {
-#include <execinfo.h>
-
-    const int SIZE = 100;
-    int    j, nframes;
-    void  *frames[SIZE];
-    char **symbols;
-
-    nframes = backtrace(frames, SIZE);
-    symbols = backtrace_symbols(frames, nframes);
-
-    if (symbols == NULL)
-      perror("Backtrace failure");
-
-    fprintf(stderr, "[%d] Backtrace:\n", rank);
-    for (j = 0; j < nframes; j++)
-      fprintf(stderr, "[%d]  %2d - %s\n", rank, nframes-j-1, symbols[j]);
-
-    free(symbols);
-  }
-#endif
-
-  fflush(NULL);
-  {
-    double stall = MPI_Wtime();
-    while (MPI_Wtime() - stall < 1) ;
-  }
-  MPI_Abort(MPI_COMM_WORLD, -1);
-}
-
-
-/** Print a debugging message.
-  */
-void ARMCII_Dbg_print_impl(const char *func, const char *format, ...) {
-  va_list etc;
-  int  disp;
-  char string[500];
-
-  disp  = 0;
-  disp += snprintf(string, 500, "[%d] %s: ", ARMCI_GROUP_WORLD.rank, func);
-  va_start(etc, format);
-  disp += vsnprintf(string+disp, 500-disp, format, etc);
-  va_end(etc);
-
-  fprintf(stderr, "%s", string);
-}
-
-
-/** Print an ARMCI warning message.
-  */
-void ARMCII_Warning(const char *fmt, ...) {
-  va_list etc;
-  int  disp;
-  char string[500];
-
-  disp  = 0;
-  disp += snprintf(string, 500, "[%d] ARMCI Warning: ", ARMCI_GROUP_WORLD.rank);
-  va_start(etc, fmt);
-  disp += vsnprintf(string+disp, 500-disp, fmt, etc);
-  va_end(etc);
-
-  fprintf(stderr, "%s", string);
-  fflush(NULL);
-}
diff --git a/src/armci/src/debug.h b/src/armci/src/debug.h
deleted file mode 100644
index 43408fa..0000000
--- a/src/armci/src/debug.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#ifndef _DEBUG_H_
-#define _DEBUG_H_
-
-#include <stdarg.h>
-#include <armciconf.h>
-#include <armci_internals.h>
-
-enum debug_cats_e {
-  DEBUG_CAT_ALL        =  -1,
-  DEBUG_CAT_NONE       =   0,
-  DEBUG_CAT_MEM_REGION = 0x1,  // 2^0
-  DEBUG_CAT_ALLOC      = 0x2,  // 2^1
-  DEBUG_CAT_MUTEX      = 0x4,  // 2^2
-  DEBUG_CAT_GROUPS     = 0x8,  // 2^3
-  DEBUG_CAT_CTREE      = 0x10, // 2^4, ...
-  DEBUG_CAT_IOV        = 0x20
-};
-
-/* A logical OR of the debug message categories that are enabled.
- */
-extern  unsigned DEBUG_CATS_ENABLED;
-
-
-#ifdef NO_SEATBELTS
-#define ARMCII_Assert(X) ((void)0)
-#define ARMCII_Assert_msg(X,MSG) ((void)0)
-#else
-void    ARMCII_Assert_fail(const char *expr, const char *msg, const char *file, int line, const char *func);
-#define ARMCII_Assert(EXPR)          do { if (unlikely(!(EXPR))) ARMCII_Assert_fail(#EXPR, NULL, __FILE__, __LINE__, __func__); } while(0)
-#define ARMCII_Assert_msg(EXPR, MSG) do { if (unlikely(!(EXPR))) ARMCII_Assert_fail(#EXPR, MSG,  __FILE__, __LINE__, __func__); } while(0)
-#endif /* NO_SEATBELTS    */
-
-
-#ifdef NO_SEATBELTS
-#define DEBUG_CAT_ENABLED(X) 0
-#define ARMCII_Dbg_print(CAT,...) ((void)0)
-#else
-#define DEBUG_CAT_ENABLED(X) (DEBUG_CATS_ENABLED & (X))
-void    ARMCII_Dbg_print_impl(const char *func, const char *format, ...);
-#define ARMCII_Dbg_print(CAT,...) do { if (DEBUG_CAT_ENABLED(CAT)) ARMCII_Dbg_print_impl(__func__,__VA_ARGS__); } while (0)
-#endif /* NO_SEATBELTS */
-
-
-#define ARMCII_Error(...) ARMCII_Error_impl(__FILE__,__LINE__,__func__,__VA_ARGS__)
-void    ARMCII_Error_impl(const char *file, const int line, const char *func, const char *msg, ...);
-void    ARMCII_Warning(const char *fmt, ...);
-
-#endif /* _DEBUG_H_ */
diff --git a/src/armci/src/gmr.c b/src/armci/src/gmr.c
deleted file mode 100644
index 6608f87..0000000
--- a/src/armci/src/gmr.c
+++ /dev/null
@@ -1,578 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <mpi.h>
-
-#include <armci.h>
-#include <armcix.h>
-#include <armci_internals.h>
-#include <debug.h>
-#include <gmr.h>
-
-
-/** Linked list of shared memory regions.
-  */
-gmr_t *gmr_list = NULL;
-
-
-/** Create a distributed shared memory region. Collective on ARMCI group.
-  *
-  * @param[in]  local_size Size of the local slice of the memory region.
-  * @param[out] base_ptrs  Array of base pointers for each process in group.
-  * @param[in]  group      Group on which to perform allocation.
-  * @return                Pointer to the memory region object.
-  */
-gmr_t *gmr_create(gmr_size_t local_size, void **base_ptrs, ARMCI_Group *group) {
-  int           i;
-  gmr_size_t    aggregate_size;
-  int           alloc_me, alloc_nproc;
-  int           world_me, world_nproc;
-  MPI_Group     world_group, alloc_group;
-  gmr_t        *mreg;
-  gmr_slice_t  *alloc_slices, gmr_slice;
-
-  ARMCII_Assert(local_size >= 0);
-  ARMCII_Assert(group != NULL);
-
-  MPI_Comm_rank(group->comm, &alloc_me);
-  MPI_Comm_size(group->comm, &alloc_nproc);
-  MPI_Comm_rank(ARMCI_GROUP_WORLD.comm, &world_me);
-  MPI_Comm_size(ARMCI_GROUP_WORLD.comm, &world_nproc);
-
-  mreg = malloc(sizeof(gmr_t));
-  ARMCII_Assert(mreg != NULL);
-
-  mreg->slices = malloc(sizeof(gmr_slice_t)*world_nproc);
-  ARMCII_Assert(mreg->slices != NULL);
-  alloc_slices = malloc(sizeof(gmr_slice_t)*alloc_nproc);
-  ARMCII_Assert(alloc_slices != NULL);
-
-  mreg->group          = *group; /* NOTE: I think it is invalid in GA/ARMCI to
-                                    free a group before its allocations.  If
-                                    this is not the case, then assignment here
-                                    is incorrect and this should really
-                                    duplicated the group (communicator). */
-
-  mreg->nslices        = world_nproc;
-  mreg->access_mode    = ARMCIX_MODE_ALL;
-  mreg->lock_state     = GMR_LOCK_UNLOCKED;
-  mreg->dla_lock_count = 0;
-  mreg->prev           = NULL;
-  mreg->next           = NULL;
-
-  /* Allocate my slice of the GMR */
-  alloc_slices[alloc_me].size = local_size;
-
-  if (local_size == 0) {
-    alloc_slices[alloc_me].base = NULL;
-  } else {
-    MPI_Alloc_mem(local_size, MPI_INFO_NULL, &(alloc_slices[alloc_me].base));
-    ARMCII_Assert(alloc_slices[alloc_me].base != NULL);
-  }
-
-  /* Debugging: Zero out shared memory if enabled */
-  if (ARMCII_GLOBAL_STATE.debug_alloc && local_size > 0) {
-    ARMCII_Assert(alloc_slices[alloc_me].base != NULL);
-    ARMCII_Bzero(alloc_slices[alloc_me].base, local_size);
-  }
-
-  /* All-to-all on <base, size> to build up slices vector */
-  gmr_slice = alloc_slices[alloc_me];
-  MPI_Allgather(  &gmr_slice, sizeof(gmr_slice_t), MPI_BYTE,
-                 alloc_slices, sizeof(gmr_slice_t), MPI_BYTE, group->comm);
-
-  /* Check for a global size 0 allocation */
-  for (i = aggregate_size = 0; i < alloc_nproc; i++) {
-    aggregate_size += alloc_slices[i].size;
-  }
-
-  /* Everyone asked for 0 bytes, return a NULL vector */
-  if (aggregate_size == 0) {
-    free(alloc_slices);
-    free(mreg->slices);
-    free(mreg);
-
-    for (i = 0; i < alloc_nproc; i++)
-      base_ptrs[i] = NULL;
-
-    return NULL;
-  }
-
-  MPI_Win_create(alloc_slices[alloc_me].base, (MPI_Aint) local_size, 1, MPI_INFO_NULL, group->comm, &mreg->window);
-
-  /* Populate the base pointers array */
-  for (i = 0; i < alloc_nproc; i++)
-    base_ptrs[i] = alloc_slices[i].base;
-
-  /* We have to do lookup on global ranks, so shovel the contents of
-     alloc_slices into the mreg->slices array which is indexed by global rank. */
-  memset(mreg->slices, 0, sizeof(gmr_slice_t)*world_nproc);
-
-  MPI_Comm_group(ARMCI_GROUP_WORLD.comm, &world_group);
-  MPI_Comm_group(group->comm, &alloc_group);
-
-  for (i = 0; i < alloc_nproc; i++) {
-    int world_rank;
-    MPI_Group_translate_ranks(alloc_group, 1, &i, world_group, &world_rank);
-    mreg->slices[world_rank] = alloc_slices[i];
-  }
-
-  free(alloc_slices);
-  MPI_Group_free(&world_group);
-  MPI_Group_free(&alloc_group);
-
-  /* Create the RMW mutex: Keeps RMW operations atomic wrt each other */
-  mreg->rmw_mutex = ARMCIX_Create_mutexes_hdl(1, group);
-
-  /* Append the new region onto the region list */
-  if (gmr_list == NULL) {
-    gmr_list = mreg;
-
-  } else {
-    gmr_t *parent = gmr_list;
-
-    while (parent->next != NULL)
-      parent = parent->next;
-
-    parent->next = mreg;
-    mreg->prev   = parent;
-  }
-
-  return mreg;
-}
-
-
-/** Destroy/free a shared memory region.
-  *
-  * @param[in] ptr   Pointer within range of the segment (e.g. base pointer).
-  * @param[in] group Group on which to perform the free.
-  */
-void gmr_destroy(gmr_t *mreg, ARMCI_Group *group) {
-  int   search_proc_in, search_proc_out, search_proc_out_grp;
-  void *search_base;
-  int   alloc_me, alloc_nproc;
-  int   world_me, world_nproc;
-
-  MPI_Comm_rank(group->comm, &alloc_me);
-  MPI_Comm_size(group->comm, &alloc_nproc);
-  MPI_Comm_rank(ARMCI_GROUP_WORLD.comm, &world_me);
-  MPI_Comm_size(ARMCI_GROUP_WORLD.comm, &world_nproc);
-
-  /* All-to-all exchange of a <base address, proc> pair.  This is so that we
-   * can support passing NULL into ARMCI_Free() which is permitted when a
-   * process allocates 0 bytes.  Unfortunately, in this case we still need to
-   * identify the mem region and free it.
-   */
-
-  if (mreg == NULL)
-    search_proc_in = -1;
-  else {
-    search_proc_in = world_me;
-    search_base    = mreg->slices[world_me].base;
-  }
-
-  /* Collectively decide on who will provide the base address */
-  MPI_Allreduce(&search_proc_in, &search_proc_out, 1, MPI_INT, MPI_MAX, group->comm);
-
-  /* Everyone passed NULL.  Nothing to free. */
-  if (search_proc_out < 0)
-    return;
-
-  /* Translate world rank to group rank */
-  search_proc_out_grp = ARMCII_Translate_absolute_to_group(group, search_proc_out);
-
-  /* Broadcast the base address */
-  MPI_Bcast(&search_base, sizeof(void*), MPI_BYTE, search_proc_out_grp, group->comm);
-
-  /* If we were passed NULL, look up the mem region using the <base, proc> pair */
-  if (mreg == NULL)
-    mreg = gmr_lookup(search_base, search_proc_out);
-
-  /* If it's still not found, the user may have passed the wrong group */
-  ARMCII_Assert_msg(mreg != NULL, "Could not locate the desired allocation");
-
-  switch (mreg->lock_state) {
-    case GMR_LOCK_UNLOCKED:
-      break;
-    case GMR_LOCK_DLA:
-      ARMCII_Warning("Releasing direct local access before freeing shared allocation\n");
-      gmr_dla_unlock(mreg);
-      break;
-    default:
-      ARMCII_Error("Unable to free locked memory region (%d)\n", mreg->lock_state);
-  }
-
-  /* Remove from the list of mem regions */
-  if (mreg->prev == NULL) {
-    ARMCII_Assert(gmr_list == mreg);
-    gmr_list = mreg->next;
-
-    if (mreg->next != NULL)
-      mreg->next->prev = NULL;
-
-  } else {
-    mreg->prev->next = mreg->next;
-    if (mreg->next != NULL)
-      mreg->next->prev = mreg->prev;
-  }
-
-  /* Destroy the window and free all buffers */
-  MPI_Win_free(&mreg->window);
-
-  if (mreg->slices[world_me].base != NULL)
-    MPI_Free_mem(mreg->slices[world_me].base);
-
-  free(mreg->slices);
-  ARMCIX_Destroy_mutexes_hdl(mreg->rmw_mutex);
-
-  free(mreg);
-}
-
-
-/** Destroy all memory regions (called by finalize).
-  *
-  * @return Number of mem regions destroyed.
-  */
-int gmr_destroy_all(void) {
-  int count = 0;
-
-  while (gmr_list != NULL) {
-    gmr_destroy(gmr_list, &gmr_list->group);
-    count++;
-  }
-
-  return count;
-}
-
-/** Lookup a shared memory region using an address and process id.
-  *
-  * @param[in] ptr  Pointer within range of the segment (e.g. base pointer).
-  * @param[in] proc Process on which the data lives.
-  * @return         Pointer to the mem region object.
-  */
-gmr_t *gmr_lookup(void *ptr, int proc) {
-  gmr_t *mreg;
-
-  mreg = gmr_list;
-
-  while (mreg != NULL) {
-    ARMCII_Assert(proc < mreg->nslices);
-
-    if (proc < mreg->nslices) {
-      const uint8_t   *base = mreg->slices[proc].base;
-      const gmr_size_t size = mreg->slices[proc].size;
-
-      if ((uint8_t*) ptr >= base && (uint8_t*) ptr < base + size)
-        break;
-    }
-
-    mreg = mreg->next;
-  }
-
-  return mreg;
-}
-
-
-/** One-sided put operation.  Source buffer must be private.
-  *
-  * @param[in] mreg   Memory region
-  * @param[in] src    Source address (local)
-  * @param[in] dst    Destination address (remote)
-  * @param[in] size   Number of bytes to transfer
-  * @param[in] proc   Absolute process id of target process
-  * @return           0 on success, non-zero on failure
-  */
-int gmr_put(gmr_t *mreg, void *src, void *dst, int size, int proc) {
-  ARMCII_Assert_msg(src != NULL, "Invalid local address");
-  return gmr_put_typed(mreg, src, size, MPI_BYTE, dst, size, MPI_BYTE, proc);
-}
-
-
-/** One-sided put operation with type arguments.  Source buffer must be private.
-  *
-  * @param[in] mreg      Memory region
-  * @param[in] src       Address of source data
-  * @param[in] src_count Number of elements of the given type at the source
-  * @param[in] src_type  MPI datatype of the source elements
-  * @param[in] dst       Address of destination buffer
-  * @param[in] dst_count Number of elements of the given type at the destination
-  * @param[in] src_type  MPI datatype of the destination elements
-  * @param[in] size      Number of bytes to transfer
-  * @param[in] proc      Absolute process id of target process
-  * @return              0 on success, non-zero on failure
-  */
-int gmr_put_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type,
-    void *dst, int dst_count, MPI_Datatype dst_type, int proc) {
-
-  int        grp_proc;
-  gmr_size_t disp;
-  MPI_Aint lb, extent;
-
-  grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc);
-  ARMCII_Assert(grp_proc >= 0);
-
-  // Calculate displacement from beginning of the window
-  if (dst == MPI_BOTTOM) 
-    disp = 0;
-  else
-    disp = (gmr_size_t) ((uint8_t*)dst - (uint8_t*)mreg->slices[proc].base);
-
-  // Perform checks
-  MPI_Type_get_true_extent(dst_type, &lb, &extent);
-  ARMCII_Assert(mreg->lock_state != GMR_LOCK_UNLOCKED);
-  ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address");
-  ARMCII_Assert_msg(disp + dst_count*extent <= mreg->slices[proc].size, "Transfer is out of range");
-
-  MPI_Put(src, src_count, src_type, grp_proc, (MPI_Aint) disp, dst_count, dst_type, mreg->window);
-
-  return 0;
-}
-
-
-/** One-sided get operation.  Destination buffer must be private.
-  *
-  * @param[in] mreg   Memory region
-  * @param[in] src    Source address (remote)
-  * @param[in] dst    Destination address (local)
-  * @param[in] size   Number of bytes to transfer
-  * @param[in] proc   Absolute process id of target process
-  * @return           0 on success, non-zero on failure
-  */
-int gmr_get(gmr_t *mreg, void *src, void *dst, int size, int proc) {
-  ARMCII_Assert_msg(dst != NULL, "Invalid local address");
-  return gmr_get_typed(mreg, src, size, MPI_BYTE, dst, size, MPI_BYTE, proc);
-}
-
-
-/** One-sided get operation with type arguments.  Destination buffer must be private.
-  *
-  * @param[in] mreg      Memory region
-  * @param[in] src       Address of source data
-  * @param[in] src_count Number of elements of the given type at the source
-  * @param[in] src_type  MPI datatype of the source elements
-  * @param[in] dst       Address of destination buffer
-  * @param[in] dst_count Number of elements of the given type at the destination
-  * @param[in] src_type  MPI datatype of the destination elements
-  * @param[in] size      Number of bytes to transfer
-  * @param[in] proc      Absolute process id of target process
-  * @return              0 on success, non-zero on failure
-  */
-int gmr_get_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type,
-    void *dst, int dst_count, MPI_Datatype dst_type, int proc) {
-
-  int        grp_proc;
-  gmr_size_t disp;
-  MPI_Aint lb, extent;
-
-  grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc);
-  ARMCII_Assert(grp_proc >= 0);
-
-  // Calculate displacement from beginning of the window
-  if (src == MPI_BOTTOM) 
-    disp = 0;
-  else
-    disp = (gmr_size_t) ((uint8_t*)src - (uint8_t*)mreg->slices[proc].base);
-
-  // Perform checks
-  MPI_Type_get_true_extent(src_type, &lb, &extent);
-  ARMCII_Assert(mreg->lock_state != GMR_LOCK_UNLOCKED);
-  ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address");
-  ARMCII_Assert_msg(disp + src_count*extent <= mreg->slices[proc].size, "Transfer is out of range");
-
-  MPI_Get(dst, dst_count, dst_type, grp_proc, (MPI_Aint) disp, src_count, src_type, mreg->window);
-
-  return 0;
-}
-
-
-/** One-sided accumulate operation.  Source buffer must be private.
-  *
-  * @param[in] mreg     Memory region
-  * @param[in] src      Source address (local)
-  * @param[in] dst      Destination address (remote)
-  * @param[in] type     MPI type of the given buffers
-  * @param[in] count    Number of elements of the given type to transfer
-  * @param[in] proc     Absolute process id of the target
-  * @return             0 on success, non-zero on failure
-  */
-int gmr_accumulate(gmr_t *mreg, void *src, void *dst, int count, MPI_Datatype type, int proc) {
-  ARMCII_Assert_msg(src != NULL, "Invalid local address");
-  return gmr_accumulate_typed(mreg, src, count, type, dst, count, type, proc);
-}
-
-
-/** One-sided accumulate operation with typed arguments.  Source buffer must be private.
-  *
-  * @param[in] mreg      Memory region
-  * @param[in] src       Address of source data
-  * @param[in] src_count Number of elements of the given type at the source
-  * @param[in] src_type  MPI datatype of the source elements
-  * @param[in] dst       Address of destination buffer
-  * @param[in] dst_count Number of elements of the given type at the destination
-  * @param[in] src_type  MPI datatype of the destination elements
-  * @param[in] size      Number of bytes to transfer
-  * @param[in] proc      Absolute process id of target process
-  * @return              0 on success, non-zero on failure
-  */
-int gmr_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type,
-    void *dst, int dst_count, MPI_Datatype dst_type, int proc) {
-
-  int        grp_proc;
-  gmr_size_t disp;
-  MPI_Aint lb, extent;
-
-  grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc);
-  ARMCII_Assert(grp_proc >= 0);
-
-  // Calculate displacement from beginning of the window
-  if (dst == MPI_BOTTOM) 
-    disp = 0;
-  else
-    disp = (gmr_size_t) ((uint8_t*)dst - (uint8_t*)mreg->slices[proc].base);
-
-  // Perform checks
-  MPI_Type_get_true_extent(dst_type, &lb, &extent);
-  ARMCII_Assert(mreg->lock_state != GMR_LOCK_UNLOCKED);
-  ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address");
-  ARMCII_Assert_msg(disp + dst_count*extent <= mreg->slices[proc].size, "Transfer is out of range");
-
-  MPI_Accumulate(src, src_count, src_type, grp_proc, (MPI_Aint) disp, dst_count, dst_type, MPI_SUM, mreg->window);
-
-  return 0;
-}
-
-/** Lock a memory region so that one-sided operations can be performed.
-  *
-  * @param[in] mreg     Memory region
-  * @param[in] mode     Lock mode (exclusive, shared, etc...)
-  * @param[in] proc     Absolute process id of the target
-  * @return             0 on success, non-zero on failure
-  */
-void gmr_lock(gmr_t *mreg, int proc) {
-  int grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc);
-  int grp_me   = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank);
-  int lock_assert, lock_mode;
-
-  ARMCII_Assert(grp_proc >= 0 && grp_me >= 0);
-  ARMCII_Assert(mreg->lock_state == GMR_LOCK_UNLOCKED || mreg->lock_state == GMR_LOCK_DLA);
-
-  /* Check for active DLA and suspend if needed */
-  if (mreg->lock_state == GMR_LOCK_DLA) {
-    ARMCII_Assert(grp_me == mreg->lock_target);
-    MPI_Win_unlock(mreg->lock_target, mreg->window);
-    mreg->lock_state = GMR_LOCK_DLA_SUSP;
-  }
-
-  if (   mreg->access_mode & ARMCIX_MODE_CONFLICT_FREE 
-      && mreg->access_mode & ARMCIX_MODE_NO_LOAD_STORE )
-  {
-    /* Only non-conflicting RMA accesses allowed.
-       Shared and exclusive locks. */
-    lock_assert = MPI_MODE_NOCHECK;
-    lock_mode   = MPI_LOCK_SHARED;
-  } else if (mreg->access_mode & ARMCIX_MODE_CONFLICT_FREE) {
-    /* Non-conflicting RMA and local accesses allowed.
-       Shared and exclusive locks. */
-    lock_assert = 0;
-    lock_mode   = MPI_LOCK_SHARED;
-  } else {
-    /* Conflicting RMA and local accesses allowed.
-       Exclusive locks. */
-    lock_assert = 0;
-    lock_mode   = MPI_LOCK_EXCLUSIVE;
-  }
-
-  MPI_Win_lock(lock_mode, grp_proc, lock_assert, mreg->window);
-
-  if (lock_mode == MPI_LOCK_EXCLUSIVE)
-    mreg->lock_state = GMR_LOCK_EXCLUSIVE;
-  else
-    mreg->lock_state = GMR_LOCK_SHARED;
-
-  mreg->lock_target = grp_proc;
-}
-
-
-/** Unlock a memory region.
-  *
-  * @param[in] mreg     Memory region
-  * @param[in] proc     Absolute process id of the target
-  * @return             0 on success, non-zero on failure
-  */
-void gmr_unlock(gmr_t *mreg, int proc) {
-  int grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc);
-  int grp_me   = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank);
-
-  ARMCII_Assert(grp_proc >= 0 && grp_me >= 0);
-  ARMCII_Assert(mreg->lock_state == GMR_LOCK_EXCLUSIVE || mreg->lock_state == GMR_LOCK_SHARED);
-  ARMCII_Assert(mreg->lock_target == grp_proc);
-
-  /* Check if DLA is suspended and needs to be resumed */
-  if (mreg->dla_lock_count > 0) {
-
-    if (mreg->lock_state != GMR_LOCK_EXCLUSIVE || mreg->lock_target != grp_me) {
-      MPI_Win_unlock(grp_proc, mreg->window);
-      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, grp_me, 0, mreg->window); // FIXME: NOCHECK here?
-    }
-
-    mreg->lock_state = GMR_LOCK_DLA;
-    mreg->lock_target= grp_me;
-  }
-  else {
-    MPI_Win_unlock(grp_proc, mreg->window);
-    mreg->lock_state = GMR_LOCK_UNLOCKED;
-  }
-}
-
-
-/** Lock a memory region so that load/store operations can be performed.
-  *
-  * @param[in] mreg     Memory region
-  * @param[in] mode     Lock mode (exclusive, shared, etc...)
-  * @return             0 on success, non-zero on failure
-  */
-void gmr_dla_lock(gmr_t *mreg) {
-  int grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank);
-
-  ARMCII_Assert(grp_proc >= 0);
-  ARMCII_Assert(mreg->lock_state == GMR_LOCK_UNLOCKED || mreg->lock_state == GMR_LOCK_DLA);
-  ARMCII_Assert_msg((mreg->access_mode & ARMCIX_MODE_NO_LOAD_STORE) == 0,
-      "Direct local access is not allowed in the current access mode");
-
-  if (mreg->dla_lock_count == 0) {
-    ARMCII_Assert(mreg->lock_state == GMR_LOCK_UNLOCKED);
-
-    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, grp_proc, 0, mreg->window);
-
-    mreg->lock_state = GMR_LOCK_DLA;
-    mreg->lock_target= grp_proc;
-  }
-
-  ARMCII_Assert(mreg->lock_state == GMR_LOCK_DLA);
-  mreg->dla_lock_count++;
-}
-
-
-/** Unlock a memory region that was locked for direct local access.
-  *
-  * @param[in] mreg     Memory region
-  */
-void gmr_dla_unlock(gmr_t *mreg) {
-  int grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank);
-
-  ARMCII_Assert(grp_proc >= 0);
-  ARMCII_Assert(mreg->lock_state == GMR_LOCK_DLA);
-  ARMCII_Assert_msg((mreg->access_mode & ARMCIX_MODE_NO_LOAD_STORE) == 0,
-      "Direct local access is not allowed in the current access mode");
-
-  mreg->dla_lock_count--;
-
-  if (mreg->dla_lock_count == 0) {
-    MPI_Win_unlock(grp_proc, mreg->window);
-    mreg->lock_state = GMR_LOCK_UNLOCKED;
-  }
-}
diff --git a/src/armci/src/gmr.h b/src/armci/src/gmr.h
deleted file mode 100644
index aeda2e3..0000000
--- a/src/armci/src/gmr.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#ifndef HAVE_GMR_H
-#define HAVE_GMR_H
-
-#include <mpi.h>
-
-#include <armci.h>
-#include <armcix.h>
-
-typedef armci_size_t gmr_size_t;
-
-enum gmr_lock_states_e { 
-  GMR_LOCK_UNLOCKED,    /* Mem region is unlocked */
-  GMR_LOCK_EXCLUSIVE,   /* Mem region is locked for exclusive access */
-  GMR_LOCK_SHARED,      /* Mem region is locked for shared (non-conflicting) access */
-  GMR_LOCK_DLA,         /* Mem region is locked for Direct Local Access */
-  GMR_LOCK_DLA_SUSP     /* Mem region is unlocked and DLA is suspended */
-};
-
-typedef struct {
-  void       *base;
-  gmr_size_t  size;
-} gmr_slice_t;
-
-typedef struct gmr_s {
-  MPI_Win                 window;         /* MPI Window for this GMR                                        */
-  ARMCI_Group             group;          /* Copy of the ARMCI group on which this GMR was allocated        */
-
-  int                     access_mode;    /* Current access mode                                            */
-  enum gmr_lock_states_e  lock_state;     /* State of the lock                                              */
-  int                     lock_target;    /* Group (window) rank of the current target (if locked)          */
-  int                     dla_lock_count; /* Access count on the DLA lock.  Can unlock when this reaches 0. */
-  armcix_mutex_hdl_t      rmw_mutex;      /* Mutex used for Read-Modify-Write operations                    */
-
-  struct gmr_s           *prev;           /* Linked list pointers for GMR list                              */
-  struct gmr_s           *next;
-  gmr_slice_t            *slices;         /* Array of GMR slices for this allocation                        */
-  int                     nslices;
-} gmr_t;
-
-extern gmr_t *gmr_list;
-
-gmr_t *gmr_create(gmr_size_t local_size, void **base_ptrs, ARMCI_Group *group);
-void   gmr_destroy(gmr_t *mreg, ARMCI_Group *group);
-int    gmr_destroy_all(void);
-gmr_t *gmr_lookup(void *ptr, int proc);
-
-int gmr_get(gmr_t *mreg, void *src, void *dst, int size, int target);
-int gmr_put(gmr_t *mreg, void *src, void *dst, int size, int target);
-int gmr_accumulate(gmr_t *mreg, void *src, void *dst, int count, MPI_Datatype type, int proc);
-
-int gmr_get_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type,
-    void *dst, int dst_count, MPI_Datatype dst_type, int proc);
-int gmr_put_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type,
-    void *dst, int dst_count, MPI_Datatype dst_type, int proc);
-int gmr_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type,
-    void *dst, int dst_count, MPI_Datatype dst_type, int proc);
-
-void gmr_lock(gmr_t *mreg, int proc);
-void gmr_unlock(gmr_t *mreg, int proc);
-
-void gmr_dla_lock(gmr_t *mreg);
-void gmr_dla_unlock(gmr_t *mreg);
-
-#endif /* HAVE_GMR_H */
diff --git a/src/armci/src/groups.c b/src/armci/src/groups.c
deleted file mode 100644
index 3137b32..0000000
--- a/src/armci/src/groups.c
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <armci.h>
-#include <armcix.h>
-#include <armci_internals.h>
-#include <debug.h>
-
-
-/** The ARMCI world group.  This is accessed from outside via
-  * ARMCI_Group_get_world.
-  */
-ARMCI_Group ARMCI_GROUP_WORLD   = {0};
-ARMCI_Group ARMCI_GROUP_DEFAULT = {0};
-
-
-/** Initialize an ARMCI group's remaining fields using the communicator field.
-  */
-void ARMCII_Group_init_from_comm(ARMCI_Group *group) {
-  if (group->comm != MPI_COMM_NULL) {
-    MPI_Comm_size(group->comm, &group->size);
-    MPI_Comm_rank(group->comm, &group->rank);
-
-  } else {
-    group->rank = -1;
-    group->size =  0;
-  }
-
-  /* If noncollective groups are in use, create a separate communicator that
-    can be used for noncollective group creation with this group as the parent.
-    This ensures that calls to MPI_Intercomm_create can't clash with any user
-    communication. */
-
-  if (ARMCII_GLOBAL_STATE.noncollective_groups && group->comm != MPI_COMM_NULL)
-    MPI_Comm_dup(group->comm, &group->noncoll_pgroup_comm);
-  else
-    group->noncoll_pgroup_comm = MPI_COMM_NULL;
-
-  /* Check if translation caching is enabled */
-  if (ARMCII_GLOBAL_STATE.cache_rank_translation) {
-    if (group->comm != MPI_COMM_NULL) {
-      int      *ranks, i;
-      MPI_Group world_group, sub_group;
-
-      group->abs_to_grp = malloc(sizeof(int)*ARMCI_GROUP_WORLD.size);
-      group->grp_to_abs = malloc(sizeof(int)*group->size);
-      ranks = malloc(sizeof(int)*ARMCI_GROUP_WORLD.size);
-
-      ARMCII_Assert(group->abs_to_grp != NULL && group->grp_to_abs != NULL && ranks != NULL);
-
-      for (i = 0; i < ARMCI_GROUP_WORLD.size; i++)
-        ranks[i] = i;
-
-      MPI_Comm_group(ARMCI_GROUP_WORLD.comm, &world_group);
-      MPI_Comm_group(group->comm, &sub_group);
-
-      MPI_Group_translate_ranks(sub_group, group->size, ranks, world_group, group->grp_to_abs);
-      MPI_Group_translate_ranks(world_group, ARMCI_GROUP_WORLD.size, ranks, sub_group, group->abs_to_grp);
-
-      MPI_Group_free(&world_group);
-      MPI_Group_free(&sub_group);
-
-      free(ranks);
-    }
-  }
-  
-  /* Translation caching is disabled */
-  else {
-    group->abs_to_grp = NULL;
-    group->grp_to_abs = NULL;
-  }
-}
-
-
-/** Create an ARMCI group that contains a subset of the nodes in the current
-  * default group.  Collective across the default group.
-  *
-  * @param[in]  grp_size         Number of entries in pid_list.
-  * @param[in]  pid_list         List of process ids that will be in the new group.
-  * @param[out] armci_grp_out    The new ARMCI group.
-  * @param[in]  armci_grp_parent The parent of the new ARMCI group.
-  */
-void ARMCI_Group_create(int grp_size, int *pid_list, ARMCI_Group *group_out) {
-  ARMCI_Group_create_child(grp_size, pid_list, group_out, &ARMCI_GROUP_DEFAULT);
-}
-
-
-/** Create an ARMCI group that contains a subset of the nodes in the parent
-  * group. Collective across output group.
-  *
-  * @param[in]  grp_size         Number of entries in pid_list.
-  * @param[in]  pid_list         List of process ids that will be in the new group.
-  * @param[out] armci_grp_out    The new ARMCI group, only valid on group members.
-  * @param[in]  armci_grp_parent The parent of the new ARMCI group.
-  */
-static inline void ARMCI_Group_create_comm_collective(int grp_size, int *pid_list, ARMCI_Group *armci_grp_out,
-    ARMCI_Group *armci_grp_parent) {
-
-  MPI_Group mpi_grp_parent;
-  MPI_Group mpi_grp_child;
-
-  MPI_Comm_group(armci_grp_parent->comm, &mpi_grp_parent);
-  MPI_Group_incl(mpi_grp_parent, grp_size, pid_list, &mpi_grp_child);
-
-  MPI_Comm_create(armci_grp_parent->comm, mpi_grp_child, &armci_grp_out->comm);
- 
-  MPI_Group_free(&mpi_grp_parent);
-  MPI_Group_free(&mpi_grp_child);
-}
-
-
-/** Create an ARMCI group that contains a subset of the nodes in the parent
-  * group. Collective across output group.
-  *
-  * @param[in]  grp_size         Number of entries in pid_list.
-  * @param[in]  pid_list         Sorted list of process ids that will be in the new group.
-  * @param[out] armci_grp_out    The new ARMCI group, only valid on group members.
-  * @param[in]  armci_grp_parent The parent of the new ARMCI group.
-  */
-static inline void ARMCI_Group_create_comm_noncollective(int grp_size, int *pid_list, ARMCI_Group *armci_grp_out,
-    ARMCI_Group *armci_grp_parent) {
-
-  const int INTERCOMM_TAG = 42;
-  int       i, grp_me, me, nproc, merge_size;
-  MPI_Comm  pgroup, inter_pgroup;
-
-  me    = armci_grp_parent->rank;
-  nproc = armci_grp_parent->size;
-
-  /* CHECK: If I'm not a member, return COMM_NULL */
-  grp_me = -1;
-  for (i = 0; i < grp_size; i++) {
-    if (pid_list[i] == me) {
-      grp_me = i;
-      break;
-    }
-  }
-
-  if (grp_me < 0) {
-    armci_grp_out->comm = MPI_COMM_NULL;
-    return;
-  }
-
-  /* CASE: Group size 1 */
-  else if (grp_size == 1 && pid_list[0] == me) {
-    MPI_Comm_dup(MPI_COMM_SELF, &armci_grp_out->comm);
-    return;
-  }
-
-  pgroup = MPI_COMM_SELF;
-
-  /* Recursively merge adjacent groups until only one group remains.  */
-  for (merge_size = 1; merge_size < grp_size; merge_size *= 2) {
-    int      gid        = grp_me / merge_size;
-    MPI_Comm pgroup_old = pgroup;
-
-    if (gid % 2 == 0) {
-      /* Check if right partner doesn't exist */
-      if ((gid+1)*merge_size >= grp_size)
-        continue;
-
-      MPI_Intercomm_create(pgroup, 0, armci_grp_parent->noncoll_pgroup_comm, pid_list[(gid+1)*merge_size], INTERCOMM_TAG, &inter_pgroup);
-      MPI_Intercomm_merge(inter_pgroup, 0 /* LOW */, &pgroup);
-    } else {
-      MPI_Intercomm_create(pgroup, 0, armci_grp_parent->noncoll_pgroup_comm, pid_list[(gid-1)*merge_size], INTERCOMM_TAG, &inter_pgroup);
-      MPI_Intercomm_merge(inter_pgroup, 1 /* HIGH */, &pgroup);
-    }
-
-    MPI_Comm_free(&inter_pgroup);
-    if (pgroup_old != MPI_COMM_SELF) MPI_Comm_free(&pgroup_old);
-  }
-
-  armci_grp_out->comm = pgroup;
-}
-
-
-/** Create an ARMCI group that contains a subset of the nodes in the parent
-  * group. Collective.
-  *
-  * @param[in]  grp_size         Number of entries in pid_list.
-  * @param[in]  pid_list         Sorted list of process ids that will be in the new group.
-  * @param[out] armci_grp_out    The new ARMCI group, only valid on group members.
-  * @param[in]  armci_grp_parent The parent of the new ARMCI group.
-  */
-void ARMCI_Group_create_child(int grp_size, int *pid_list, ARMCI_Group *armci_grp_out,
-    ARMCI_Group *armci_grp_parent) {
-
-  if (ARMCII_GLOBAL_STATE.noncollective_groups)
-    ARMCI_Group_create_comm_noncollective(grp_size, pid_list, armci_grp_out, armci_grp_parent);
-  else
-    ARMCI_Group_create_comm_collective(grp_size, pid_list, armci_grp_out, armci_grp_parent);
-
-  ARMCII_Group_init_from_comm(armci_grp_out);
-}
-
-
-/** Free an ARMCI group.  Collective across group.
-  *
-  * @param[in] group The group to be freed
-  */
-void ARMCI_Group_free(ARMCI_Group *group) {
-  if (group->comm != MPI_COMM_NULL) {
-    MPI_Comm_free(&group->comm);
-
-    if (ARMCII_GLOBAL_STATE.noncollective_groups)
-      MPI_Comm_free(&group->noncoll_pgroup_comm);
-  }
-
-  /* If the group has translation caches, free them */
-  if (group->abs_to_grp != NULL)
-    free(group->abs_to_grp);
-  if (group->grp_to_abs != NULL)
-    free(group->grp_to_abs);
-
-  group->rank = -1;
-  group->size = 0;
-}
-
-
-/** Query the calling process' rank in a given group.
-  *
-  * @param[in]  group Group to query on.
-  * @param[out] rank  Location to store the rank.
-  * @return           Zero on success, error code otherwise.
-  */
-int  ARMCI_Group_rank(ARMCI_Group *group, int *rank) {
-  *rank = group->rank;
-
-  if (*rank >= 0)
-    return 0;
-  else
-    return 1;
-}
-
-
-/** Query the size of the given group.
-  *
-  * @param[in]  group Group to query.
-  * @param[out] size  Variable to store the size in.
-  */
-void ARMCI_Group_size(ARMCI_Group *group, int *size) {
-  *size = group->size;
-}
-
-
-/** Set the default group.
-  *
-  * @param[in] group The new default group
-  */
-void ARMCI_Group_set_default(ARMCI_Group *group) {
-  ARMCI_GROUP_DEFAULT = *group;
-}
-
-
-/** Get the default group.
-  *
-  * @param[out] group_out Pointer to the default group.
-  */
-void ARMCI_Group_get_default(ARMCI_Group *group_out) {
-  *group_out = ARMCI_GROUP_DEFAULT;
-}
-
-
-/** Fetch the world group.
-  *
-  * @param[out] group_out Output group.
-  */
-void ARMCI_Group_get_world(ARMCI_Group *group_out) {
-  *group_out = ARMCI_GROUP_WORLD;
-}
-
-
-/** Translate a group process rank to the corresponding process rank in the
-  * ARMCI world group.
-  *
-  * @param[in] group      Group to translate from.
-  * @param[in] group_rank Rank of the process in group.
-  */
-int ARMCI_Absolute_id(ARMCI_Group *group, int group_rank) {
-  int       world_rank;
-  MPI_Group world_group, sub_group;
-
-  ARMCII_Assert(group_rank >= 0 && group_rank < group->size);
-
-  /* Check if group is the world group */
-  if (group->comm == ARMCI_GROUP_WORLD.comm)
-    world_rank = group_rank;
-
-  /* Check for translation cache */
-  else if (group->grp_to_abs != NULL)
-    world_rank = group->grp_to_abs[group_rank];
-
-  else {
-    /* Translate the rank */
-    MPI_Comm_group(ARMCI_GROUP_WORLD.comm, &world_group);
-    MPI_Comm_group(group->comm, &sub_group);
-
-    MPI_Group_translate_ranks(sub_group, 1, &group_rank, world_group, &world_rank);
-
-    MPI_Group_free(&world_group);
-    MPI_Group_free(&sub_group);
-  }
-
-  /* Check if translation failed */
-  if (world_rank == MPI_UNDEFINED)
-    return -1;
-  else
-    return world_rank;
-}
-
-
-/** Split a parent group into multiple child groups.  This is similar to
-  * MPI_Comm_split.  Collective across the parent group.
-  *
-  * @param[in]  parent The parent group.
-  * @param[in]  color  The id number of the new group.  Processes are grouped
-  *                    together so allthat give the same color will be placed
-  *                    in the same new group.
-  * @param[in]  key    Relative ordering of processes in the new group.
-  * @param[out] new_group Pointer to a handle where group info will be stored.
-  */
-int ARMCIX_Group_split(ARMCI_Group *parent, int color, int key, ARMCI_Group *new_group) {
-  int err;
-
-  err = MPI_Comm_split(parent->comm, color, key, &new_group->comm);
-
-  if (err != MPI_SUCCESS)
-    return err;
-
-  ARMCII_Group_init_from_comm(new_group);
-
-  return 0;
-}
-
-
-/** Duplicate an ARMCI group.  Collective across the parent group.
-  *
-  * @param[in]  parent The parent group.
-  * @param[in]  color  The id number of the new group.  Processes are grouped
-  *                    together so allthat give the same color will be placed
-  *                    in the same new group.
-  * @param[in]  key    Relative ordering of processes in the new group.
-  * @param[out] new_group Pointer to a handle where group info will be stored.
-  */
-int ARMCIX_Group_dup(ARMCI_Group *parent, ARMCI_Group *new_group) {
-  int err;
-
-  err = MPI_Comm_dup(parent->comm, &new_group->comm);
-
-  if (err != MPI_SUCCESS)
-    return err;
-
-  ARMCII_Group_init_from_comm(new_group);
-
-  return 0;
-}
diff --git a/src/armci/src/init_finalize.c b/src/armci/src/init_finalize.c
deleted file mode 100644
index d616bb8..0000000
--- a/src/armci/src/init_finalize.c
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <mpi.h>
-
-#include <armci.h>
-#include <armci_internals.h>
-#include <debug.h>
-#include <gmr.h>
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Init = PARMCI_Init
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Init ARMCI_Init
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Init as PARMCI_Init
-#endif
-/* -- end weak symbols block -- */
-
-/** Initialize ARMCI.  MPI must be initialized before this can be called.  It
-  * invalid to make ARMCI calls before initialization.  Collective on the world
-  * group.
-  *
-  * @return            Zero on success
-  */
-int PARMCI_Init(void) {
-  char *var;
-
-  /* GA/TCGMSG end up calling ARMCI_Init() multiple times. */
-  if (ARMCII_GLOBAL_STATE.init_count > 0) {
-    ARMCII_GLOBAL_STATE.init_count++;
-    return 0;
-  }
-
-  /* Check for MPI initialization */
-  {
-    int mpi_is_init, mpi_is_fin;
-    MPI_Initialized(&mpi_is_init);
-    MPI_Finalized(&mpi_is_fin);
-    if (!mpi_is_init || mpi_is_fin) 
-      ARMCII_Error("MPI must be initialized before calling ARMCI_Init");
-  }
-
-  /* Set defaults */
-#ifdef ARMCI_GROUP
-  ARMCII_GLOBAL_STATE.noncollective_groups = 1;
-#endif
-#ifdef NO_SEATBELTS
-  ARMCII_GLOBAL_STATE.iov_checks           = 0;
-#endif
-
-  /* Check for debugging flags */
-
-  ARMCII_GLOBAL_STATE.debug_alloc          = ARMCII_Getenv_bool("ARMCI_DEBUG_ALLOC", 0);
-  ARMCII_GLOBAL_STATE.debug_flush_barriers = ARMCII_Getenv_bool("ARMCI_FLUSH_BARRIERS", 1);
-  ARMCII_GLOBAL_STATE.verbose              = ARMCII_Getenv_bool("ARMCI_VERBOSE", 0);
-
-  /* Group formation options */
-
-  ARMCII_GLOBAL_STATE.cache_rank_translation=ARMCII_Getenv_bool("ARMCI_CACHE_RANK_TRANSLATION", 1);
-  if (ARMCII_Getenv("ARMCI_NONCOLLECTIVE_GROUPS"))
-    ARMCII_GLOBAL_STATE.noncollective_groups = ARMCII_Getenv_bool("ARMCI_NONCOLLECTIVE_GROUPS", 0);
-
-  /* Check for IOV flags */
-
-  ARMCII_GLOBAL_STATE.iov_checks           = ARMCII_Getenv_bool("ARMCI_IOV_CHECKS", 0);
-  ARMCII_GLOBAL_STATE.iov_batched_limit    = ARMCII_Getenv_int("ARMCI_IOV_BATCHED_LIMIT", 0);
-
-  if (ARMCII_GLOBAL_STATE.iov_batched_limit < 0) {
-    ARMCII_Warning("Ignoring invalid value for ARMCI_IOV_BATCHED_LIMIT (%d)\n", ARMCII_GLOBAL_STATE.iov_batched_limit);
-    ARMCII_GLOBAL_STATE.iov_batched_limit = 0;
-  }
-
-  var = ARMCII_Getenv("ARMCI_IOV_METHOD");
-
-  ARMCII_GLOBAL_STATE.iov_method = ARMCII_IOV_AUTO;
-
-  if (var != NULL) {
-    if (strcmp(var, "AUTO") == 0)
-      ARMCII_GLOBAL_STATE.iov_method = ARMCII_IOV_AUTO;
-    else if (strcmp(var, "CONSRV") == 0)
-      ARMCII_GLOBAL_STATE.iov_method = ARMCII_IOV_CONSRV;
-    else if (strcmp(var, "BATCHED") == 0)
-      ARMCII_GLOBAL_STATE.iov_method = ARMCII_IOV_BATCHED;
-    else if (strcmp(var, "DIRECT") == 0)
-      ARMCII_GLOBAL_STATE.iov_method = ARMCII_IOV_DIRECT;
-    else if (ARMCI_GROUP_WORLD.rank == 0)
-      ARMCII_Warning("Ignoring unknown value for ARMCI_IOV_METHOD (%s)\n", var);
-  }
-
-  /* Check for Strided flags */
-
-  var = ARMCII_Getenv("ARMCI_STRIDED_METHOD");
-
-  ARMCII_GLOBAL_STATE.strided_method = ARMCII_STRIDED_DIRECT;
-
-  if (var != NULL) {
-    if (strcmp(var, "IOV") == 0)
-      ARMCII_GLOBAL_STATE.strided_method = ARMCII_STRIDED_IOV;
-    else if (strcmp(var, "DIRECT") == 0)
-      ARMCII_GLOBAL_STATE.strided_method = ARMCII_STRIDED_DIRECT;
-    else if (ARMCI_GROUP_WORLD.rank == 0)
-      ARMCII_Warning("Ignoring unknown value for ARMCI_STRIDED_METHOD (%s)\n", var);
-  }
-
-  /* Shared buffer handling method */
-
-  var = ARMCII_Getenv("ARMCI_SHR_BUF_METHOD");
-
-  ARMCII_GLOBAL_STATE.shr_buf_method = ARMCII_SHR_BUF_COPY;
-
-  if (var != NULL) {
-    if (strcmp(var, "COPY") == 0)
-      ARMCII_GLOBAL_STATE.shr_buf_method = ARMCII_SHR_BUF_COPY;
-    else if (strcmp(var, "NOGUARD") == 0)
-      ARMCII_GLOBAL_STATE.shr_buf_method = ARMCII_SHR_BUF_NOGUARD;
-    else if (ARMCI_GROUP_WORLD.rank == 0)
-      ARMCII_Warning("Ignoring unknown value for ARMCI_SHR_BUF_METHOD (%s)\n", var);
-  }
-
-  /* Setup groups and communicators */
-
-  MPI_Comm_dup(MPI_COMM_WORLD, &ARMCI_GROUP_WORLD.comm);
-  ARMCII_Group_init_from_comm(&ARMCI_GROUP_WORLD);
-  ARMCI_GROUP_DEFAULT = ARMCI_GROUP_WORLD;
-
-  /* Create GOP operators */
-
-  MPI_Op_create(ARMCII_Absmin_op, 1 /* commute */, &MPI_ABSMIN_OP);
-  MPI_Op_create(ARMCII_Absmax_op, 1 /* commute */, &MPI_ABSMAX_OP);
-
-  MPI_Op_create(ARMCII_Msg_sel_min_op, 1 /* commute */, &MPI_SELMIN_OP);
-  MPI_Op_create(ARMCII_Msg_sel_max_op, 1 /* commute */, &MPI_SELMAX_OP);
-
-  ARMCII_GLOBAL_STATE.init_count++;
-
-  if (ARMCII_GLOBAL_STATE.verbose) {
-    if (ARMCI_GROUP_WORLD.rank == 0) {
-      int major, minor;
-
-      MPI_Get_version(&major, &minor);
-
-      printf("ARMCI-MPI initialized with %d process%s, MPI v%d.%d\n", ARMCI_GROUP_WORLD.size, ARMCI_GROUP_WORLD.size > 1 ? "es":"", major, minor);
-#ifdef NO_SEATBELTS
-      printf("  NO_SEATBELTS           = ENABLED\n");
-#endif
-      printf("  STRIDED_METHOD         = %s\n", ARMCII_Strided_methods_str[ARMCII_GLOBAL_STATE.strided_method]);
-      printf("  IOV_METHOD             = %s\n", ARMCII_Iov_methods_str[ARMCII_GLOBAL_STATE.iov_method]);
-
-      if (   ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_BATCHED
-          || ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_AUTO)
-      {
-        if (ARMCII_GLOBAL_STATE.iov_batched_limit > 0)
-          printf("  IOV_BATCHED_LIMIT      = %d\n", ARMCII_GLOBAL_STATE.iov_batched_limit);
-        else
-          printf("  IOV_BATCHED_LIMIT      = UNLIMITED\n");
-      }
-
-      printf("  IOV_CHECKS             = %s\n", ARMCII_GLOBAL_STATE.iov_checks             ? "TRUE" : "FALSE");
-      printf("  SHR_BUF_METHOD         = %s\n", ARMCII_Shr_buf_methods_str[ARMCII_GLOBAL_STATE.shr_buf_method]);
-      printf("  NONCOLLECTIVE_GROUPS   = %s\n", ARMCII_GLOBAL_STATE.noncollective_groups   ? "TRUE" : "FALSE");
-      printf("  CACHE_RANK_TRANSLATION = %s\n", ARMCII_GLOBAL_STATE.cache_rank_translation ? "TRUE" : "FALSE");
-      printf("  DEBUG_ALLOC            = %s\n", ARMCII_GLOBAL_STATE.debug_alloc            ? "TRUE" : "FALSE");
-      printf("  FLUSH_BARRIERS         = %s\n", ARMCII_GLOBAL_STATE.debug_flush_barriers   ? "TRUE" : "FALSE");
-      printf("\n");
-      fflush(NULL);
-    }
-
-    MPI_Barrier(ARMCI_GROUP_WORLD.comm);
-  }
-
-  return 0;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Init_args = PARMCI_Init_args
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Init_args ARMCI_Init_args
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Init_args as PARMCI_Init_args
-#endif
-/* -- end weak symbols block -- */
-
-/** Initialize ARMCI.  MPI must be initialized before this can be called.  It
-  * is invalid to make ARMCI calls before initialization.  Collective on the
-  * world group.
-  *
-  * @param[inout] argc Command line argument count
-  * @param[inout] argv Command line arguments
-  * @return            Zero on success
-  */
-int PARMCI_Init_args(int *argc, char ***argv) {
-  return PARMCI_Init();
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Initialized = PARMCI_Initialized
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Initialized ARMCI_Initialized
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Initialized as PARMCI_Initialized
-#endif
-/* -- end weak symbols block -- */
-
-/** Check if ARMCI has been initialized.
-  *
-  * @return Non-zero if ARMCI has been initialized.
-  */
-int PARMCI_Initialized(void) {
-  return ARMCII_GLOBAL_STATE.init_count > 0;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Finalize = PARMCI_Finalize
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Finalize ARMCI_Finalize
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Finalize as PARMCI_Finalize
-#endif
-/* -- end weak symbols block -- */
-
-/** Finalize ARMCI.  Must be called before MPI is finalized.  ARMCI calls are
-  * not valid after finalization.  Collective on world group.
-  *
-  * @return            Zero on success
-  */
-int PARMCI_Finalize(void) {
-  int nfreed;
-
-  /* GA/TCGMSG end up calling ARMCI_Finalize() multiple times. */
-  if (ARMCII_GLOBAL_STATE.init_count == 0) {
-    return 0;
-  }
-
-  ARMCII_GLOBAL_STATE.init_count--;
-
-  /* Only finalize on the last matching call */
-  if (ARMCII_GLOBAL_STATE.init_count > 0) {
-    return 0;
-  }
-
-  nfreed = gmr_destroy_all();
-
-  if (nfreed > 0 && ARMCI_GROUP_WORLD.rank == 0)
-    ARMCII_Warning("Freed %d leaked allocations\n", nfreed);
-
-  /* Free GOP operators */
-
-  MPI_Op_free(&MPI_ABSMIN_OP);
-  MPI_Op_free(&MPI_ABSMAX_OP);
-
-  MPI_Op_free(&MPI_SELMIN_OP);
-  MPI_Op_free(&MPI_SELMAX_OP);
-
-  ARMCI_Cleanup();
-
-  ARMCI_Group_free(&ARMCI_GROUP_WORLD);
-
-  return 0;
-}
-
-
-/** Cleaup ARMCI resources.  Call finalize instead.
-  */
-void ARMCI_Cleanup(void) {
-  return;
-}
-
diff --git a/src/armci/src/internals.c b/src/armci/src/internals.c
deleted file mode 100644
index aca571d..0000000
--- a/src/armci/src/internals.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-
-#include <armci.h>
-#include <armcix.h>
-#include <armci_internals.h>
-#include <debug.h>
-#include <gmr.h>
-
-/** ARMCI Internal global state */
-global_state_t ARMCII_GLOBAL_STATE = { 0 };
-
-/** Enum strings */
-char ARMCII_Strided_methods_str[][10] = { "IOV", "DIRECT" };
-char ARMCII_Iov_methods_str[][10]     = { "AUTO", "CONSRV", "BATCHED", "DIRECT" };
-char ARMCII_Shr_buf_methods_str[][10] = { "COPY", "NOGUARD" };
-
-/** Raise an internal fatal ARMCI error.
-  *
-  * @param[in] file Current file name (__FILE__)
-  * @param[in] line Current line numeber (__LINE__)
-  * @param[in] func Current function name (__func__)
-  * @param[in] msg  Message to be printed
-  * @param[in] code Exit error code
-  */
-void ARMCII_Error_impl(const char *file, const int line, const char *func, const char *msg, ...) {
-  va_list ap;
-  int  disp;
-  char string[500];
-
-  disp  = 0;
-  va_start(ap, msg);
-  disp += vsnprintf(string, 500, msg, ap);
-  va_end(ap);
-
-  fprintf(stderr, "[%d] ARMCI Internal error in %s (%s:%d)\n[%d] Messge: %s\n", ARMCI_GROUP_WORLD.rank, 
-      func, file, line, ARMCI_GROUP_WORLD.rank, string);
-  MPI_Abort(ARMCI_GROUP_WORLD.comm, 100);
-}
-
-
-/** Translate a world process rank to the corresponding process rank in the
-  * ARMCI group.
-  *
-  * @param[in] group      Group to translate to.
-  * @param[in] world_rank Rank of the process in the world group.
-  * @return               Rank in group or -1 if not in the group.
-  */
-int ARMCII_Translate_absolute_to_group(ARMCI_Group *group, int world_rank) {
-  int       group_rank;
-  MPI_Group world_group, sub_group;
-
-  ARMCII_Assert(world_rank >= 0 && world_rank < ARMCI_GROUP_WORLD.size);
-
-  /* Check if group is the world group */
-  if (group->comm == ARMCI_GROUP_WORLD.comm)
-    group_rank = world_rank;
-
-  /* Check for translation cache */
-  else if (group->grp_to_abs != NULL)
-    group_rank = group->abs_to_grp[world_rank];
-
-  else {
-    /* Translate the rank */
-    MPI_Comm_group(ARMCI_GROUP_WORLD.comm, &world_group);
-    MPI_Comm_group(group->comm, &sub_group);
-
-    MPI_Group_translate_ranks(world_group, 1, &world_rank, sub_group, &group_rank);
-
-    MPI_Group_free(&world_group);
-    MPI_Group_free(&sub_group);
-  }
-
-  /* Check if translation failed */
-  if (group_rank == MPI_UNDEFINED)
-    return -1;
-  else
-    return group_rank;
-}
-
-
-/** Translate an ARMCI accumulate data type into an MPI type so we can pass it
-  * to mem regions.
-  *
-  * @param[in]  armci_datatype ARMCI accumulate data type
-  * @param[out] mpi_type       MPI data type
-  * @param[out] type_size      Size of the MPI data type
-  */
-void ARMCII_Acc_type_translate(int armci_datatype, MPI_Datatype *mpi_type, int *type_size) {
-    // Determine the MPI type for the transfer
-    switch (armci_datatype) {
-      case ARMCI_ACC_INT:
-        *mpi_type = MPI_INT;
-        break;
-      case ARMCI_ACC_LNG:
-        *mpi_type = MPI_LONG;
-        break;
-      case ARMCI_ACC_FLT:
-        *mpi_type = MPI_FLOAT;
-        break;
-      case ARMCI_ACC_DBL:
-        *mpi_type = MPI_DOUBLE;
-        break;
-      case ARMCI_ACC_CPL:
-        *mpi_type = MPI_FLOAT;
-        break;
-      case ARMCI_ACC_DCP:
-        *mpi_type = MPI_DOUBLE;
-        break;
-      default:
-        ARMCII_Error("unknown data type", 100);
-        return;
-    }
-
-    MPI_Type_size(*mpi_type, type_size);
-}
-
-
-/** Synchronize all public and private windows.
-  */
-void ARMCII_Flush_local(void) {
-  gmr_t *cur_mreg = gmr_list;
-
-  while (cur_mreg) {
-    gmr_dla_lock(cur_mreg);
-    gmr_dla_unlock(cur_mreg);
-
-    cur_mreg = cur_mreg->next;
-  }
-}
diff --git a/src/armci/src/malloc.c b/src/armci/src/malloc.c
deleted file mode 100644
index ae41239..0000000
--- a/src/armci/src/malloc.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <mpi.h>
-
-#include <debug.h>
-#include <armci.h>
-#include <armci_internals.h>
-#include <gmr.h>
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Malloc = PARMCI_Malloc
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Malloc ARMCI_Malloc
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Malloc as PARMCI_Malloc
-#endif
-/* -- end weak symbols block -- */
-
-/** Allocate a shared memory segment.  Collective.
-  *
-  * @param[out] base_ptrs Array of length nproc that will contain pointers to
-  *                       the base address of each process' patch of the
-  *                       segment.
-  * @param[in]       size Number of bytes to allocate on the local process.
-  */
-int PARMCI_Malloc(void **ptr_arr, armci_size_t bytes) {
-  return ARMCI_Malloc_group(ptr_arr, bytes, &ARMCI_GROUP_WORLD);
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Free = PARMCI_Free
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Free ARMCI_Free
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Free as PARMCI_Free
-#endif
-/* -- end weak symbols block -- */
-
-/** Free a shared memory allocation.  Collective.
-  *
-  * @param[in] ptr Pointer to the local patch of the allocation
-  */
-int PARMCI_Free(void *ptr) {
-  return ARMCI_Free_group(ptr, &ARMCI_GROUP_WORLD);
-}
-
-
-/** Allocate a shared memory segment.  Collective.
-  *
-  * @param[out] base_ptrs Array that will contain pointers to the base address of
-  *                       each process' patch of the segment.  Array is of length
-  *                       equal to the number of processes in the group.
-  * @param[in]       size Number of bytes to allocate on the local process.
-  */
-int ARMCI_Malloc_group(void **base_ptrs, armci_size_t size, ARMCI_Group *group) {
-  int i;
-  gmr_t *mreg;
-
-  ARMCII_Assert(PARMCI_Initialized());
-
-  mreg = gmr_create(size, base_ptrs, group);
-
-  if (DEBUG_CAT_ENABLED(DEBUG_CAT_ALLOC)) {
-#define BUF_LEN 1000
-    char ptr_string[BUF_LEN];
-    int  count = 0;
-
-    if (mreg == NULL) {
-      strncpy(ptr_string, "NULL", 5);
-    } else {
-      for (i = 0; i < mreg->nslices && count < BUF_LEN; i++)
-        count += snprintf(ptr_string+count, BUF_LEN-count, 
-            (i == mreg->nslices-1) ? "%p" : "%p ", base_ptrs[i]);
-    }
-
-    ARMCII_Dbg_print(DEBUG_CAT_ALLOC, "base ptrs [%s]\n", ptr_string);
-#undef BUF_LEN
-  }
-
-  return 0;
-}
-
-
-/** Free a shared memory allocation.  Collective.
-  *
-  * @param[in] ptr Pointer to the local patch of the allocation
-  */
-int ARMCI_Free_group(void *ptr, ARMCI_Group *group) {
-  gmr_t *mreg;
-
-  if (ptr != NULL) {
-    mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank);
-    ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer");
-  } else {
-    ARMCII_Dbg_print(DEBUG_CAT_ALLOC, "given NULL\n");
-    mreg = NULL;
-  }
-
-  gmr_destroy(mreg, group);
-
-  return 0;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Malloc_local = PARMCI_Malloc_local
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Malloc_local ARMCI_Malloc_local
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Malloc_local as PARMCI_Malloc_local
-#endif
-/* -- end weak symbols block -- */
-
-/** Allocate a local buffer suitable for use in one-sided communication
-  *
-  * @param[in] size Number of bytes to allocate
-  * @return         Pointer to the local buffer
-  */
-void *PARMCI_Malloc_local(armci_size_t size) {
-  void *buf;
-
-  MPI_Alloc_mem((MPI_Aint) size, MPI_INFO_NULL, &buf);
-  ARMCII_Assert(buf != NULL);
-
-  if (ARMCII_GLOBAL_STATE.debug_alloc) {
-    ARMCII_Bzero(buf, size);
-  }
-
-  return buf;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Free_local = PARMCI_Free_local
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Free_local ARMCI_Free_local
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Free_local as PARMCI_Free_local
-#endif
-/* -- end weak symbols block -- */
-
-/** Free memory allocated with ARMCI_Malloc_local
-  *
-  * @param[in] buf Pointer to local buffer to free
-  */
-int PARMCI_Free_local(void *buf) {
-  MPI_Free_mem(buf);
-  return 0;
-}
diff --git a/src/armci/src/message.c b/src/armci/src/message.c
deleted file mode 100644
index 1fa113d..0000000
--- a/src/armci/src/message.c
+++ /dev/null
@@ -1,443 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <mpi.h>
-
-#include <debug.h>
-#include <armci.h>
-#include <armci_internals.h>
-
-/** Query process rank from messaging (MPI) layer.
-  */
-int armci_msg_me(void) {
-  int me;
-  MPI_Comm_rank(ARMCI_GROUP_WORLD.comm, &me);
-  return me;
-}
-
-
-/** Query number of processes.
-  */
-int armci_msg_nproc(void) {
-  int nproc;
-  MPI_Comm_size(ARMCI_GROUP_WORLD.comm, &nproc);
-  return nproc;
-}
-
-
-/** Abort the application.
-  *
-  * @param[in] code Exit error code
-  */
-void armci_msg_abort(int code) {
-  MPI_Abort(ARMCI_GROUP_WORLD.comm, code);
-}
-
-
-/** Get the wall clock time.
-  *
-  * @return Wall clock time
-  */
-double armci_timer(void) {
-  return MPI_Wtime();
-}
-
-
-/** Broadcast a message.  Collective.
-  *
-  * @param[in] buffer Source buffer on root, destination elsewhere.
-  * @param[in] len    Length of the message in bytes.
-  * @param[in] root   Rank of the root process.
-  */
-void armci_msg_bcast(void *buf_in, int len, int root) {
-  void **buf;
-
-  /* Is the buffer an input or an output? */
-  if (ARMCI_GROUP_WORLD.rank == root)
-    ARMCII_Buf_prepare_read_vec(&buf_in, &buf, 1, len);
-  else
-    ARMCII_Buf_prepare_write_vec(&buf_in, &buf, 1, len);
-
-  MPI_Bcast(buf[0], len, MPI_BYTE, root, ARMCI_GROUP_WORLD.comm);
-
-  if (ARMCI_GROUP_WORLD.rank == root)
-    ARMCII_Buf_finish_read_vec(&buf_in, buf, 1, len);
-  else
-    ARMCII_Buf_finish_write_vec(&buf_in, buf, 1, len);
-}
-
-
-/** Broadcast a message.  Collective.
-  *
-  * @param[in] buffer Source buffer on root, destination elsewhere.
-  * @param[in] len    Length of the message in bytes.
-  * @param[in] root   Rank of the root process.
-  */
-void armci_msg_brdcst(void *buffer, int len, int root) {
-  armci_msg_bcast(buffer, len, root);
-}
-
-
-/** Broadcast a message on the given scope.  Collective.
-  *
-  * @param[in] scope  Scope for the broadcast
-  * @param[in] buffer Source buffer on root, destination elsewhere.
-  * @param[in] len    Length of the message in bytes.
-  * @param[in] root   Rank of the root process.
-  */
-void armci_msg_bcast_scope(int scope, void *buffer, int len, int root) {
-  armci_msg_group_bcast_scope(scope, buffer, len, root, &ARMCI_GROUP_WORLD);
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak armci_msg_barrier = parmci_msg_barrier
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF parmci_msg_barrier armci_msg_barrier
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate armci_msg_barrier as parmci_msg_barrier
-#endif
-/* -- end weak symbols block -- */
-
-/** Barrier from the messaging layer.
-  */
-void parmci_msg_barrier(void) {
-  MPI_Barrier(ARMCI_GROUP_WORLD.comm);
-
-  if (ARMCII_GLOBAL_STATE.debug_flush_barriers) {
-    ARMCII_Flush_local();
-  }
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak armci_msg_group_barrier = parmci_msg_group_barrier
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF parmci_msg_group_barrier armci_msg_group_barrier
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate armci_msg_group_barrier as parmci_msg_group_barrier
-#endif
-/* -- end weak symbols block -- */
-
-/** Message barrier on a group.
-  *
-  * @param[in] group Group on which to perform barrier
-  */
-void parmci_msg_group_barrier(ARMCI_Group *group) {
-  MPI_Barrier(group->comm);
-
-  if (ARMCII_GLOBAL_STATE.debug_flush_barriers) {
-    ARMCII_Flush_local();
-  }
-}
-
-
-/** Broadcast on a group. Collective.
-  *
-  * @param[in]    scope ARMCI scope
-  * @param[inout] buf   Input on the root, output on all other processes
-  * @param[in]    len   Number of bytes in the message
-  * @param[in]    abs_root Absolute rank of the process at the root of the broadcast
-  * @param[in]    group ARMCI group on which to perform communication
-  */
-void armci_msg_group_bcast_scope(int scope, void *buf_in, int len, int abs_root, ARMCI_Group *group) {
-  int    grp_root;
-  void **buf;
-
-  if (scope == SCOPE_ALL || scope == SCOPE_MASTERS) {
-    /* Is the buffer an input or an output? */
-    if (ARMCI_GROUP_WORLD.rank == abs_root)
-      ARMCII_Buf_prepare_read_vec(&buf_in, &buf, 1, len);
-    else
-      ARMCII_Buf_prepare_write_vec(&buf_in, &buf, 1, len);
-
-    grp_root = ARMCII_Translate_absolute_to_group(group, abs_root);
-    ARMCII_Assert(grp_root >= 0 && grp_root < group->size);
-
-    MPI_Bcast(buf[0], len, MPI_BYTE, grp_root, group->comm);
-
-    if (ARMCI_GROUP_WORLD.rank == abs_root)
-      ARMCII_Buf_finish_read_vec(&buf_in, buf, 1, len);
-    else
-      ARMCII_Buf_finish_write_vec(&buf_in, buf, 1, len);
-  } else /* SCOPE_NODE */ {
-    grp_root = 0;
-
-    /* This is a self-broadcast, which is a no-op. */
-  }
-}
-
-
-/** Send a two-sided message.
-  *
-  * @param[in] tag    Message tag (must match on sender and receiver)
-  * @param[in] buf    Buffer containing the message
-  * @param[in] nbytes Length of the message in bytes
-  * @param[in] dest   Destination process id
-  */
-void armci_msg_snd(int tag, void *buf_in, int nbytes, int dest) {
-  void **buf;
-
-  ARMCII_Buf_prepare_read_vec(&buf_in, &buf, 1, nbytes);
-  MPI_Send(buf[0], nbytes, MPI_BYTE, dest, tag, ARMCI_GROUP_WORLD.comm);
-  ARMCII_Buf_finish_read_vec(&buf_in, buf, 1, nbytes);
-}
-
-
-/** Receive a two-sided message.
-  *
-  * @param[in]  tag    Message tag (must match on sender and receiver)
-  * @param[in]  buf    Buffer containing the message
-  * @param[in]  nbytes_buf Size of the buffer in bytes
-  * @param[out] nbytes_msg Length of the message received in bytes (NULL to ignore)
-  * @param[in]  src    Source process id
-  */
-void armci_msg_rcv(int tag, void *buf_out, int nbytes_buf, int *nbytes_msg, int src) {
-  void     **buf;
-  MPI_Status status;
-
-  ARMCII_Buf_prepare_write_vec(&buf_out, &buf, 1, nbytes_buf);
-  MPI_Recv(buf[0], nbytes_buf, MPI_BYTE, src, tag, ARMCI_GROUP_WORLD.comm, &status);
-  ARMCII_Buf_finish_write_vec(&buf_out, buf, 1, nbytes_buf);
-
-  if (nbytes_msg != NULL)
-    MPI_Get_count(&status, MPI_BYTE, nbytes_msg);
-}
-
-
-/** Receive a two-sided message from any source.
-  *
-  * @param[in]  tag    Message tag (must match on sender and receiver)
-  * @param[in]  buf    Buffer containing the message
-  * @param[in]  nbytes_buf Size of the buffer in bytes
-  * @param[out] nbytes_msg Length of the message received in bytes (NULL to ignore)
-  * @return            Rank of the message source
-  */
-int armci_msg_rcvany(int tag, void *buf_out, int nbytes_buf, int *nbytes_msg) {
-  void     **buf;
-  MPI_Status status;
-
-  ARMCII_Buf_prepare_write_vec(&buf_out, &buf, 1, nbytes_buf);
-  MPI_Recv(buf[0], nbytes_buf, MPI_BYTE, MPI_ANY_SOURCE, tag, ARMCI_GROUP_WORLD.comm, &status);
-  ARMCII_Buf_finish_write_vec(&buf_out, buf, 1, nbytes_buf);
-
-  if (nbytes_msg != NULL)
-    MPI_Get_count(&status, MPI_BYTE, nbytes_msg);
-
-  return status.MPI_SOURCE;
-}
-
-
-void armci_msg_reduce(void *x, int n, char *op, int type) {
-  armci_msg_reduce_scope(SCOPE_ALL, x, n, op, type);
-}
-
-
-void armci_msg_reduce_scope(int scope, void *x, int n, char *op, int type) {
-  ARMCII_Error("unimplemented"); // TODO
-}
-
-
-/** Map process IDs onto a binary tree.
-  *
-  * @param[in]  scope Scope of processes involved
-  * @param[out] root  Process id of the root
-  * @param[out] up    Process id of my parent
-  * @param[out] left  Process id of my left child
-  * @param[out] right Process if of my right child
-  */
-void armci_msg_bintree(int scope, int *root, int *up, int *left, int *right) {
-  int me, nproc;
-
-  if (scope == SCOPE_NODE) {
-    *root  = 0;
-    *left  = -1;
-    *right = -1;
-   
-    return;
-  }
-
-  me    = armci_msg_me();
-  nproc = armci_msg_nproc();
-
-  *root = 0;
-  *up   =  (me == 0) ? -1 : (me - 1) / 2;
-
-  *left = 2*me + 1;
-  if (*left >= nproc) *left = -1;
-
-  *right = 2*me + 2;
-  if (*right >= nproc) *right = -1;
-}
-
-
-/** Data packet for a select operation.  Data entry is a struct from GA's
-  * runtime where the first element is a value and later parts are indices of
-  * the element in the GA.
-  */
-typedef struct {
-  int     contribute;
-  int     type;
-  uint8_t data[1];
-} sel_data_t;
-
-
-/** Select operations to be used in allreduce.
-  */
-MPI_Op MPI_SELMIN_OP;
-MPI_Op MPI_SELMAX_OP;
-
-
-/** Min operator for armci_msg_sel
-  */
-void ARMCII_Msg_sel_min_op(void *data_in, void *data_inout, int *len, MPI_Datatype *datatype) {
-  sel_data_t *sd_1, *sd_2;
-
-  sd_1 = (sel_data_t*) data_in;
-  sd_2 = (sel_data_t*) data_inout;
-
-  if (sd_1->contribute && !sd_2->contribute) {
-    ARMCI_Copy(data_in, data_inout, *len);
-  }
-  
-  else if (sd_1->contribute && sd_2->contribute) {
-
-#define MSG_SEL_MIN_OP(X,Y,LEN,TYPE)                                      \
-  do {                                                                    \
-    if (*(TYPE*)((sel_data_t*)X)->data < *(TYPE*)((sel_data_t*)Y)->data)  \
-      ARMCI_Copy(X, Y, LEN);                                              \
-  } while (0)
-
-    switch (sd_1->type) {
-      case ARMCI_INT:
-        MSG_SEL_MIN_OP(data_in, data_inout, *len, int);
-        break;
-      case ARMCI_LONG:
-        MSG_SEL_MIN_OP(data_in, data_inout, *len, long);
-        break;
-      case ARMCI_LONG_LONG:
-        MSG_SEL_MIN_OP(data_in, data_inout, *len, long long);
-        break;
-      case ARMCI_FLOAT:
-        MSG_SEL_MIN_OP(data_in, data_inout, *len, float);
-        break;
-      case ARMCI_DOUBLE:
-        MSG_SEL_MIN_OP(data_in, data_inout, *len, double);
-        break;
-      default:
-        ARMCII_Error("Invalid data type (%d)", sd_1->type);
-    }
-
-#undef MSG_SEL_MIN_OP
-  }
-
-  /* else: no need to copy, data_inout already contains what we want to return */
-}
-
-
-/** Min operator for armci_msg_sel
-  */
-void ARMCII_Msg_sel_max_op(void *data_in, void *data_inout, int *len, MPI_Datatype *datatype) {
-  sel_data_t *sd_1, *sd_2;
-
-  sd_1 = (sel_data_t*) data_in;
-  sd_2 = (sel_data_t*) data_inout;
-
-  if (sd_1->contribute && !sd_2->contribute) {
-    ARMCI_Copy(data_in, data_inout, *len);
-  }
-  
-  else if (sd_1->contribute && sd_2->contribute) {
-
-#define MSG_SEL_MAX_OP(X,Y,LEN,TYPE)                                      \
-  do {                                                                    \
-    if (*(TYPE*)((sel_data_t*)X)->data > *(TYPE*)((sel_data_t*)Y)->data)  \
-      ARMCI_Copy(X, Y, LEN);                                              \
-  } while (0)
-
-    switch (sd_1->type) {
-      case ARMCI_INT:
-        MSG_SEL_MAX_OP(data_in, data_inout, *len, int);
-        break;
-      case ARMCI_LONG:
-        MSG_SEL_MAX_OP(data_in, data_inout, *len, long);
-        break;
-      case ARMCI_LONG_LONG:
-        MSG_SEL_MAX_OP(data_in, data_inout, *len, long long);
-        break;
-      case ARMCI_FLOAT:
-        MSG_SEL_MAX_OP(data_in, data_inout, *len, float);
-        break;
-      case ARMCI_DOUBLE:
-        MSG_SEL_MAX_OP(data_in, data_inout, *len, double);
-        break;
-      default:
-        ARMCII_Error("Invalid data type (%d)", sd_1->type);
-    }
-
-#undef MSG_SEL_MIN_OP
-  }
-
-  /* else: no need to copy, data_inout already contains what we want to return */
-}
-
-
-/** Collective index selection reduce operation.
-  */
-void armci_msg_sel(void *x, int n, char *op, int type, int contribute) {
-  armci_msg_sel_scope(SCOPE_ALL, x, n, op, type, contribute);
-}
-
-
-/** Collective index selection reduce operation (scoped).
-  */
-void armci_msg_sel_scope(int scope, void *x, int n, char* op, int type, int contribute) {
-  MPI_Comm    sel_comm;
-  sel_data_t *data_in, *data_out;
-  void      **x_buf;
-
-  /*
-  printf("[%d] armci_msg_sel_scope(scope=%d, x=%p, n=%d, op=%s, type=%d, contribute=%d)\n",
-      ARMCI_GROUP_WORLD.rank, scope, x, n, op, type, contribute);
-  */
-
-  /* Determine the scope of the collective operation */
-  if (scope == SCOPE_ALL || scope == SCOPE_MASTERS)
-    sel_comm = ARMCI_GROUP_WORLD.comm;
-  else
-    sel_comm = MPI_COMM_SELF;
-
-  data_in  = malloc(sizeof(sel_data_t)+n-1);
-  data_out = malloc(sizeof(sel_data_t)+n-1);
-
-  ARMCII_Assert(data_in != NULL && data_out != NULL);
-
-  ARMCII_Buf_prepare_read_vec(&x, &x_buf, 1, n);
-
-  data_in->contribute = contribute;
-  data_in->type       = type;
-
-  if (contribute)
-    ARMCI_Copy(x, data_in->data, n);
-
-  if (strncmp(op, "min", 3) == 0) {
-    MPI_Allreduce(data_in, data_out, sizeof(sel_data_t)+n-1, MPI_BYTE, MPI_SELMIN_OP, sel_comm);
-  } else if (strncmp(op, "max", 3) == 0) {
-    MPI_Allreduce(data_in, data_out, sizeof(sel_data_t)+n-1, MPI_BYTE, MPI_SELMAX_OP, sel_comm);
-  } else {
-      ARMCII_Error("Invalid operation (%s)", op);
-  }
-
-  ARMCI_Copy(data_out->data, x, n);
-
-  ARMCII_Buf_finish_write_vec(&x, x_buf, 1, n);
-
-  free(data_in);
-  free(data_out);
-}
diff --git a/src/armci/src/message.h b/src/armci/src/message.h
deleted file mode 100644
index 0ab48af..0000000
--- a/src/armci/src/message.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#ifndef HAVE_ARMCI_MSG_H
-#define HAVE_ARMCI_MSG_H
-
-#include <armci.h>
-
-/** Note on scopes:
-  *
-  * SCOPE_NODE    - Include all processes on the current node.  In the current
-  *                 implementation we use MPI_COMM_SELF for this.
-  * SCOPE_MASTERS - Includes one rank from every node.  Currently the same as
-                    SCOPE_ALL.
-  * SCOPE_ALL     - Includes all processes.
-  */
-enum armci_scope_e { SCOPE_ALL, SCOPE_NODE, SCOPE_MASTERS}; 
-
-enum armci_type_e  { ARMCI_INT, ARMCI_LONG, ARMCI_LONG_LONG, ARMCI_FLOAT, ARMCI_DOUBLE };
-
-/* Utility routines */
-
-int  armci_msg_me(void);
-int  armci_msg_nproc(void);
-
-void armci_msg_abort(int code);
-double armci_timer(void);
-
-/* Send/Recv */
-
-void armci_msg_snd(int tag, void *buffer, int len, int to);
-void armci_msg_rcv(int tag, void *buffer, int buflen, int *msglen, int from);
-int  armci_msg_rcvany(int tag, void *buffer, int buflen, int *msglen); 
-
-/* Assorted Collectives */
-
-void armci_msg_barrier(void);
-void armci_msg_group_barrier(ARMCI_Group *group);
-void armci_msg_bintree(int scope, int *Root, int *Up, int *Left, int *Right);
-
-void armci_msg_bcast(void *buffer, int len, int root);
-void armci_msg_bcast_scope(int scope, void *buffer, int len, int root);
-void armci_msg_brdcst(void *buffer, int len, int root);
-void armci_msg_group_bcast_scope(int scope, void *buf, int len, int root, ARMCI_Group *group);
-
-/* TODO */ void armci_msg_reduce(void *x, int n, char *op, int type); 
-/* TODO */ void armci_msg_reduce_scope(int scope, void *x, int n, char *op, int type); 
-
-void armci_msg_sel(void *x, int n, char *op, int type, int contribute);
-void armci_msg_sel_scope(int scope, void *x, int n, char *op, int type, int contribute);
-
-/* TODO */ void armci_exchange_address(void *ptr_ar[], int n);
-
-/* TODO */ void armci_msg_clus_brdcst(void *buf, int len);
-/* TODO */ void armci_msg_clus_igop(int *x, int n, char *op); 
-/* TODO */ void armci_msg_clus_fgop(float *x, int n, char *op); 
-/* TODO */ void armci_msg_clus_lgop(long *x, int n, char *op); 
-/* TODO */ void armci_msg_clus_llgop(long long *x, int n, char *op); 
-/* TODO */ void armci_msg_clus_dgop(double *x, int n, char *op); 
-
-/* TODO */ void armci_exchange_address_grp(void *ptr_arr[], int n, ARMCI_Group *group);
-/* TODO */ void armci_grp_clus_brdcst(void *buf, int len, int grp_master, int grp_clus_nproc,ARMCI_Group *mastergroup);
-
-/* Global Operations / Reduction Operations */
-
-void armci_msg_gop_scope(int scope, void *x, int n, char *op, int type);
-void armci_msg_igop(int *x, int n, char *op);
-void armci_msg_lgop(long *x, int n, char *op);
-void armci_msg_llgop(long long *x, int n, char *op);
-void armci_msg_fgop(float *x, int n, char *op);
-void armci_msg_dgop(double *x, int n, char *op);
-
-void armci_msg_group_gop_scope(int scope, void *x, int n, char *op, int type, ARMCI_Group *group);
-void armci_msg_group_igop(int *x, int n, char *op, ARMCI_Group *group);
-void armci_msg_group_lgop(long *x, int n, char *op, ARMCI_Group *group);
-void armci_msg_group_llgop(long long *x, int n, char *op, ARMCI_Group *group);
-void armci_msg_group_fgop(float *x, int n, char *op, ARMCI_Group *group);
-void armci_msg_group_dgop(double *x, int n,char *op, ARMCI_Group *group);
-
-#endif /* HAVE_ARMCI_MSG_H */
diff --git a/src/armci/src/message_gop.c b/src/armci/src/message_gop.c
deleted file mode 100644
index 0aa2fdc..0000000
--- a/src/armci/src/message_gop.c
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <mpi.h>
-
-#include <debug.h>
-#include <armci.h>
-#include <armci_internals.h>
-
-/* MPI Operations, registered in Init */
-MPI_Op MPI_ABSMIN_OP;
-MPI_Op MPI_ABSMAX_OP;
-
-#define IABS(X)  (((X) > 0  ) ? X : -X)
-#define FABS(X)  (((X) > 0.0) ? X : -X)
-#define MIN(X,Y) (((X) < (Y)) ? X : Y)
-#define MAX(X,Y) (((X) > (Y)) ? X : Y)
-
-#define ABSMIN(IN,INOUT,COUNT,DTYPE,ABSOP)      \
-      do {                                      \
-        int i;                                  \
-        DTYPE *in = (DTYPE *)IN;                \
-        DTYPE *io = (DTYPE *)INOUT;             \
-        for (i = 0; i < COUNT; i++) {           \
-          const DTYPE x = ABSOP(in[i]);         \
-          const DTYPE y = ABSOP(io[i]);         \
-          io[i] = MIN(x,y);                     \
-        }                                       \
-      } while (0)
-
-/** MPI reduction operator that computes the minimum absolute value.
-  */
-void ARMCII_Absmin_op(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
-  const int    count = *len;
-  MPI_Datatype dt    = *datatype;
-
-  if (dt == MPI_INT) {
-      ABSMIN(invec, inoutvec, count, int, IABS);
-  } else if (dt == MPI_LONG) {
-      ABSMIN(invec, inoutvec, count, long, IABS);
-  } else if (dt == MPI_LONG_LONG) {
-      ABSMIN(invec, inoutvec, count, long long, IABS);
-  } else if (dt == MPI_FLOAT) {
-      ABSMIN(invec, inoutvec, count, float, FABS);
-  } else if (dt == MPI_DOUBLE) {
-      ABSMIN(invec, inoutvec, count, double, FABS);
-  } else {
-      ARMCII_Error("unknown type (%d)", *datatype);
-  }
-}
-
-#undef ABSMIN
-
-
-#define ABSMAX(IN,INOUT,COUNT,DTYPE,ABSOP)      \
-      do {                                      \
-        int i;                                  \
-        DTYPE *in = (DTYPE *)IN;                \
-        DTYPE *io = (DTYPE *)INOUT;             \
-        for (i = 0; i < COUNT; i++) {           \
-          const DTYPE x = ABSOP(in[i]);         \
-          const DTYPE y = ABSOP(io[i]);         \
-          io[i] = MAX(x,y);                     \
-        }                                       \
-      } while (0)
-
-/** MPI reduction operator that computes the maximum absolute value.
-  */
-void ARMCII_Absmax_op(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
-  const int    count = *len;
-  MPI_Datatype dt    = *datatype;
-
-  if (dt == MPI_INT) {
-      ABSMAX(invec, inoutvec, count, int, IABS);
-  } else if (dt == MPI_LONG) {
-      ABSMAX(invec, inoutvec, count, long, IABS);
-  } else if (dt == MPI_LONG_LONG) {
-      ABSMAX(invec, inoutvec, count, long long, IABS);
-  } else if (dt == MPI_FLOAT) {
-      ABSMAX(invec, inoutvec, count, float, FABS);
-  } else if (dt == MPI_DOUBLE) {
-      ABSMAX(invec, inoutvec, count, double, FABS);
-  } else {
-      ARMCII_Error("unknown type (%d)", *datatype);
-  }
-}
-
-#undef ABSMAX
-
-
-#define ABSV(IN,INOUT,COUNT,DTYPE,ABSOP)        \
-      do {                                      \
-        int i;                                  \
-        DTYPE *in = (DTYPE *)IN;                \
-        DTYPE *io = (DTYPE *)INOUT;             \
-        for (i = 0; i < COUNT; i++)             \
-          io[i] = ABSOP(in[i]);                 \
-      } while (0)
-
-/** Compute the absolute value.
-  */
-void ARMCII_Absv_op(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
-  const int    count = *len;
-  MPI_Datatype dt    = *datatype;
-
-  if (dt == MPI_INT) {
-      ABSV(invec, inoutvec, count, int, IABS);
-  } else if (dt == MPI_LONG) {
-      ABSV(invec, inoutvec, count, long, IABS);
-  } else if (dt == MPI_LONG_LONG) {
-      ABSV(invec, inoutvec, count, long long, IABS);
-  } else if (dt == MPI_FLOAT) {
-      ABSV(invec, inoutvec, count, float, FABS);
-  } else if (dt == MPI_DOUBLE) {
-      ABSV(invec, inoutvec, count, double, FABS);
-  } else {
-      ARMCII_Error("unknown type (%d)", *datatype);
-  }
-}
-
-#undef ABSV
-
-
-/** General ARMCI global operation (reduction).  Collective on group.
-  *
-  * @param[in]    scope Scope in which to perform the GOP (only SCOPE_ALL is supported)
-  * @param[inout] x     Vector of n data elements, contains input and will contain output.
-  * @param[in]    n     Length of x
-  * @param[in]    op    One of '+', '*', 'max', 'min', 'absmax', 'absmin'
-  * @param[in]    type  Data type of x (e.g. ARMCI_INT, ...)
-  * @param[in]    group Group on which to perform the GOP
-  */
-void armci_msg_group_gop_scope(int scope, void *x, int n, char *op, int type, ARMCI_Group *group) {
-  void        *out, **x_buf;
-  MPI_Op       mpi_op;
-  MPI_Datatype mpi_type;
-  MPI_Comm     comm;
-  int          mpi_type_size;
-
-  if (scope == SCOPE_ALL || scope == SCOPE_MASTERS)
-    comm = group->comm;
-  else
-    comm = MPI_COMM_SELF;
-
-  if (op[0] == '+') {
-    mpi_op = MPI_SUM;
-  } else if (op[0] == '*') {
-    mpi_op = MPI_PROD;
-  } else if (strncmp(op, "max", 3) == 0) {
-    mpi_op = MPI_MAX;
-  } else if (strncmp(op, "min", 3) == 0) {
-    mpi_op = MPI_MIN;
-  } else if (strncmp(op, "or", 2) == 0) {
-    mpi_op = MPI_BOR;
-  } else if (strncmp(op, "absmax", 6) == 0) {
-    mpi_op = MPI_ABSMAX_OP;
-  } else if (strncmp(op, "absmin", 6) == 0) {
-    mpi_op = MPI_ABSMIN_OP;
-  } else {
-    ARMCII_Error("unknown operation \'%s\'", op);
-    return;
-  }
-
-  switch(type) {
-    case ARMCI_INT:
-      mpi_type = MPI_INT;
-      break;
-    case ARMCI_LONG:
-      mpi_type = MPI_LONG;
-      break;
-    case ARMCI_LONG_LONG:
-      mpi_type = MPI_LONG_LONG;
-      break;
-    case ARMCI_FLOAT:
-      mpi_type = MPI_FLOAT;
-      break;
-    case ARMCI_DOUBLE:
-      mpi_type = MPI_DOUBLE;
-      break;
-    default:
-      ARMCII_Error("unknown type (%d)", type);
-      return;
-  }
-
-  MPI_Type_size(mpi_type, &mpi_type_size);
-
-  ARMCII_Buf_prepare_read_vec(&x, &x_buf, 1, n*mpi_type_size);
-
-  // ABS MAX/MIN are unary as well as binary.  We need to also apply abs in the
-  // single processor case when reduce would normally just be a no-op.
-  if (group->size == 1 && (mpi_op == MPI_ABSMAX_OP || mpi_op == MPI_ABSMIN_OP)) {
-    ARMCII_Absv_op(x_buf[0], x_buf[0], &n, &mpi_type);
-  }
-
-  else {
-    out = malloc(n*mpi_type_size);
-    ARMCII_Assert(out != NULL);
-
-    MPI_Allreduce(x_buf[0], out, n, mpi_type, mpi_op, group->comm);
-
-    ARMCI_Copy(out, x_buf[0], n*mpi_type_size);
-    free(out);
-  }
-
-  ARMCII_Buf_finish_write_vec(&x, x_buf, 1, n*mpi_type_size);
-}
-
-void armci_msg_group_igop(int *x, int n, char *op, ARMCI_Group *group) {
-  armci_msg_group_gop_scope(SCOPE_ALL, x, n, op, ARMCI_INT, group);
-}
-
-void armci_msg_group_lgop(long *x, int n, char *op, ARMCI_Group *group) {
-  armci_msg_group_gop_scope(SCOPE_ALL, x, n, op, ARMCI_LONG, group);
-}
-
-void armci_msg_group_llgop(long long *x, int n, char *op, ARMCI_Group *group) {
-  armci_msg_group_gop_scope(SCOPE_ALL, x, n, op, ARMCI_LONG_LONG, group);
-}
-
-void armci_msg_group_fgop(float *x, int n, char *op, ARMCI_Group *group) {
-  armci_msg_group_gop_scope(SCOPE_ALL, x, n, op, ARMCI_FLOAT, group);
-}
-
-void armci_msg_group_dgop(double *x, int n, char *op, ARMCI_Group *group) {
-  armci_msg_group_gop_scope(SCOPE_ALL, x, n, op, ARMCI_DOUBLE, group);
-}
-
-void armci_msg_gop_scope(int scope, void *x, int n, char *op, int type) {
-  armci_msg_group_gop_scope(scope, x, n, op, type, &ARMCI_GROUP_WORLD);
-}
-
-void armci_msg_igop(int *x, int n, char *op) {
-  armci_msg_gop_scope(SCOPE_ALL, x, n, op, ARMCI_INT);
-}
-
-void armci_msg_lgop(long *x, int n, char *op) {
-  armci_msg_gop_scope(SCOPE_ALL, x, n, op, ARMCI_LONG);
-}
-
-void armci_msg_llgop(long long *x, int n, char *op) {
-  armci_msg_gop_scope(SCOPE_ALL, x, n, op, ARMCI_LONG_LONG);
-}
-
-void armci_msg_fgop(float *x, int n, char *op) {
-  armci_msg_gop_scope(SCOPE_ALL, x, n, op, ARMCI_FLOAT);
-}
-
-void armci_msg_dgop(double *x, int n, char *op) {
-  armci_msg_gop_scope(SCOPE_ALL, x, n, op, ARMCI_DOUBLE);
-}
-
diff --git a/src/armci/src/mp3.fh b/src/armci/src/mp3.fh
deleted file mode 100644
index 4d0d68c..0000000
--- a/src/armci/src/mp3.fh
+++ /dev/null
@@ -1,8 +0,0 @@
-#include "mpif.h"
-#define MP_TIMER() mpi_wtime()
-#define MP_FINALIZE() mpi_finalize(ierr)
-
-#ifndef MP_DEFINES_ONLY
-      integer ierr
-      call mpi_init(ierr)
-#endif
diff --git a/src/armci/src/mp3.h b/src/armci/src/mp3.h
deleted file mode 100644
index 81d0458..0000000
--- a/src/armci/src/mp3.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef _MP3_H_
-#define _MP3_H_
-
-#include <mpi.h>
-
-#define MP_INIT(ARGC,ARGV)   MPI_Init(&(ARGC),&(ARGV))
-#define MP_BARRIER()         MPI_Barrier(MPI_COMM_WORLD)
-#define MP_FINALIZE()        MPI_Finalize()
-#define MP_PROCS(X)          MPI_Comm_size(MPI_COMM_WORLD,X)
-#define MP_MYID(X)           MPI_Comm_rank(MPI_COMM_WORLD,X)
-#define MP_TIMER()           MPI_Wtime()
-
-#define GA_INIT(ARGC,ARGV)    GA_Initialize()
-#define ARMCI_INIT(ARGC,ARGV) ARMCI_Init()
-
-
-#endif /* _MP3_H_ */
diff --git a/src/armci/src/mp3def.fh b/src/armci/src/mp3def.fh
deleted file mode 100644
index 922b119..0000000
--- a/src/armci/src/mp3def.fh
+++ /dev/null
@@ -1,3 +0,0 @@
-#define MP_DEFINES_ONLY
-#include "mp3.fh"
-#undef MP_DEFINES_ONLY
diff --git a/src/armci/src/mutex.c b/src/armci/src/mutex.c
deleted file mode 100644
index c5cf91d..0000000
--- a/src/armci/src/mutex.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-
-#include <debug.h>
-#include <armci.h>
-#include <armci_internals.h>
-#include <armcix.h>
-
-#define MAX_TIMEOUT 1000
-#define TIMEOUT_MUL 2
-#define MIN(A,B) (((A) < (B)) ? (A) : (B))
-
-
-/** This is the handle for the "default" group of mutexes used by the
-  * standard ARMCI mutex API
-  */
-static armcix_mutex_hdl_t armci_mutex_hdl = NULL;
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Create_mutexes = PARMCI_Create_mutexes
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Create_mutexes ARMCI_Create_mutexes
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Create_mutexes as PARMCI_Create_mutexes
-#endif
-/* -- end weak symbols block -- */
-
-/** Create ARMCI mutexes.  Collective.
-  *
-  * @param[in] count Number of mutexes to create on the calling process
-  */
-int PARMCI_Create_mutexes(int count) {
-  if (armci_mutex_hdl != NULL)
-    ARMCII_Error("attempted to create ARMCI mutexes multiple times");
-
-  armci_mutex_hdl = ARMCIX_Create_mutexes_hdl(count, &ARMCI_GROUP_WORLD);
-
-  if (armci_mutex_hdl != NULL)
-    return 0;
-  else
-    return 1;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Destroy_mutexes = PARMCI_Destroy_mutexes
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Destroy_mutexes ARMCI_Destroy_mutexes
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Destroy_mutexes as PARMCI_Destroy_mutexes
-#endif
-/* -- end weak symbols block -- */
-
-/** Destroy/free ARMCI mutexes.  Collective.
-  */
-int PARMCI_Destroy_mutexes(void) {
-  int err;
-
-  if (armci_mutex_hdl == NULL)
-    ARMCII_Error("attempted to free unallocated ARMCI mutexes");
-  
-  err = ARMCIX_Destroy_mutexes_hdl(armci_mutex_hdl);
-  armci_mutex_hdl = NULL;
-
-  return err;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Lock = PARMCI_Lock
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Lock ARMCI_Lock
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Lock as PARMCI_Lock
-#endif
-/* -- end weak symbols block -- */
-
-/** Lock a mutex.
-  *
-  * @param[in] mutex Number of the mutex to lock
-  * @param[in] proc  Target process for the lock operation
-  */
-void PARMCI_Lock(int mutex, int proc) {
-  if (armci_mutex_hdl == NULL)
-    ARMCII_Error("attempted to lock on unallocated ARMCI mutexes");
-  
-  ARMCIX_Lock_hdl(armci_mutex_hdl, mutex, proc);
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Unlock = PARMCI_Unlock
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Unlock ARMCI_Unlock
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Unlock as PARMCI_Unlock
-#endif
-/* -- end weak symbols block -- */
-
-/** Unlock a mutex.
-  *
-  * @param[in] mutex Number of the mutex to unlock
-  * @param[in] proc  Target process for the unlock operation
-  */
-void PARMCI_Unlock(int mutex, int proc) {
-  if (armci_mutex_hdl == NULL)
-    ARMCII_Error("attempted to unlock on unallocated ARMCI mutexes");
-  
-  ARMCIX_Unlock_hdl(armci_mutex_hdl, mutex, proc);
-}
diff --git a/src/armci/src/mutex_hdl_queue.c b/src/armci/src/mutex_hdl_queue.c
deleted file mode 100644
index 1088fe3..0000000
--- a/src/armci/src/mutex_hdl_queue.c
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-
-#include <armci.h>
-#include <armci_internals.h>
-#include <armcix.h>
-#include <debug.h>
-
-#define ARMCI_MUTEX_TAG 100
-
-/* TODO: Make these all no-ops for sequential runs */
-
-/** Create a group of ARMCI mutexes.  Collective onthe ARMCI group.
-  *
-  * @param[in] count  Number of mutexes on the local process.
-  * @param[in] pgroup ARMCI group on which to create mutexes
-  * @return           Handle to the mutex group.
-  */
-armcix_mutex_hdl_t ARMCIX_Create_mutexes_hdl(int my_count, ARMCI_Group *pgroup) {
-  int rank, nproc, max_count, i;
-  armcix_mutex_hdl_t hdl;
-
-  hdl = malloc(sizeof(struct armcix_mutex_hdl_s));
-  ARMCII_Assert(hdl != NULL);
-
-  ARMCIX_Group_dup(pgroup, &hdl->grp);
-
-  MPI_Comm_rank(hdl->grp.comm, &rank);
-  MPI_Comm_size(hdl->grp.comm, &nproc);
-
-  hdl->my_count = my_count;
-
-  /* Find the max. count to determine how many windows we need. */
-  MPI_Allreduce(&my_count, &max_count, 1, MPI_INT, MPI_MAX, hdl->grp.comm);
-  ARMCII_Assert_msg(max_count > 0, "Invalid number of mutexes");
-
-  hdl->max_count = max_count;
-  hdl->windows = malloc(sizeof(MPI_Win)*max_count);
-
-  if (my_count > 0) {
-    hdl->bases = malloc(sizeof(uint8_t*)*my_count);
-  } else {
-    hdl->bases = NULL;
-  }
-
-  /* We need multiple windows here: one for each mutex.  Otherwise
-     performance will suffer due to exclusive access epochs. */
-  for (i = 0; i < max_count; i++) {
-    int   size = 0;
-    void *base = NULL;
-
-    if (i < my_count) {
-      MPI_Alloc_mem(nproc, MPI_INFO_NULL, &hdl->bases[i]);
-      ARMCII_Assert(hdl->bases[i] != NULL);
-      ARMCII_Bzero(hdl->bases[i], nproc);
-
-      base = hdl->bases[i];
-      size = nproc;
-    }
-
-    MPI_Win_create(base, size, sizeof(uint8_t), MPI_INFO_NULL, hdl->grp.comm, &hdl->windows[i]);
-  }
-
-  return hdl;
-}
-
-
-/** Destroy a group of ARMCI mutexes.  Collective.
-  *
-  * @param[in] hdl Handle to the group that should be destroyed.
-  * @return        Zero on success, non-zero otherwise.
-  */
-int ARMCIX_Destroy_mutexes_hdl(armcix_mutex_hdl_t hdl) {
-  int i;
-
-  for (i = 0; i < hdl->max_count; i++) {
-    MPI_Win_free(&hdl->windows[i]);
-  }
-    
-  if (hdl->bases != NULL) {
-    for (i = 0; i < hdl->my_count; i++)
-      MPI_Free_mem(hdl->bases[i]);
-
-    free(hdl->bases);
-  }
-
-  ARMCI_Group_free(&hdl->grp);
-  free(hdl->windows);
-  free(hdl);
-
-  return 0;
-}
-
-
-/** Lock a mutex.
-  * 
-  * @param[in] hdl        Mutex group that the mutex belongs to.
-  * @param[in] mutex      Desired mutex number [0..count-1]
-  * @param[in] world_proc Absolute ID of process where the mutex lives
-  */
-void ARMCIX_Lock_hdl(armcix_mutex_hdl_t hdl, int mutex, int world_proc) {
-  int       rank, nproc, already_locked, i, proc;
-  uint8_t *buf;
-
-  ARMCII_Assert(mutex >= 0 && mutex < hdl->max_count);
-
-  MPI_Comm_rank(hdl->grp.comm, &rank);
-  MPI_Comm_size(hdl->grp.comm, &nproc);
-
-  /* User gives us the absolute ID.  Translate to the rank in the mutex's group. */
-  proc = ARMCII_Translate_absolute_to_group(&hdl->grp, world_proc);
-  ARMCII_Assert(proc >= 0);
-
-  buf = malloc(nproc*sizeof(uint8_t));
-  ARMCII_Assert(buf != NULL);
-
-  buf[rank] = 1;
-
-  /* Get all data from the lock_buf, except the byte belonging to
-   * me. Set the byte belonging to me to 1. */
-  MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->windows[mutex]);
-  
-  MPI_Put(&buf[rank], 1, MPI_BYTE, proc, rank, 1, MPI_BYTE, hdl->windows[mutex]);
-
-  /* Get data to the left of rank */
-  if (rank > 0) {
-    MPI_Get(buf, rank, MPI_BYTE, proc, 0, rank, MPI_BYTE, hdl->windows[mutex]);
-  }
-
-  /* Get data to the right of rank */
-  if (rank < nproc - 1) {
-    MPI_Get(&buf[rank+1], nproc-1-rank, MPI_BYTE, proc, rank + 1, nproc-1-rank, MPI_BYTE, hdl->windows[mutex]);
-  }
-  
-  MPI_Win_unlock(proc, hdl->windows[mutex]);
-
-  ARMCII_Assert(buf[rank] == 1);
-
-  for (i = already_locked = 0; i < nproc; i++)
-    if (buf[i] && i != rank)
-      already_locked = 1;
-
-  /* Wait for notification */
-  if (already_locked) {
-    MPI_Status status;
-    ARMCII_Dbg_print(DEBUG_CAT_MUTEX, "waiting for notification [proc = %d, mutex = %d]\n", proc, mutex);
-    MPI_Recv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE, ARMCI_MUTEX_TAG+mutex, hdl->grp.comm, &status);
-  }
-
-  ARMCII_Dbg_print(DEBUG_CAT_MUTEX, "lock acquired [proc = %d, mutex = %d]\n", proc, mutex);
-  free(buf);
-}
-
-
-/** Attempt to lock a mutex (implemented as a blocking call).
-  * 
-  * @param[in] hdl   Mutex group that the mutex belongs to.
-  * @param[in] mutex Desired mutex number [0..count-1]
-  * @param[in] world_proc Absolute ID of process where the mutex lives
-  * @return          0 on success, non-zero on failure
-  */
-int ARMCIX_Trylock_hdl(armcix_mutex_hdl_t hdl, int mutex, int world_proc) {
-  ARMCII_Assert(mutex >= 0 && mutex < hdl->max_count);
-
-  ARMCIX_Lock_hdl(hdl, mutex, world_proc);
-  return 0;
-}
-
-
-/** Unlock a mutex.
-  * 
-  * @param[in] hdl   Mutex group that the mutex belongs to.
-  * @param[in] mutex Desired mutex number [0..count-1]
-  * @param[in] world_proc Absolute ID of process where the mutex lives
-  */
-void ARMCIX_Unlock_hdl(armcix_mutex_hdl_t hdl, int mutex, int world_proc) {
-  int      rank, nproc, i, proc;
-  uint8_t *buf;
-
-  ARMCII_Assert(mutex >= 0 && mutex < hdl->max_count);
-
-  MPI_Comm_rank(hdl->grp.comm, &rank);
-  MPI_Comm_size(hdl->grp.comm, &nproc);
-
-  proc = ARMCII_Translate_absolute_to_group(&hdl->grp, world_proc);
-  ARMCII_Assert(proc >= 0);
-
-  buf = malloc(nproc*sizeof(uint8_t));
-
-  buf[rank] = 0;
-
-  /* Get all data from the lock_buf, except the byte belonging to
-   * me. Set the byte belonging to me to 0. */
-  MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->windows[mutex]);
-  
-  MPI_Put(&buf[rank], 1, MPI_BYTE, proc, rank, 1, MPI_BYTE, hdl->windows[mutex]);
-
-  /* Get data to the left of rank */
-  if (rank > 0) {
-    MPI_Get(buf, rank, MPI_BYTE, proc, 0, rank, MPI_BYTE, hdl->windows[mutex]);
-  }
-
-  /* Get data to the right of rank */
-  if (rank < nproc - 1) {
-    MPI_Get(&buf[rank+1], nproc-1-rank, MPI_BYTE, proc, rank + 1, nproc-1-rank, MPI_BYTE, hdl->windows[mutex]);
-  }
-  
-  MPI_Win_unlock(proc, hdl->windows[mutex]);
-
-  ARMCII_Assert(buf[rank] == 0);
-
-  /* Notify the next waiting process, starting to my right for fairness */
-  for (i = 1; i < nproc; i++) {
-    int p = (rank + i) % nproc;
-    if (buf[p] == 1) {
-      ARMCII_Dbg_print(DEBUG_CAT_MUTEX, "notifying %d [proc = %d, mutex = %d]\n", p, proc, mutex);
-      MPI_Send(NULL, 0, MPI_BYTE, p, ARMCI_MUTEX_TAG+mutex, hdl->grp.comm);
-      break;
-    }
-  }
-
-  ARMCII_Dbg_print(DEBUG_CAT_MUTEX, "lock released [proc = %d, mutex = %d]\n", proc, mutex);
-  free(buf);
-}
diff --git a/src/armci/src/mutex_hdl_spin.c b/src/armci/src/mutex_hdl_spin.c
deleted file mode 100644
index c2107cf..0000000
--- a/src/armci/src/mutex_hdl_spin.c
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-/* These mutexes are built using only MPI-2 atomic accumulate.  The only
- * drawback is that they are vulnerable to livelock.  Here's how the lock
- * algorithm works:
- *
- * Let mutex be an integer that is initially 0.  I hold the mutex when after
- * adding my rank to it, it is equal to my rank.
- *
- * function lock(mutex, p):
- *
- *   acc(mutex, p, me)      // mutex = mutex + me
- *
- *   while (get(mutex, p) != me) {
- *     acc(mutex, p, -1*me) // -1*me is the value to be accumulated
- *     sleep(random)        // Try to avoid livelock/do some backoff
- *     acc(mutex, p, me)
- *   }
- *
- * function unlock(mutex, p)
- *   acc(mutex, p, -1*me)
- */
-
-// TODO: Should each mutex be in a different window?
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <mpi.h>
-
-#include <debug.h>
-#include <armci.h>
-#include <armcix.h>
-#include <armci_internals.h>
-
-#define MAX_TIMEOUT 1000
-#define TIMEOUT_MUL 2
-#define MIN(A,B) (((A) < (B)) ? (A) : (B))
-
-
-/** Create a mutex group.  Collective.
-  *
-  * @param[in] count Number of mutexes to create on the calling process
-  * @return          Handle to the mutex group
-  */
-armcix_mutex_hdl_t ARMCIX_Create_mutexes_hdl(int count, ARMCI_Group *pgroup) {
-  int         ierr, i;
-  armcix_mutex_hdl_t hdl;
-
-  hdl = malloc(sizeof(struct armcix_mutex_hdl_s));
-  ARMCII_Assert(hdl != NULL);
-
-  MPI_Comm_dup(pgroup->comm, &hdl->comm);
-
-  if (count > 0) {
-    MPI_Alloc_mem(count*sizeof(long), MPI_INFO_NULL, &hdl->base);
-    ARMCII_Assert(hdl->base != NULL);
-  } else {
-    hdl->base = NULL;
-  }
-
-  hdl->count = count;
-
-  // Initialize mutexes to 0
-  for (i = 0; i < count; i++)
-    hdl->base[i] = 0;
-
-  ierr = MPI_Win_create(hdl->base, count*sizeof(long), sizeof(long) /* displacement size */,
-                        MPI_INFO_NULL, hdl->comm, &hdl->window);
-  ARMCII_Assert(ierr == MPI_SUCCESS);
-
-  return hdl;
-}
-
-
-/** Destroy/free a mutex group.  Collective.
-  * 
-  * @param[in] hdl Group to destroy
-  */
-int ARMCIX_Destroy_mutexes_hdl(armcix_mutex_hdl_t hdl) {
-  MPI_Win_free(&hdl->window);
-  
-  if (hdl->base) 
-    MPI_Free_mem(hdl->base);
-
-  MPI_Comm_free(&hdl->comm);
-
-  free(hdl);
-  
-  return 0;
-}
-
-
-/** Lock a mutex.
-  * 
-  * @param[in] hdl         Mutex group that the mutex belongs to.
-  * @param[in] mutex       Desired mutex number [0..count-1]
-  * @param[in] world_proc  Absolute ID of process where the mutex lives
-  */
-void ARMCIX_Lock_hdl(armcix_mutex_hdl_t hdl, int mutex, int world_proc) {
-  int       rank, nproc, proc;
-  long      lock_val, unlock_val, lock_out;
-  int       timeout = 1;
-
-  MPI_Comm_rank(hdl->comm, &rank);
-  MPI_Comm_size(hdl->comm, &nproc);
-
-  /* User gives us the absolute ID.  Translate to the rank in the mutex's group. */
-  proc = ARMCII_Translate_absolute_to_group(hdl->comm, world_proc);
-  ARMCII_Assert(proc >= 0);
-
-  lock_val   = rank+1;    // Map into range 1..nproc
-  unlock_val = -1 * (rank+1);
-
-  /* mutex <- mutex + rank */
-  MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window);
-  MPI_Accumulate(&lock_val, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, MPI_SUM, hdl->window);
-  MPI_Win_unlock(proc, hdl->window);
-
-  for (;;) {
-    /* read mutex value */
-    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window);
-    MPI_Get(&lock_out, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, hdl->window);
-    MPI_Win_unlock(proc, hdl->window);
-
-    ARMCII_Assert(lock_out > 0);
-    ARMCII_Assert(lock_out <= nproc*(nproc+1)/2); // Must be < sum of all ranks
-
-    /* We are holding the mutex */
-    if (lock_out == rank+1)
-      break;
-
-    /* mutex <- mutex - rank */
-    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window);
-    MPI_Accumulate(&unlock_val, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, MPI_SUM, hdl->window);
-    MPI_Win_unlock(proc, hdl->window);
-
-    /* Exponential backoff */
-    usleep(timeout + rand()%timeout);
-    timeout = MIN(timeout*TIMEOUT_MUL, MAX_TIMEOUT);
-    if (rand() % nproc == 0) // Chance to reset timeout
-      timeout = 1;
-
-    /* mutex <- mutex + rank */
-    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window);
-    MPI_Accumulate(&lock_val, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, MPI_SUM, hdl->window);
-    MPI_Win_unlock(proc, hdl->window);
-  }
-}
-
-
-/** Attempt to lock a mutex (non-blocking).
-  * 
-  * @param[in] hdl         Mutex group that the mutex belongs to.
-  * @param[in] mutex       Desired mutex number [0..count-1]
-  * @param[in] world_proc  Absolute ID of process where the mutex lives
-  * @return                0 on success, non-zero on failure
-  */
-int ARMCIX_Trylock_hdl(armcix_mutex_hdl_t hdl, int mutex, int world_proc) {
-  int       rank, nproc, proc;
-  long      lock_val, unlock_val, lock_out;
-
-  ARMCII_Assert(mutex >= 0);
-
-  MPI_Comm_rank(hdl->comm, &rank);
-  MPI_Comm_size(hdl->comm, &nproc);
-
-  /* User gives us the absolute ID.  Translate to the rank in the mutex's group. */
-  proc = ARMCII_Translate_absolute_to_group(hdl->comm, world_proc);
-  ARMCII_Assert(proc >= 0);
-
-  lock_val   = rank+1;
-  unlock_val = -1 * (rank+1);
-
-  /* mutex <- mutex + rank */
-  MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window);
-  MPI_Accumulate(&lock_val, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, MPI_SUM, hdl->window);
-  MPI_Win_unlock(proc, hdl->window);
-
-  /* read mutex value */
-  MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window);
-  MPI_Get(&lock_out, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, hdl->window);
-  MPI_Win_unlock(proc, hdl->window);
-
-  ARMCII_Assert(lock_out > 0);
-  ARMCII_Assert(lock_out <= nproc*(nproc+1)/2); // Must be < sum of all ranks
-
-  /* We are holding the mutex */
-  if (lock_out == rank+1)
-    return 0;
-
-  /* mutex <- mutex - rank */
-  MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window);
-  MPI_Accumulate(&unlock_val, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, MPI_SUM, hdl->window);
-  MPI_Win_unlock(proc, hdl->window);
-
-  return 1;
-}
-
-
-/** Unlock a mutex.
-  * 
-  * @param[in] hdl         Mutex group that the mutex belongs to.
-  * @param[in] mutex       Desired mutex number [0..count-1]
-  * @param[in] world_proc  Absolute ID of process where the mutex lives
-  */
-void ARMCIX_Unlock_hdl(armcix_mutex_hdl_t hdl, int mutex, int world_proc) {
-  int       rank, nproc, proc;
-  long      unlock_val;
-
-  ARMCII_Assert(mutex >= 0);
-
-  MPI_Comm_rank(hdl->comm, &rank);
-  MPI_Comm_size(hdl->comm, &nproc);
-
-  /* User gives us the absolute ID.  Translate to the rank in the mutex's group. */
-  proc = ARMCII_Translate_absolute_to_group(hdl->comm, world_proc);
-  ARMCII_Assert(proc >= 0);
-
-  unlock_val = -1 * (rank+1);
-
-  /* mutex <- mutex - rank */
-  MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window);
-  MPI_Accumulate(&unlock_val, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, MPI_SUM, hdl->window);
-  MPI_Win_unlock(proc, hdl->window);
-}
-
diff --git a/src/armci/src/onesided.c b/src/armci/src/onesided.c
deleted file mode 100644
index 72ded74..0000000
--- a/src/armci/src/onesided.c
+++ /dev/null
@@ -1,376 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-
-#include <armci.h>
-#include <armcix.h>
-#include <armci_internals.h>
-#include <debug.h>
-#include <gmr.h>
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Access_begin = PARMCI_Access_begin
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Access_begin ARMCI_Access_begin
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Access_begin as PARMCI_Access_begin
-#endif
-/* -- end weak symbols block -- */
-
-/** Declare the start of a local access epoch.  This allows direct access to
-  * data in local memory.
-  *
-  * @param[in] ptr Pointer to the allocation that will be accessed directly 
-  */
-void PARMCI_Access_begin(void *ptr) {
-  gmr_t *mreg;
-
-  mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank);
-  ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer");
-
-  ARMCII_Assert_msg((mreg->access_mode & ARMCIX_MODE_NO_LOAD_STORE) == 0,
-      "Direct local access is not permitted in the current access mode");
-
-  gmr_dla_lock(mreg);
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Access_end = PARMCI_Access_end
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Access_end ARMCI_Access_end
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Access_end as PARMCI_Access_end
-#endif
-/* -- end weak symbols block -- */
-
-/** Declare the end of a local access epoch.
-  *
-  * \note MPI-2 does not allow multiple locks at once, so you can have only one
-  * access epoch open at a time and cannot do put/get/acc while in an access
-  * region.
-  *
-  * @param[in] ptr Pointer to the allocation that was accessed directly 
-  */
-void PARMCI_Access_end(void *ptr) {
-  gmr_t *mreg;
-
-  mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank);
-  ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer");
-
-  gmr_dla_unlock(mreg);
-}
-
-
-/** Set the acess mode for the given allocation.  Collective across the
-  * allocation's group.  Waits for all processes, finishes all communication,
-  * and then sets the new access mode.
-  *
-  * @param[in] new_mode The new access mode.
-  * @param[in] ptr      Pointer within the allocation.
-  * @return             Zero upon success, error code otherwise.
-  */
-int ARMCIX_Mode_set(int new_mode, void *ptr, ARMCI_Group *group) {
-  gmr_t *mreg;
-
-  mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank);
-  ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer");
-
-  ARMCII_Assert(group->comm == mreg->group.comm);
-
-  ARMCII_Assert_msg(mreg->lock_state != GMR_LOCK_DLA,
-      "Cannot change the access mode; window is locked for local access.");
-  ARMCII_Assert_msg(mreg->lock_state == GMR_LOCK_UNLOCKED,
-      "Cannot change the access mode on a window that is locked.");
-
-  // Wait for all processes to complete any outstanding communication before we
-  // do the mode switch
-  MPI_Barrier(mreg->group.comm);
-
-  mreg->access_mode = new_mode;
-
-  return 0;
-}
-
-
-/** Query the access mode for the given allocation.  Non-collective.
-  *
-  * @param[in] ptr      Pointer within the allocation.
-  * @return             Current access mode.
-  */
-int ARMCIX_Mode_get(void *ptr) {
-  gmr_t *mreg;
-
-  mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank);
-  ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer");
-
-  return mreg->access_mode;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Get = PARMCI_Get
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Get ARMCI_Get
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Get as PARMCI_Get
-#endif
-/* -- end weak symbols block -- */
-
-/** One-sided get operation.
-  *
-  * @param[in] src    Source address (remote)
-  * @param[in] dst    Destination address (local)
-  * @param[in] size   Number of bytes to transfer
-  * @param[in] target Process id to target
-  * @return           0 on success, non-zero on failure
-  */
-int PARMCI_Get(void *src, void *dst, int size, int target) {
-  gmr_t *src_mreg, *dst_mreg;
-
-  src_mreg = gmr_lookup(src, target);
-
-  /* If NOGUARD is set, assume the buffer is not shared */
-  if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD)
-    dst_mreg = gmr_lookup(dst, ARMCI_GROUP_WORLD.rank);
-  else
-    dst_mreg = NULL;
-
-  ARMCII_Assert_msg(src_mreg != NULL, "Invalid remote pointer");
-
-  /* Local operation */
-  if (target == ARMCI_GROUP_WORLD.rank && dst_mreg == NULL) {
-    gmr_dla_lock(src_mreg);
-    ARMCI_Copy(src, dst, size);
-    gmr_dla_unlock(src_mreg);
-  }
-
-  /* Origin buffer is private */
-  else if (dst_mreg == NULL) {
-    gmr_lock(src_mreg, target);
-    gmr_get(src_mreg, src, dst, size, target);
-    gmr_unlock(src_mreg, target);
-  }
-
-  /* COPY: Either origin and target buffers are in the same window and we can't
-   * lock the same window twice (MPI semantics) or the user has requested
-   * always-copy mode. */
-  else {
-    void *dst_buf;
-
-    MPI_Alloc_mem(size, MPI_INFO_NULL, &dst_buf);
-    ARMCII_Assert(dst_buf != NULL);
-
-    gmr_lock(src_mreg, target);
-    gmr_get(src_mreg, src, dst_buf, size, target);
-    gmr_unlock(src_mreg, target);
-
-    gmr_dla_lock(dst_mreg);
-    ARMCI_Copy(dst_buf, dst, size);
-    MPI_Free_mem(dst_buf);
-    gmr_dla_unlock(dst_mreg);
-  }
-
-  return 0;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Put = PARMCI_Put
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Put ARMCI_Put
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Put as PARMCI_Put
-#endif
-/* -- end weak symbols block -- */
-
-/** One-sided put operation.
-  *
-  * @param[in] src    Source address (remote)
-  * @param[in] dst    Destination address (local)
-  * @param[in] size   Number of bytes to transfer
-  * @param[in] target Process id to target
-  * @return           0 on success, non-zero on failure
-  */
-int PARMCI_Put(void *src, void *dst, int size, int target) {
-  gmr_t *src_mreg, *dst_mreg;
-
-  dst_mreg = gmr_lookup(dst, target);
-
-  /* If NOGUARD is set, assume the buffer is not shared */
-  if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD)
-    src_mreg = gmr_lookup(src, ARMCI_GROUP_WORLD.rank);
-  else
-    src_mreg = NULL;
-
-  ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer");
-
-  /* Local operation */
-  if (target == ARMCI_GROUP_WORLD.rank && src_mreg == NULL) {
-    gmr_dla_lock(dst_mreg);
-    ARMCI_Copy(src, dst, size);
-    gmr_dla_unlock(dst_mreg);
-  }
-
-  /* Origin buffer is private */
-  else if (src_mreg == NULL) {
-    gmr_lock(dst_mreg, target);
-    gmr_put(dst_mreg, src, dst, size, target);
-    gmr_unlock(dst_mreg, target);
-  }
-
-  /* COPY: Either origin and target buffers are in the same window and we can't
-   * lock the same window twice (MPI semantics) or the user has requested
-   * always-copy mode. */
-  else {
-    void *src_buf;
-
-    MPI_Alloc_mem(size, MPI_INFO_NULL, &src_buf);
-    ARMCII_Assert(src_buf != NULL);
-
-    gmr_dla_lock(src_mreg);
-    ARMCI_Copy(src, src_buf, size);
-    gmr_dla_unlock(src_mreg);
-
-    gmr_lock(dst_mreg, target);
-    gmr_put(dst_mreg, src_buf, dst, size, target);
-    gmr_unlock(dst_mreg, target);
-
-    MPI_Free_mem(src_buf);
-  }
-
-  return 0;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Acc = PARMCI_Acc
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Acc ARMCI_Acc
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Acc as PARMCI_Acc
-#endif
-/* -- end weak symbols block -- */
-
-/** One-sided accumulate operation.
-  *
-  * @param[in] datatype ARMCI data type for the accumulate operation (see armci.h)
-  * @param[in] scale    Pointer for a scalar of type datatype that will be used to
-  *                     scale values in the source buffer
-  * @param[in] src      Source address (remote)
-  * @param[in] dst      Destination address (local)
-  * @param[in] bytes    Number of bytes to transfer
-  * @param[in] proc     Process id to target
-  * @return             0 on success, non-zero on failure
-  */
-int PARMCI_Acc(int datatype, void *scale, void *src, void *dst, int bytes, int proc) {
-  void  *src_buf;
-  int    count, type_size, scaled, src_is_locked = 0;
-  MPI_Datatype type;
-  gmr_t *src_mreg, *dst_mreg;
-
-  /* If NOGUARD is set, assume the buffer is not shared */
-  if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD)
-    src_mreg = gmr_lookup(src, ARMCI_GROUP_WORLD.rank);
-  else
-    src_mreg = NULL;
-
-  dst_mreg = gmr_lookup(dst, proc);
-
-  ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer");
-
-  /* Prepare the input data: Apply scaling if needed and acquire the DLA lock if
-   * needed.  We hold the DLA lock if (src_buf == src && src_mreg != NULL). */
-
-  scaled = ARMCII_Buf_acc_is_scaled(datatype, scale);
-
-  if (src_mreg) {
-    gmr_dla_lock(src_mreg);
-    src_is_locked = 1;
-  }
-
-  if (scaled) {
-      MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf);
-      ARMCII_Assert(src_buf != NULL);
-      ARMCII_Buf_acc_scale(src, src_buf, bytes, datatype, scale);
-  } else {
-    src_buf = src;
-  }
-
-  /* Check if we need to copy: user requested it or same mem region */
-  if (   (src_buf == src) /* buf_prepare didn't make a copy */
-      && (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY || src_mreg == dst_mreg) )
-  {
-    MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf);
-    ARMCII_Assert(src_buf != NULL);
-    ARMCI_Copy(src, src_buf, bytes);
-  }
-
-  /* Unlock early if src_buf is a copy */
-  if (src_buf != src && src_is_locked) {
-    gmr_dla_unlock(src_mreg);
-    src_is_locked = 0;
-  }
-
-  ARMCII_Acc_type_translate(datatype, &type, &type_size);
-  count = bytes/type_size;
-
-  ARMCII_Assert_msg(bytes % type_size == 0, 
-      "Transfer size is not a multiple of the datatype size");
-
-  /* TODO: Support a local accumulate operation more efficiently */
-
-  gmr_lock(dst_mreg, proc);
-  gmr_accumulate(dst_mreg, src_buf, dst, count, type, proc);
-  gmr_unlock(dst_mreg, proc);
-
-  if (src_is_locked) {
-    gmr_dla_unlock(src_mreg);
-    src_is_locked = 0;
-  }
-
-  if (src_buf != src)
-    MPI_Free_mem(src_buf);
-
-  return 0;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Put_flag = PARMCI_Put_flag
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Put_flag ARMCI_Put_flag
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Put_flag as PARMCI_Put_flag
-#endif
-/* -- end weak symbols block -- */
-
-/** One-sided copy of data from the source to the destination.  Set a flag on
-  * the remote process when the transfer is complete.
-  *
-  * @param[in] src   Source buffer
-  * @param[in] dst   Destination buffer on proc
-  * @param[in] size  Number of bytes to transfer
-  * @param[in] flag  Address of the flag buffer on proc
-  * @param[in] value Value to set the flag to
-  * @param[in] proc  Process id of the target
-  * @return          0 on success, non-zero on failure
-  */
-int PARMCI_Put_flag(void *src, void* dst, int size, int *flag, int value, int proc) {
-  PARMCI_Put(src, dst, size, proc);
-  PARMCI_Fence(proc);
-  PARMCI_Put(&value, flag, sizeof(int), proc);
-
-  return 0;
-}
diff --git a/src/armci/src/onesided_nb.c b/src/armci/src/onesided_nb.c
deleted file mode 100644
index 9c206c6..0000000
--- a/src/armci/src/onesided_nb.c
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <armci.h>
-
-
-/** Initialize Non-blocking handle.
-  */
-void ARMCI_INIT_HANDLE(armci_hdl_t *hdl) {
-  return;
-}
-
-
-/** Mark a handle as aggregate.
-  */
-void ARMCI_SET_AGGREGATE_HANDLE(armci_hdl_t *hdl) {
-  return;
-}
-
-
-/** Clear an aggregate handle.
-  */
-void ARMCI_UNSET_AGGREGATE_HANDLE(armci_hdl_t *hdl) {
-  return;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_NbPut = PARMCI_NbPut
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_NbPut ARMCI_NbPut
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_NbPut as PARMCI_NbPut
-#endif
-/* -- end weak symbols block -- */
-
-/** Non-blocking put operation.  Note: the implementation is not non-blocking
-  */
-int PARMCI_NbPut(void *src, void *dst, int bytes, int proc, armci_hdl_t *handle) {
-  return PARMCI_Put(src, dst, bytes, proc);
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_NbGet = PARMCI_NbGet
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_NbGet ARMCI_NbGet
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_NbGet as PARMCI_NbGet
-#endif
-/* -- end weak symbols block -- */
-
-/** Non-blocking get operation.  Note: the implementation is not non-blocking
-  */
-int PARMCI_NbGet(void *src, void *dst, int bytes, int proc, armci_hdl_t *handle) {
-  return PARMCI_Get(src, dst, bytes, proc);
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_NbAcc = PARMCI_NbAcc
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_NbAcc ARMCI_NbAcc
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_NbAcc as PARMCI_NbAcc
-#endif
-/* -- end weak symbols block -- */
-
-/** Non-blocking accumulate operation.  Note: the implementation is not non-blocking
-  */
-int PARMCI_NbAcc(int datatype, void *scale, void *src, void *dst, int bytes, int proc, armci_hdl_t *hdl) {
-  return PARMCI_Acc(datatype, scale, src, dst, bytes, proc);
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Wait = PARMCI_Wait
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Wait ARMCI_Wait
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Wait as PARMCI_Wait
-#endif
-/* -- end weak symbols block -- */
-
-/** Wait for a non-blocking operation to finish.
-  */
-int PARMCI_Wait(armci_hdl_t* handle) {
-  return 0;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Test = PARMCI_Test
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Test ARMCI_Test
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Test as PARMCI_Test
-#endif
-/* -- end weak symbols block -- */
-
-/** Check if a non-blocking operation has finished.
-  */
-int PARMCI_Test(armci_hdl_t* handle) {
-  return 0;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_WaitAll = PARMCI_WaitAll
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_WaitAll ARMCI_WaitAll
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_WaitAll as PARMCI_WaitAll
-#endif
-/* -- end weak symbols block -- */
-
-/** Wait for all non-blocking operations with implicit (NULL) handles to finish.
-  */
-int PARMCI_WaitAll(void) {
-  return 0;
-}
-
-
diff --git a/src/armci/src/parmci.c b/src/armci/src/parmci.c
deleted file mode 100644
index d7d3e4c..0000000
--- a/src/armci/src/parmci.c
+++ /dev/null
@@ -1,294 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-/* If no weak symbols support */
-#if !defined(HAVE_PRAGMA_WEAK) && !defined(HAVE_PRAGMA_HP_SEC_DEF) && !defined(HAVE_PRAGMA_CRI_DUP)
-
-#include "armci.h"
-
-#pragma weak ARMCI_Init
-int ARMCI_Init(void) {
-  return PARMCI_Init();
-}
-
-#pragma weak ARMCI_Init_args
-int ARMCI_Init_args(int *argc, char ***argv) {
-  return PARMCI_Init_args(argc, argv);
-}
-
-#pragma weak ARMCI_Initialized
-int ARMCI_Initialized(void) {
-  return PARMCI_Initialized();
-}
-
-#pragma weak ARMCI_Finalize
-int ARMCI_Finalize(void) {
-  return PARMCI_Finalize();
-}
-
-#pragma weak ARMCI_Malloc
-int ARMCI_Malloc(void **base_ptrs, armci_size_t size) {
-  return PARMCI_Malloc(base_ptrs, size);
-}
-
-#pragma weak ARMCI_Free
-int ARMCI_Free(void *ptr) {
-  return PARMCI_Free(ptr);
-}
-
-#pragma weak ARMCI_Malloc_local
-void *ARMCI_Malloc_local(armci_size_t size) {
-  return PARMCI_Malloc_local(size);
-}
-
-#pragma weak ARMCI_Free_local
-int ARMCI_Free_local(void *ptr) {
-  return PARMCI_Free_local(ptr);
-}
-
-#pragma weak ARMCI_Barrier
-void ARMCI_Barrier(void) {
-  PARMCI_Barrier();
-  return;
-}
-
-#pragma weak ARMCI_Fence
-void ARMCI_Fence(int proc) {
-  PARMCI_Fence(proc);
-  return;
-}
-
-#pragma weak ARMCI_AllFence
-void ARMCI_AllFence(void) {
-  PARMCI_AllFence();
-  return;
-}
-
-#pragma weak ARMCI_Access_begin
-void ARMCI_Access_begin(void *ptr) {
-  PARMCI_Access_begin(ptr);
-  return;
-}
-
-#pragma weak ARMCI_Access_end
-void ARMCI_Access_end(void *ptr) {
-  PARMCI_Access_end(ptr);
-  return;
-}
-
-#pragma weak ARMCI_Get
-int ARMCI_Get(void *src, void *dst, int size, int target) {
-  return PARMCI_Get(src, dst, size, target);
-}
-
-#pragma weak ARMCI_Put
-int ARMCI_Put(void *src, void *dst, int size, int target) {
-  return PARMCI_Put(src, dst, size, target);
-}
-
-#pragma weak ARMCI_Acc
-int ARMCI_Acc(int datatype, void *scale, void *src, void *dst, int bytes, int proc) {
-  return PARMCI_Acc(datatype, scale, src, dst, bytes, proc);
-}
-
-#pragma weak ARMCI_PutS
-int ARMCI_PutS(void *src_ptr, int src_stride_ar[], void *dst_ptr, int dst_stride_ar[], int count[], int stride_levels, int proc) {
-  return PARMCI_PutS(src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels, proc);
-}
-
-#pragma weak ARMCI_GetS
-int ARMCI_GetS(void *src_ptr, int src_stride_ar[], void *dst_ptr, int dst_stride_ar[], int count[], int stride_levels, int proc) {
-  return PARMCI_GetS(src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels, proc);
-}
-
-#pragma weak ARMCI_AccS
-int ARMCI_AccS(int datatype, void *scale, void *src_ptr, int src_stride_ar[], void *dst_ptr, int dst_stride_ar[], int count[], int stride_levels, int proc) {
-  return PARMCI_AccS(datatype, scale, src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels, proc);
-}
-
-#pragma weak ARMCI_Put_flag
-int ARMCI_Put_flag(void *src, void* dst, int size, int *flag, int value, int proc) {
-  return PARMCI_Put_flag(src, dst, size, flag, value, proc);
-}
-
-#pragma weak ARMCI_PutS_flag
-int ARMCI_PutS_flag(void *src_ptr, int src_stride_ar[], void *dst_ptr, int dst_stride_ar[], int count[], int stride_levels, int *flag, int value, int proc) {
-  return PARMCI_PutS_flag(src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels, flag, value, proc);
-}
-
-#pragma weak ARMCI_PutV
-int ARMCI_PutV(armci_giov_t *iov, int iov_len, int proc) {
-  return PARMCI_PutV(iov, iov_len, proc);
-}
-
-#pragma weak ARMCI_GetV
-int ARMCI_GetV(armci_giov_t *iov, int iov_len, int proc) {
-  return PARMCI_GetV(iov, iov_len, proc);
-}
-
-#pragma weak ARMCI_AccV
-int ARMCI_AccV(int datatype, void *scale, armci_giov_t *iov, int iov_len, int proc) {
-  return PARMCI_AccV(datatype, scale, iov, iov_len, proc);
-}
-
-#pragma weak ARMCI_Wait
-int ARMCI_Wait(armci_hdl_t* hdl) {
-  return PARMCI_Wait(hdl);
-}
-
-#pragma weak ARMCI_Test
-int ARMCI_Test(armci_hdl_t* hdl) {
-  return PARMCI_Test(hdl);
-}
-
-#pragma weak ARMCI_WaitAll
-int ARMCI_WaitAll(void) {
-  return PARMCI_WaitAll();
-}
-
-#pragma weak ARMCI_NbPut
-int ARMCI_NbPut(void *src, void *dst, int bytes, int proc, armci_hdl_t *hdl) {
-  return PARMCI_NbPut(src, dst, bytes, proc, hdl);
-}
-
-#pragma weak ARMCI_NbGet
-int ARMCI_NbGet(void *src, void *dst, int bytes, int proc, armci_hdl_t *hdl) {
-  return PARMCI_NbGet(src, dst, bytes, proc, hdl);
-}
-
-#pragma weak ARMCI_NbAcc
-int ARMCI_NbAcc(int datatype, void *scale, void *src, void *dst, int bytes, int proc, armci_hdl_t *hdl) {
-  return PARMCI_NbAcc(datatype, scale, src, dst, bytes, proc, hdl);
-}
-
-#pragma weak ARMCI_NbPutS
-int ARMCI_NbPutS(void *src_ptr, int src_stride_ar[], void *dst_ptr, int dst_stride_ar[], int count[], int stride_levels, int proc, armci_hdl_t *hdl) {
-  return PARMCI_NbPutS(src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels, proc, hdl);
-}
-
-#pragma weak ARMCI_NbGetS
-int ARMCI_NbGetS(void *src_ptr, int src_stride_ar[], void *dst_ptr, int dst_stride_ar[], int count[], int stride_levels, int proc, armci_hdl_t *hdl) {
-  return PARMCI_NbGetS(src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels, proc, hdl);
-}
-
-#pragma weak ARMCI_NbAccS
-int ARMCI_NbAccS(int datatype, void *scale, void *src_ptr, int src_stride_ar[], void *dst_ptr, int dst_stride_ar[], int count[], int stride_levels, int proc, armci_hdl_t *hdl) {
-  return PARMCI_NbAccS(datatype, scale, src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels, proc, hdl);
-}
-
-#pragma weak ARMCI_NbPutV
-int ARMCI_NbPutV(armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle) {
-  return PARMCI_NbPutV(iov, iov_len, proc, handle);
-}
-
-#pragma weak ARMCI_NbGetV
-int ARMCI_NbGetV(armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle) {
-  return PARMCI_NbGetV(iov, iov_len, proc, handle);
-}
-
-#pragma weak ARMCI_NbAccV
-int ARMCI_NbAccV(int datatype, void *scale, armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle) {
-  return PARMCI_NbAccV(datatype, scale, iov, iov_len, proc, handle);
-}
-
-#pragma weak ARMCI_PutValueInt
-int ARMCI_PutValueInt(int src, void *dst, int proc) {
-  return PARMCI_PutValueInt(src, dst, proc);
-}
-
-#pragma weak ARMCI_PutValueLong
-int ARMCI_PutValueLong(long src, void *dst, int proc) {
-  return PARMCI_PutValueLong(src, dst, proc);
-}
-
-#pragma weak ARMCI_PutValueFloat
-int ARMCI_PutValueFloat(float src, void *dst, int proc) {
-  return PARMCI_PutValueFloat(src, dst, proc);
-}
-
-#pragma weak ARMCI_PutValueDouble
-int ARMCI_PutValueDouble(double src, void *dst, int proc) {
-  return PARMCI_PutValueDouble(src, dst, proc);
-}
-
-#pragma weak ARMCI_NbPutValueInt
-int ARMCI_NbPutValueInt(int src, void *dst, int proc, armci_hdl_t *hdl) {
-  return PARMCI_NbPutValueInt(src, dst, proc, hdl);
-}
-
-#pragma weak ARMCI_NbPutValueLong
-int ARMCI_NbPutValueLong(long src, void *dst, int proc, armci_hdl_t *hdl) {
-  return PARMCI_NbPutValueLong(src, dst, proc, hdl);
-}
-
-#pragma weak ARMCI_NbPutValueFloat
-int ARMCI_NbPutValueFloat(float src, void *dst, int proc, armci_hdl_t *hdl) {
-  return PARMCI_NbPutValueFloat(src, dst, proc, hdl);
-}
-
-#pragma weak ARMCI_NbPutValueDouble
-int ARMCI_NbPutValueDouble(double src, void *dst, int proc, armci_hdl_t *hdl) {
-  return PARMCI_NbPutValueDouble(src, dst, proc, hdl);
-}
-
-#pragma weak ARMCI_GetValueInt
-int ARMCI_GetValueInt(void *src, int proc) {
-  return PARMCI_GetValueInt(src, proc);
-}
-
-#pragma weak ARMCI_GetValueLong
-long ARMCI_GetValueLong(void *src, int proc) {
-  return PARMCI_GetValueLong(src, proc);
-}
-
-#pragma weak ARMCI_GetValueFloat
-float ARMCI_GetValueFloat(void *src, int proc) {
-  return PARMCI_GetValueFloat(src, proc);
-}
-
-#pragma weak ARMCI_GetValueDouble
-double ARMCI_GetValueDouble(void *src, int proc) {
-  return PARMCI_GetValueDouble(src, proc);
-}
-
-#pragma weak ARMCI_Create_mutexes
-int ARMCI_Create_mutexes(int count) {
-  return PARMCI_Create_mutexes(count);
-}
-
-#pragma weak ARMCI_Destroy_mutexes
-int ARMCI_Destroy_mutexes(void) {
-  return PARMCI_Destroy_mutexes();
-}
-
-#pragma weak ARMCI_Lock
-void ARMCI_Lock(int mutex, int proc) {
-  PARMCI_Lock(mutex, proc);
-  return;
-}
-
-#pragma weak ARMCI_Unlock
-void ARMCI_Unlock(int mutex, int proc) {
-  PARMCI_Unlock(mutex, proc);
-  return;
-}
-
-#pragma weak ARMCI_Rmw
-int ARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc) {
-  return PARMCI_Rmw(op, ploc, prem, value, proc);
-}
-
-#pragma weak armci_msg_barrier
-void armci_msg_barrier(void) {
-  parmci_msg_barrier();
-  return;
-}
-
-#pragma weak armci_msg_group_barrier
-void armci_msg_group_barrier(ARMCI_Group *group) {
-  parmci_msg_group_barrier(group);
-  return;
-}
-
-#endif
diff --git a/src/armci/src/rmw.c b/src/armci/src/rmw.c
deleted file mode 100644
index 946cbe1..0000000
--- a/src/armci/src/rmw.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <armci.h>
-#include <armci_internals.h>
-#include <gmr.h>
-#include <debug.h>
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Rmw = PARMCI_Rmw
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Rmw ARMCI_Rmw
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Rmw as PARMCI_Rmw
-#endif
-/* -- end weak symbols block -- */
-
-/** Perform atomic read-modify-write on the given integer or long location and
-  * return the location's original value.
-  *
-  * \note ARMCI RMW operations are atomic with respect to other RMW operations,
-  * but not with respect to other one-sided operations (get, put, acc, etc).
-  *
-  * @param[in]  op    Operation to be performed:
-  *                     ARMCI_FETCH_AND_ADD (int)
-  *                     ARMCI_FETCH_AND_ADD_LONG
-  *                     ARMCI_SWAP (int)
-  *                     ARMCI_SWAP_LONG
-  * @param[out] ploc  Location to store the original value.
-  * @param[in]  prem  Location on which to perform atomic operation.
-  * @param[in]  value Value to add to remote location (ignored for swap).
-  * @param[in]  proc  Process rank for the target buffer.
-  */
-int PARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc) {
-  int           is_long;
-  gmr_t *mreg;
-
-  mreg = gmr_lookup(prem, proc);
-  ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer");
-
-  if (op == ARMCI_SWAP_LONG || op == ARMCI_FETCH_AND_ADD_LONG)
-    is_long = 1;
-  else
-    is_long = 0;
-
-  if (op == ARMCI_SWAP || op == ARMCI_SWAP_LONG) {
-    long swap_val_l;
-    int  swap_val_i;
-
-    ARMCIX_Lock_hdl(mreg->rmw_mutex, 0, proc);
-    PARMCI_Get(prem, is_long ? (void*) &swap_val_l : (void*) &swap_val_i, 
-              is_long ? sizeof(long) : sizeof(int), proc);
-    PARMCI_Put(ploc, prem, is_long ? sizeof(long) : sizeof(int), proc);
-    ARMCIX_Unlock_hdl(mreg->rmw_mutex, 0, proc);
-
-    if (is_long)
-      *(long*) ploc = swap_val_l;
-    else
-      *(int*) ploc = swap_val_i;
-  }
-
-  else if (op == ARMCI_FETCH_AND_ADD || op == ARMCI_FETCH_AND_ADD_LONG) {
-    long fetch_val_l, new_val_l;
-    int  fetch_val_i, new_val_i;
-    
-    ARMCIX_Lock_hdl(mreg->rmw_mutex, 0, proc);
-    PARMCI_Get(prem, is_long ? (void*) &fetch_val_l : (void*) &fetch_val_i,
-              is_long ? sizeof(long) : sizeof(int), proc);
-    
-    if (is_long)
-      new_val_l = fetch_val_l + value;
-    else
-      new_val_i = fetch_val_i + value;
-
-    PARMCI_Put(is_long ? (void*) &new_val_l : (void*) &new_val_i, prem, 
-              is_long ? sizeof(long) : sizeof(int), proc);
-    ARMCIX_Unlock_hdl(mreg->rmw_mutex, 0, proc);
-
-    if (is_long)
-      *(long*) ploc = fetch_val_l;
-    else
-      *(int*) ploc = fetch_val_i;
-  }
-
-  else {
-    ARMCII_Error("invalid operation (%d)", op);
-  }
-
-  return 0;
-}
diff --git a/src/armci/src/strided.c b/src/armci/src/strided.c
deleted file mode 100644
index f2c5e94..0000000
--- a/src/armci/src/strided.c
+++ /dev/null
@@ -1,800 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <armci.h>
-#include <armci_internals.h>
-#include <gmr.h>
-#include <debug.h>
-
-
-/** Convert an ARMCI strided access description into an MPI subarray datatype.
-  *
-  * @param[in]  stride_array    Array of strides
-  * @param[in]  count           Array of transfer counts
-  * @param[in]  stride_levels   Number of levels of striding
-  * @param[in]  old_type        Type of the data element described by count and stride_array
-  * @param[out] new_type        New MPI type for the given strided access
-  */
-void ARMCII_Strided_to_dtype(int stride_array[/*stride_levels*/], int count[/*stride_levels+1*/],
-                             int stride_levels, MPI_Datatype old_type, MPI_Datatype *new_type)
-{
-  int sizes   [stride_levels+1];
-  int subsizes[stride_levels+1];
-  int starts  [stride_levels+1];
-  int i, old_type_size;
-
-  MPI_Type_size(old_type, &old_type_size);
-
-  /* Eliminate counts that don't count (all 1 counts at the end) */
-  for (i = stride_levels+1; i > 0 && stride_levels > 0 && count[i-1] == 1; i--)
-    stride_levels--;
-
-  /* A correct strided spec should me monotonic increasing and stride_array[i+1] should
-     be a multiple of stride_array[i]. */
-  if (stride_levels > 0) {
-    for (i = 1; i < stride_levels; i++)
-      ARMCII_Assert(stride_array[i] >= stride_array[i-1] && stride_array[i] % stride_array[i-1] == 0);
-  }
-
-  /* Test for a contiguous transfer */
-  if (stride_levels == 0) {
-    int elem_count = count[0]/old_type_size;
-
-    ARMCII_Assert(count[0] % old_type_size == 0);
-    MPI_Type_contiguous(elem_count, old_type, new_type);
-  }
-
-  /* Transfer is non-contiguous */
-  else {
-
-    for (i = 0; i < stride_levels+1; i++)
-      starts[i] = 0;
-
-    sizes   [stride_levels] = stride_array[0]/old_type_size;
-    subsizes[stride_levels] = count[0]/old_type_size;
-
-    ARMCII_Assert(stride_array[0] % old_type_size == 0 && count[0] % old_type_size == 0);
-
-    for (i = 1; i < stride_levels; i++) {
-      /* Convert strides into dimensions by dividing out contributions from lower dims */
-      sizes   [stride_levels-i] = stride_array[i]/stride_array[i-1];
-      subsizes[stride_levels-i] = count[i];
-
-      ARMCII_Assert_msg(stride_array[i] % stride_array[i-1] == 0, "Invalid striding");
-    }
-
-    sizes   [0] = count[stride_levels];
-    subsizes[0] = count[stride_levels];
-
-    MPI_Type_create_subarray(stride_levels+1, sizes, subsizes, starts, MPI_ORDER_C, old_type, new_type);
-  }
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_PutS = PARMCI_PutS
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_PutS ARMCI_PutS
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_PutS as PARMCI_PutS
-#endif
-/* -- end weak symbols block -- */
-
-/** Blocking operation that transfers data from the calling process to the
-  * memory of the remote process.  The data transfer is strided and blocking.
-  *
-  * @param[in] src_ptr         Source starting address of the data block to put.
-  * @param[in] src_stride_arr  Source array of stride distances in bytes.
-  * @param[in] dst_ptr         Destination starting address to put data.
-  * @param[in] dst_stride_ar   Destination array of stride distances in bytes.
-  * @param[in] count           Block size in each dimension. count[0] should be the
-  *                            number of bytes of contiguous data in leading dimension.
-  * @param[in] stride_levels   The level of strides.
-  * @param[in] proc            Remote process ID (destination).
-  *
-  * @return                    Zero on success, error code otherwise.
-  */
-int PARMCI_PutS(void *src_ptr, int src_stride_ar[/*stride_levels*/],
-               void *dst_ptr, int dst_stride_ar[/*stride_levels*/], 
-               int count[/*stride_levels+1*/], int stride_levels, int proc) {
-
-  int err;
-
-  if (ARMCII_GLOBAL_STATE.strided_method == ARMCII_STRIDED_DIRECT) {
-    void         *src_buf = NULL;
-    gmr_t *mreg, *gmr_loc = NULL;
-    MPI_Datatype src_type, dst_type;
-
-    /* COPY: Guard shared buffers */
-    if (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY) {
-      gmr_loc = gmr_lookup(src_ptr, ARMCI_GROUP_WORLD.rank);
-
-      if (gmr_loc != NULL) {
-        int i, size;
-
-        for (i = 1, size = count[0]; i < stride_levels+1; i++)
-          size *= count[i];
-
-        MPI_Alloc_mem(size, MPI_INFO_NULL, &src_buf);
-        ARMCII_Assert(src_buf != NULL);
-
-        gmr_dla_lock(gmr_loc);
-        armci_write_strided(src_ptr, stride_levels, src_stride_ar, count, src_buf);
-        gmr_dla_unlock(gmr_loc);
-
-        MPI_Type_contiguous(size, MPI_BYTE, &src_type);
-      }
-    }
-
-    /* NOGUARD: If src_buf hasn't been assigned to a copy, the strided source
-     * buffer is going to be used directly. */
-    if (src_buf == NULL) { 
-        src_buf = src_ptr;
-        ARMCII_Strided_to_dtype(src_stride_ar, count, stride_levels, MPI_BYTE, &src_type);
-    }
-
-    ARMCII_Strided_to_dtype(dst_stride_ar, count, stride_levels, MPI_BYTE, &dst_type);
-
-    MPI_Type_commit(&src_type);
-    MPI_Type_commit(&dst_type);
-
-    mreg = gmr_lookup(dst_ptr, proc);
-    ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer");
-
-    gmr_lock(mreg, proc);
-    gmr_put_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc);
-    gmr_unlock(mreg, proc);
-
-    MPI_Type_free(&src_type);
-    MPI_Type_free(&dst_type);
-
-    /* COPY: Free temporary buffer */
-    if (src_buf != src_ptr)
-      MPI_Free_mem(src_buf);
-
-    err = 0;
-
-  } else {
-    armci_giov_t iov;
-
-    ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels);
-    err = PARMCI_PutV(&iov, 1, proc);
-
-    free(iov.src_ptr_array);
-    free(iov.dst_ptr_array);
-  }
-
-  return err;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_GetS = PARMCI_GetS
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_GetS ARMCI_GetS
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_GetS as PARMCI_GetS
-#endif
-/* -- end weak symbols block -- */
-
-/** Blocking operation that transfers data from the remote process to the
-  * memory of the calling process.  The data transfer is strided and blocking.
-  *
-  * @param[in] src_ptr         Source starting address of the data block to put.
-  * @param[in] src_stride_arr  Source array of stride distances in bytes.
-  * @param[in] dst_ptr         Destination starting address to put data.
-  * @param[in] dst_stride_ar   Destination array of stride distances in bytes.
-  * @param[in] count           Block size in each dimension. count[0] should be the
-  *                            number of bytes of contiguous data in leading dimension.
-  * @param[in] stride_levels   The level of strides.
-  * @param[in] proc            Remote process ID (destination).
-  *
-  * @return                    Zero on success, error code otherwise.
-  */
-int PARMCI_GetS(void *src_ptr, int src_stride_ar[/*stride_levels*/],
-               void *dst_ptr, int dst_stride_ar[/*stride_levels*/], 
-               int count[/*stride_levels+1*/], int stride_levels, int proc) {
-
-  int err;
-
-  if (ARMCII_GLOBAL_STATE.strided_method == ARMCII_STRIDED_DIRECT) {
-    void         *dst_buf = NULL;
-    gmr_t *mreg, *gmr_loc = NULL;
-    MPI_Datatype src_type, dst_type;
-
-    /* COPY: Guard shared buffers */
-    if (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY) {
-      gmr_loc = gmr_lookup(dst_ptr, ARMCI_GROUP_WORLD.rank);
-
-      if (gmr_loc != NULL) {
-        int i, size;
-
-        for (i = 1, size = count[0]; i < stride_levels+1; i++)
-          size *= count[i];
-
-        MPI_Alloc_mem(size, MPI_INFO_NULL, &dst_buf);
-        ARMCII_Assert(dst_buf != NULL);
-
-        MPI_Type_contiguous(size, MPI_BYTE, &dst_type);
-      }
-    }
-
-    /* NOGUARD: If dst_buf hasn't been assigned to a copy, the strided source
-     * buffer is going to be used directly. */
-    if (dst_buf == NULL) { 
-        dst_buf = dst_ptr;
-        ARMCII_Strided_to_dtype(dst_stride_ar, count, stride_levels, MPI_BYTE, &dst_type);
-    }
-
-    ARMCII_Strided_to_dtype(src_stride_ar, count, stride_levels, MPI_BYTE, &src_type);
-
-    MPI_Type_commit(&src_type);
-    MPI_Type_commit(&dst_type);
-
-    mreg = gmr_lookup(src_ptr, proc);
-    ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer");
-
-    gmr_lock(mreg, proc);
-    gmr_get_typed(mreg, src_ptr, 1, src_type, dst_buf, 1, dst_type, proc);
-    gmr_unlock(mreg, proc);
-
-    /* COPY: Finish the transfer */
-    if (dst_buf != dst_ptr) {
-      gmr_dla_lock(gmr_loc);
-      armci_read_strided(dst_ptr, stride_levels, dst_stride_ar, count, dst_buf);
-      gmr_dla_unlock(gmr_loc);
-      MPI_Free_mem(dst_buf);
-    }
-
-    MPI_Type_free(&src_type);
-    MPI_Type_free(&dst_type);
-
-    err = 0;
-
-  } else {
-    armci_giov_t iov;
-
-    ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels);
-    err = PARMCI_GetV(&iov, 1, proc);
-
-    free(iov.src_ptr_array);
-    free(iov.dst_ptr_array);
-  }
-
-  return err;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_AccS = PARMCI_AccS
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_AccS ARMCI_AccS
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_AccS as PARMCI_AccS
-#endif
-/* -- end weak symbols block -- */
-
-/** Blocking operation that accumulates data from the local process into the
-  * memory of the remote process.  The data transfer is strided and blocking.
-  *
-  * @param[in] datatype        Type of data to be transferred.
-  * @param[in] scale           Pointer to the value that input data should be scaled by.
-  * @param[in] src_ptr         Source starting address of the data block to put.
-  * @param[in] src_stride_arr  Source array of stride distances in bytes.
-  * @param[in] dst_ptr         Destination starting address to put data.
-  * @param[in] dst_stride_ar   Destination array of stride distances in bytes.
-  * @param[in] count           Block size in each dimension. count[0] should be the
-  *                            number of bytes of contiguous data in leading dimension.
-  * @param[in] stride_levels   The level of strides.
-  * @param[in] proc            Remote process ID (destination).
-  *
-  * @return                    Zero on success, error code otherwise.
-  */
-int PARMCI_AccS(int datatype, void *scale,
-               void *src_ptr, int src_stride_ar[/*stride_levels*/],
-               void *dst_ptr, int dst_stride_ar[/*stride_levels*/],
-               int count[/*stride_levels+1*/], int stride_levels, int proc) {
-
-  int err;
-
-  if (ARMCII_GLOBAL_STATE.strided_method == ARMCII_STRIDED_DIRECT) {
-    void         *src_buf = NULL;
-    gmr_t *mreg, *gmr_loc = NULL;
-    MPI_Datatype src_type, dst_type, mpi_datatype;
-    int          scaled, mpi_datatype_size;
-
-    ARMCII_Acc_type_translate(datatype, &mpi_datatype, &mpi_datatype_size);
-    scaled = ARMCII_Buf_acc_is_scaled(datatype, scale);
-
-    /* SCALE: copy and scale if requested */
-    if (scaled) {
-      armci_giov_t iov;
-      int i, nelem;
-
-      if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD)
-        gmr_loc = gmr_lookup(src_ptr, ARMCI_GROUP_WORLD.rank);
-
-      for (i = 1, nelem = count[0]/mpi_datatype_size; i < stride_levels+1; i++)
-        nelem *= count[i];
-
-      MPI_Alloc_mem(nelem*mpi_datatype_size, MPI_INFO_NULL, &src_buf);
-      ARMCII_Assert(src_buf != NULL);
-
-      if (gmr_loc != NULL) gmr_dla_lock(gmr_loc);
-
-      /* Shoehorn the strided information into an IOV */
-      ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, src_ptr, src_stride_ar, count, stride_levels);
-
-      for (i = 0; i < iov.ptr_array_len; i++)
-        ARMCII_Buf_acc_scale(iov.src_ptr_array[i], ((uint8_t*)src_buf) + i*iov.bytes, iov.bytes, datatype, scale);
-
-      free(iov.src_ptr_array);
-      free(iov.dst_ptr_array);
-
-      if (gmr_loc != NULL) gmr_dla_unlock(gmr_loc);
-
-      MPI_Type_contiguous(nelem, mpi_datatype, &src_type);
-    }
-
-    /* COPY: Guard shared buffers */
-    else if (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY) {
-      gmr_loc = gmr_lookup(src_ptr, ARMCI_GROUP_WORLD.rank);
-
-      if (gmr_loc != NULL) {
-        int i, nelem;
-
-        for (i = 1, nelem = count[0]/mpi_datatype_size; i < stride_levels+1; i++)
-          nelem *= count[i];
-
-        MPI_Alloc_mem(nelem*mpi_datatype_size, MPI_INFO_NULL, &src_buf);
-        ARMCII_Assert(src_buf != NULL);
-
-        gmr_dla_lock(gmr_loc);
-        armci_write_strided(src_ptr, stride_levels, src_stride_ar, count, src_buf);
-        gmr_dla_unlock(gmr_loc);
-
-        MPI_Type_contiguous(nelem, mpi_datatype, &src_type);
-      }
-    }
-
-    /* NOGUARD: If src_buf hasn't been assigned to a copy, the strided source
-     * buffer is going to be used directly. */
-    if (src_buf == NULL) { 
-        src_buf = src_ptr;
-        ARMCII_Strided_to_dtype(src_stride_ar, count, stride_levels, mpi_datatype, &src_type);
-    }
-
-    ARMCII_Strided_to_dtype(dst_stride_ar, count, stride_levels, mpi_datatype, &dst_type);
-
-    MPI_Type_commit(&src_type);
-    MPI_Type_commit(&dst_type);
-
-    int src_size, dst_size;
-
-    MPI_Type_size(src_type, &src_size);
-    MPI_Type_size(dst_type, &dst_size);
-
-    ARMCII_Assert(src_size == dst_size);
-
-    mreg = gmr_lookup(dst_ptr, proc);
-    ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer");
-
-    gmr_lock(mreg, proc);
-    gmr_accumulate_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc);
-    gmr_unlock(mreg, proc);
-
-    MPI_Type_free(&src_type);
-    MPI_Type_free(&dst_type);
-
-    /* COPY/SCALE: Free temp buffer */
-    if (src_buf != src_ptr)
-      MPI_Free_mem(src_buf);
-
-    err = 0;
-
-  } else {
-    armci_giov_t iov;
-
-    ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels);
-    err = PARMCI_AccV(datatype, scale, &iov, 1, proc);
-
-    free(iov.src_ptr_array);
-    free(iov.dst_ptr_array);
-  }
-
-  return err;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_NbPutS = PARMCI_NbPutS
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_NbPutS ARMCI_NbPutS
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_NbPutS as PARMCI_NbPutS
-#endif
-/* -- end weak symbols block -- */
-
-/** Non-blocking operation that transfers data from the calling process to the
-  * memory of the remote process.  The data transfer is strided and blocking.
-  *
-  * @param[in] src_ptr         Source starting address of the data block to put.
-  * @param[in] src_stride_arr  Source array of stride distances in bytes.
-  * @param[in] dst_ptr         Destination starting address to put data.
-  * @param[in] dst_stride_ar   Destination array of stride distances in bytes.
-  * @param[in] count           Block size in each dimension. count[0] should be the
-  *                            number of bytes of contiguous data in leading dimension.
-  * @param[in] stride_levels   The level of strides.
-  * @param[in] proc            Remote process ID (destination).
-  *
-  * @return                    Zero on success, error code otherwise.
-  */
-int PARMCI_NbPutS(void *src_ptr, int src_stride_ar[/*stride_levels*/],
-               void *dst_ptr, int dst_stride_ar[/*stride_levels*/], 
-               int count[/*stride_levels+1*/], int stride_levels, int proc, armci_hdl_t *hdl) {
-
-  return PARMCI_PutS(src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels, proc);
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_NbGetS = PARMCI_NbGetS
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_NbGetS ARMCI_NbGetS
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_NbGetS as PARMCI_NbGetS
-#endif
-/* -- end weak symbols block -- */
-
-/** Non-blocking operation that transfers data from the remote process to the
-  * memory of the calling process.  The data transfer is strided and blocking.
-  *
-  * @param[in] src_ptr         Source starting address of the data block to put.
-  * @param[in] src_stride_arr  Source array of stride distances in bytes.
-  * @param[in] dst_ptr         Destination starting address to put data.
-  * @param[in] dst_stride_ar   Destination array of stride distances in bytes.
-  * @param[in] count           Block size in each dimension. count[0] should be the
-  *                            number of bytes of contiguous data in leading dimension.
-  * @param[in] stride_levels   The level of strides.
-  * @param[in] proc            Remote process ID (destination).
-  *
-  * @return                    Zero on success, error code otherwise.
-  */
-int PARMCI_NbGetS(void *src_ptr, int src_stride_ar[/*stride_levels*/],
-               void *dst_ptr, int dst_stride_ar[/*stride_levels*/], 
-               int count[/*stride_levels+1*/], int stride_levels, int proc, armci_hdl_t *hdl) {
-
-  return PARMCI_GetS(src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels, proc);
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_NbAccS = PARMCI_NbAccS
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_NbAccS ARMCI_NbAccS
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_NbAccS as PARMCI_NbAccS
-#endif
-/* -- end weak symbols block -- */
-
-/** Non-blocking operation that accumulates data from the local process into the
-  * memory of the remote process.  The data transfer is strided and blocking.
-  *
-  * @param[in] datatype        Type of data to be transferred.
-  * @param[in] scale           Pointer to the value that input data should be scaled by.
-  * @param[in] src_ptr         Source starting address of the data block to put.
-  * @param[in] src_stride_arr  Source array of stride distances in bytes.
-  * @param[in] dst_ptr         Destination starting address to put data.
-  * @param[in] dst_stride_ar   Destination array of stride distances in bytes.
-  * @param[in] count           Block size in each dimension. count[0] should be the
-  *                            number of bytes of contiguous data in leading dimension.
-  * @param[in] stride_levels   The level of strides.
-  * @param[in] proc            Remote process ID (destination).
-  *
-  * @return                    Zero on success, error code otherwise.
-  */
-int PARMCI_NbAccS(int datatype, void *scale,
-               void *src_ptr, int src_stride_ar[/*stride_levels*/],
-               void *dst_ptr, int dst_stride_ar[/*stride_levels*/],
-               int count[/*stride_levels+1*/], int stride_levels, int proc, armci_hdl_t *hdl) {
-
-  return PARMCI_AccS(datatype, scale, src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels, proc);
-}
-
-
-/** Translate a strided operation into a more general IO Vector.
-  *
-  * @param[in] src_ptr         Source starting address of the data block to put.
-  * @param[in] src_stride_arr  Source array of stride distances in bytes.
-  * @param[in] dst_ptr         Destination starting address to put data.
-  * @param[in] dst_stride_ar   Destination array of stride distances in bytes.
-  * @param[in] count           Block size in each dimension. count[0] should be the
-  *                            number of bytes of contiguous data in leading dimension.
-  * @param[in] stride_levels   The level of strides.
-  *
-  * @return                    Zero on success, error code otherwise.
-  */
-void ARMCII_Strided_to_iov(armci_giov_t *iov,
-               void *src_ptr, int src_stride_ar[/*stride_levels*/],
-               void *dst_ptr, int dst_stride_ar[/*stride_levels*/], 
-               int count[/*stride_levels+1*/], int stride_levels) {
-
-  int i;
-
-  iov->bytes = count[0];
-  iov->ptr_array_len = 1;
-
-  for (i = 0; i < stride_levels; i++)
-    iov->ptr_array_len *= count[i+1];
-
-  iov->src_ptr_array = malloc(iov->ptr_array_len*sizeof(void*));
-  iov->dst_ptr_array = malloc(iov->ptr_array_len*sizeof(void*));
-
-  ARMCII_Assert(iov->src_ptr_array != NULL && iov->dst_ptr_array != NULL);
-
-  // Case 1: Non-strided transfer
-  if (stride_levels == 0) {
-    iov->src_ptr_array[0] = src_ptr;
-    iov->dst_ptr_array[0] = dst_ptr;
-
-  // Case 2: Strided transfer
-  } else {
-    int idx[stride_levels];
-    int xfer;
-
-    for (i = 0; i < stride_levels; i++)
-      idx[i] = 0;
-
-    for (xfer = 0; idx[stride_levels-1] < count[stride_levels]; xfer++) {
-      int disp_src = 0;
-      int disp_dst = 0;
-
-      ARMCII_Assert(xfer < iov->ptr_array_len);
-
-      // Calculate displacements from base pointers
-      for (i = 0; i < stride_levels; i++) {
-        disp_src += src_stride_ar[i]*idx[i];
-        disp_dst += dst_stride_ar[i]*idx[i];
-      }
-
-      // Add to the IO Vector
-      iov->src_ptr_array[xfer] = ((uint8_t*)src_ptr) + disp_src;
-      iov->dst_ptr_array[xfer] = ((uint8_t*)dst_ptr) + disp_dst;
-
-      // Increment innermost index
-      idx[0] += 1;
-
-      // Propagate "carry" overflows outward.  We're done when the outermost
-      // index is greater than the requested count.
-      for (i = 0; i < stride_levels-1; i++) {
-        if (idx[i] >= count[i+1]) {
-          idx[i]    = 0;
-          idx[i+1] += 1;
-        }
-      }
-    }
-
-    ARMCII_Assert(xfer == iov->ptr_array_len);
-  }
-}
-
-
-/** Translate a strided operation into a more general IO Vector iterator.
-  *
-  * @param[in] src_ptr         Source starting address of the data block to put.
-  * @param[in] src_stride_arr  Source array of stride distances in bytes.
-  * @param[in] dst_ptr         Destination starting address to put data.
-  * @param[in] dst_stride_ar   Destination array of stride distances in bytes.
-  * @param[in] count           Block size in each dimension. count[0] should be the
-  *                            number of bytes of contiguous data in leading dimension.
-  * @param[in] stride_levels   The level of strides.
-  *
-  * @return                    ARMCI IOV iterator corresponding to the strided parameters.
-  */
-armcii_iov_iter_t *ARMCII_Strided_to_iov_iter(
-               void *src_ptr, int src_stride_ar[/*stride_levels*/],
-               void *dst_ptr, int dst_stride_ar[/*stride_levels*/], 
-               int count[/*stride_levels+1*/], int stride_levels) {
-
-  int i;
-  armcii_iov_iter_t *it = malloc(sizeof(armcii_iov_iter_t));
-
-  ARMCII_Assert(it != NULL);
-
-  it->src = src_ptr;
-  it->dst = dst_ptr;
-  it->stride_levels = stride_levels;
-  it->base_ptr      = malloc(sizeof(int)*(4*stride_levels+1));
-  it->was_contiguous= 0;
-
-  ARMCII_Assert( it->base_ptr != NULL );
-
-  it->src_stride_ar = &it->base_ptr[0*stride_levels];
-  it->dst_stride_ar = &it->base_ptr[1*stride_levels];
-  it->count         = &it->base_ptr[2*stride_levels];
-  it->idx           = &it->base_ptr[3*stride_levels+1];
-
-  for (i = 0; i < stride_levels; i++) {
-    it->src_stride_ar[i] = src_stride_ar[i];
-    it->dst_stride_ar[i] = dst_stride_ar[i];
-    it->count[i]         = count[i];
-    it->idx[i]           = 0;
-  }
-
-  return it;
-}
-
-
-/** Free an iterator.
-  * 
-  * @param[in]  it      IOV iterator
-  */
-void ARMCII_Iov_iter_free(armcii_iov_iter_t *it) {
-  free(it->base_ptr);
-  free(it);
-}
-
-
-/** Query whether the iterator has another iteration.
-  * 
-  * @param[in]  it      IOV iterator
-  *
-  * @return             True if another iteration exists
-  */
-int ARMCII_Iov_iter_has_next(armcii_iov_iter_t *it) {
-  return (it->idx[it->stride_levels-1] < it->count[it->stride_levels] && !it->was_contiguous);
-}
-
-
-/** Get the next source/destination pointer pair from the IOV iterator.
-  * 
-  * @param[in]  it      IOV iterator
-  * @param[out] src     Source adress
-  * @param[out] dst     Destination adress
-  *
-  * @return             True if another iteration existed
-  */
-int ARMCII_Iov_iter_next(armcii_iov_iter_t *it, void **src, void **dst) {
-
-  if (!ARMCII_Iov_iter_has_next(it)) {
-    *src = NULL;
-    *dst = NULL;
-    return 0;
-  }
-
-  // Case 1: Non-strided transfer
-  if (it->stride_levels == 0) {
-    *src = src;
-    *dst = dst;
-    it->was_contiguous = 1;
-
-  // Case 2: Strided transfer
-  } else {
-    int i, disp_src = 0, disp_dst = 0;
-
-    // Calculate displacements from base pointers
-    for (i = 0; i < it->stride_levels; i++) {
-      disp_src += it->src_stride_ar[i]*it->idx[i];
-      disp_dst += it->dst_stride_ar[i]*it->idx[i];
-    }
-
-    // Add to the IO Vector
-    *src = ((uint8_t*)it->src) + disp_src;
-    *dst = ((uint8_t*)it->dst) + disp_dst;
-
-    // Increment innermost index
-    it->idx[0] += 1;
-
-    // Propagate "carry" overflows outward.  We're done when the outermost
-    // index is greater than the requested count.
-    for (i = 0; i < it->stride_levels-1; i++) {
-      if (it->idx[i] >= it->count[i+1]) {
-        it->idx[i]    = 0;
-        it->idx[i+1] += 1;
-      }
-    }
-  }
-
-  return 1;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_PutS_flag = PARMCI_PutS_flag
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_PutS_flag ARMCI_PutS_flag
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_PutS_flag as PARMCI_PutS_flag
-#endif
-/* -- end weak symbols block -- */
-
-/** Blocking operation that transfers data from the calling process to the
-  * memory of the remote process.  The data transfer is strided and blocking.
-  * After the transfer completes, the given flag is set on the remote process.
-  *
-  * @param[in] src_ptr         Source starting address of the data block to put.
-  * @param[in] src_stride_arr  Source array of stride distances in bytes.
-  * @param[in] dst_ptr         Destination starting address to put data.
-  * @param[in] dst_stride_ar   Destination array of stride distances in bytes.
-  * @param[in] count           Block size in each dimension. count[0] should be the
-  *                            number of bytes of contiguous data in leading dimension.
-  * @param[in] stride_levels   The level of strides.
-  * @param[in] flag            Location of the flag buffer
-  * @param[in] value           Value to set the flag to
-  * @param[in] proc            Remote process ID (destination).
-  *
-  * @return                    Zero on success, error code otherwise.
-  */
-int PARMCI_PutS_flag(void *src_ptr, int src_stride_ar[/*stride_levels*/],
-                 void *dst_ptr, int dst_stride_ar[/*stride_levels*/], 
-                 int count[/*stride_levels+1*/], int stride_levels, 
-                 int *flag, int value, int proc) {
-
-  PARMCI_PutS(src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels, proc);
-  PARMCI_Fence(proc);
-  PARMCI_Put(&value, flag, sizeof(int), proc);
-
-  return 1;
-}
-
-
-/* Pack strided data into a contiguous destination buffer.  This is a local operation.
- *
- * @param[in] src            Pointer to the strided buffer
- * @param[in] stride_levels  Number of levels of striding
- * @param[in] src_stride_arr Array of length stride_levels of stride lengths
- * @param[in] count          Array of length stride_levels+1 of the number of
- *                           units at each stride level (lowest is contiguous)
- * @param[in] dst            Destination contiguous buffer
- */
-void armci_write_strided(void *src, int stride_levels, int src_stride_arr[],
-                         int count[], char *dst) {
-  armci_giov_t iov;
-  int i;
-
-  // Shoehorn the strided information into an IOV
-  ARMCII_Strided_to_iov(&iov, src, src_stride_arr, src, src_stride_arr, count, stride_levels);
-
-  for (i = 0; i < iov.ptr_array_len; i++)
-    ARMCI_Copy(iov.src_ptr_array[i], dst + i*count[0], iov.bytes);
-
-  free(iov.src_ptr_array);
-  free(iov.dst_ptr_array);
-}
-
-
-/* Unpack strided data from a contiguous source buffer.  This is a local operation.
- *
- * @param[in] src            Pointer to the contiguous buffer
- * @param[in] stride_levels  Number of levels of striding
- * @param[in] dst_stride_arr Array of length stride_levels of stride lengths
- * @param[in] count          Array of length stride_levels+1 of the number of
- *                           units at each stride level (lowest is contiguous)
- * @param[in] dst            Destination strided buffer
- */
-void armci_read_strided(void *dst, int stride_levels, int dst_stride_arr[],
-                        int count[], char *src) {
-  armci_giov_t iov;
-  int i;
-
-  // Shoehorn the strided information into an IOV
-  ARMCII_Strided_to_iov(&iov, dst, dst_stride_arr, dst, dst_stride_arr, count, stride_levels);
-
-  for (i = 0; i < iov.ptr_array_len; i++)
-    ARMCI_Copy(src + i*count[0], iov.dst_ptr_array[i], iov.bytes);
-
-  free(iov.src_ptr_array);
-  free(iov.dst_ptr_array);
-}
diff --git a/src/armci/src/topology.c b/src/armci/src/topology.c
deleted file mode 100644
index 950d125..0000000
--- a/src/armci/src/topology.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <armci.h>
-#include <armci_internals.h>
-#include <debug.h>
-
-/** NOTE: Domains are not implemented.  These dummy wrappers assume that all
-  * domains are of size 1. */
-
-/** Query the size of a given domain.
-  *
-  * @param[in] domain    Desired domain.
-  * @param[in] domain_id Domain id or -1 for my domain.
-  */
-int armci_domain_nprocs(armci_domain_t domain, int domain_id) {
-  return 1;
-}
-
-/** Query which domain a process belongs to.
-  */
-int armci_domain_id(armci_domain_t domain, int glob_proc_id) {
-  return glob_proc_id;
-}
-
-/** Translate a domain process ID to a global process ID.
-  */
-int armci_domain_glob_proc_id(armci_domain_t domain, int domain_id, int loc_proc_id) {
-  ARMCII_Assert(loc_proc_id == 0); // Groups must be size 1
-  return domain_id;
-}
-
-/** Query the ID of my domain.
-  */
-int armci_domain_my_id(armci_domain_t domain) {
-  return ARMCI_GROUP_WORLD.rank;
-}
-
-/** Query the number of domains.
-  */
-int armci_domain_count(armci_domain_t domain) {
-  return ARMCI_GROUP_WORLD.size;
-}
-
-/** Query if the given process shared a domain with me.
-  */
-int armci_domain_same_id(armci_domain_t domain, int glob_proc_id) {
-  return glob_proc_id == ARMCI_GROUP_WORLD.rank;
-}
-
-
-/** Query if a process is on the same node as the caller.
-  *
-  * @param[in] proc Process id in question
-  */
-int ARMCI_Same_node(int proc) {
-  return proc == ARMCI_GROUP_WORLD.rank;
-}
diff --git a/src/armci/src/util.c b/src/armci/src/util.c
deleted file mode 100644
index 3121bd7..0000000
--- a/src/armci/src/util.c
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <mpi.h>
-
-#include <armci.h>
-#include <armci_internals.h>
-#include <debug.h>
-#include <gmr.h>
-
-
-/** Fatal error, print the message and abort the program with the provided
-  * error code.
-  */
-void ARMCI_Error(char *msg, int code) {
-  fprintf(stderr, "[%d] ARMCI Error: %s\n", ARMCI_GROUP_WORLD.rank, msg);
-  fflush(NULL);
-  MPI_Abort(ARMCI_GROUP_WORLD.comm, code);
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Barrier = PARMCI_Barrier
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Barrier ARMCI_Barrier
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Barrier as PARMCI_Barrier
-#endif
-/* -- end weak symbols block -- */
-
-/** Barrier synchronization.  Collective on the world group (not the default
-  * group!).
-  */
-void PARMCI_Barrier(void) {
-  PARMCI_AllFence();
-  MPI_Barrier(ARMCI_GROUP_WORLD.comm);
-
-  if (ARMCII_GLOBAL_STATE.debug_flush_barriers) {
-    ARMCII_Flush_local();
-  }
-}
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_Fence = PARMCI_Fence
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_Fence ARMCI_Fence
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_Fence as PARMCI_Fence
-#endif
-/* -- end weak symbols block -- */
-
-/** Wait for remote completion on one-sided operations targeting process proc.
-  * In MPI-2, this is a no-op since get/put/acc already guarantee remote
-  * completion.
-  *
-  * @param[in] proc Process to target
-  */
-void PARMCI_Fence(int proc) {
-  return;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_AllFence = PARMCI_AllFence
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_AllFence ARMCI_AllFence
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_AllFence as PARMCI_AllFence
-#endif
-/* -- end weak symbols block -- */
-
-/** Wait for remote completion on all one-sided operations.  In MPI-2, this is
-  * a no-op since get/put/acc already guarantee remote completion.
-  */
-void PARMCI_AllFence(void) {
-  return;
-}
-
-
-int ARMCI_Uses_shm(void) {
-  return 0;
-}
-
-
-void ARMCI_Set_shm_limit(unsigned long shmemlimit) {
-  return;
-}
-
-
-int ARMCI_Uses_shm_grp(ARMCI_Group *group) {
-  return 0;
-}
-
-
-/** Copy local data.
-  *
-  * @param[in]  src  Source buffer
-  * @param[out] dst  Destination buffer
-  * @param[in]  size Number of bytes to copy
-  */
-void ARMCI_Copy(void *src, void *dst, int size) {
-#ifndef COPY_WITH_SENDRECV
-  memcpy(dst, src, size);
-#else
-  static MPI_Comm copy_comm = MPI_COMM_NULL;
-
-  if (copy_comm == MPI_COMM_NULL)
-    MPI_Comm_dup(MPI_COMM_SELF, &copy_comm);
-
-  MPI_Sendrecv(src, size, MPI_BYTE,
-      0 /* rank */, 0 /* tag */,
-      dst, size, MPI_BYTE,
-      0 /* rank */, 0 /* tag */,
-      copy_comm, MPI_STATUS_IGNORE);
-#endif
-}
-
-
-/** Zero out the given buffer.
-  */
-void ARMCII_Bzero(void *buf, armci_size_t size) {
-  armci_size_t i;
-  uint8_t *buf_b = (uint8_t *)buf;
-
-  for (i = 0; i < size; i++)
-    buf_b[i] = 0;
-}
-
-
-static const unsigned char log2_table[256] = 
-    { 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
-      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
-      6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-      6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-      6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7,
-      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 };
-
-/** Calculate the base 2 logarithm of a given integer.
-  */
-int ARMCII_Log2(unsigned int val) {
-  unsigned int v16, v8;
-  int lg = 0;
-
-  if (val == 0) return -1;
-
-  if ((v16 = val >> 16))
-    lg = (v8 = v16 >> 8) ? log2_table[v8] + 24 : log2_table[v16] + 16;
-  else
-    lg = (v8 = val >> 8) ? log2_table[v8] + 8 : log2_table[val];
-
-  return lg;
-}
-
-
-/** Retrieve the value of a boolean environment variable.
-  */
-int ARMCII_Getenv_bool(char *varname, int default_value) {
-  char *var = getenv(varname);
-
-  if (var == NULL)
-    return default_value;
-  
-  if (var[0] == 'T' || var[0] == 't' || var[0] == '1' || var[0] == 'y' || var[0] == 'Y')
-    return 1;
-
-  else
-    return 0;
-}
-
-
-/** Retrieve the value of a environment variable.
-  */
-char *ARMCII_Getenv(char *varname) {
-  return getenv(varname);
-}
-
-
-/** Retrieve the value of an integer environment variable.
-  */
-int ARMCII_Getenv_int(char *varname, int default_value) {
-  char *var = getenv("ARMCI_IOV_BATCHED_LIMIT");
-  if (var) 
-    return atoi(var);
-  else
-    return default_value;
-}
diff --git a/src/armci/src/value_ops.c b/src/armci/src/value_ops.c
deleted file mode 100644
index c0f3966..0000000
--- a/src/armci/src/value_ops.c
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <armci.h>
-#include <debug.h>
-
-/* Put value operations */
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_PutValueInt = PARMCI_PutValueInt
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_PutValueInt ARMCI_PutValueInt
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_PutValueInt as PARMCI_PutValueInt
-#endif
-/* -- end weak symbols block -- */
-
-int PARMCI_PutValueInt(int src, void *dst, int proc) {
-  return PARMCI_Put(&src, dst, sizeof(int), proc);
-}
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_PutValueLong = PARMCI_PutValueLong
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_PutValueLong ARMCI_PutValueLong
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_PutValueLong as PARMCI_PutValueLong
-#endif
-/* -- end weak symbols block -- */
-
-int PARMCI_PutValueLong(long src, void *dst, int proc) {
-  return PARMCI_Put(&src, dst, sizeof(long), proc);
-}
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_PutValueFloat = PARMCI_PutValueFloat
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_PutValueFloat ARMCI_PutValueFloat
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_PutValueFloat as PARMCI_PutValueFloat
-#endif
-/* -- end weak symbols block -- */
-
-int PARMCI_PutValueFloat(float src, void *dst, int proc) {
-  return PARMCI_Put(&src, dst, sizeof(float), proc);
-}
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_PutValueDouble = PARMCI_PutValueDouble
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_PutValueDouble ARMCI_PutValueDouble
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_PutValueDouble as PARMCI_PutValueDouble
-#endif
-/* -- end weak symbols block -- */
-
-int PARMCI_PutValueDouble(double src, void *dst, int proc) {
-  return PARMCI_Put(&src, dst, sizeof(double), proc);
-}
-
-/* Non-blocking put operations */
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_NbPutValueInt = PARMCI_NbPutValueInt
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_NbPutValueInt ARMCI_NbPutValueInt
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_NbPutValueInt as PARMCI_NbPutValueInt
-#endif
-/* -- end weak symbols block -- */
-
-int PARMCI_NbPutValueInt(int src, void *dst, int proc, armci_hdl_t *hdl) {
-  return PARMCI_NbPut(&src, dst, sizeof(int), proc, hdl);
-}
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_NbPutValueLong = PARMCI_NbPutValueLong
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_NbPutValueLong ARMCI_NbPutValueLong
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_NbPutValueLong as PARMCI_NbPutValueLong
-#endif
-/* -- end weak symbols block -- */
-
-int PARMCI_NbPutValueLong(long src, void *dst, int proc, armci_hdl_t *hdl) {
-  return PARMCI_NbPut(&src, dst, sizeof(long), proc, hdl);
-}
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_NbPutValueFloat = PARMCI_NbPutValueFloat
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_NbPutValueFloat ARMCI_NbPutValueFloat
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_NbPutValueFloat as PARMCI_NbPutValueFloat
-#endif
-/* -- end weak symbols block -- */
-
-int PARMCI_NbPutValueFloat(float src, void *dst, int proc, armci_hdl_t *hdl) {
-  return PARMCI_NbPut(&src, dst, sizeof(float), proc, hdl);
-}
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_NbPutValueDouble = PARMCI_NbPutValueDouble
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_NbPutValueDouble ARMCI_NbPutValueDouble
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_NbPutValueDouble as PARMCI_NbPutValueDouble
-#endif
-/* -- end weak symbols block -- */
-
-int PARMCI_NbPutValueDouble(double src, void *dst, int proc, armci_hdl_t *hdl) {
-  return PARMCI_NbPut(&src, dst, sizeof(double), proc, hdl);
-}
-
-/* Get value operations */
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_GetValueInt = PARMCI_GetValueInt
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_GetValueInt ARMCI_GetValueInt
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_GetValueInt as PARMCI_GetValueInt
-#endif
-/* -- end weak symbols block -- */
-
-int    PARMCI_GetValueInt(void *src, int proc) {
-  int val;
-  PARMCI_Get(src, &val, sizeof(int), proc);
-  return val;
-}
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_GetValueLong = PARMCI_GetValueLong
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_GetValueLong ARMCI_GetValueLong
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_GetValueLong as PARMCI_GetValueLong
-#endif
-/* -- end weak symbols block -- */
-
-long   PARMCI_GetValueLong(void *src, int proc) {
-  long val;
-  PARMCI_Get(src, &val, sizeof(long), proc);
-  return val;
-}
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_GetValueFloat = PARMCI_GetValueFloat
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_GetValueFloat ARMCI_GetValueFloat
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_GetValueFloat as PARMCI_GetValueFloat
-#endif
-/* -- end weak symbols block -- */
-
-float  PARMCI_GetValueFloat(void *src, int proc) {     
-  float val;
-  PARMCI_Get(src, &val, sizeof(float), proc);
-  return val;
-}
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_GetValueDouble = PARMCI_GetValueDouble
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_GetValueDouble ARMCI_GetValueDouble
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_GetValueDouble as PARMCI_GetValueDouble
-#endif
-/* -- end weak symbols block -- */
-
-double PARMCI_GetValueDouble(void *src, int proc) {     
-  double val;
-  PARMCI_Get(src, &val, sizeof(double), proc);
-  return val;
-}
diff --git a/src/armci/src/vector.c b/src/armci/src/vector.c
deleted file mode 100644
index 738f4f8..0000000
--- a/src/armci/src/vector.c
+++ /dev/null
@@ -1,532 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <armci.h>
-#include <armci_internals.h>
-#include <debug.h>
-#include <gmr.h>
-
-#ifndef NO_USE_CTREE
-#include <conflict_tree.h>
-#endif
-
-
-/** Check an I/O vector operation's buffers for overlap.
-  *
-  * @param[in] iov      Vector of transfer information.
-  * @return             Logical true when regions overlap, 0 otherwise.
-  */
-int ARMCII_Iov_check_overlap(void **ptrs, int count, int size) {
-#ifndef NO_CHECK_OVERLAP
-#ifdef NO_USE_CTREE
-  int i, j;
-
-  if (!ARMCII_GLOBAL_STATE.iov_checks) return 0;
-
-  for (i = 0; i < count; i++) {
-    for (j = i+1; j < count; j++) {
-      const uint8_t *ptr_1_lo = ptrs[i];
-      const uint8_t *ptr_1_hi = ((uint8_t*)ptrs[i]) + size - 1;
-      const uint8_t *ptr_2_lo = ptrs[j];
-      const uint8_t *ptr_2_hi = ((uint8_t*)ptrs[j]) + size - 1;
-
-      if (   (ptr_1_lo >= ptr_2_lo && ptr_1_lo <= ptr_2_hi)
-          || (ptr_1_hi >= ptr_2_lo && ptr_1_hi <= ptr_2_hi)
-          || (ptr_1_lo <  ptr_2_lo && ptr_1_hi >  ptr_2_hi)) {
-        ARMCII_Dbg_print(DEBUG_CAT_IOV, "IOV regions overlap: [%p, %p] - [%p, %p]\n",
-            ptr_1_lo, ptr_1_hi, ptr_2_lo, ptr_2_hi);
-        return 1;
-      }
-    }
-  }
-#else
-  int i;
-  ctree_t ctree = CTREE_EMPTY;
-
-  if (!ARMCII_GLOBAL_STATE.iov_checks) return 0;
-
-  for (i = 0; i < count; i++) {
-    int conflict = ctree_insert(&ctree, ptrs[i], ((uint8_t*)ptrs[i]) + size - 1);
-
-    if (conflict) {
-      ctree_t cnode = ctree_locate(ctree, ptrs[i], ((uint8_t*)ptrs[i]) + size - 1);
-
-      ARMCII_Dbg_print(DEBUG_CAT_IOV, "IOV regions overlap: [%p, %p] - [%p, %p]\n",
-          ptrs[i], ((uint8_t*)ptrs[i]) + size - 1, cnode->lo, cnode->hi);
-
-      ctree_destroy(&ctree);
-      return 1;
-    }
-  }
-
-  ctree_destroy(&ctree);
-#endif /* NO_USE_CTREE */
-#endif /* NO_CHECK_OVERLAP */
-
-  return 0;
-}
-
-
-/** Check if a set of pointers all corresponds to the same allocation.
-  *
-  * @param[in] ptrs  An array of count shared pointers valid on proc.
-  * @param[in] count Size of the ptrs array.
-  * @param[in] proc  Process on which the pointers are valid.
-  * @return          Non-zero (true) on success, zero (false) otherwise.
-  */
-int ARMCII_Iov_check_same_allocation(void **ptrs, int count, int proc) {
-  int i;
-  gmr_t *mreg;
-  void *base, *extent;
-
-  if (!ARMCII_GLOBAL_STATE.iov_checks) return 1;
-
-  mreg = gmr_lookup(ptrs[0], proc);
-
-  /* If local, all must be local */
-  if (mreg == NULL) {
-    for (i = 1; i < count; i++) {
-      mreg = gmr_lookup(ptrs[i], proc);
-      if (mreg != NULL)
-        return 0;
-    }
-  }
-  /* If shared, all must fall in this region */
-  else {
-    base   = mreg->slices[proc].base;
-    extent = ((uint8_t*) base) + mreg->slices[proc].size;
-
-    for (i = 1; i < count; i++)
-      if ( !(ptrs[i] >= base && ptrs[i] < extent) )
-        return 0;
-  }
-
-  return 1;
-}
-
-
-/** Perform an I/O vector operation.  Local buffers must be private.
-  *
-  * @param[in] op          Operation to be performed (ARMCII_OP_PUT, ...)
-  * @param[in] src         Array of source pointers
-  * @param[in] dst         Array of destination pointers
-  * @param[in] count       Length of pointer arrays
-  * @param[in] size        Size of each transfer
-  * @param[in] datatype    Data type for accumulate op (ignored for all others)
-  * @param[in] overlapping Do remote regions overlap?
-  * @param[in] same_alloc  Do remote regions correspond to the same allocation?
-  * @param[in] proc        Target process
-  * @return                Zero on success, error code otherwise
-  */
-int ARMCII_Iov_op_dispatch(enum ARMCII_Op_e op, void **src, void **dst, int count, int size,
-    int datatype, int overlapping, int same_alloc, int proc) {
-
-  MPI_Datatype type;
-  int type_count, type_size;
-
-  if (op == ARMCII_OP_ACC) {
-    ARMCII_Acc_type_translate(datatype, &type, &type_size);
-    type_count = size/type_size;
-    ARMCII_Assert_msg(size % type_size == 0, "Transfer size is not a multiple of type size");
-  } else {
-    type = MPI_BYTE;
-    MPI_Type_size(type, &type_size);
-    type_count = size/type_size;
-    ARMCII_Assert_msg(size % type_size == 0, "Transfer size is not a multiple of type size");
-  }
-
-  // CONSERVATIVE CASE: If remote pointers overlap or remote pointers correspond to
-  // multiple allocations, use the safe implementation to avoid invalid MPI
-  // use.
-
-  if (overlapping || !same_alloc || ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_CONSRV) {
-    if (overlapping) ARMCII_Warning("IOV remote buffers overlap\n");
-    if (!same_alloc) ARMCII_Warning("IOV remote buffers are not within the same allocation\n");
-    return ARMCII_Iov_op_safe(op, src, dst, count, type_count, type, proc);
-  }
-
-  // OPTIMIZED CASE: It's safe for us to issue all the operations under a
-  // single lock.
-
-  else if (   ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_DIRECT
-           || ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_AUTO  ) {
-    return ARMCII_Iov_op_datatype(op, src, dst, count, type_count, type, proc);
-
-  } else if (ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_BATCHED) {
-    return ARMCII_Iov_op_batched(op, src, dst, count, type_count, type, proc);
-
-  } else {
-    ARMCII_Error("unknown iov method (%d)\n", ARMCII_GLOBAL_STATE.iov_method);
-    return 1;
-  }
-}
-
-
-/** Safe implementation of the ARMCI IOV operation
-  */
-int ARMCII_Iov_op_safe(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count,
-    MPI_Datatype type, int proc) {
-  
-  int i;
-
-  for (i = 0; i < count; i++) {
-    gmr_t *mreg;
-    void *shr_ptr;
-
-    switch(op) {
-      case ARMCII_OP_PUT:
-        shr_ptr = dst[i];
-        break;
-      case ARMCII_OP_GET:
-        shr_ptr = src[i];
-        break;
-      case ARMCII_OP_ACC:
-        shr_ptr = dst[i];
-        break;
-      default:
-        ARMCII_Error("unknown operation (%d)", op);
-        return 1;
-    }
-
-    mreg = gmr_lookup(shr_ptr, proc);
-    ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer");
-
-    gmr_lock(mreg, proc);
-
-    switch(op) {
-      case ARMCII_OP_PUT:
-        gmr_put(mreg, src[i], dst[i], elem_count, proc);
-        break;
-      case ARMCII_OP_GET:
-        gmr_get(mreg, src[i], dst[i], elem_count, proc);
-        break;
-      case ARMCII_OP_ACC:
-        gmr_accumulate(mreg, src[i], dst[i], elem_count, type, proc);
-        break;
-      default:
-        ARMCII_Error("unknown operation (%d)", op);
-        return 1;
-    }
-
-    gmr_unlock(mreg, proc);
-  }
-
-  return 0;
-}
-
-
-/** Optimized implementation of the ARMCI IOV operation that uses a single
-  * lock/unlock pair.
-  */
-int ARMCII_Iov_op_batched(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count,
-    MPI_Datatype type, int proc) {
-
-  int i;
-  gmr_t *mreg;
-  void *shr_ptr;
-
-  switch(op) {
-    case ARMCII_OP_PUT:
-      shr_ptr = dst[0];
-      break;
-    case ARMCII_OP_GET:
-      shr_ptr = src[0];
-      break;
-    case ARMCII_OP_ACC:
-      shr_ptr = dst[0];
-      break;
-    default:
-      ARMCII_Error("unknown operation (%d)", op);
-      return 1;
-  }
-
-  mreg = gmr_lookup(shr_ptr, proc);
-  ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer");
-
-  gmr_lock(mreg, proc);
-
-  for (i = 0; i < count; i++) {
-
-    if (   ARMCII_GLOBAL_STATE.iov_batched_limit > 0 
-        && i % ARMCII_GLOBAL_STATE.iov_batched_limit == 0
-        && i > 0 )
-    {
-      gmr_unlock(mreg, proc);
-      gmr_lock(mreg, proc);
-    }
-
-    switch(op) {
-      case ARMCII_OP_PUT:
-        gmr_put(mreg, src[i], dst[i], elem_count, proc);
-        break;
-      case ARMCII_OP_GET:
-        gmr_get(mreg, src[i], dst[i], elem_count, proc);
-        break;
-      case ARMCII_OP_ACC:
-        gmr_accumulate(mreg, src[i], dst[i], elem_count, type, proc);
-        break;
-      default:
-        ARMCII_Error("unknown operation (%d)", op);
-        return 1;
-    }
-  }
-
-  gmr_unlock(mreg, proc);
-
-  return 0;
-}
-
-
-/** Optimized implementation of the ARMCI IOV operation that uses an MPI
-  * datatype to achieve a one-sided gather/scatter.
-  */
-int ARMCII_Iov_op_datatype(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count,
-    MPI_Datatype type, int proc) {
-
-    gmr_t *mreg;
-    MPI_Datatype  type_loc, type_rem;
-    MPI_Aint      disp_loc[count];
-    int           disp_rem[count];
-    int           block_len[count];
-    void         *dst_win_base;
-    int           dst_win_size, i, type_size;
-    void        **buf_rem, **buf_loc;
-    MPI_Aint      base_rem;
-
-    switch(op) {
-      case ARMCII_OP_ACC:
-      case ARMCII_OP_PUT:
-        buf_rem = dst;
-        buf_loc = src;
-        break;
-      case ARMCII_OP_GET:
-        buf_rem = src;
-        buf_loc = dst;
-        break;
-      default:
-        ARMCII_Error("unknown operation (%d)", op);
-        return 1;
-    }
-
-    MPI_Type_size(type, &type_size);
-
-    mreg = gmr_lookup(buf_rem[0], proc);
-    ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer");
-
-    dst_win_base = mreg->slices[proc].base;
-    dst_win_size = mreg->slices[proc].size;
-
-    MPI_Get_address(dst_win_base, &base_rem);
-
-    for (i = 0; i < count; i++) {
-      MPI_Aint target_rem;
-      MPI_Get_address(buf_loc[i], &disp_loc[i]);
-      MPI_Get_address(buf_rem[i], &target_rem);
-      disp_rem[i]  = (target_rem - base_rem)/type_size;
-      block_len[i] = elem_count;
-
-      ARMCII_Assert_msg((target_rem - base_rem) % type_size == 0, "Transfer size is not a multiple of type size");
-      ARMCII_Assert_msg(disp_rem[i] >= 0 && disp_rem[i] < dst_win_size, "Invalid remote pointer");
-      ARMCII_Assert_msg(((uint8_t*)buf_rem[i]) + block_len[i] <= ((uint8_t*)dst_win_base) + dst_win_size, "Transfer exceeds buffer length");
-    }
-
-    MPI_Type_create_hindexed(count, block_len, disp_loc, type, &type_loc);
-    MPI_Type_create_indexed_block(count, elem_count, disp_rem, type, &type_rem);
-
-    /* MPI_Type_create_indexed_block should be more efficient than this:
-       MPI_Type_indexed(count, block_len, disp_rem, type, &type_rem); */
-
-    MPI_Type_commit(&type_loc);
-    MPI_Type_commit(&type_rem);
-
-    gmr_lock(mreg, proc);
-
-    switch(op) {
-      case ARMCII_OP_ACC:
-        gmr_accumulate_typed(mreg, MPI_BOTTOM, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc);
-        break;
-      case ARMCII_OP_PUT:
-        gmr_put_typed(mreg, MPI_BOTTOM, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc);
-        break;
-      case ARMCII_OP_GET:
-        gmr_get_typed(mreg, MPI_BOTTOM, 1, type_rem, MPI_BOTTOM, 1, type_loc, proc);
-        break;
-      default:
-        ARMCII_Error("unknown operation (%d)", op);
-        return 1;
-    }
-
-    gmr_unlock(mreg, proc);
-
-    MPI_Type_free(&type_loc);
-    MPI_Type_free(&type_rem);
-
-    return 0;
-}    
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_PutV = PARMCI_PutV
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_PutV ARMCI_PutV
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_PutV as PARMCI_PutV
-#endif
-/* -- end weak symbols block -- */
-
-/** Generalized I/O vector one-sided put.
-  *
-  * @param[in] iov      Vector of transfer information.
-  * @param[in] iov_len  Length of iov.
-  * @param[in] proc     Target process.
-  * @return             Success 0, otherwise non-zero.
-  */
-int PARMCI_PutV(armci_giov_t *iov, int iov_len, int proc) {
-  int v;
-
-  for (v = 0; v < iov_len; v++) {
-    void **src_buf;
-    int    overlapping, same_alloc;
-
-    if (iov[v].ptr_array_len == 0) continue; // NOP //
-    if (iov[v].bytes == 0) continue; // NOP //
-
-    overlapping = ARMCII_Iov_check_overlap(iov[v].dst_ptr_array, iov[v].ptr_array_len, iov[v].bytes);
-    same_alloc  = ARMCII_Iov_check_same_allocation(iov[v].dst_ptr_array, iov[v].ptr_array_len, proc);
-
-    ARMCII_Buf_prepare_read_vec(iov[v].src_ptr_array, &src_buf, iov[v].ptr_array_len, iov[v].bytes);
-    ARMCII_Iov_op_dispatch(ARMCII_OP_PUT, src_buf, iov[v].dst_ptr_array, iov[v].ptr_array_len, iov[v].bytes, 0, overlapping, same_alloc, proc);
-    ARMCII_Buf_finish_read_vec(iov[v].src_ptr_array, src_buf, iov[v].ptr_array_len, iov[v].bytes);
-  }
-
-  return 0;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_GetV = PARMCI_GetV
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_GetV ARMCI_GetV
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_GetV as PARMCI_GetV
-#endif
-/* -- end weak symbols block -- */
-
-/** Generalized I/O vector one-sided get.
-  *
-  * @param[in] iov      Vector of transfer information.
-  * @param[in] iov_len  Length of iov.
-  * @param[in] proc     Target process.
-  * @return             Success 0, otherwise non-zero.
-  */
-int PARMCI_GetV(armci_giov_t *iov, int iov_len, int proc) {
-  int v;
-
-  for (v = 0; v < iov_len; v++) {
-    void **dst_buf;
-    int    overlapping, same_alloc;
-
-    if (iov[v].ptr_array_len == 0) continue; // NOP //
-    if (iov[v].bytes == 0) continue; // NOP //
-
-    // overlapping = ARMCII_Iov_check_overlap(iov[v].src_ptr_array, iov[v].ptr_array_len, iov[v].bytes);
-    overlapping = ARMCII_Iov_check_overlap(iov[v].dst_ptr_array, iov[v].ptr_array_len, iov[v].bytes);
-    same_alloc  = ARMCII_Iov_check_same_allocation(iov[v].src_ptr_array, iov[v].ptr_array_len, proc);
-
-    ARMCII_Buf_prepare_write_vec(iov[v].dst_ptr_array, &dst_buf, iov[v].ptr_array_len, iov[v].bytes);
-    ARMCII_Iov_op_dispatch(ARMCII_OP_GET, iov[v].src_ptr_array, dst_buf, iov[v].ptr_array_len, iov[v].bytes, 0, overlapping, same_alloc, proc);
-    ARMCII_Buf_finish_write_vec(iov[v].dst_ptr_array, dst_buf, iov[v].ptr_array_len, iov[v].bytes);
-  }
-
-  return 0;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_AccV = PARMCI_AccV
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_AccV ARMCI_AccV
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_AccV as PARMCI_AccV
-#endif
-/* -- end weak symbols block -- */
-
-/** Generalized I/O vector one-sided accumulate.
-  *
-  * @param[in] iov      Vector of transfer information.
-  * @param[in] iov_len  Length of iov.
-  * @param[in] proc     Target process.
-  * @return             Success 0, otherwise non-zero.
-  */
-int PARMCI_AccV(int datatype, void *scale, armci_giov_t *iov, int iov_len, int proc) {
-  int v;
-
-  for (v = 0; v < iov_len; v++) {
-    void **src_buf;
-    int    overlapping, same_alloc;
-
-    if (iov[v].ptr_array_len == 0) continue; // NOP //
-    if (iov[v].bytes == 0) continue; // NOP //
-
-    overlapping = ARMCII_Iov_check_overlap(iov[v].dst_ptr_array, iov[v].ptr_array_len, iov[v].bytes);
-    same_alloc  = ARMCII_Iov_check_same_allocation(iov[v].dst_ptr_array, iov[v].ptr_array_len, proc);
-
-    ARMCII_Buf_prepare_acc_vec(iov[v].src_ptr_array, &src_buf, iov[v].ptr_array_len, iov[v].bytes, datatype, scale);
-    ARMCII_Iov_op_dispatch(ARMCII_OP_ACC, src_buf, iov[v].dst_ptr_array, iov[v].ptr_array_len, iov[v].bytes, datatype, overlapping, same_alloc, proc);
-    ARMCII_Buf_finish_acc_vec(iov[v].src_ptr_array, src_buf, iov[v].ptr_array_len, iov[v].bytes);
-  }
-
-  return 0;
-}
-
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_NbPutV = PARMCI_NbPutV
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_NbPutV ARMCI_NbPutV
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_NbPutV as PARMCI_NbPutV
-#endif
-/* -- end weak symbols block -- */
-
-int PARMCI_NbPutV(armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle) {
-  return PARMCI_PutV(iov, iov_len, proc);
-}
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_NbGetV = PARMCI_NbGetV
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_NbGetV ARMCI_NbGetV
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_NbGetV as PARMCI_NbGetV
-#endif
-/* -- end weak symbols block -- */
-
-int PARMCI_NbGetV(armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle) {
-  return PARMCI_GetV(iov, iov_len, proc);
-}
-
-/* -- begin weak symbols block -- */
-#if defined(HAVE_PRAGMA_WEAK)
-#  pragma weak ARMCI_NbAccV = PARMCI_NbAccV
-#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-#  pragma _HP_SECONDARY_DEF PARMCI_NbAccV ARMCI_NbAccV
-#elif defined(HAVE_PRAGMA_CRI_DUP)
-#  pragma _CRI duplicate ARMCI_NbAccV as PARMCI_NbAccV
-#endif
-/* -- end weak symbols block -- */
-
-int PARMCI_NbAccV(int datatype, void *scale, armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle) {
-  return PARMCI_AccV(datatype, scale, iov, iov_len, proc);
-}
-
-
diff --git a/src/armci/tests/ARMCI_AccS_latency.c b/src/armci/tests/ARMCI_AccS_latency.c
deleted file mode 100644
index c0708b0..0000000
--- a/src/armci/tests/ARMCI_AccS_latency.c
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-#include <armci.h>
-
-#define MAX_XDIM 1024 
-#define MAX_YDIM 1024
-#define ITERATIONS 10
-#define SKIP 1
-
-#ifdef VANILLA_ARMCI
-#define ARMCI_Access_begin(X) ((void*)0)
-#define ARMCI_Access_end(X) ((void*)0)
-#endif 
-
-int main(int argc, char **argv)
-{
-
-    int i, j, rank, nranks, peer;
-    size_t xdim, ydim;
-    unsigned long bufsize;
-    double **buffer, *src_buf;
-    double t_start, t_stop;
-    int count[2], src_stride, trg_stride, stride_level;
-    double scaling;
-    int provided;
-
-    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
-
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
-
-    if (nranks < 2) {
-        printf("%s: Must be run with at least 2 processes\n", argv[0]);
-        MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-
-    ARMCI_Init_args(&argc, &argv);
-
-    buffer = (double **) malloc(sizeof(double *) * nranks);
-
-    bufsize = MAX_XDIM * MAX_YDIM * sizeof(double);
-    ARMCI_Malloc((void **) buffer, bufsize);
-    src_buf = ARMCI_Malloc_local(bufsize);
-
-    if (rank == 0)
-    {
-        printf("ARMCI_AccS Latency - local and remote completions - in usec \n");
-        printf("%30s %22s %22s\n",
-               "Dimensions(array of double)",
-               "Local Completion",
-               "Remote completion");
-        fflush(stdout);
-    }
-
-    ARMCI_Access_begin(buffer[rank]);
-    for (i = 0; i < bufsize / sizeof(double); i++)
-    {
-      *(buffer[rank] + i) = 1.0 + rank;
-      *(src_buf + i) = 1.0 + rank;
-    }
-    ARMCI_Access_end(buffer[rank]);
-
-    scaling = 2.0;
-
-    src_stride = MAX_YDIM * sizeof(double);
-    trg_stride = MAX_YDIM * sizeof(double);
-    stride_level = 1;
-
-    ARMCI_Barrier();
-
-    for (xdim = 1; xdim <= MAX_XDIM; xdim *= 2)
-    {
-
-        count[1] = xdim;
-
-        for (ydim = 1; ydim <= MAX_YDIM; ydim *= 2)
-        {
-
-            count[0] = ydim * sizeof(double);
-
-            if (rank == 0)
-            {
-
-                peer = 1;
-
-                for (i = 0; i < ITERATIONS + SKIP; i++)
-                {
-
-                    if (i == SKIP) t_start = MPI_Wtime();
-
-                    ARMCI_AccS(ARMCI_ACC_DBL,
-                               (void *) &scaling,
-                               /* (void *) buffer[rank] */ src_buf,
-                               &src_stride,
-                               (void *) buffer[peer],
-                               &trg_stride,
-                               count,
-                               stride_level,
-                               1);
-
-                }
-                t_stop = MPI_Wtime();
-                ARMCI_Fence(1);
-
-                char temp[10];
-                sprintf(temp, "%dX%d", (int) xdim, (int) ydim);
-                printf("%30s %20.2f ", temp, ((t_stop - t_start) * 1000000)
-                        / ITERATIONS);
-                fflush(stdout);
-
-                ARMCI_Barrier();
-
-                ARMCI_Barrier();
-
-                for (i = 0; i < ITERATIONS + SKIP; i++)
-                {
-
-                    if (i == SKIP) t_start = MPI_Wtime();
-
-                    ARMCI_AccS(ARMCI_ACC_DBL,
-                               (void *) &scaling,
-                               /* (void *) buffer[rank] */ src_buf,
-                               &src_stride,
-                               (void *) buffer[peer],
-                               &trg_stride,
-                               count,
-                               stride_level,
-                               1);
-                    ARMCI_Fence(1);
-
-                }
-                t_stop = MPI_Wtime();
-                printf("%20.2f \n", ((t_stop - t_start) * 1000000) / ITERATIONS);
-                fflush(stdout);
-
-                ARMCI_Barrier();
-
-                ARMCI_Barrier();
-
-            }
-            else
-            {
-
-                peer = 0;
-
-                ARMCI_Barrier();
-
-                if (rank == 1) 
-                {
-                  ARMCI_Access_begin(buffer[rank]);
-                  for (i = 0; i < xdim; i++)
-                  {
-                    for (j = 0; j < ydim; j++)
-                    {
-                      if (*(buffer[rank] + i * MAX_XDIM + j) != ((1.0 + rank)
-                            + scaling * (1.0 + peer) * (ITERATIONS + SKIP)))
-                      {
-                        printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n",
-                            i,
-                            j,
-                            ((1.0 + rank) + scaling * (1.0 + peer)),
-                            *(buffer[rank] + i * MAX_YDIM + j));
-                        fflush(stdout);
-                        ARMCI_Error("Bailing out", 1);
-                      }
-                    }
-                  }
-
-                  for (i = 0; i < bufsize / sizeof(double); i++)
-                  {
-                    *(buffer[rank] + i) = 1.0 + rank;
-                  }
-                  ARMCI_Access_end(buffer[rank]);
-                }
-
-                ARMCI_Barrier();
-
-                ARMCI_Barrier();
-
-                if (rank == 1) 
-                {
-                  ARMCI_Access_begin(buffer[rank]);
-
-                  for (i = 0; i < xdim; i++)
-                  {
-                    for (j = 0; j < ydim; j++)
-                    {
-                      if (*(buffer[rank] + i * MAX_XDIM + j) != ((1.0 + rank)
-                            + scaling * (1.0 + peer) * (ITERATIONS + SKIP)))
-                      {
-                        printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n",
-                            i,
-                            j,
-                            ((1.0 + rank) + scaling * (1.0 + peer)),
-                            *(buffer[rank] + i * MAX_YDIM + j));
-                        fflush(stdout);
-                        ARMCI_Error("Bailing out", 1);
-                      }
-                    }
-                  }
-
-                  for (i = 0; i < bufsize / sizeof(double); i++)
-                  {
-                    *(buffer[rank] + i) = 1.0 + rank;
-                  }
-
-                  ARMCI_Access_end(buffer[rank]);
-                }
-                ARMCI_Barrier();
-
-            }
-
-        }
-
-    }
-
-    ARMCI_Barrier();
-
-    ARMCI_Free((void *) buffer[rank]);
-    ARMCI_Free_local(src_buf);
-    free(buffer);
-
-    ARMCI_Finalize();
-
-    MPI_Finalize();
-
-    return 0;
-}
diff --git a/src/armci/tests/ARMCI_PutS_latency.c b/src/armci/tests/ARMCI_PutS_latency.c
deleted file mode 100644
index 0938653..0000000
--- a/src/armci/tests/ARMCI_PutS_latency.c
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-#include <armci.h>
-
-#define MAX_XDIM 1024 
-#define MAX_YDIM 1024
-#define ITERATIONS 100
-#define SKIP 10
-
-int main(int argc, char *argv[]) {
-
-   int i, j, rank, nranks;
-   int xdim, ydim;
-   long bufsize;
-   double **buffer;
-   double t_start, t_stop;
-   int count[2], src_stride, trg_stride, stride_level, peer;
-   double expected, actual;
-   int provided;
-
-   MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
-   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-   MPI_Comm_size(MPI_COMM_WORLD, &nranks);
-
-    if (nranks < 2) {
-        printf("%s: Must be run with at least 2 processes\n", argv[0]);
-        MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-
-   ARMCI_Init_args(&argc, &argv);
-   
-   bufsize = MAX_XDIM * MAX_YDIM * sizeof(double);
-   buffer = (double **) malloc(sizeof(double *) * nranks);
-   ARMCI_Malloc((void **) buffer, bufsize);
-
-   for(i=0; i< bufsize/sizeof(double); i++) {
-       *(buffer[rank] + i) = 1.0 + rank;
-   }
-
-   if(rank == 0) {
-     printf("ARMCI_PutS Latency - local and remote completions - in usec \n");
-     printf("%30s %22s %22s\n", "Dimensions(array of doubles)", "Latency-LocalCompeltion", "Latency-RemoteCompletion");
-     fflush(stdout);
-   }
-
-   src_stride = MAX_YDIM*sizeof(double);
-   trg_stride = MAX_YDIM*sizeof(double);
-   stride_level = 1;
-
-   ARMCI_Barrier();
-
-   for(xdim=1; xdim<=MAX_XDIM; xdim*=2) {
-
-      count[1] = xdim;
-
-      for(ydim=1; ydim<=MAX_YDIM; ydim*=2) {
-
-        count[0] = ydim*sizeof(double); 
-      
-        if(rank == 0) 
-        {
-          peer = 1;          
- 
-          for(i=0; i<ITERATIONS+SKIP; i++) { 
-
-             if(i == SKIP)
-                 t_start = MPI_Wtime();
-
-             ARMCI_PutS((void *) buffer[rank], &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, peer); 
- 
-          }
-          t_stop = MPI_Wtime();
-          ARMCI_Fence(peer);
-          char temp[10]; 
-          sprintf(temp,"%dX%d", xdim, ydim);
-          printf("%30s %20.2f", temp, ((t_stop-t_start)*1000000)/ITERATIONS);
-          fflush(stdout);
-
-          ARMCI_Barrier();
-
-          ARMCI_Barrier();
-
-          for(i=0; i<ITERATIONS+SKIP; i++) {
-  
-             if(i == SKIP)
-                t_start = MPI_Wtime();
-
-             ARMCI_PutS((void *) buffer[rank], &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, peer); 
-             ARMCI_Fence(peer);
-
-          }
-          t_stop = MPI_Wtime();
-          printf("%20.2f \n", ((t_stop-t_start)*1000000)/ITERATIONS);
-          fflush(stdout);
-
-          ARMCI_Barrier();
-
-          ARMCI_Barrier();
-        }
-        else
-        {
-            peer = 0;
-
-            expected = (1.0 + (double) peer);
-
-            ARMCI_Barrier();
-            if (rank == 1)
-            {
-              for(i=0; i<xdim; i++)
-              {
-                for(j=0; j<ydim; j++)
-                {
-                  actual = *(buffer[rank] + i*MAX_YDIM + j);
-                  if(actual != expected)
-                  {
-                    printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n",
-                        i, j, expected, actual);
-                    fflush(stdout);
-                    ARMCI_Error("Bailing out", 1);
-                  }
-                }
-              }
-            }
-            for(i=0; i< bufsize/sizeof(double); i++) {
-              *(buffer[rank] + i) = 1.0 + rank;
-            }
-
-            ARMCI_Barrier();
-
-            ARMCI_Barrier();
-            if (rank == 1)
-            {
-              for(i=0; i<xdim; i++)
-              {
-                for(j=0; j<ydim; j++)
-                {
-                  actual = *(buffer[rank] + i*MAX_YDIM + j);
-                  if(actual != expected)
-                  {
-                    printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n",
-                        i, j, expected, actual);
-                    fflush(stdout);
-                    ARMCI_Error("Bailing out", 1);
-                  }
-                }
-              }
-
-              for(i=0; i< bufsize/sizeof(double); i++) {
-                *(buffer[rank] + i) = 1.0 + rank;
-              }
-            }
-            ARMCI_Barrier();
-
-        }
-        
-      }
-
-   }
-
-   ARMCI_Barrier();
-
-   ARMCI_Free((void *) buffer[rank]);
-   free(buffer);
-
-   ARMCI_Finalize();
-
-   MPI_Finalize();
-
-   return 0;
-}
diff --git a/src/armci/tests/Makefile.mk b/src/armci/tests/Makefile.mk
deleted file mode 100644
index c392269..0000000
--- a/src/armci/tests/Makefile.mk
+++ /dev/null
@@ -1,80 +0,0 @@
-#
-# Copyright (C) 2010. See COPYRIGHT in top-level directory.
-#
-
-check_PROGRAMS += tests/test_onesided         \
-                  tests/test_onesided_shared  \
-                  tests/test_onesided_shared_dla \
-                  tests/test_mutex            \
-                  tests/test_mutex_rmw        \
-                  tests/test_mutex_trylock    \
-                  tests/test_malloc           \
-                  tests/test_malloc_irreg     \
-                  tests/ARMCI_PutS_latency    \
-                  tests/ARMCI_AccS_latency    \
-                  tests/test_groups           \
-                  tests/test_group_split      \
-                  tests/test_malloc_group     \
-                  tests/test_accs             \
-                  tests/test_accs_dla         \
-                  tests/test_puts             \
-                  tests/test_puts_gets        \
-                  tests/test_puts_gets_dla    \
-                  tests/test_assert           \
-                  tests/test_igop             \
-                  tests/test_rmw_fadd         \
-                  tests/test_parmci           \
-                  # end
-
-TESTS          += tests/test_onesided         \
-                  tests/test_onesided_shared  \
-                  tests/test_onesided_shared_dla \
-                  tests/test_mutex            \
-                  tests/test_mutex_rmw        \
-                  tests/test_mutex_trylock    \
-                  tests/test_malloc           \
-                  tests/test_malloc_irreg     \
-                  tests/ARMCI_PutS_latency    \
-                  tests/ARMCI_AccS_latency    \
-                  tests/test_groups           \
-                  tests/test_group_split      \
-                  tests/test_malloc_group     \
-                  tests/test_accs             \
-                  tests/test_accs_dla         \
-                  tests/test_puts             \
-                  tests/test_puts_gets        \
-                  tests/test_puts_gets_dla    \
-                  tests/test_igop             \
-                  tests/test_rmw_fadd         \
-                  tests/test_parmci           \
-                  # end
-
-XFAIL_TESTS    += tests/test_assert           \
-                  # end
-
-tests_test_onesided_LDADD = libarmci.la
-tests_test_onesided_shared_LDADD = libarmci.la
-tests_test_onesided_shared_dla_LDADD = libarmci.la
-tests_test_mutex_LDADD = libarmci.la
-tests_test_mutex_rmw_LDADD = libarmci.la
-tests_test_mutex_trylock_LDADD = libarmci.la
-tests_test_malloc_LDADD = libarmci.la
-tests_test_malloc_irreg_LDADD = libarmci.la
-tests_ARMCI_PutS_latency_LDADD = libarmci.la
-tests_ARMCI_AccS_latency_LDADD = libarmci.la
-tests_test_groups_LDADD = libarmci.la
-tests_test_group_split_LDADD = libarmci.la
-tests_test_malloc_group_LDADD = libarmci.la
-tests_test_accs_LDADD = libarmci.la
-tests_test_accs_dla_LDADD = libarmci.la
-tests_test_puts_LDADD = libarmci.la
-tests_test_puts_gets_LDADD = libarmci.la
-tests_test_puts_gets_dla_LDADD = libarmci.la
-tests_test_assert_LDADD = libarmci.la
-tests_test_igop_LDADD = libarmci.la
-tests_test_rmw_fadd_LDADD = libarmci.la
-tests_test_parmci_LDADD = libarmci.la
-tests_test_parmci_SOURCES = tests/test_parmci.c tests/test_parmci_lib.c
-
-include tests/mpi/Makefile.mk
-include tests/ctree/Makefile.mk
diff --git a/src/armci/tests/ctree/Makefile.mk b/src/armci/tests/ctree/Makefile.mk
deleted file mode 100644
index b0733ae..0000000
--- a/src/armci/tests/ctree/Makefile.mk
+++ /dev/null
@@ -1,16 +0,0 @@
-#
-# Copyright (C) 2010. See COPYRIGHT in top-level directory.
-#
-
-check_PROGRAMS += tests/ctree/ctree_test        \
-                  tests/ctree/ctree_test_rand   \
-                  tests/ctree/ctree_test_rand_interval
-
-TESTS          += tests/ctree/ctree_test        \
-                  tests/ctree/ctree_test_rand   \
-                  tests/ctree/ctree_test_rand_interval
-
-
-tests_ctree_ctree_test_LDADD = libarmci.la -lm
-tests_ctree_ctree_test_rand_LDADD = libarmci.la -lm
-tests_ctree_ctree_test_rand_interval_LDADD = libarmci.la -lm
diff --git a/src/armci/tests/ctree/ctree_test.c b/src/armci/tests/ctree/ctree_test.c
deleted file mode 100644
index fabe2a1..0000000
--- a/src/armci/tests/ctree/ctree_test.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <conflict_tree.h>
-
-#define MAX 20
-#define INC 4
-
-int main(int argc, char **argv) {
-  uint8_t *i;
-  ctree_t ctree = CTREE_EMPTY;
-
-  printf("========== FORWARD INSERT CHECK ==========\n");
-
-  for (i = 0; i <= (uint8_t*) MAX; i+=INC) {
-    printf("----- Inserting [%10p, %10p] -----\n", i, i+INC-1);
-    int conflict = ctree_insert(&ctree, i, i+INC-1);
-    ctree_print(ctree);
-
-    if (conflict)
-      printf("Error, conflict inserting %p\n", i);
-  }
-
-  for (i = 0; i <= (uint8_t*) MAX; i+=INC) {
-    printf("----- Checking [%10p, %10p] -----\n", i, i+INC-1);
-    int conflict = ctree_insert(&ctree, i, i+INC-1);
-
-    if (!conflict)
-      printf("Error, no conflict inserting %p\n", i);
-  }
-
-  ctree_destroy(&ctree);
-
-  printf("========== REVERSE INSERT CHECK ==========\n");
-
-  for (i = (uint8_t*) MAX+INC; i-INC <= (uint8_t*) MAX+INC; i-=INC) {
-    printf("----- Inserting [%10p, %10p] -----\n", i-INC, i-1);
-    int conflict = ctree_insert(&ctree, i-INC, i-1);
-    ctree_print(ctree);
-
-    if (conflict)
-      printf("Error, conflict inserting %p\n", i);
-  }
-
-  for (i = 0; i <= (uint8_t*) MAX; i+=INC) {
-    printf("----- Checking [%10p, %10p] -----\n", i, i+INC-1);
-    int conflict = ctree_insert(&ctree, i, i+INC-1);
-
-    if (!conflict)
-      printf("Error, no conflict inserting %p\n", i);
-  }
-
-  ctree_destroy(&ctree);
-
-  return 0;
-}
diff --git a/src/armci/tests/ctree/ctree_test_rand.c b/src/armci/tests/ctree/ctree_test_rand.c
deleted file mode 100644
index 4e21723..0000000
--- a/src/armci/tests/ctree/ctree_test_rand.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <conflict_tree.h>
-
-#define NELT 1000
-
-uint8_t *data[NELT];
-
-int main(int argc, char **argv) {
-  int i;
-  ctree_t ctree = CTREE_EMPTY;
-
-  srand(time(NULL));
-
-  for (i = 0; i < NELT; i++)
-    data[i] = ((uint8_t*) NULL) + i;
-
-  // Perform NELT random swaps
-  for (i = 0; i < NELT; i++) {
-    int j = rand() % NELT;
-    int k = rand() % NELT;
-    uint8_t *tmp = data[j];
-    data[j] = data[k];
-    data[k] = tmp;
-  }
-
-  for (i = 0; i < NELT; i++) {
-    printf(" + Inserting [%p, %p]\n", data[i], data[i]);
-    int conflict = ctree_insert(&ctree, data[i], data[i]);
-
-    if (conflict) {
-      printf("*** Error, conflict inserting %p\n", data[i]);
-      ctree_print(ctree);
-      exit(1);
-    }
-  }
-
-  printf("\n");
-  ctree_print(ctree);
-  printf("\n");
-
-  for (i = 0; i < NELT; i++) {
-    printf(" + Checking [%p, %p]\n", data[i], data[i]);
-    int conflict = ctree_insert(&ctree, data[i], data[i]);
-
-    if (!conflict) {
-      printf("*** Error, no conflict inserting %p\n", data[i]);
-      ctree_print(ctree);
-      exit(1);
-    }
-  }
-
-  ctree_destroy(&ctree);
-
-  return 0;
-}
diff --git a/src/armci/tests/ctree/ctree_test_rand_interval.c b/src/armci/tests/ctree/ctree_test_rand_interval.c
deleted file mode 100644
index 5525aa7..0000000
--- a/src/armci/tests/ctree/ctree_test_rand_interval.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <conflict_tree.h>
-
-#define MIN(X,Y) ((X) < (Y) ? X : Y)
-#define MAX(X,Y) ((X) > (Y) ? X : Y)
-
-#define MAX_INTVL 1000
-#define NELT      1000
-
-uint8_t *data[NELT][2];
-
-int main(int argc, char **argv) {
-  int i, next, upper_bound;
-  ctree_t ctree = CTREE_EMPTY;
-
-  srand(time(NULL));
-
-  // Generate random intervals that fully cover the space from [0,next)
-  for (i = next = 0; i < NELT; i++, next++) {
-    data[i][0]  = ((uint8_t*) NULL) + next;
-    next        = next + rand()%MAX_INTVL;
-    data[i][1]  = ((uint8_t*) NULL) + next;
-    upper_bound = next;
-  }
-
-  // Perform NELT random swaps so elements are inserted in random order
-  for (i = 0; i < NELT; i++) {
-    int j = rand() % NELT;
-    int k = rand() % NELT;
-    uint8_t *tmp[2];
-
-    tmp[0]  = data[j][0];
-    tmp[1]  = data[j][1];
-
-    data[j][0] = data[k][0];
-    data[j][1] = data[k][1];
-
-    data[k][0] = tmp[0];
-    data[k][1] = tmp[1];
-  }
-
-  // Build the conflict tree
-  for (i = 0; i < NELT; i++) {
-    printf(" + Inserting [%p, %p]\n", data[i][0], data[i][1]);
-    int conflict = ctree_insert(&ctree, data[i][0], data[i][1]);
-
-    if (conflict) {
-      printf("*** Error, conflict inserting [%p, %p]\n", data[i][0], data[i][1]);
-      ctree_print(ctree);
-      exit(1);
-    }
-  }
-
-  printf("\n");
-  ctree_print(ctree);
-  printf("\n");
-
-  // Generate random test samples
-  for (i = 0; i < NELT; i++) {
-    int x = rand() % upper_bound;
-    int y = rand() % upper_bound;
-
-    data[i][0]  = ((uint8_t*) NULL) + MIN(x,y);
-    data[i][1]  = ((uint8_t*) NULL) + MAX(x,y);
-  }
-
-  for (i = 0; i < NELT; i++) {
-    printf(" + Checking [%p, %p]\n", data[i][0], data[i][1]);
-    int conflict = ctree_insert(&ctree, data[i][0], data[i][1]);
-
-    if (!conflict) {
-      printf("*** Error, no conflict inserting [%p, %p]\n", data[i][0], data[i][1]);
-      ctree_print(ctree);
-      exit(1);
-    }
-  }
-
-  ctree_destroy(&ctree);
-
-  return 0;
-}
diff --git a/src/armci/tests/mpi/Makefile.mk b/src/armci/tests/mpi/Makefile.mk
deleted file mode 100644
index 2ef2d92..0000000
--- a/src/armci/tests/mpi/Makefile.mk
+++ /dev/null
@@ -1,31 +0,0 @@
-#
-# Copyright (C) 2010. See COPYRIGHT in top-level directory.
-#
-
-check_PROGRAMS += \
-                  tests/mpi/ping-pong-mpi               \
-                  tests/mpi/test_mpi_accs               \
-                  tests/mpi/test_mpi_indexed_accs       \
-                  tests/mpi/test_mpi_indexed_gets       \
-                  tests/mpi/test_mpi_indexed_puts_gets  \
-                  tests/mpi/test_mpi_subarray_accs      \
-                  tests/mpi/test_win_create             \
-                  # end
-
-TESTS          += \
-                  tests/mpi/test_mpi_accs               \
-                  tests/mpi/test_mpi_indexed_accs       \
-                  tests/mpi/test_mpi_indexed_gets       \
-                  tests/mpi/test_mpi_indexed_puts_gets  \
-                  tests/mpi/test_mpi_subarray_accs      \
-                  tests/mpi/test_win_create             \
-                  #tests/mpi/ping-pong-mpi      \
-                  # end
-
-tests_mpi_ping_pong_mpi_LDADD = libarmci.la
-tests_mpi_test_mpi_accs_LDADD = libarmci.la
-tests_mpi_test_mpi_indexed_accs_LDADD = libarmci.la
-tests_mpi_test_mpi_indexed_gets_LDADD = libarmci.la
-tests_mpi_test_mpi_indexed_puts_gets_LDADD = libarmci.la
-tests_mpi_test_mpi_subarray_accs_LDADD = libarmci.la
-tests_mpi_test_win_create_LDADD = libarmci.la
diff --git a/src/armci/tests/mpi/copy_bench.c b/src/armci/tests/mpi/copy_bench.c
deleted file mode 100644
index 2691d97..0000000
--- a/src/armci/tests/mpi/copy_bench.c
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <mpi.h>
-
-#define NITER 100000
-#define MAXSZ 65536*2*2
-
-int main(int argc, char **argv) {
-  int      i, j;
-  char    *in, *out;
-  double   t_start, t_stop;
-  MPI_Comm copy_comm;
-
-  MPI_Init(&argc, &argv);
-
-  MPI_Comm_dup(MPI_COMM_SELF, &copy_comm);
-
-  in  = malloc(MAXSZ);
-  out = malloc(MAXSZ);
-
-  for (i = 0; i < MAXSZ; i++) {
-    in[i]  = 0xAA;
-    out[i] = 0x55;
-  }
-
-  for (i = 1; i <= MAXSZ; i *= 2) {
-    t_start = MPI_Wtime();
-    for (j = 0; j < NITER; j++) {
-      memcpy(out, in, i);
-    }
-    t_stop = MPI_Wtime();
-    printf("MEMCPY: %7d\t%0.9f\n", i, t_stop-t_start);
-  }
-
-  for (i = 0; i < MAXSZ; i++) {
-    in[i]  = 0xAA;
-    out[i] = 0x55;
-  }
-
-  for (i = 1; i <= MAXSZ; i *= 2) {
-    t_start = MPI_Wtime();
-    for (j = 0; j < NITER; j++) {
-      MPI_Sendrecv(in, i, MPI_BYTE,
-          0 /* rank */, 0 /* tag */,
-          out, i, MPI_BYTE,
-          0 /* rank */, 0 /* tag */,
-          copy_comm, MPI_STATUS_IGNORE);
-    }
-    t_stop = MPI_Wtime();
-    printf("SNDRCV: %7d\t%0.9f\n", i, t_stop-t_start);
-  }
-
-
-  MPI_Comm_free(&copy_comm);
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/tests/mpi/pgroup_bench.c b/src/armci/tests/mpi/pgroup_bench.c
deleted file mode 100644
index 1085481..0000000
--- a/src/armci/tests/mpi/pgroup_bench.c
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#include <mpi.h>
-
-#define INTERCOMM_TAG 0
-#define NITER 1024
-
-
-void pgroup_create(int grp_size, int *pid_list, MPI_Comm *group_out);
-void pgroup_free(MPI_Comm *group);
-
-
-/** Free a pgroup
-  */
-void pgroup_free(MPI_Comm *group) {
-  /* Note: It's ok to compare predefined handles */
-  if (*group == MPI_COMM_NULL || *group == MPI_COMM_SELF)
-    return;
-
-  MPI_Comm_free(group);
-}
-
-
-/* Create a processor group containing the processes in pid_list.
- *
- * NOTE: pid_list list must be identical and sorted on all processes
- */
-void pgroup_create(int grp_size, int *pid_list, MPI_Comm *group_out) {
-  int       i, grp_me, me, nproc, merge_size;
-  MPI_Comm  pgroup, inter_pgroup;
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &me);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  /* CASE: pgroup size 0 */
-  if (grp_size == 0) {
-    *group_out = MPI_COMM_NULL;
-    return;
-  }
-
-  /* CASE: pgroup size 1 */
-  else if (grp_size == 1 && pid_list[0] == me) {
-    *group_out = MPI_COMM_SELF;
-    return;
-  }
-
-  /* CHECK: If I'm not a member, return COMM_NULL */
-  grp_me = -1;
-  for (i = 0; i < grp_size; i++) {
-    if (pid_list[i] == me) {
-      grp_me = i;
-      break;
-    }
-  }
-
-  if (grp_me < 0) {
-    *group_out = MPI_COMM_NULL;
-    return;
-  }
-
-  pgroup = MPI_COMM_SELF;
-
-  for (merge_size = 1; merge_size < grp_size; merge_size *= 2) {
-    int      gid        = grp_me / merge_size;
-    MPI_Comm pgroup_old = pgroup;
-
-    if (gid % 2 == 0) {
-      /* Check if right partner doesn't exist */
-      if ((gid+1)*merge_size >= grp_size)
-        continue;
-
-      MPI_Intercomm_create(pgroup, 0, MPI_COMM_WORLD, pid_list[(gid+1)*merge_size], INTERCOMM_TAG, &inter_pgroup);
-      MPI_Intercomm_merge(inter_pgroup, 0 /* LOW */, &pgroup);
-    } else {
-      MPI_Intercomm_create(pgroup, 0, MPI_COMM_WORLD, pid_list[(gid-1)*merge_size], INTERCOMM_TAG, &inter_pgroup);
-      MPI_Intercomm_merge(inter_pgroup, 1 /* HIGH */, &pgroup);
-    }
-
-    MPI_Comm_free(&inter_pgroup);
-    if (pgroup_old != MPI_COMM_SELF) MPI_Comm_free(&pgroup_old);
-  }
-
-  *group_out = pgroup;
-}
-
-
-int main(int argc, char **argv) {
-  int me, nproc, i, j, *glist;
-  MPI_Comm groups[NITER];
-  MPI_Group world_group;
-
-  MPI_Init(&argc, &argv);
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &me);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
-
-  glist = (int*) malloc(nproc*sizeof(int));
-
-  for (i = 0; i < nproc; i++)
-    glist[i] = i;
-
-  if (me == 0)
-    printf("Gsize\tPGgroup (sec)\tComm (sec)\n");
-
-  MPI_Barrier(MPI_COMM_WORLD);
-
-  for (i = 1; i <= nproc; i*= 2) {
-    double t_start, t_pg, t_comm;
-    MPI_Group intracomm_group;
-
-    /** Benchmark pgroup creation cost **/
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    t_start = MPI_Wtime();
-
-    for (j = 0; j < NITER; j++)
-      pgroup_create(i, glist, &groups[j]);
-
-    t_pg = MPI_Wtime() - t_start;
-
-    for (j = 0; j < NITER; j++)
-      pgroup_free(&groups[j]);
-
-    /** Benchmark intracommunicator creation cost **/
-
-    MPI_Group_incl(world_group, i, glist, &intracomm_group);
-    MPI_Barrier(MPI_COMM_WORLD);
-    t_start = MPI_Wtime();
-
-    for (j = 0; j < NITER; j++)
-      MPI_Comm_create(MPI_COMM_WORLD, intracomm_group, &groups[j]);
-
-    t_comm = MPI_Wtime() - t_start;
-    MPI_Group_free(&intracomm_group);
-
-    for (j = 0; j < NITER; j++)
-      pgroup_free(&groups[j]);
-
-    if (me == 0)
-      printf("%6d\t%0.9f\t%0.9f\n", i, t_pg/NITER, t_comm/NITER);
-
-  }
-
-  free(glist);
-  MPI_Group_free(&world_group);
-
-  MPI_Finalize();
-  return 0;
-}
diff --git a/src/armci/tests/mpi/ping-pong-mpi.c b/src/armci/tests/mpi/ping-pong-mpi.c
deleted file mode 100644
index c150f94..0000000
--- a/src/armci/tests/mpi/ping-pong-mpi.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <stdint.h>
-
-#include <mpi.h>
-
-#define MAX_SIZE   (128*1024)
-#define NUM_ROUNDS 10000
-#define PROBE_PROGRESS
-
-int main(int argc, char **argv) {
-  int        me, nproc, target;
-  int        msg_length, round, i;
-  double     t_start, t_stop;
-  uint8_t  *snd_buf;  // Send buffer (byte array)
-  uint8_t  *rcv_buf;  // Receive buffer (byte array)
-  MPI_Win    window;
-
-  MPI_Init(&argc, &argv);
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &me);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  if (nproc < 2) {
-    if (me == 0) printf("This benchmark should be run on at least two processes\n");
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  if (me == 0)
-    printf("MPI-2 passive ping-pong latency test, performing %d rounds at each xfer size.\n", NUM_ROUNDS);
-
-  MPI_Alloc_mem(MAX_SIZE, MPI_INFO_NULL, &rcv_buf);
-  MPI_Alloc_mem(MAX_SIZE, MPI_INFO_NULL, &snd_buf);
-
-  MPI_Win_create(rcv_buf, MAX_SIZE, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &window);
-
-  for (i = 0; i < MAX_SIZE; i++) {
-    snd_buf[i] = 1;
-  }
-
-  for (target = 1; target < nproc; target++) {
-    if (me == 0) printf("\n========== Process pair: %d and %d ==========\n\n", 0, target);
-
-    for (msg_length = 1; msg_length <= MAX_SIZE; msg_length *= 2) {
-      MPI_Barrier(MPI_COMM_WORLD);
-      t_start = MPI_Wtime();
-
-      if (me == 0 || me == target) {
-        // Perform NUM_ROUNDS ping-pongs
-        for (round = 0; round < NUM_ROUNDS*2; round++) {
-          int my_target = me == 0 ? target : 0;
-
-          // I am the sender
-          if ((round % 2 == 0 && me == 0) || (round % 2 != 0 && me != 0)) {
-            // Clear start and end markers for next round
-            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, me, 0, window);
-            rcv_buf[0] = 0;
-            rcv_buf[msg_length-1] = 0;
-            MPI_Win_unlock(me, window);
-
-            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, my_target, 0, window);
-            MPI_Put(snd_buf, msg_length, MPI_BYTE, my_target, 0, msg_length, MPI_BYTE, window);
-            MPI_Win_unlock(my_target, window);
-          }
-
-          // I am the receiver: Poll start and end markers
-          else {
-            uint8_t val;
-
-            do {
-#ifdef PROBE_PROGRESS
-              /* Needed for progress in many MPI implementations.. */
-              MPI_Iprobe(0, 0, MPI_COMM_WORLD, (void*) &val, MPI_STATUS_IGNORE);
-#endif
-              MPI_Win_lock(MPI_LOCK_EXCLUSIVE, me, 0, window);
-              val = ((volatile uint8_t*)rcv_buf)[0];
-              MPI_Win_unlock(me, window);
-            } while (val == 0);
-
-            do {
-#ifdef PROBE_PROGRESS
-              MPI_Iprobe(0, 0, MPI_COMM_WORLD, (void*) &val, MPI_STATUS_IGNORE);
-#endif
-              MPI_Win_lock(MPI_LOCK_EXCLUSIVE, me, 0, window);
-              val = ((volatile uint8_t*)rcv_buf)[msg_length-1];
-              MPI_Win_unlock(me, window);
-            } while (val == 0);
-          }
-        }
-      }
-
-      MPI_Barrier(MPI_COMM_WORLD);
-      t_stop = MPI_Wtime();
-
-      if (me == 0)
-        printf("%8d bytes \t %12.8f us\n", msg_length, (t_stop-t_start)/NUM_ROUNDS*1.0e6);
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-  }
-
-  MPI_Win_free(&window);
-  MPI_Free_mem(snd_buf);
-  MPI_Free_mem(rcv_buf);
-
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/tests/mpi/test_mpi_accs.c b/src/armci/tests/mpi/test_mpi_accs.c
deleted file mode 100644
index ee9b0a9..0000000
--- a/src/armci/tests/mpi/test_mpi_accs.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-
-#define XDIM 1024 
-#define YDIM 1024
-#define ITERATIONS 10
-
-int main(int argc, char **argv) {
-    int itr, i, j, rank, nranks, peer, bufsize, errors = 0;
-    double *buffer, *src_buf;
-    MPI_Win buf_win;
-
-    MPI_Init(&argc, &argv);
-
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
-
-    bufsize = XDIM * YDIM * sizeof(double);
-    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &buffer);
-    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &src_buf);
-
-    if (rank == 0)
-        printf("MPI RMA Strided Accumulate Test:\n");
-
-    for (i = 0; i < XDIM*YDIM; i++) {
-        *(buffer  + i) = 1.0 + rank;
-        *(src_buf + i) = 1.0 + rank;
-    }
-
-    MPI_Win_create(buffer, bufsize, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf_win);
-
-    peer = (rank+1) % nranks;
-
-    for (itr = 0; itr < ITERATIONS; itr++) {
-
-      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
-
-      for (j = 0; j < YDIM; j++) {
-        MPI_Accumulate(src_buf + j*XDIM, XDIM, MPI_DOUBLE, peer,
-                       j*XDIM*sizeof(double), XDIM, MPI_DOUBLE, MPI_SUM, buf_win);
-      }
-
-      MPI_Win_unlock(peer, buf_win);
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, buf_win);
-    for (i = 0; i < XDIM; i++) {
-      for (j = 0; j < YDIM; j++) {
-        const double actual   = *(buffer + i + j*XDIM);
-        const double expected = (1.0 + rank) + (1.0 + ((rank+nranks-1)%nranks)) * (ITERATIONS);
-        if (actual - expected > 1e-10) {
-          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
-              rank, j, i, expected, actual);
-          errors++;
-          fflush(stdout);
-        }
-      }
-    }
-    MPI_Win_unlock(rank, buf_win);
-
-    MPI_Win_free(&buf_win);
-    MPI_Free_mem(buffer);
-    MPI_Free_mem(src_buf);
-
-    MPI_Finalize();
-
-    if (errors == 0) {
-      printf("%d: Success\n", rank);
-      return 0;
-    } else {
-      printf("%d: Fail\n", rank);
-      return 1;
-    }
-}
diff --git a/src/armci/tests/mpi/test_mpi_indexed_accs.c b/src/armci/tests/mpi/test_mpi_indexed_accs.c
deleted file mode 100644
index 78c06bd..0000000
--- a/src/armci/tests/mpi/test_mpi_indexed_accs.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-/* One-Sided MPI 2-D Strided Accumulate Test
- *
- * Author: James Dinan <dinan at mcs.anl.gov> 
- * Date  : December, 2010
- *
- * This code performs N accumulates into a 2d patch of a shared array.  The
- * array has dimensions [X, Y] and the subarray has dimensions [SUB_X, SUB_Y]
- * and begins at index [0, 0].  The input and output buffers are specified
- * using an MPI indexed type.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-
-#define XDIM 16
-#define YDIM 16
-#define SUB_XDIM 8
-#define SUB_YDIM 8
-#define ITERATIONS 1
-
-int main(int argc, char **argv) {
-    int i, j, rank, nranks, peer, bufsize, errors;
-    double *win_buf, *src_buf;
-    MPI_Win buf_win;
-
-    MPI_Init(&argc, &argv);
-
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
-
-    bufsize = XDIM * YDIM * sizeof(double);
-    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &win_buf);
-    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &src_buf);
-
-    if (rank == 0)
-        printf("MPI RMA Strided Accumulate Test:\n");
-
-    for (i = 0; i < XDIM*YDIM; i++) {
-        *(win_buf  + i) = 1.0 + rank;
-        *(src_buf + i) = 1.0 + rank;
-    }
-
-    MPI_Win_create(win_buf, bufsize, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf_win);
-
-    peer = (rank+1) % nranks;
-
-    // Perform ITERATIONS strided accumulate operations
-
-    for (i = 0; i < ITERATIONS; i++) {
-      MPI_Aint idx_loc[SUB_YDIM];
-      int idx_rem[SUB_YDIM];
-      int blk_len[SUB_YDIM];
-      MPI_Datatype src_type, dst_type;
-
-      for (i = 0; i < SUB_YDIM; i++) {
-        MPI_Get_address(&src_buf[i*XDIM], &idx_loc[i]);
-        idx_rem[i] = i*XDIM;
-        blk_len[i] = SUB_XDIM;
-      }
-
-#ifdef ABSOLUTE
-      MPI_Type_hindexed(SUB_YDIM, blk_len, idx_loc, MPI_DOUBLE, &src_type);
-#else
-      MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &src_type);
-#endif
-      MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &dst_type);
-
-      MPI_Type_commit(&src_type);
-      MPI_Type_commit(&dst_type);
-
-      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
-
-#ifdef ABSOLUTE
-      MPI_Accumulate(MPI_BOTTOM, 1, src_type, peer, 0, 1, dst_type, MPI_SUM, buf_win);
-#else
-      MPI_Accumulate(src_buf, 1, src_type, peer, 0, 1, dst_type, MPI_SUM, buf_win);
-#endif
-
-      MPI_Win_unlock(peer, buf_win);
-
-      MPI_Type_free(&src_type);
-      MPI_Type_free(&dst_type);
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    // Verify that the results are correct
-
-    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, buf_win);
-    errors = 0;
-    for (i = 0; i < SUB_XDIM; i++) {
-      for (j = 0; j < SUB_YDIM; j++) {
-        const double actual   = *(win_buf + i + j*XDIM);
-        const double expected = (1.0 + rank) + (1.0 + ((rank+nranks-1)%nranks)) * (ITERATIONS);
-        if (actual - expected > 1e-10) {
-          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
-              rank, j, i, expected, actual);
-          errors++;
-          fflush(stdout);
-        }
-      }
-    }
-    for (i = SUB_XDIM; i < XDIM; i++) {
-      for (j = 0; j < SUB_YDIM; j++) {
-        const double actual   = *(win_buf + i + j*XDIM);
-        const double expected = 1.0 + rank;
-        if (actual - expected > 1e-10) {
-          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
-              rank, j, i, expected, actual);
-          errors++;
-          fflush(stdout);
-        }
-      }
-    }
-    for (i = 0; i < XDIM; i++) {
-      for (j = SUB_YDIM; j < YDIM; j++) {
-        const double actual   = *(win_buf + i + j*XDIM);
-        const double expected = 1.0 + rank;
-        if (actual - expected > 1e-10) {
-          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
-              rank, j, i, expected, actual);
-          errors++;
-          fflush(stdout);
-        }
-      }
-    }
-    MPI_Win_unlock(rank, buf_win);
-
-    MPI_Win_free(&buf_win);
-    MPI_Free_mem(win_buf);
-    MPI_Free_mem(src_buf);
-
-    MPI_Finalize();
-
-    if (errors == 0) {
-      printf("%d: Success\n", rank);
-      return 0;
-    } else {
-      printf("%d: Fail\n", rank);
-      return 1;
-    }
-}
diff --git a/src/armci/tests/mpi/test_mpi_indexed_gets.c b/src/armci/tests/mpi/test_mpi_indexed_gets.c
deleted file mode 100644
index dc1bd9d..0000000
--- a/src/armci/tests/mpi/test_mpi_indexed_gets.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-/* One-Sided MPI 2-D Strided Get Test
- *
- * Author: James Dinan <dinan at mcs.anl.gov> 
- * Date  : December, 2010
- *
- * This code performs N strided get operations from a 2d patch of a shared
- * array.  The array has dimensions [X, Y] and the subarray has dimensions
- * [SUB_X, SUB_Y] and begins at index [0, 0].  The input and output buffers are
- * specified using an MPI indexed type.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-
-#define XDIM 8
-#define YDIM 1024
-#define SUB_XDIM 8
-#define SUB_YDIM 256
-
-int main(int argc, char **argv) {
-    int i, j, rank, nranks, peer, bufsize, errors;
-    double *win_buf, *loc_buf;
-    MPI_Win buf_win;
-
-    MPI_Aint idx_loc[SUB_YDIM];
-    int idx_rem[SUB_YDIM];
-    int blk_len[SUB_YDIM];
-    MPI_Datatype loc_type, rem_type;
-
-    MPI_Init(&argc, &argv);
-
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
-
-    bufsize = XDIM * YDIM * sizeof(double);
-    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &win_buf);
-    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &loc_buf);
-
-    if (rank == 0)
-        printf("MPI RMA Strided Get Test:\n");
-
-    for (i = 0; i < XDIM*YDIM; i++)
-        *(win_buf + i) = 1.0 + rank;
-
-    MPI_Win_create(win_buf, bufsize, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf_win);
-
-    peer = (rank+1) % nranks;
-
-    // Build the datatype
-
-    for (i = 0; i < SUB_YDIM; i++) {
-      MPI_Get_address(&loc_buf[i*XDIM], &idx_loc[i]);
-      idx_rem[i] = i*XDIM;
-      blk_len[i] = SUB_XDIM;
-    }
-
-    MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &loc_type);
-    MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &rem_type);
-
-    MPI_Type_commit(&loc_type);
-    MPI_Type_commit(&rem_type);
-
-    // Perform get operation
-
-    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
-
-    MPI_Get(loc_buf, 1, loc_type, peer, 0, 1, rem_type, buf_win);
-
-    // Use the datatype only on the remote side (must have SUB_XDIM == XDIM)
-    // MPI_Get(loc_buf, SUB_XDIM*SUB_YDIM, MPI_DOUBLE, peer, 0, 1, rem_type, buf_win);
-
-    MPI_Win_unlock(peer, buf_win);
-
-    MPI_Type_free(&loc_type);
-    MPI_Type_free(&rem_type);
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    // Verify that the results are correct
-
-    errors = 0;
-    for (i = 0; i < SUB_XDIM; i++) {
-      for (j = 0; j < SUB_YDIM; j++) {
-        const double actual   = *(loc_buf + i + j*XDIM);
-        const double expected = (1.0 + peer);
-        if (actual - expected > 1e-10) {
-          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
-              rank, j, i, expected, actual);
-          errors++;
-          fflush(stdout);
-        }
-      }
-    }
-    for (i = SUB_XDIM; i < XDIM; i++) {
-      for (j = 0; j < SUB_YDIM; j++) {
-        const double actual   = *(loc_buf + i + j*XDIM);
-        const double expected = 1.0 + rank;
-        if (actual - expected > 1e-10) {
-          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
-              rank, j, i, expected, actual);
-          errors++;
-          fflush(stdout);
-        }
-      }
-    }
-    for (i = 0; i < XDIM; i++) {
-      for (j = SUB_YDIM; j < YDIM; j++) {
-        const double actual   = *(loc_buf + i + j*XDIM);
-        const double expected = 1.0 + rank;
-        if (actual - expected > 1e-10) {
-          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
-              rank, j, i, expected, actual);
-          errors++;
-          fflush(stdout);
-        }
-      }
-    }
-
-    MPI_Win_free(&buf_win);
-    MPI_Free_mem(win_buf);
-    MPI_Free_mem(loc_buf);
-
-    MPI_Finalize();
-
-    if (errors == 0) {
-      printf("%d: Success\n", rank);
-      return 0;
-    } else {
-      printf("%d: Fail\n", rank);
-      return 1;
-    }
-}
diff --git a/src/armci/tests/mpi/test_mpi_indexed_puts_gets.c b/src/armci/tests/mpi/test_mpi_indexed_puts_gets.c
deleted file mode 100644
index d14988c..0000000
--- a/src/armci/tests/mpi/test_mpi_indexed_puts_gets.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-/* One-Sided MPI 2-D Strided Accumulate Test
- *
- * Author: James Dinan <dinan at mcs.anl.gov> 
- * Date  : December, 2010
- *
- * This code performs N strided put operations followed by get operations into
- * a 2d patch of a shared array.  The array has dimensions [X, Y] and the
- * subarray has dimensions [SUB_X, SUB_Y] and begins at index [0, 0].  The
- * input and output buffers are specified using an MPI indexed type.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-
-#define XDIM 8
-#define YDIM 1024
-#define SUB_XDIM 8
-#define SUB_YDIM 255
-#define ITERATIONS 1
-
-int main(int argc, char **argv) {
-    int i, j, rank, nranks, peer, bufsize, errors;
-    double *win_buf, *src_buf, *dst_buf;
-    MPI_Win buf_win;
-
-    MPI_Init(&argc, &argv);
-
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
-
-    bufsize = XDIM * YDIM * sizeof(double);
-    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &win_buf);
-    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &src_buf);
-    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &dst_buf);
-
-    if (rank == 0)
-        printf("MPI RMA Strided Accumulate Test:\n");
-
-    for (i = 0; i < XDIM*YDIM; i++) {
-        *(win_buf  + i) = 1.0 + rank;
-        *(src_buf + i) = 1.0 + rank;
-    }
-
-    MPI_Win_create(win_buf, bufsize, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf_win);
-
-    peer = (rank+1) % nranks;
-
-    // Perform ITERATIONS strided accumulate operations
-
-    for (i = 0; i < ITERATIONS; i++) {
-      MPI_Aint idx_loc[SUB_YDIM];
-      int idx_rem[SUB_YDIM];
-      int blk_len[SUB_YDIM];
-      MPI_Datatype src_type, dst_type;
-
-      for (i = 0; i < SUB_YDIM; i++) {
-        MPI_Get_address(&src_buf[i*XDIM], &idx_loc[i]);
-        idx_rem[i] = i*XDIM;
-        blk_len[i] = SUB_XDIM;
-      }
-
-      MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &src_type);
-      MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &dst_type);
-
-      MPI_Type_commit(&src_type);
-      MPI_Type_commit(&dst_type);
-
-      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
-      MPI_Put(src_buf, 1, src_type, peer, 0, 1, dst_type, buf_win);
-      MPI_Win_unlock(peer, buf_win);
-
-      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
-      MPI_Get(dst_buf, 1, src_type, peer, 0, 1, dst_type, buf_win);
-      MPI_Win_unlock(peer, buf_win);
-
-      MPI_Type_free(&src_type);
-      MPI_Type_free(&dst_type);
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    // Verify that the results are correct
-
-    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, buf_win);
-    errors = 0;
-    for (i = 0; i < SUB_XDIM; i++) {
-      for (j = 0; j < SUB_YDIM; j++) {
-        const double actual   = *(win_buf + i + j*XDIM);
-        const double expected = (1.0 + ((rank+nranks-1)%nranks));
-        if (actual - expected > 1e-10) {
-          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
-              rank, j, i, expected, actual);
-          errors++;
-          fflush(stdout);
-        }
-      }
-    }
-    for (i = SUB_XDIM; i < XDIM; i++) {
-      for (j = 0; j < SUB_YDIM; j++) {
-        const double actual   = *(win_buf + i + j*XDIM);
-        const double expected = 1.0 + rank;
-        if (actual - expected > 1e-10) {
-          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
-              rank, j, i, expected, actual);
-          errors++;
-          fflush(stdout);
-        }
-      }
-    }
-    for (i = 0; i < XDIM; i++) {
-      for (j = SUB_YDIM; j < YDIM; j++) {
-        const double actual   = *(win_buf + i + j*XDIM);
-        const double expected = 1.0 + rank;
-        if (actual - expected > 1e-10) {
-          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
-              rank, j, i, expected, actual);
-          errors++;
-          fflush(stdout);
-        }
-      }
-    }
-    MPI_Win_unlock(rank, buf_win);
-
-    MPI_Win_free(&buf_win);
-    MPI_Free_mem(win_buf);
-    MPI_Free_mem(src_buf);
-    MPI_Free_mem(dst_buf);
-
-    MPI_Finalize();
-
-    if (errors == 0) {
-      printf("%d: Success\n", rank);
-      return 0;
-    } else {
-      printf("%d: Fail\n", rank);
-      return 1;
-    }
-}
diff --git a/src/armci/tests/mpi/test_mpi_subarray_accs.c b/src/armci/tests/mpi/test_mpi_subarray_accs.c
deleted file mode 100644
index 0eb5397..0000000
--- a/src/armci/tests/mpi/test_mpi_subarray_accs.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-/* One-Sided MPI 2-D Strided Accumulate Test
- *
- * Author: James Dinan <dinan at mcs.anl.gov> 
- * Date  : December, 2010
- *
- * This code performs N accumulates into a 2d patch of a shared array.  The
- * array has dimensions [X, Y] and the subarray has dimensions [SUB_X, SUB_Y]
- * and begins at index [0, 0].  The input and output buffers are specified
- * using an MPI subarray type.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-
-#define XDIM 1024 
-#define YDIM 1024
-#define SUB_XDIM 512
-#define SUB_YDIM 512
-#define ITERATIONS 10
-
-int main(int argc, char **argv) {
-    int i, j, rank, nranks, peer, bufsize, errors;
-    double *win_buf, *src_buf;
-    MPI_Win buf_win;
-
-    MPI_Init(&argc, &argv);
-
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
-
-    bufsize = XDIM * YDIM * sizeof(double);
-    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &win_buf);
-    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &src_buf);
-
-    if (rank == 0)
-        printf("MPI RMA Strided Accumulate Test:\n");
-
-    for (i = 0; i < XDIM*YDIM; i++) {
-        *(win_buf  + i) = 1.0 + rank;
-        *(src_buf + i) = 1.0 + rank;
-    }
-
-    MPI_Win_create(win_buf, bufsize, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf_win);
-
-    peer = (rank+1) % nranks;
-
-    // Perform ITERATIONS strided accumulate operations
-
-    for (i = 0; i < ITERATIONS; i++) {
-      int ndims               = 2;
-      int src_arr_sizes[2]    = { XDIM, YDIM };
-      int src_arr_subsizes[2] = { SUB_XDIM, SUB_YDIM };
-      int src_arr_starts[2]   = {    0,    0 };
-      int dst_arr_sizes[2]    = { XDIM, YDIM };
-      int dst_arr_subsizes[2] = { SUB_XDIM, SUB_YDIM };
-      int dst_arr_starts[2]   = {    0,    0 };
-      MPI_Datatype src_type, dst_type;
-
-      MPI_Type_create_subarray(ndims, src_arr_sizes, src_arr_subsizes, src_arr_starts,
-          MPI_ORDER_C, MPI_DOUBLE, &src_type);
-
-      MPI_Type_create_subarray(ndims, dst_arr_sizes, dst_arr_subsizes, dst_arr_starts,
-          MPI_ORDER_C, MPI_DOUBLE, &dst_type);
-
-      MPI_Type_commit(&src_type);
-      MPI_Type_commit(&dst_type);
-
-      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
-
-      MPI_Accumulate(src_buf, 1, src_type, peer, 0, 1, dst_type, MPI_SUM, buf_win);
-
-      MPI_Win_unlock(peer, buf_win);
-
-      MPI_Type_free(&src_type);
-      MPI_Type_free(&dst_type);
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    // Verify that the results are correct
-
-    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, buf_win);
-    errors = 0;
-    for (i = 0; i < SUB_XDIM; i++) {
-      for (j = 0; j < SUB_YDIM; j++) {
-        const double actual   = *(win_buf + i + j*XDIM);
-        const double expected = (1.0 + rank) + (1.0 + ((rank+nranks-1)%nranks)) * (ITERATIONS);
-        if (actual - expected > 1e-10) {
-          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
-              rank, j, i, expected, actual);
-          errors++;
-          fflush(stdout);
-        }
-      }
-    }
-    for (i = SUB_XDIM; i < XDIM; i++) {
-      for (j = 0; j < SUB_YDIM; j++) {
-        const double actual   = *(win_buf + i + j*XDIM);
-        const double expected = 1.0 + rank;
-        if (actual - expected > 1e-10) {
-          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
-              rank, j, i, expected, actual);
-          errors++;
-          fflush(stdout);
-        }
-      }
-    }
-    for (i = 0; i < XDIM; i++) {
-      for (j = SUB_YDIM; j < YDIM; j++) {
-        const double actual   = *(win_buf + i + j*XDIM);
-        const double expected = 1.0 + rank;
-        if (actual - expected > 1e-10) {
-          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
-              rank, j, i, expected, actual);
-          errors++;
-          fflush(stdout);
-        }
-      }
-    }
-    MPI_Win_unlock(rank, buf_win);
-
-    MPI_Win_free(&buf_win);
-    MPI_Free_mem(win_buf);
-    MPI_Free_mem(src_buf);
-
-    MPI_Finalize();
-
-    if (errors == 0) {
-      printf("%d: Success\n", rank);
-      return 0;
-    } else {
-      printf("%d: Fail\n", rank);
-      return 1;
-    }
-}
diff --git a/src/armci/tests/mpi/test_win_create.c b/src/armci/tests/mpi/test_win_create.c
deleted file mode 100644
index 20beadc..0000000
--- a/src/armci/tests/mpi/test_win_create.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <mpi.h>
-
-#define DATA_NELTS  1000
-#define NUM_WIN     1000   // Error starts at 17.  Up to 16 is ok.
-#define DATA_SZ     (DATA_NELTS*sizeof(int))
-
-int main(int argc, char ** argv) {
-  int      rank, nproc, i;
-  void    *base_ptrs[NUM_WIN];
-  MPI_Win  windows[NUM_WIN];
-
-  MPI_Init(&argc, &argv);
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  if (rank == 0) printf("Starting MPI window creation test with %d processes\n", nproc);
-
-  // Perform a pile of window creations
-  for (i = 0; i < NUM_WIN; i++) {
-    if (rank == 0) printf(" + Creating window %d\n", i);
-
-    MPI_Alloc_mem(DATA_SZ, MPI_INFO_NULL, &base_ptrs[i]);
-    MPI_Win_create(base_ptrs[i], DATA_SZ, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &windows[i]);
-  }
-
-  MPI_Barrier(MPI_COMM_WORLD);
-
-  // Free all the windows
-  for (i = 0; i < NUM_WIN; i++) {
-    if (rank == 0) printf(" + Freeing window %d\n", i);
-
-    MPI_Win_free(&windows[i]);
-    MPI_Free_mem(base_ptrs[i]);
-  }
-
-  if (rank == 0) printf("Test complete: PASS.\n");
-
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/tests/test_accs.c b/src/armci/tests/test_accs.c
deleted file mode 100644
index 30fdb89..0000000
--- a/src/armci/tests/test_accs.c
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-#include <armci.h>
-
-#define XDIM 1024 
-#define YDIM 1024
-#define ITERATIONS 10
-
-int main(int argc, char **argv) {
-    int i, j, rank, nranks, peer, bufsize, errors, total_errors;
-    double **buffer, *src_buf;
-    int count[2], src_stride, trg_stride, stride_level;
-    double scaling, time;
-
-    MPI_Init(&argc, &argv);
-    ARMCI_Init();
-
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
-
-    buffer = (double **) malloc(sizeof(double *) * nranks);
-
-    bufsize = XDIM * YDIM * sizeof(double);
-    ARMCI_Malloc((void **) buffer, bufsize);
-    src_buf = ARMCI_Malloc_local(bufsize);
-
-    if (rank == 0)
-        printf("ARMCI Strided Accumulate Test:\n");
-
-    ARMCI_Access_begin(buffer[rank]);
-
-    for (i = 0; i < XDIM*YDIM; i++) {
-        *(buffer[rank] + i) = 1.0 + rank;
-        *(src_buf + i) = 1.0 + rank;
-    }
-
-    ARMCI_Access_end(buffer[rank]);
-
-    scaling = 2.0;
-
-    src_stride = XDIM * sizeof(double);
-    trg_stride = XDIM * sizeof(double);
-    stride_level = 1;
-
-    count[1] = YDIM;
-    count[0] = XDIM * sizeof(double);
-
-    ARMCI_Barrier();
-    time = MPI_Wtime();
-
-    peer = (rank+1) % nranks;
-
-    for (i = 0; i < ITERATIONS; i++) {
-
-      ARMCI_AccS(ARMCI_ACC_DBL,
-          (void *) &scaling,
-          src_buf,
-          &src_stride,
-          (void *) buffer[peer],
-          &trg_stride,
-          count,
-          stride_level,
-          peer);
-    }
-
-    ARMCI_Barrier();
-    time = MPI_Wtime() - time;
-
-    if (rank == 0) printf("Time: %f sec\n", time);
-
-    ARMCI_Access_begin(buffer[rank]);
-    for (i = errors = 0; i < XDIM; i++) {
-      for (j = 0; j < YDIM; j++) {
-        const double actual   = *(buffer[rank] + i + j*XDIM);
-        const double expected = (1.0 + rank) + scaling * (1.0 + ((rank+nranks-1)%nranks)) * (ITERATIONS);
-        if (actual - expected > 1e-10) {
-          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
-              rank, j, i, expected, actual);
-          errors++;
-          fflush(stdout);
-        }
-      }
-    }
-    ARMCI_Access_end(buffer[rank]);
-
-    MPI_Allreduce(&errors, &total_errors, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-
-    ARMCI_Free((void *) buffer[rank]);
-    ARMCI_Free_local(src_buf);
-    free(buffer);
-
-    ARMCI_Finalize();
-    MPI_Finalize();
-
-    if (total_errors == 0) {
-      if (rank == 0) printf("Success.\n");
-      return 0;
-    } else {
-      if (rank == 0) printf("Fail.\n");
-      return 1;
-    }
-}
diff --git a/src/armci/tests/test_accs_dla.c b/src/armci/tests/test_accs_dla.c
deleted file mode 100644
index 270e8d3..0000000
--- a/src/armci/tests/test_accs_dla.c
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-#include <armci.h>
-
-#define XDIM 1024 
-#define YDIM 1024
-#define ITERATIONS 10
-
-int main(int argc, char **argv) {
-    int i, j, rank, nranks, peer, bufsize, errors, total_errors;
-    double **buf_bvec, **src_bvec, *src_buf;
-    int count[2], src_stride, trg_stride, stride_level;
-    double scaling, time;
-
-    MPI_Init(&argc, &argv);
-    ARMCI_Init();
-
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
-
-    buf_bvec = (double **) malloc(sizeof(double *) * nranks);
-    src_bvec = (double **) malloc(sizeof(double *) * nranks);
-
-    bufsize = XDIM * YDIM * sizeof(double);
-    ARMCI_Malloc((void **) buf_bvec, bufsize);
-    ARMCI_Malloc((void **) src_bvec, bufsize);
-    src_buf = src_bvec[rank];
-
-    if (rank == 0)
-        printf("ARMCI Strided DLA Accumulate Test:\n");
-
-    ARMCI_Access_begin(buf_bvec[rank]);
-    ARMCI_Access_begin(src_buf);
-
-    for (i = 0; i < XDIM*YDIM; i++) {
-        *(buf_bvec[rank] + i) = 1.0 + rank;
-        *(src_buf + i) = 1.0 + rank;
-    }
-
-    ARMCI_Access_end(src_buf);
-    ARMCI_Access_end(buf_bvec[rank]);
-
-    scaling = 2.0;
-
-    src_stride = XDIM * sizeof(double);
-    trg_stride = XDIM * sizeof(double);
-    stride_level = 1;
-
-    count[1] = YDIM;
-    count[0] = XDIM * sizeof(double);
-
-    ARMCI_Barrier();
-    time = MPI_Wtime();
-
-    peer = (rank+1) % nranks;
-
-    for (i = 0; i < ITERATIONS; i++) {
-
-      ARMCI_AccS(ARMCI_ACC_DBL,
-          (void *) &scaling,
-          src_buf,
-          &src_stride,
-          (void *) buf_bvec[peer],
-          &trg_stride,
-          count,
-          stride_level,
-          peer);
-    }
-
-    ARMCI_Barrier();
-    time = MPI_Wtime() - time;
-
-    if (rank == 0) printf("Time: %f sec\n", time);
-
-    ARMCI_Access_begin(buf_bvec[rank]);
-    for (i = errors = 0; i < XDIM; i++) {
-      for (j = 0; j < YDIM; j++) {
-        const double actual   = *(buf_bvec[rank] + i + j*XDIM);
-        const double expected = (1.0 + rank) + scaling * (1.0 + ((rank+nranks-1)%nranks)) * (ITERATIONS);
-        if (actual - expected > 1e-10) {
-          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
-              rank, j, i, expected, actual);
-          errors++;
-          fflush(stdout);
-        }
-      }
-    }
-    ARMCI_Access_end(buf_bvec[rank]);
-
-    MPI_Allreduce(&errors, &total_errors, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-
-    ARMCI_Free((void *) buf_bvec[rank]);
-    ARMCI_Free((void *) src_bvec[rank]);
-
-    free(buf_bvec);
-    free(src_bvec);
-
-    ARMCI_Finalize();
-    MPI_Finalize();
-
-    if (total_errors == 0) {
-      if (rank == 0) printf("Success.\n");
-      return 0;
-    } else {
-      if (rank == 0) printf("Fail.\n");
-      return 1;
-    }
-}
diff --git a/src/armci/tests/test_assert.c b/src/armci/tests/test_assert.c
deleted file mode 100644
index 3085572..0000000
--- a/src/armci/tests/test_assert.c
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#include <mpi.h>
-#include <armci.h>
-
-int main(int argc, char ** argv) {
-  MPI_Init(&argc, &argv);
-  ARMCI_Init();
-
-  ARMCI_Get(NULL, NULL, 1, 0);
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/tests/test_group_split.c b/src/armci/tests/test_group_split.c
deleted file mode 100644
index 1e87e8d..0000000
--- a/src/armci/tests/test_group_split.c
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <armci.h>
-#include <armcix.h>
-
-#define PART_SIZE 1
-
-int main(int argc, char **argv) {
-  int          me, nproc;
-  ARMCI_Group  g_world, g_new;
-
-  MPI_Init(&argc, &argv);
-  ARMCI_Init();
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &me);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  if (me == 0) printf("ARMCI Group test starting on %d procs\n", nproc);
-
-  ARMCI_Group_get_world(&g_world);
-  
-  if (me == 0) printf(" + Creating odd/even groups\n");
-
-  ARMCIX_Group_split(&g_world, me%2, me, &g_new);
-
-  ARMCI_Group_free(&g_new);
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/tests/test_groups.c b/src/armci/tests/test_groups.c
deleted file mode 100644
index 078e0ad..0000000
--- a/src/armci/tests/test_groups.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <armci.h>
-
-#define PART_SIZE 1
-
-int main(int argc, char **argv) {
-  int                      me, nproc;
-  int                      i, *procs;
-  ARMCI_Group              g_world, g_odd, g_even;
-
-  MPI_Init(&argc, &argv);
-  ARMCI_Init();
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &me);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  procs = malloc(sizeof(int) * ( nproc/2 + (nproc % 2 ? 1 : 0 )));
-
-  if (me == 0) printf("ARMCI Group test starting on %d procs\n", nproc);
-
-  ARMCI_Group_get_world(&g_world);
-  
-  if (me == 0) printf(" + Creating odd group\n");
-
-  for (i = 1; i < nproc; i += 2) {
-    procs[i/2] = i;
-  }
-
-  ARMCI_Group_create_child(i/2, procs, &g_odd, &g_world);
-
-  if (me == 0) printf(" + Creating even group\n");
-
-  for (i = 0; i < nproc; i += 2) {
-    procs[i/2] = i;
-  }
-
-  ARMCI_Group_create_child(i/2, procs, &g_even, &g_world);
-
-  if (me == 0) printf(" + Freeing groups\n");
-
-  if (me % 2 > 0)
-    ARMCI_Group_free(&g_odd);
-  else
-    ARMCI_Group_free(&g_even);
-
-  free(procs);
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/tests/test_igop.c b/src/armci/tests/test_igop.c
deleted file mode 100644
index 9b6fbd2..0000000
--- a/src/armci/tests/test_igop.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#include <mpi.h>
-#include <armci.h>
-
-#define DATA_SZ 100
-#define SHARED_BUF 1
-
-int main(int argc, char ** argv) {
-  int    rank, nproc, i;
-  int   *buf;
-#ifdef SHARED_BUF
-  void **base_ptrs;
-#endif
-
-  MPI_Init(&argc, &argv);
-  ARMCI_Init();
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  if (rank == 0) printf("Starting ARMCI GOP test with %d processes\n", nproc);
-
-#ifdef SHARED_BUF
-  base_ptrs = malloc(nproc*sizeof(void*));
-  ARMCI_Malloc(base_ptrs, DATA_SZ*sizeof(int));
-  buf = base_ptrs[rank];
-#else
-  buf = malloc(DATA_SZ*sizeof(int));
-#endif
-
-  if (rank == 0) printf(" - Testing ABSMIN\n");
-
-  for (i = 0; i < DATA_SZ; i++)
-    buf[i] = (rank+1) * ((i % 2) ? -1 : 1);
-
-  armci_msg_igop(buf, DATA_SZ, "absmin");
-
-  for (i = 0; i < DATA_SZ; i++)
-    if (buf[i] != 1) {
-      printf("Err: buf[%d] = %d expected 1\n", i, buf[i]);
-      ARMCI_Error("Fail", 1);
-    }
-
-  if (rank == 0) printf(" - Testing ABSMAX\n");
-
-  for (i = 0; i < DATA_SZ; i++)
-    buf[i] = (rank+1) * ((i % 2) ? -1 : 1);
-
-  armci_msg_igop(buf, DATA_SZ, "absmax");
-
-  for (i = 0; i < DATA_SZ; i++)
-    if (buf[i] != nproc) {
-      printf("Err: buf[%d] = %d expected %d\n", i, buf[i], nproc);
-      ARMCI_Error("Fail", 1);
-    }
-
-#ifdef SHARED_BUF
-  ARMCI_Free(base_ptrs[rank]);
-  free(base_ptrs);
-#else
-  free(buf);
-#endif
-
-  if (rank == 0) printf("Pass.\n");
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/tests/test_malloc.c b/src/armci/tests/test_malloc.c
deleted file mode 100644
index fd0cbb2..0000000
--- a/src/armci/tests/test_malloc.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-/** ARMCI Malloc test -- James Dinan <dinan at mcs.anl.gov>
-  * 
-  * Perform a pile of allocations and then free them.
-  */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#include <mpi.h>
-#include <armci.h>
-
-#define DATA_NELTS     1000
-#define NUM_ITERATIONS 100
-#define DATA_SZ        (DATA_NELTS*sizeof(int))
-
-int main(int argc, char ** argv) {
-  int     rank, nproc, test_iter;
-  void ***base_ptrs;
-
-  MPI_Init(&argc, &argv);
-  ARMCI_Init();
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  if (rank == 0) printf("Starting ARMCI memory allocation test with %d processes\n", nproc);
-
-  base_ptrs = malloc(sizeof(void**)*NUM_ITERATIONS);
-
-  // Perform a pile of allocations
-  for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) {
-    if (rank == 0) printf(" + allocation %d\n", test_iter);
-
-    base_ptrs[test_iter] = malloc(sizeof(void*)*nproc);
-    ARMCI_Malloc((void**)base_ptrs[test_iter], (test_iter % 4 == 0) ? 0 : DATA_SZ);
-  }
-
-  ARMCI_Barrier();
-
-  // Free all allocations
-  for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) {
-    if (rank == 0) printf(" + free %d\n", test_iter);
-
-    ARMCI_Free(((void**)base_ptrs[test_iter])[rank]);
-    free(base_ptrs[test_iter]);
-  }
-
-  free(base_ptrs);
-
-  if (rank == 0) printf("Test complete: PASS.\n");
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/tests/test_malloc_group.c b/src/armci/tests/test_malloc_group.c
deleted file mode 100644
index 6afa3c3..0000000
--- a/src/armci/tests/test_malloc_group.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <armci.h>
-#include <armcix.h>
-
-#define PART_SIZE 1
-#define DATA_SZ   100*sizeof(int)
-
-int main(int argc, char **argv) {
-  int          me, nproc, grp_me, grp_nproc;
-  ARMCI_Group  g_world, g_new;
-  void       **base_ptrs;
-
-  MPI_Init(&argc, &argv);
-  ARMCI_Init();
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &me);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  base_ptrs = malloc(sizeof(void*)*nproc);
-
-  if (me == 0) printf("ARMCI Group test starting on %d procs\n", nproc);
-
-  ARMCI_Group_get_world(&g_world);
-  
-  if (me == 0) printf(" + Creating odd/even groups\n");
-
-  ARMCIX_Group_split(&g_world, me%2, me, &g_new);
-
-  ARMCI_Group_rank(&g_new, &grp_me);
-  ARMCI_Group_size(&g_new, &grp_nproc);
-
-  if (me == 0) printf(" + Performing group allocation\n");
-  ARMCI_Malloc_group(base_ptrs, DATA_SZ, &g_new);
-  ARMCI_Barrier();
-
-  if (me == 0) printf(" + Freeing group allocation\n");
-
-  ARMCI_Free_group(base_ptrs[grp_me], &g_new);
-  ARMCI_Barrier();
-
-  if (me == 0) printf(" + Freeing group\n");
-
-  ARMCI_Group_free(&g_new);
-
-  if (me == 0) printf(" + done\n");
-
-  free(base_ptrs);
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/tests/test_malloc_irreg.c b/src/armci/tests/test_malloc_irreg.c
deleted file mode 100644
index 7e011fc..0000000
--- a/src/armci/tests/test_malloc_irreg.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-/** ARMCI Irregular memory allocation test -- James Dinan <dinan at mcs.anl.gov>
-  * 
-  * Perform a series of allocations where all processes but one give zero bytes
-  * to ARMCI_Malloc.  The process that does a non-zero allocation initializes
-  * their shared memory and then the data is fetched by a neighbor and tested.
-  */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#include <mpi.h>
-#include <armci.h>
-
-#define DATA_NELTS     1000
-#define NUM_ITERATIONS 10
-#define DATA_SZ        (DATA_NELTS*sizeof(int))
-
-int main(int argc, char ** argv) {
-  int     rank, nproc, test_iter, i;
-  void ***base_ptrs;
-  int    *buf;
-
-  MPI_Init(&argc, &argv);
-  ARMCI_Init();
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  if (rank == 0) printf("Starting ARMCI irregular memory allocation test with %d processes\n", nproc);
-
-  buf       = malloc(DATA_SZ);
-  base_ptrs = malloc(sizeof(void**)*NUM_ITERATIONS);
-
-  // Perform a pile of allocations
-  for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) {
-    if (rank == 0) printf(" + allocation %d\n", test_iter);
-
-    base_ptrs[test_iter] = malloc(sizeof(void*)*nproc);
-    ARMCI_Malloc((void**)base_ptrs[test_iter], (test_iter % nproc == rank) ? DATA_SZ : 0);
-  }
-
-  ARMCI_Barrier();
-
-  // Initialize data to my rank
-  for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) {
-    if (test_iter % nproc == rank) {
-      ARMCI_Access_begin(base_ptrs[test_iter][rank]);
-      for (i = 0; i < DATA_NELTS; i++)
-        ((int*)base_ptrs[test_iter][rank])[i] = rank;
-      ARMCI_Access_end(base_ptrs[test_iter][rank]);
-    }
-  }
-
-  ARMCI_Barrier();
-
-  // Fetch and test
-  for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) {
-    ARMCI_Get(base_ptrs[test_iter][test_iter%nproc], buf, DATA_SZ, test_iter%nproc);
-    for (i = 0; i < DATA_NELTS; i++) {
-      if (buf[i] != test_iter % nproc)
-        printf("Error: got %d expected %d\n", buf[i], test_iter%nproc);
-    }
-  }
-
-  // Free all allocations
-  for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) {
-    if (rank == 0) printf(" + free %d\n", test_iter);
-
-    ARMCI_Free(((void**)base_ptrs[test_iter])[rank]);
-    free(base_ptrs[test_iter]);
-  }
-
-  ARMCI_Barrier();
-
-  free(base_ptrs);
-  free(buf);
-
-  if (rank == 0) printf("Test complete: PASS.\n");
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/tests/test_mutex.c b/src/armci/tests/test_mutex.c
deleted file mode 100644
index 39244c5..0000000
--- a/src/armci/tests/test_mutex.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-/** ARMCI Mutex test -- James Dinan <dinan at mcs.anl.gov>
-  * 
-  * All processes create N mutexes then lock+unlock all mutexes on all
-  * processes.
-  */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#include <mpi.h>
-#include <armci.h>
-
-#define NUM_MUTEXES 10
-
-int main(int argc, char ** argv) {
-  int rank, nproc, i, j;
-
-  MPI_Init(&argc, &argv);
-  ARMCI_Init();
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  if (rank == 0) printf("Starting ARMCI mutex test with %d processes\n", nproc);
-
-  ARMCI_Create_mutexes(NUM_MUTEXES);
-
-  for (i = 0; i < nproc; i++)
-    for (j = 0; j < NUM_MUTEXES; j++) {
-      ARMCI_Lock(  j, (rank+i)%nproc);
-      ARMCI_Unlock(j, (rank+i)%nproc);
-    }
-
-  printf(" + %3d done\n", rank);
-  fflush(NULL);
-
-  ARMCI_Destroy_mutexes();
-
-  if (rank == 0) printf("Test complete: PASS.\n");
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/tests/test_mutex_rmw.c b/src/armci/tests/test_mutex_rmw.c
deleted file mode 100644
index 830ba11..0000000
--- a/src/armci/tests/test_mutex_rmw.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-/** ARMCI Mutex RMW Test -- James Dinan <dinan at mcs.anl.gov>
-  * 
-  * A mutex and shared integer live on process 0.  All processes lock, add a
-  * value to the integer, and unlock.  Process 0 confirms the final result.
-  */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#include <mpi.h>
-#include <armci.h>
-
-#define NITER 1000
-#define ADDIN 5
-
-int main(int argc, char ** argv) {
-  int    rank, nproc, val, i;
-  void **base_ptrs;
-
-  MPI_Init(&argc, &argv);
-  ARMCI_Init();
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  if (rank == 0) printf("Starting ARMCI mutex read-modify-write test with %d processes\n", nproc);
-
-  base_ptrs = malloc(nproc*sizeof(void*));
-
-  ARMCI_Create_mutexes(rank == 0 ? 1 : 0);
-  ARMCI_Malloc(base_ptrs, (rank == 0) ? sizeof(int) : 0); // Proc 0 has a shared int
-
-  if (rank == 0) {
-    val = 0;
-    ARMCI_Put(&val, base_ptrs[0], sizeof(int), 0);
-  }
-
-  ARMCI_Barrier();
-
-  for (i = 0; i < NITER; i++) {
-    ARMCI_Lock(0, 0);
-
-    ARMCI_Get(base_ptrs[0], &val, sizeof(int), 0);
-    val += ADDIN;
-    ARMCI_Put(&val, base_ptrs[0], sizeof(int), 0);
-
-    ARMCI_Unlock(0, 0);
-  }
-
-  printf(" + %3d done\n", rank);
-  fflush(NULL);
-
-  ARMCI_Barrier();
-
-  if (rank == 0) {
-    ARMCI_Get(base_ptrs[0], &val, sizeof(int), 0);
-
-    if (val == ADDIN*nproc*NITER)
-      printf("Test complete: PASS.\n");
-    else
-      printf("Test complete: FAIL.  Got %d, expected %d.\n", val, ADDIN*nproc*NITER);
-  }
-
-  ARMCI_Free(base_ptrs[rank]);
-  ARMCI_Destroy_mutexes();
-  free(base_ptrs);
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/tests/test_mutex_trylock.c b/src/armci/tests/test_mutex_trylock.c
deleted file mode 100644
index 7b9aa62..0000000
--- a/src/armci/tests/test_mutex_trylock.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-/** ARMCI Mutex test -- James Dinan <dinan at mcs.anl.gov>
-  * 
-  * All processes create N mutexes then lock+unlock all mutexes on all
-  * processes.  Locking is accomplished via trylock in a loop.
-  */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <unistd.h>
-
-#include <mpi.h>
-#include <armci.h>
-#include <armcix.h>
-
-#define NUM_MUTEXES 10
-
-int main(int argc, char ** argv) {
-  int rank, nproc, i, j;
-  armcix_mutex_hdl_t mhdl;
-  ARMCI_Group world_group;
-
-  MPI_Init(&argc, &argv);
-  ARMCI_Init();
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  if (rank == 0) printf("Starting ARMCIX mutex test with %d processes\n", nproc);
-
-  ARMCI_Group_get_world(&world_group);
-  mhdl = ARMCIX_Create_mutexes_hdl(NUM_MUTEXES, &world_group);
-
-  for (i = 0; i < nproc; i++)
-    for (j = 0; j < NUM_MUTEXES; j++) {
-      while (ARMCIX_Trylock_hdl(mhdl, j, (rank+i)%nproc))
-        ;
-      ARMCIX_Unlock_hdl(mhdl, j, (rank+i)%nproc);
-    }
-
-  printf(" + %3d done\n", rank);
-  fflush(NULL);
-
-  ARMCIX_Destroy_mutexes_hdl(mhdl);
-
-  if (rank == 0) printf("Test complete: PASS.\n");
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/tests/test_onesided.c b/src/armci/tests/test_onesided.c
deleted file mode 100644
index b72b4f5..0000000
--- a/src/armci/tests/test_onesided.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#include <mpi.h>
-#include <armci.h>
-
-#define DATA_NELTS     1000
-#define NUM_ITERATIONS 10
-#define DATA_SZ        (DATA_NELTS*sizeof(int))
-
-int main(int argc, char ** argv) {
-  int    rank, nproc, i, test_iter;
-  int   *my_data, *buf;
-  void **base_ptrs;
-
-  MPI_Init(&argc, &argv);
-  ARMCI_Init();
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  if (rank == 0) printf("Starting ARMCI test with %d processes\n", nproc);
-
-  buf = malloc(DATA_SZ);
-  base_ptrs = malloc(sizeof(void*)*nproc);
-
-  for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) {
-    if (rank == 0) printf(" + iteration %d\n", test_iter);
-
-    /*** Allocate the shared array ***/
-    ARMCI_Malloc(base_ptrs, DATA_SZ);
-    my_data = base_ptrs[rank];
-
-    /*** Get from our right neighbor and verify correct data ***/
-    ARMCI_Access_begin(my_data);
-    for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank*test_iter;
-    ARMCI_Access_end(my_data);
-
-    ARMCI_Barrier(); // Wait for all updates to data to complete
-
-    ARMCI_Get(base_ptrs[(rank+1) % nproc], buf, DATA_SZ, (rank+1) % nproc);
-
-    for (i = 0; i < DATA_NELTS; i++) {
-      if (buf[i] != ((rank+1) % nproc)*test_iter) {
-        printf("%d: GET expected %d, got %d\n", rank, (rank+1) % nproc, buf[i]);
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-
-    ARMCI_Barrier(); // Wait for all gets to complete
-
-    /*** Put to our left neighbor and verify correct data ***/
-    for (i = 0; i < DATA_NELTS; i++) buf[i] = rank*test_iter;
-    ARMCI_Put(buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc);
-
-    ARMCI_Barrier(); // Wait for all updates to data to complete
-
-    ARMCI_Access_begin(my_data);
-    for (i = 0; i < DATA_NELTS; i++) {
-      if (my_data[i] != ((rank+1) % nproc)*test_iter) {
-        printf("%d: PUT expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]);
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-    ARMCI_Access_end(my_data);
-
-    ARMCI_Barrier(); // Wait for all gets to complete
-
-    /*** Accumulate to our left neighbor and verify correct data ***/
-    for (i = 0; i < DATA_NELTS; i++) buf[i] = rank;
-    
-    ARMCI_Access_begin(my_data);
-    for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank;
-    ARMCI_Access_end(my_data);
-    ARMCI_Barrier();
-
-    int scale = test_iter;
-    ARMCI_Acc(ARMCI_ACC_INT, &scale, buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc);
-
-    ARMCI_Barrier(); // Wait for all updates to data to complete
-
-    ARMCI_Access_begin(my_data);
-    for (i = 0; i < DATA_NELTS; i++) {
-      if (my_data[i] != rank + ((rank+1) % nproc)*test_iter) {
-        printf("%d: ACC expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]);
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-    ARMCI_Access_end(my_data);
-
-    ARMCI_Free(my_data);
-  }
-
-  free(buf);
-  free(base_ptrs);
-
-  if (rank == 0) printf("Test complete: PASS.\n");
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/tests/test_onesided_shared.c b/src/armci/tests/test_onesided_shared.c
deleted file mode 100644
index 03578e6..0000000
--- a/src/armci/tests/test_onesided_shared.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#include <mpi.h>
-#include <armci.h>
-
-#define VERBOSE        0
-#define DATA_NELTS     1000
-#define NUM_ITERATIONS 10
-#define DATA_SZ        (DATA_NELTS*sizeof(int))
-
-int main(int argc, char ** argv) {
-  int    rank, nproc, i, test_iter;
-  int   *my_data, *buf;
-  void **base_ptrs;
-  void **buf_shared;
-
-  MPI_Init(&argc, &argv);
-  ARMCI_Init();
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  if (rank == 0) printf("Starting ARMCI test with %d processes\n", nproc);
-
-  base_ptrs  = malloc(sizeof(void*)*nproc);
-  buf_shared = malloc(sizeof(void*)*nproc);
-
-  for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) {
-    if (rank == 0) printf(" + iteration %d\n", test_iter);
-
-    if (rank == 0 && VERBOSE) printf("   - Allocating shared buffers\n");
-
-    /*** Allocate the shared array ***/
-    ARMCI_Malloc(base_ptrs,  DATA_SZ);
-    ARMCI_Malloc(buf_shared, DATA_SZ);
-
-    buf     = buf_shared[rank];
-    my_data = base_ptrs[rank];
-
-    if (rank == 0 && VERBOSE) printf("   - Testing one-sided get\n");
-
-    /*** Get from our right neighbor and verify correct data ***/
-    ARMCI_Access_begin(my_data);
-    for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank*test_iter;
-    ARMCI_Access_end(my_data);
-
-    ARMCI_Barrier(); // Wait for all updates to data to complete
-
-    ARMCI_Get(base_ptrs[(rank+1) % nproc], buf, DATA_SZ, (rank+1) % nproc);
-
-    ARMCI_Access_begin(buf);
-
-    for (i = 0; i < DATA_NELTS; i++) {
-      if (buf[i] != ((rank+1) % nproc)*test_iter) {
-        printf("%d: GET expected %d, got %d\n", rank, (rank+1) % nproc, buf[i]);
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-
-    ARMCI_Access_end(buf);
-
-    ARMCI_Barrier(); // Wait for all gets to complete
-
-    if (rank == 0 && VERBOSE) printf("   - Testing one-sided put\n");
-
-    /*** Put to our left neighbor and verify correct data ***/
-    for (i = 0; i < DATA_NELTS; i++) buf[i] = rank*test_iter;
-    ARMCI_Put(buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc);
-
-    ARMCI_Barrier(); // Wait for all updates to data to complete
-
-    ARMCI_Access_begin(my_data);
-    for (i = 0; i < DATA_NELTS; i++) {
-      if (my_data[i] != ((rank+1) % nproc)*test_iter) {
-        printf("%d: PUT expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]);
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-    ARMCI_Access_end(my_data);
-
-    ARMCI_Barrier(); // Wait for all gets to complete
-
-    if (rank == 0 && VERBOSE) printf("   - Testing one-sided accumlate\n");
-
-    /*** Accumulate to our left neighbor and verify correct data ***/
-    ARMCI_Access_begin(buf);
-    for (i = 0; i < DATA_NELTS; i++) buf[i] = rank;
-    ARMCI_Access_end(buf);
-    
-    ARMCI_Access_begin(my_data);
-    for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank;
-    ARMCI_Access_end(my_data);
-    ARMCI_Barrier();
-
-    int scale = test_iter;
-    ARMCI_Acc(ARMCI_ACC_INT, &scale, buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc);
-
-    ARMCI_Barrier(); // Wait for all updates to data to complete
-
-    ARMCI_Access_begin(my_data);
-    for (i = 0; i < DATA_NELTS; i++) {
-      if (my_data[i] != rank + ((rank+1) % nproc)*test_iter) {
-        printf("%d: ACC expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]);
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-    ARMCI_Access_end(my_data);
-
-    if (rank == 0 && VERBOSE) printf("   - Freeing shared buffers\n");
-
-    ARMCI_Free(my_data);
-    ARMCI_Free(buf);
-  }
-
-  free(base_ptrs);
-  free(buf_shared);
-
-  if (rank == 0) printf("Test complete: PASS.\n");
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/tests/test_onesided_shared_dla.c b/src/armci/tests/test_onesided_shared_dla.c
deleted file mode 100644
index f438ed3..0000000
--- a/src/armci/tests/test_onesided_shared_dla.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#include <mpi.h>
-#include <armci.h>
-
-#define VERBOSE        0
-#define DATA_NELTS     1000
-#define NUM_ITERATIONS 10
-#define DATA_SZ        (DATA_NELTS*sizeof(int))
-
-int main(int argc, char ** argv) {
-  int    rank, nproc, i, test_iter;
-  int   *my_data, *buf;
-  void **base_ptrs;
-  void **buf_shared;
-
-  MPI_Init(&argc, &argv);
-  ARMCI_Init();
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  if (rank == 0) printf("Starting ARMCI test with %d processes\n", nproc);
-
-  base_ptrs  = malloc(sizeof(void*)*nproc);
-  buf_shared = malloc(sizeof(void*)*nproc);
-
-  for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) {
-    if (rank == 0) printf(" + iteration %d\n", test_iter);
-
-    if (rank == 0 && VERBOSE) printf("   - Allocating shared buffers\n");
-
-    /*** Allocate the shared array ***/
-    ARMCI_Malloc(base_ptrs,  DATA_SZ);
-    ARMCI_Malloc(buf_shared, DATA_SZ);
-
-    buf     = buf_shared[rank];
-    my_data = base_ptrs[rank];
-
-    if (rank == 0 && VERBOSE) printf("   - Testing one-sided get\n");
-    
-    ARMCI_Access_begin(buf);
-
-    /*** Get from our right neighbor and verify correct data ***/
-    ARMCI_Access_begin(my_data);
-    for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank*test_iter;
-    ARMCI_Access_end(my_data);
-
-    ARMCI_Barrier(); // Wait for all updates to data to complete
-
-    ARMCI_Get(base_ptrs[(rank+1) % nproc], buf, DATA_SZ, (rank+1) % nproc);
-
-    for (i = 0; i < DATA_NELTS; i++) {
-      if (buf[i] != ((rank+1) % nproc)*test_iter) {
-        printf("%d: GET expected %d, got %d\n", rank, (rank+1) % nproc, buf[i]);
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-
-    ARMCI_Barrier();
-
-    if (rank == 0 && VERBOSE) printf("   - Testing one-sided put\n");
-
-    /*** Put to our left neighbor and verify correct data ***/
-    for (i = 0; i < DATA_NELTS; i++) buf[i] = rank*test_iter;
-
-    ARMCI_Put(buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc);
-
-    ARMCI_Barrier();
-
-    ARMCI_Access_begin(my_data);
-    for (i = 0; i < DATA_NELTS; i++) {
-      if (my_data[i] != ((rank+1) % nproc)*test_iter) {
-        printf("%d: PUT expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]);
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-    ARMCI_Access_end(my_data);
-
-    ARMCI_Barrier();
-
-    if (rank == 0 && VERBOSE) printf("   - Testing one-sided accumlate\n");
-
-    /*** Accumulate to our left neighbor and verify correct data ***/
-    ARMCI_Access_begin(my_data);
-    for (i = 0; i < DATA_NELTS; i++) buf[i] = rank;
-    
-    for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank;
-    ARMCI_Access_end(my_data);
-
-    ARMCI_Barrier();
-
-    int scale = test_iter;
-    ARMCI_Acc(ARMCI_ACC_INT, &scale, buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc);
-
-    ARMCI_Barrier();
-
-    ARMCI_Access_begin(my_data);
-    for (i = 0; i < DATA_NELTS; i++) {
-      if (my_data[i] != rank + ((rank+1) % nproc)*test_iter) {
-        printf("%d: ACC expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]);
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-    ARMCI_Access_end(my_data);
-    
-    ARMCI_Access_end(buf);
-
-    if (rank == 0 && VERBOSE) printf("   - Freeing shared buffers\n");
-
-    ARMCI_Free(my_data);
-    ARMCI_Free(buf);
-  }
-
-  free(base_ptrs);
-  free(buf_shared);
-
-  if (rank == 0) printf("Test complete: PASS.\n");
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/tests/test_parmci.c b/src/armci/tests/test_parmci.c
deleted file mode 100644
index 91e9ca1..0000000
--- a/src/armci/tests/test_parmci.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#include <mpi.h>
-#include <armci.h>
-
-#define DATA_NELTS     1000
-#define NUM_ITERATIONS 10
-#define DATA_SZ        (DATA_NELTS*sizeof(int))
-
-       int armci_calls = 0;
-extern int parmci_calls;
-
-int main(int argc, char ** argv) {
-  int    rank, nproc, i, test_iter;
-  int   *my_data, *buf;
-  void **base_ptrs;
-
-  MPI_Init(&argc, &argv);
-  ARMCI_Init();
-  armci_calls++;
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  if (rank == 0) printf("Starting ARMCI test with %d processes\n", nproc);
-
-  buf = malloc(DATA_SZ);
-  base_ptrs = malloc(sizeof(void*)*nproc);
-
-  for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) {
-    if (rank == 0) printf(" + iteration %d\n", test_iter);
-
-    /*** Allocate the shared array ***/
-    ARMCI_Malloc(base_ptrs, DATA_SZ);
-    my_data = base_ptrs[rank];
-
-    /*** Get from our right neighbor and verify correct data ***/
-    ARMCI_Access_begin(my_data);
-    for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank*test_iter;
-    ARMCI_Access_end(my_data);
-
-    ARMCI_Barrier(); // Wait for all updates to data to complete
-    armci_calls++;
-
-    ARMCI_Get(base_ptrs[(rank+1) % nproc], buf, DATA_SZ, (rank+1) % nproc);
-    armci_calls++;
-
-    for (i = 0; i < DATA_NELTS; i++) {
-      if (buf[i] != ((rank+1) % nproc)*test_iter) {
-        printf("%d: GET expected %d, got %d\n", rank, (rank+1) % nproc, buf[i]);
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-
-    ARMCI_Barrier(); // Wait for all gets to complete
-    armci_calls++;
-
-    /*** Put to our left neighbor and verify correct data ***/
-    for (i = 0; i < DATA_NELTS; i++) buf[i] = rank*test_iter;
-    ARMCI_Put(buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc);
-    armci_calls++;
-
-    ARMCI_Barrier(); // Wait for all updates to data to complete
-    armci_calls++;
-
-    ARMCI_Access_begin(my_data);
-    for (i = 0; i < DATA_NELTS; i++) {
-      if (my_data[i] != ((rank+1) % nproc)*test_iter) {
-        printf("%d: PUT expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]);
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-    ARMCI_Access_end(my_data);
-
-    ARMCI_Barrier(); // Wait for all gets to complete
-    armci_calls++;
-
-    /*** Accumulate to our left neighbor and verify correct data ***/
-    for (i = 0; i < DATA_NELTS; i++) buf[i] = rank;
-    
-    ARMCI_Access_begin(my_data);
-    for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank;
-    ARMCI_Access_end(my_data);
-    ARMCI_Barrier();
-    armci_calls++;
-
-    int scale = test_iter;
-    ARMCI_Acc(ARMCI_ACC_INT, &scale, buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc);
-
-    ARMCI_Barrier(); // Wait for all updates to data to complete
-    armci_calls++;
-
-    ARMCI_Access_begin(my_data);
-    for (i = 0; i < DATA_NELTS; i++) {
-      if (my_data[i] != rank + ((rank+1) % nproc)*test_iter) {
-        printf("%d: ACC expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]);
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-    ARMCI_Access_end(my_data);
-
-    ARMCI_Free(my_data);
-  }
-
-  free(buf);
-  free(base_ptrs);
-  
-  if (armci_calls == parmci_calls) {
-    if (rank == 0) {
-      printf("Profiling check ok: %d recorded == %d profiled calls\n", armci_calls, parmci_calls);
-      printf("Test complete: PASS.\n");
-    }
-  } else {
-    printf("%d: Profiling check failed -- %d recorded != %d profiled calls\n", rank, armci_calls, parmci_calls);
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  ARMCI_Finalize();
-  armci_calls++;
-
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/armci/tests/test_parmci_lib.c b/src/armci/tests/test_parmci_lib.c
deleted file mode 100644
index 44056d2..0000000
--- a/src/armci/tests/test_parmci_lib.c
+++ /dev/null
@@ -1,28 +0,0 @@
-#include <armci.h>
-
-int parmci_calls = 0;
-
-int ARMCI_Init(void) {
-  parmci_calls++;
-  return PARMCI_Init();
-}
-   
-int ARMCI_Finalize(void) {
-  parmci_calls++;
-  return PARMCI_Finalize();
-}
-
-void ARMCI_Barrier(void) {
-  parmci_calls++;
-  return PARMCI_Barrier();
-}
-
-int ARMCI_Get(void *src, void *dst, int size, int target) {
-    parmci_calls++;
-    return PARMCI_Get(src, dst, size, target);
-}
-
-int ARMCI_Put(void *src, void *dst, int size, int target) {
-    parmci_calls++;
-    return PARMCI_Put(src, dst, size, target);
-}
diff --git a/src/armci/tests/test_puts.c b/src/armci/tests/test_puts.c
deleted file mode 100644
index 21a11bc..0000000
--- a/src/armci/tests/test_puts.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-#include <armci.h>
-
-#define XDIM 1024 
-#define YDIM 1024
-#define ITERATIONS 10
-
-int main(int argc, char **argv) {
-    int i, j, rank, nranks, peer, bufsize, errors;
-    double **buffer, *src_buf;
-    int count[2], src_stride, trg_stride, stride_level;
-
-    MPI_Init(&argc, &argv);
-    ARMCI_Init();
-
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
-
-    buffer = (double **) malloc(sizeof(double *) * nranks);
-
-    bufsize = XDIM * YDIM * sizeof(double);
-    ARMCI_Malloc((void **) buffer, bufsize);
-    src_buf = ARMCI_Malloc_local(bufsize);
-
-    if (rank == 0)
-        printf("ARMCI Strided Put Test:\n");
-
-    src_stride = XDIM * sizeof(double);
-    trg_stride = XDIM * sizeof(double);
-    stride_level = 1;
-
-    count[1] = YDIM;
-    count[0] = XDIM * sizeof(double);
-
-    ARMCI_Barrier();
-
-    peer = (rank+1) % nranks;
-
-    for (i = 0; i < ITERATIONS; i++) {
-
-      for (j = 0; j < XDIM*YDIM; j++) {
-        *(src_buf + j) = rank + i;
-      }
-
-      ARMCI_PutS(
-          src_buf,
-          &src_stride,
-          (void *) buffer[peer],
-          &trg_stride,
-          count,
-          stride_level,
-          peer);
-    }
-
-    ARMCI_Barrier();
-
-    ARMCI_Access_begin(buffer[rank]);
-    for (i = errors = 0; i < XDIM; i++) {
-      for (j = 0; j < YDIM; j++) {
-        const double actual   = *(buffer[rank] + i + j*XDIM);
-        const double expected = (1.0 + rank) + (1.0 + ((rank+nranks-1)%nranks)) + (ITERATIONS);
-        if (actual - expected > 1e-10) {
-          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
-              rank, j, i, expected, actual);
-          errors++;
-          fflush(stdout);
-        }
-      }
-    }
-    ARMCI_Access_end(buffer[rank]);
-
-    ARMCI_Free((void *) buffer[rank]);
-    ARMCI_Free_local(src_buf);
-    free(buffer);
-
-    ARMCI_Finalize();
-    MPI_Finalize();
-
-    if (errors == 0) {
-      printf("%d: Success\n", rank);
-      return 0;
-    } else {
-      printf("%d: Fail\n", rank);
-      return 1;
-    }
-}
diff --git a/src/armci/tests/test_puts_gets.c b/src/armci/tests/test_puts_gets.c
deleted file mode 100644
index aef6811..0000000
--- a/src/armci/tests/test_puts_gets.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-#include <armci.h>
-
-#define XDIM 5
-#define YDIM 98
-#define ITERATIONS 10
-
-int main(int argc, char **argv) {
-    int i, j, rank, nranks, peer, bufsize, errors;
-    double **buffer, *src_buf, *dst_buf;
-    int count[2], src_stride, trg_stride, stride_level;
-
-    MPI_Init(&argc, &argv);
-    ARMCI_Init();
-
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
-
-    buffer = (double **) malloc(sizeof(double *) * nranks);
-
-    bufsize = XDIM * YDIM * sizeof(double);
-    ARMCI_Malloc((void **) buffer, bufsize);
-    src_buf = ARMCI_Malloc_local(bufsize);
-    dst_buf = ARMCI_Malloc_local(bufsize);
-
-    if (rank == 0)
-        printf("ARMCI Strided Put Test:\n");
-
-    src_stride = XDIM * sizeof(double);
-    trg_stride = XDIM * sizeof(double);
-    stride_level = 1;
-
-    count[1] = YDIM;
-    count[0] = XDIM * sizeof(double);
-
-    ARMCI_Barrier();
-
-    peer = (rank+1) % nranks;
-
-    for (i = 0; i < ITERATIONS; i++) {
-
-      for (j = 0; j < XDIM*YDIM; j++) {
-        *(src_buf + j) = rank + i;
-      }
-
-      ARMCI_PutS(
-          src_buf,
-          &src_stride,
-          (void *) buffer[peer],
-          &trg_stride,
-          count,
-          stride_level,
-          peer);
-
-      ARMCI_GetS(
-          (void *) buffer[peer],
-          &trg_stride,
-          dst_buf,
-          &src_stride,
-          count,
-          stride_level,
-          peer);
-    }
-
-    ARMCI_Barrier();
-
-    ARMCI_Access_begin(buffer[rank]);
-    for (i = errors = 0; i < XDIM; i++) {
-      for (j = 0; j < YDIM; j++) {
-        const double actual   = *(buffer[rank] + i + j*XDIM);
-        const double expected = (1.0 + rank) + (1.0 + ((rank+nranks-1)%nranks)) + (ITERATIONS);
-        if (actual - expected > 1e-10) {
-          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
-              rank, j, i, expected, actual);
-          errors++;
-          fflush(stdout);
-        }
-      }
-    }
-    ARMCI_Access_end(buffer[rank]);
-
-    ARMCI_Free((void *) buffer[rank]);
-    ARMCI_Free_local(src_buf);
-    ARMCI_Free_local(dst_buf);
-    free(buffer);
-
-    ARMCI_Finalize();
-    MPI_Finalize();
-
-    if (errors == 0) {
-      printf("%d: Success\n", rank);
-      return 0;
-    } else {
-      printf("%d: Fail\n", rank);
-      return 1;
-    }
-}
diff --git a/src/armci/tests/test_puts_gets_dla.c b/src/armci/tests/test_puts_gets_dla.c
deleted file mode 100644
index 56d01de..0000000
--- a/src/armci/tests/test_puts_gets_dla.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-#include <armci.h>
-
-#define XDIM 5
-#define YDIM 98
-#define ITERATIONS 10
-
-int main(int argc, char **argv) {
-    int i, j, rank, nranks, peer, bufsize, errors;
-    double *src_buf, *dst_buf;
-    double **shr_bvec, **src_bvec, **dst_bvec;
-    int count[2], src_stride, trg_stride, stride_level;
-
-    MPI_Init(&argc, &argv);
-    ARMCI_Init();
-
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
-
-    shr_bvec = (double **) malloc(sizeof(double *) * nranks);
-    src_bvec = (double **) malloc(sizeof(double *) * nranks);
-    dst_bvec = (double **) malloc(sizeof(double *) * nranks);
-
-    bufsize = XDIM * YDIM * sizeof(double);
-    ARMCI_Malloc((void **) shr_bvec, bufsize);
-    ARMCI_Malloc((void **) src_bvec, bufsize);
-    ARMCI_Malloc((void **) dst_bvec, bufsize);
-
-    src_buf = src_bvec[rank];
-    dst_buf = dst_bvec[rank];
-
-    if (rank == 0)
-        printf("ARMCI Strided DLA Put Test:\n");
-
-    src_stride = XDIM * sizeof(double);
-    trg_stride = XDIM * sizeof(double);
-    stride_level = 1;
-
-    count[1] = YDIM;
-    count[0] = XDIM * sizeof(double);
-
-    ARMCI_Barrier();
-
-    peer = (rank+1) % nranks;
-
-    for (i = 0; i < ITERATIONS; i++) {
-
-      ARMCI_Access_begin(src_buf);
-      for (j = 0; j < XDIM*YDIM; j++) {
-        *(src_buf + j) = rank + i;
-      }
-      ARMCI_Access_end(src_buf);
-
-      ARMCI_PutS(
-          src_buf,
-          &src_stride,
-          (void *) shr_bvec[peer],
-          &trg_stride,
-          count,
-          stride_level,
-          peer);
-
-      ARMCI_GetS(
-          (void *) shr_bvec[peer],
-          &trg_stride,
-          dst_buf,
-          &src_stride,
-          count,
-          stride_level,
-          peer);
-    }
-
-    ARMCI_Barrier();
-
-    ARMCI_Access_begin(shr_bvec[rank]);
-    for (i = errors = 0; i < XDIM; i++) {
-      for (j = 0; j < YDIM; j++) {
-        const double actual   = *(shr_bvec[rank] + i + j*XDIM);
-        const double expected = (1.0 + rank) + (1.0 + ((rank+nranks-1)%nranks)) + (ITERATIONS);
-        if (actual - expected > 1e-10) {
-          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
-              rank, j, i, expected, actual);
-          errors++;
-          fflush(stdout);
-        }
-      }
-    }
-    ARMCI_Access_end(shr_bvec[rank]);
-
-    ARMCI_Free((void *) shr_bvec[rank]);
-    ARMCI_Free((void *) src_bvec[rank]);
-    ARMCI_Free((void *) dst_bvec[rank]);
-
-    free(shr_bvec);
-    free(src_bvec);
-    free(dst_bvec);
-
-    ARMCI_Finalize();
-    MPI_Finalize();
-
-    if (errors == 0) {
-      printf("%d: Success\n", rank);
-      return 0;
-    } else {
-      printf("%d: Fail\n", rank);
-      return 1;
-    }
-}
diff --git a/src/armci/tests/test_rmw_fadd.c b/src/armci/tests/test_rmw_fadd.c
deleted file mode 100644
index 518fc02..0000000
--- a/src/armci/tests/test_rmw_fadd.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (C) 2010. See COPYRIGHT in top-level directory.
- */
-
-/** ARMCI RMW-FADD test -- James Dinan <dinan at mcs.anl.gov>
-  * 
-  * All processes allocate one shared integer counter per process.  All
-  * processes perform NINC atomic fetch-and-add operations on every counter.
-  */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#include <mpi.h>
-#include <armci.h>
-
-#define NINC 100
-
-#ifdef USE_ARMCI_LONG
-#  define INC_TYPE long
-#  define ARMCI_OP ARMCI_FETCH_AND_ADD_LONG
-#else
-#  define INC_TYPE int
-#  define ARMCI_OP ARMCI_FETCH_AND_ADD
-#endif
-
-int main(int argc, char ** argv) {
-  int        errors = 0;
-  int        rank, nproc, i, j;
-  void     **base_ptrs;
-  INC_TYPE   val;
-
-  MPI_Init(&argc, &argv);
-  ARMCI_Init();
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
-
-  if (rank == 0) printf("Starting ARMCI RMW-FADD test with %d processes\n", nproc);
-
-  base_ptrs = malloc(sizeof(void*)*nproc);
-  ARMCI_Malloc(base_ptrs, sizeof(INC_TYPE));
-
-  ARMCI_Access_begin(base_ptrs[rank]);
-  *(int*) base_ptrs[rank] = 0;
-  ARMCI_Access_end(base_ptrs[rank]);
-
-  ARMCI_Barrier();
-
-  for (i = 0; i < NINC; i++) {
-    for (j = 0; j < nproc; j++) {
-      ARMCI_Rmw(ARMCI_OP, &val, base_ptrs[j], 1, j);
-    }
-  }
-
-  ARMCI_Barrier();
-
-  ARMCI_Access_begin(base_ptrs[rank]);
-  if (*(int*) base_ptrs[rank] != NINC*nproc) {
-    errors++;
-    printf("%3d -- Got %d, expected %d\n", rank, *(int*) base_ptrs[rank], NINC*nproc);
-  }
-  ARMCI_Access_end(base_ptrs[rank]);
-
-  armci_msg_igop(&errors, 1, "+");
-
-  if (rank == 0) {
-    if (errors == 0) printf("Test complete: PASS.\n");
-    else            printf("Test fail: %d errors.\n", errors);
-  }
-
-  ARMCI_Free(base_ptrs[rank]);
-  free(base_ptrs);
-
-  ARMCI_Finalize();
-  MPI_Finalize();
-
-  return 0;
-}

-----------------------------------------------------------------------

Summary of changes:
 autogen.sh                                       |    3 +-
 maint/cvardirs                                   |    2 +-
 maint/errmsgdirs                                 |    2 +-
 src/armci/COPYRIGHT                              |   49 --
 src/armci/Makefile.am                            |   55 --
 src/armci/README                                 |  152 ----
 src/armci/VERSION                                |   20 -
 src/armci/autogen.sh                             |    9 -
 src/armci/benchmarks/Makefile.mk                 |   22 -
 src/armci/benchmarks/bench_groups.c              |   90 ---
 src/armci/benchmarks/contiguous-bench.c          |  114 ---
 src/armci/benchmarks/ping-pong.c                 |  114 ---
 src/armci/benchmarks/ring-flood.c                |   84 ---
 src/armci/benchmarks/strided-bench.c             |  126 ----
 src/armci/configure.ac                           |  196 ------
 src/armci/src/armci.h                            |  282 --------
 src/armci/src/armci_internals.h                  |  185 -----
 src/armci/src/armcix.h                           |   57 --
 src/armci/src/buffer.c                           |  432 ------------
 src/armci/src/conflict_tree.c                    |  310 ---------
 src/armci/src/conflict_tree.h                    |   31 -
 src/armci/src/debug.c                            |  101 ---
 src/armci/src/debug.h                            |   52 --
 src/armci/src/gmr.c                              |  578 ----------------
 src/armci/src/gmr.h                              |   68 --
 src/armci/src/groups.c                           |  360 ----------
 src/armci/src/init_finalize.c                    |  279 --------
 src/armci/src/internals.c                        |  135 ----
 src/armci/src/malloc.c                           |  160 -----
 src/armci/src/message.c                          |  443 ------------
 src/armci/src/message.h                          |   81 ---
 src/armci/src/message_gop.c                      |  255 -------
 src/armci/src/mp3.fh                             |    8 -
 src/armci/src/mp3.h                              |   17 -
 src/armci/src/mp3def.fh                          |    3 -
 src/armci/src/mutex.c                            |  120 ----
 src/armci/src/mutex_hdl_queue.c                  |  229 ------
 src/armci/src/mutex_hdl_spin.c                   |  229 ------
 src/armci/src/onesided.c                         |  376 ----------
 src/armci/src/onesided_nb.c                      |  133 ----
 src/armci/src/parmci.c                           |  294 --------
 src/armci/src/rmw.c                              |   96 ---
 src/armci/src/strided.c                          |  800 ----------------------
 src/armci/src/topology.c                         |   62 --
 src/armci/src/util.c                             |  199 ------
 src/armci/src/value_ops.c                        |  191 -----
 src/armci/src/vector.c                           |  532 --------------
 src/armci/tests/ARMCI_AccS_latency.c             |  233 -------
 src/armci/tests/ARMCI_PutS_latency.c             |  175 -----
 src/armci/tests/Makefile.mk                      |   80 ---
 src/armci/tests/ctree/Makefile.mk                |   16 -
 src/armci/tests/ctree/ctree_test.c               |   59 --
 src/armci/tests/ctree/ctree_test_rand.c          |   61 --
 src/armci/tests/ctree/ctree_test_rand_interval.c |   87 ---
 src/armci/tests/mpi/Makefile.mk                  |   31 -
 src/armci/tests/mpi/copy_bench.c                 |   60 --
 src/armci/tests/mpi/pgroup_bench.c               |  156 -----
 src/armci/tests/mpi/ping-pong-mpi.c              |  114 ---
 src/armci/tests/mpi/test_mpi_accs.c              |   81 ---
 src/armci/tests/mpi/test_mpi_indexed_accs.c      |  147 ----
 src/armci/tests/mpi/test_mpi_indexed_gets.c      |  137 ----
 src/armci/tests/mpi/test_mpi_indexed_puts_gets.c |  143 ----
 src/armci/tests/mpi/test_mpi_subarray_accs.c     |  140 ----
 src/armci/tests/mpi/test_win_create.c            |   49 --
 src/armci/tests/test_accs.c                      |  107 ---
 src/armci/tests/test_accs_dla.c                  |  113 ---
 src/armci/tests/test_assert.c                    |   22 -
 src/armci/tests/test_group_split.c               |   37 -
 src/armci/tests/test_groups.c                    |   58 --
 src/armci/tests/test_igop.c                      |   77 --
 src/armci/tests/test_malloc.c                    |   61 --
 src/armci/tests/test_malloc_group.c              |   59 --
 src/armci/tests/test_malloc_irreg.c              |   89 ---
 src/armci/tests/test_mutex.c                     |   50 --
 src/armci/tests/test_mutex_rmw.c                 |   77 --
 src/armci/tests/test_mutex_trylock.c             |   56 --
 src/armci/tests/test_onesided.c                  |  108 ---
 src/armci/tests/test_onesided_shared.c           |  130 ----
 src/armci/tests/test_onesided_shared_dla.c       |  130 ----
 src/armci/tests/test_parmci.c                    |  130 ----
 src/armci/tests/test_parmci_lib.c                |   28 -
 src/armci/tests/test_puts.c                      |   92 ---
 src/armci/tests/test_puts_gets.c                 |  103 ---
 src/armci/tests/test_puts_gets_dla.c             |  114 ---
 src/armci/tests/test_rmw_fadd.c                  |   80 ---
 test/mpi/spawn/pgroup_intercomm_test.c           |    4 +-
 86 files changed, 5 insertions(+), 11625 deletions(-)
 delete mode 100644 src/armci/COPYRIGHT
 delete mode 100644 src/armci/Makefile.am
 delete mode 100644 src/armci/README
 delete mode 100644 src/armci/VERSION
 delete mode 100755 src/armci/autogen.sh
 delete mode 100644 src/armci/benchmarks/Makefile.mk
 delete mode 100644 src/armci/benchmarks/bench_groups.c
 delete mode 100644 src/armci/benchmarks/contiguous-bench.c
 delete mode 100644 src/armci/benchmarks/ping-pong.c
 delete mode 100644 src/armci/benchmarks/ring-flood.c
 delete mode 100644 src/armci/benchmarks/strided-bench.c
 delete mode 100644 src/armci/configure.ac
 delete mode 100644 src/armci/src/armci.h
 delete mode 100644 src/armci/src/armci_internals.h
 delete mode 100644 src/armci/src/armcix.h
 delete mode 100644 src/armci/src/buffer.c
 delete mode 100644 src/armci/src/conflict_tree.c
 delete mode 100644 src/armci/src/conflict_tree.h
 delete mode 100644 src/armci/src/debug.c
 delete mode 100644 src/armci/src/debug.h
 delete mode 100644 src/armci/src/gmr.c
 delete mode 100644 src/armci/src/gmr.h
 delete mode 100644 src/armci/src/groups.c
 delete mode 100644 src/armci/src/init_finalize.c
 delete mode 100644 src/armci/src/internals.c
 delete mode 100644 src/armci/src/malloc.c
 delete mode 100644 src/armci/src/message.c
 delete mode 100644 src/armci/src/message.h
 delete mode 100644 src/armci/src/message_gop.c
 delete mode 100644 src/armci/src/mp3.fh
 delete mode 100644 src/armci/src/mp3.h
 delete mode 100644 src/armci/src/mp3def.fh
 delete mode 100644 src/armci/src/mutex.c
 delete mode 100644 src/armci/src/mutex_hdl_queue.c
 delete mode 100644 src/armci/src/mutex_hdl_spin.c
 delete mode 100644 src/armci/src/onesided.c
 delete mode 100644 src/armci/src/onesided_nb.c
 delete mode 100644 src/armci/src/parmci.c
 delete mode 100644 src/armci/src/rmw.c
 delete mode 100644 src/armci/src/strided.c
 delete mode 100644 src/armci/src/topology.c
 delete mode 100644 src/armci/src/util.c
 delete mode 100644 src/armci/src/value_ops.c
 delete mode 100644 src/armci/src/vector.c
 delete mode 100644 src/armci/tests/ARMCI_AccS_latency.c
 delete mode 100644 src/armci/tests/ARMCI_PutS_latency.c
 delete mode 100644 src/armci/tests/Makefile.mk
 delete mode 100644 src/armci/tests/ctree/Makefile.mk
 delete mode 100644 src/armci/tests/ctree/ctree_test.c
 delete mode 100644 src/armci/tests/ctree/ctree_test_rand.c
 delete mode 100644 src/armci/tests/ctree/ctree_test_rand_interval.c
 delete mode 100644 src/armci/tests/mpi/Makefile.mk
 delete mode 100644 src/armci/tests/mpi/copy_bench.c
 delete mode 100644 src/armci/tests/mpi/pgroup_bench.c
 delete mode 100644 src/armci/tests/mpi/ping-pong-mpi.c
 delete mode 100644 src/armci/tests/mpi/test_mpi_accs.c
 delete mode 100644 src/armci/tests/mpi/test_mpi_indexed_accs.c
 delete mode 100644 src/armci/tests/mpi/test_mpi_indexed_gets.c
 delete mode 100644 src/armci/tests/mpi/test_mpi_indexed_puts_gets.c
 delete mode 100644 src/armci/tests/mpi/test_mpi_subarray_accs.c
 delete mode 100644 src/armci/tests/mpi/test_win_create.c
 delete mode 100644 src/armci/tests/test_accs.c
 delete mode 100644 src/armci/tests/test_accs_dla.c
 delete mode 100644 src/armci/tests/test_assert.c
 delete mode 100644 src/armci/tests/test_group_split.c
 delete mode 100644 src/armci/tests/test_groups.c
 delete mode 100644 src/armci/tests/test_igop.c
 delete mode 100644 src/armci/tests/test_malloc.c
 delete mode 100644 src/armci/tests/test_malloc_group.c
 delete mode 100644 src/armci/tests/test_malloc_irreg.c
 delete mode 100644 src/armci/tests/test_mutex.c
 delete mode 100644 src/armci/tests/test_mutex_rmw.c
 delete mode 100644 src/armci/tests/test_mutex_trylock.c
 delete mode 100644 src/armci/tests/test_onesided.c
 delete mode 100644 src/armci/tests/test_onesided_shared.c
 delete mode 100644 src/armci/tests/test_onesided_shared_dla.c
 delete mode 100644 src/armci/tests/test_parmci.c
 delete mode 100644 src/armci/tests/test_parmci_lib.c
 delete mode 100644 src/armci/tests/test_puts.c
 delete mode 100644 src/armci/tests/test_puts_gets.c
 delete mode 100644 src/armci/tests/test_puts_gets_dla.c
 delete mode 100644 src/armci/tests/test_rmw_fadd.c


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list