[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.1.3-126-g769ab3b

Service Account noreply at mpich.org
Wed Nov 5 00:57:31 CST 2014


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  769ab3b40a77e6ae70f0c57fb0a829b64b75cbd4 (commit)
       via  bc3ad0430fa4cec565d47ad08a2fd0bf0beb2896 (commit)
       via  15e16af6e40fba1df0e2e5bc8f0749214fefd565 (commit)
       via  ce486ba0384633292b19ea124a07088dbd95872c (commit)
       via  d0b28aa71ebcd03673879c0672cd29097409d41b (commit)
       via  7887c04e1d398858d6f989fa42664cf698c312b3 (commit)
      from  db61d8d89ed21803983b557a46e48b0340404902 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/769ab3b40a77e6ae70f0c57fb0a829b64b75cbd4

commit 769ab3b40a77e6ae70f0c57fb0a829b64b75cbd4
Author: Ken Raffenetti <raffenet at mcs.anl.gov>
Date:   Tue Nov 4 22:28:19 2014 -0600

    portals4: prevent early request free
    
    The large send handler incorrectly assumed event ordering from portals.
    This could lead to a request being freed while pending events would
    still attempt to access it, causing a segfault or incorrect handler to
    execute.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_impl.h b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_impl.h
index d7974a5..f5c204d 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_impl.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_impl.h
@@ -45,6 +45,7 @@ typedef struct {
     ptl_handle_me_t put_me;
     ptl_handle_me_t *get_me_p;
     int num_gets;
+    int put_acked;
     ptl_size_t chunk_offset;
     void *chunk_buffer[MPID_NEM_PTL_NUM_CHUNK_BUFFERS];
     MPIDI_msg_sz_t bytes_put;
@@ -67,6 +68,7 @@ typedef struct {
         REQ_PTL(req_)->put_me        = PTL_INVALID_HANDLE;      \
         REQ_PTL(req_)->get_me_p      = NULL;                    \
         REQ_PTL(req_)->num_gets      = 0;                       \
+        REQ_PTL(req_)->put_acked     = 0;                       \
         REQ_PTL(req_)->event_handler = NULL;                    \
         REQ_PTL(req_)->chunk_offset  = 0;                       \
     } while (0)
diff --git a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c
index 6abffaa..e6bbc66 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c
@@ -111,12 +111,16 @@ static int handler_large(const ptl_event_t *e)
         /* truncated message */
         mpi_errno = handler_send_complete(e);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    } else if (e->type == PTL_EVENT_ACK) {
+        REQ_PTL(sreq)->put_acked = 1;
     } else if (e->type == PTL_EVENT_GET) {
         /* decrement the remaining get operations */
-        if (--REQ_PTL(sreq)->num_gets == 0)
-            mpi_errno = handler_send_complete(e);
+        REQ_PTL(sreq)->num_gets--;
     }
 
+    if (REQ_PTL(sreq)->num_gets == 0 && REQ_PTL(sreq)->put_acked)
+        mpi_errno = handler_send_complete(e);
+
  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_HANDLER_LARGE);
     return mpi_errno;

http://git.mpich.org/mpich.git/commitdiff/bc3ad0430fa4cec565d47ad08a2fd0bf0beb2896

commit bc3ad0430fa4cec565d47ad08a2fd0bf0beb2896
Author: Ken Raffenetti <raffenet at mcs.anl.gov>
Date:   Tue Oct 28 17:49:44 2014 -0500

    portals4: handle out-of-order reply events
    
    In large message cases, when multiple get operations are issued, the
    data may arrive out-of-order back at the initiator. A counter is required
    to ensure all operations have completed. In the temporary buffer case, we
    simply wait for all the data to arrive, and unpack in one operation.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_impl.h b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_impl.h
index ec599f2..d7974a5 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_impl.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_impl.h
@@ -44,6 +44,7 @@ typedef struct {
     ptl_handle_md_t md;
     ptl_handle_me_t put_me;
     ptl_handle_me_t *get_me_p;
+    int num_gets;
     ptl_size_t chunk_offset;
     void *chunk_buffer[MPID_NEM_PTL_NUM_CHUNK_BUFFERS];
     MPIDI_msg_sz_t bytes_put;
@@ -65,6 +66,7 @@ typedef struct {
         REQ_PTL(req_)->md            = PTL_INVALID_HANDLE;      \
         REQ_PTL(req_)->put_me        = PTL_INVALID_HANDLE;      \
         REQ_PTL(req_)->get_me_p      = NULL;                    \
+        REQ_PTL(req_)->num_gets      = 0;                       \
         REQ_PTL(req_)->event_handler = NULL;                    \
         REQ_PTL(req_)->chunk_offset  = 0;                       \
     } while (0)
diff --git a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_poll.c b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_poll.c
index 4c4b6a4..26a1eb2 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_poll.c
@@ -165,14 +165,11 @@ int MPID_nem_ptl_poll(int is_blocking_poll)
         case PTL_EVENT_ACK:
         case PTL_EVENT_REPLY:
         case PTL_EVENT_SEARCH: {
-            /* intermediate operations for large messages pass a NULL user_ptr. we can ignore these events */
-            if (event.user_ptr) {
-                MPID_Request * const req = event.user_ptr;
-                MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "req = %p", req);
-                MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "REQ_PTL(req)->event_handler = %p", REQ_PTL(req)->event_handler);
-                mpi_errno = REQ_PTL(req)->event_handler(&event);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-            }
+            MPID_Request * const req = event.user_ptr;
+            MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "req = %p", req);
+            MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "REQ_PTL(req)->event_handler = %p", REQ_PTL(req)->event_handler);
+            mpi_errno = REQ_PTL(req)->event_handler(&event);
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
             break;
         }
         case PTL_EVENT_AUTO_FREE:
diff --git a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_recv.c b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_recv.c
index 80576e7..2152da7 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_recv.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_recv.c
@@ -57,7 +57,7 @@ static int handler_recv_complete(const ptl_event_t *e)
     MPIDI_FUNC_ENTER(MPID_STATE_HANDLER_RECV_COMPLETE);
     
     MPIU_Assert(e->type == PTL_EVENT_REPLY || e->type == PTL_EVENT_PUT || e->type == PTL_EVENT_PUT_OVERFLOW);
-    
+
     if (REQ_PTL(rreq)->md != PTL_INVALID_HANDLE) {
         ret = PtlMDRelease(REQ_PTL(rreq)->md);
         MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmdrelease", "**ptlmdrelease %s", MPID_nem_ptl_strerror(ret));
@@ -122,10 +122,10 @@ static int handler_recv_dequeue_complete(const ptl_event_t *e)
 }
 
 #undef FUNCNAME
-#define FUNCNAME handler_recv_unpack
+#define FUNCNAME handler_recv_big_get
 #undef FCNAME
 #define FCNAME MPIU_QUOTE(FUNCNAME)
-static int handler_recv_unpack(const ptl_event_t *e)
+static int handler_recv_big_get(const ptl_event_t *e)
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Request *const rreq = e->user_ptr;
@@ -137,14 +137,17 @@ static int handler_recv_unpack(const ptl_event_t *e)
 
     MPIU_Assert(e->type == PTL_EVENT_REPLY);
 
-    last = rreq->dev.segment_size;
-    MPID_Segment_unpack(rreq->dev.segment_ptr, rreq->dev.segment_first, &last,
-                        (char *)REQ_PTL(rreq)->chunk_buffer[0] + REQ_PTL(rreq)->chunk_offset);
-
-    rreq->dev.segment_first += e->mlength;
-    REQ_PTL(rreq)->chunk_offset += e->mlength;
-    if (rreq->dev.segment_first == rreq->dev.segment_size)
+    /* decrement the number of remaining gets */
+    REQ_PTL(rreq)->num_gets--;
+    if (REQ_PTL(rreq)->num_gets == 0) {
+        /* if we used a temporary buffer, unpack the data */
+        if (REQ_PTL(rreq)->chunk_buffer[0]) {
+            last = rreq->dev.segment_size;
+            MPID_Segment_unpack(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, REQ_PTL(rreq)->chunk_buffer[0]);
+            MPIU_Assert(last == rreq->dev.segment_size);
+        }
         mpi_errno = handler_recv_complete(e);
+    }
 
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
@@ -164,25 +167,21 @@ static void big_get(void *buf, ptl_size_t left_to_get, MPIDI_VC_t *vc, ptl_match
     int ret;
     MPID_nem_ptl_vc_area *vc_ptl;
     ptl_size_t start, get_sz;
-    void *user_ptr = NULL;
 
     vc_ptl = VC_PTL(vc);
     start = (ptl_size_t)buf;
 
-    /* we need to handle all event if we are unpacking from the chunk_buffer */
-    if (REQ_PTL(rreq)->event_handler == handler_recv_unpack)
-        user_ptr = rreq;
+    /* we need to handle all events */
+    REQ_PTL(rreq)->event_handler = handler_recv_big_get;
 
     while (left_to_get > 0) {
         /* get up to the maximum allowed by the portals interface */
-        if (left_to_get > MPIDI_nem_ptl_ni_limits.max_msg_size) {
+        if (left_to_get > MPIDI_nem_ptl_ni_limits.max_msg_size)
             get_sz = MPIDI_nem_ptl_ni_limits.max_msg_size;
-        } else {
+        else
             get_sz = left_to_get;
-            /* attach the request to the final operation */
-            user_ptr = rreq;
-        }
-        ret = MPID_nem_ptl_rptl_get(MPIDI_nem_ptl_global_md, start, get_sz, vc_ptl->id, vc_ptl->ptg, match_bits, 0, user_ptr);
+
+        ret = MPID_nem_ptl_rptl_get(MPIDI_nem_ptl_global_md, start, get_sz, vc_ptl->id, vc_ptl->ptg, match_bits, 0, rreq);
         DBG_MSG_GET("global", get_sz, vc->pg_rank, match_bits);
         MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "   buf=%p", (char *)start);
         MPIU_Assert(ret == 0);
@@ -190,6 +189,7 @@ static void big_get(void *buf, ptl_size_t left_to_get, MPIDI_VC_t *vc, ptl_match
         /* account for what has been sent */
         start += get_sz;
         left_to_get -= get_sz;
+        REQ_PTL(rreq)->num_gets++;
     }
 }
 
@@ -304,8 +304,6 @@ static int handler_recv_dequeue_large(const ptl_event_t *e)
 
     /* we need to GET the rest of the data from the sender's buffer */
     if (dt_contig) {
-        REQ_PTL(rreq)->event_handler = handler_recv_complete;
-
         big_get((char *)rreq->dev.user_buf + dt_true_lb + PTL_LARGE_THRESHOLD, data_sz - PTL_LARGE_THRESHOLD,
                 vc, e->match_bits, rreq);
         goto fn_exit;
@@ -338,10 +336,9 @@ static int handler_recv_dequeue_large(const ptl_event_t *e)
         
     /* message won't fit in a single IOV, allocate buffer and unpack when received */
     /* FIXME: For now, allocate a single large buffer to hold entire message */
-    MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, rreq->dev.segment_size - rreq->dev.segment_first,
+    MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, data_sz - PTL_LARGE_THRESHOLD,
                         mpi_errno, "chunk_buffer");
-    REQ_PTL(rreq)->event_handler = handler_recv_unpack;
-    big_get(REQ_PTL(rreq)->chunk_buffer[0], rreq->dev.segment_size - rreq->dev.segment_first, vc, e->match_bits, rreq);
+    big_get(REQ_PTL(rreq)->chunk_buffer[0], data_sz - PTL_LARGE_THRESHOLD, vc, e->match_bits, rreq);
 
  fn_exit:
     MPIU_CHKPMEM_COMMIT();
@@ -396,7 +393,6 @@ static int handler_recv_dequeue_unpack_large(const ptl_event_t *e)
 
     MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, rreq->dev.segment_size - rreq->dev.segment_first,
                         mpi_errno, "chunk_buffer");
-    REQ_PTL(rreq)->event_handler = handler_recv_unpack;
     big_get(REQ_PTL(rreq)->chunk_buffer[0], rreq->dev.segment_size - rreq->dev.segment_first, vc, e->match_bits, rreq);
 
  fn_exit:
@@ -708,7 +704,6 @@ int MPID_nem_ptl_lmt_start_recv(MPIDI_VC_t *vc,  MPID_Request *rreq, MPID_IOV s_
     if (dt_contig) {
         void * real_user_buf = (char *)rreq->dev.user_buf + dt_true_lb;
 
-        REQ_PTL(rreq)->event_handler = handler_recv_complete;
         big_get((char *)real_user_buf + PTL_LARGE_THRESHOLD, data_sz - PTL_LARGE_THRESHOLD, vc, match_bits, rreq);
 
         /* The memcpy is done after the get purposely for overlapping */
@@ -756,7 +751,6 @@ int MPID_nem_ptl_lmt_start_recv(MPIDI_VC_t *vc,  MPID_Request *rreq, MPID_IOV s_
             /* FIXME: For now, allocate a single large buffer to hold entire message */
             MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, rreq->dev.segment_size - rreq->dev.segment_first,
                                 mpi_errno, "chunk_buffer");
-            REQ_PTL(rreq)->event_handler = handler_recv_unpack;
             big_get(REQ_PTL(rreq)->chunk_buffer[0], rreq->dev.segment_size - rreq->dev.segment_first, vc, match_bits, rreq);
         }
     }
diff --git a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c
index 152ff88..6abffaa 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c
@@ -13,7 +13,6 @@
 #define FCNAME MPIU_QUOTE(FUNCNAME)
 static void big_meappend(void *buf, ptl_size_t left_to_send, MPIDI_VC_t *vc, ptl_match_bits_t match_bits, MPID_Request *sreq)
 {
-    void * user_ptr = NULL;
     int i, ret;
     MPID_nem_ptl_vc_area *vc_ptl;
     ptl_me_t me;
@@ -39,19 +38,17 @@ static void big_meappend(void *buf, ptl_size_t left_to_send, MPIDI_VC_t *vc, ptl
         /* send up to the maximum allowed by the portals interface */
         if (left_to_send > MPIDI_nem_ptl_ni_limits.max_msg_size)
             me.length = MPIDI_nem_ptl_ni_limits.max_msg_size;
-        else {
+        else
             me.length = left_to_send;
-            /* attach the request to the final operation */
-            user_ptr = sreq;
-        }
 
-        ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_get_pt, &me, PTL_PRIORITY_LIST, user_ptr, &REQ_PTL(sreq)->get_me_p[i]);
+        ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_get_pt, &me, PTL_PRIORITY_LIST, sreq, &REQ_PTL(sreq)->get_me_p[i]);
         DBG_MSG_MEAPPEND("CTL", vc->pg_rank, me, sreq);
         MPIU_Assert(ret == 0);
 
         /* account for what has been sent */
         me.start = (char *)me.start + me.length;
         left_to_send -= me.length;
+        REQ_PTL(sreq)->num_gets++;
     }
 }
 
@@ -114,8 +111,10 @@ static int handler_large(const ptl_event_t *e)
         /* truncated message */
         mpi_errno = handler_send_complete(e);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-    } else {
-        REQ_PTL(sreq)->event_handler = handler_send_complete;
+    } else if (e->type == PTL_EVENT_GET) {
+        /* decrement the remaining get operations */
+        if (--REQ_PTL(sreq)->num_gets == 0)
+            mpi_errno = handler_send_complete(e);
     }
 
  fn_exit:
@@ -382,7 +381,8 @@ static int send_msg(ptl_hdr_data_t ssend_flag, struct MPIDI_VC *vc, const void *
                 me.min_free = 0;
 
                 MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->get_me_p, ptl_handle_me_t *, sizeof(ptl_handle_me_t), mpi_errno, "get_me_p");
-                        
+
+                REQ_PTL(sreq)->num_gets = 1;
                 ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_get_pt, &me, PTL_PRIORITY_LIST, sreq,
                                   &REQ_PTL(sreq)->get_me_p[0]);
                 MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmeappend", "**ptlmeappend %s", MPID_nem_ptl_strerror(ret));

http://git.mpich.org/mpich.git/commitdiff/15e16af6e40fba1df0e2e5bc8f0749214fefd565

commit 15e16af6e40fba1df0e2e5bc8f0749214fefd565
Author: Ken Raffenetti <raffenet at mcs.anl.gov>
Date:   Thu Oct 23 10:54:13 2014 -0500

    add test for large non-contiguous datatype
    
    This is useful test case for netmods that use packing and/or break
    large messages into smaller chunks.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/test/mpi/datatype/Makefile.am b/test/mpi/datatype/Makefile.am
index 03c13b5..2840393 100644
--- a/test/mpi/datatype/Makefile.am
+++ b/test/mpi/datatype/Makefile.am
@@ -39,6 +39,7 @@ noinst_PROGRAMS =           \
     large-count             \
     large_type              \
     large_type_sendrec      \
+    large_vec	            \
     lbub                    \
     localpack               \
     longdouble              \
diff --git a/test/mpi/datatype/large_vec.c b/test/mpi/datatype/large_vec.c
new file mode 100644
index 0000000..0b7eafb
--- /dev/null
+++ b/test/mpi/datatype/large_vec.c
@@ -0,0 +1,81 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+
+/* tests non-contig send/recv of a message > 2GB. count=270M, type=long long
+   run with 3 processes to exercise both shared memory and TCP in Nemesis tests*/
+
+int main(int argc, char *argv[])
+{
+    int ierr, i, size, rank;
+    int elems = 270000000;
+    MPI_Status status;
+    MPI_Datatype dtype;
+    long long *cols;
+    int errs = 0;
+
+
+    MTest_Init(&argc, &argv);
+
+    /* need large memory */
+    if (sizeof(void *) < 8) {
+        MTest_Finalize(errs);
+        MPI_Finalize();
+        return 0;
+    }
+
+    ierr = MPI_Comm_size(MPI_COMM_WORLD, &size);
+    ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    if (size != 3) {
+        fprintf(stderr, "[%d] usage: mpiexec -n 3 %s\n", rank, argv[0]);
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    cols = malloc(elems * sizeof(long long));
+    if (cols == NULL) {
+        printf("malloc of >2GB array failed\n");
+        errs++;
+        MTest_Finalize(errs);
+        MPI_Finalize();
+        return 0;
+    }
+
+    MPI_Type_vector(elems / 2, 1, 2, MPI_LONG_LONG_INT, &dtype);
+    MPI_Type_commit(&dtype);
+
+    if (rank == 0) {
+        for (i = 0; i < elems; i++)
+            cols[i] = i;
+        /* printf("[%d] sending...\n",rank); */
+        ierr = MPI_Send(cols, 1, dtype, 1, 0, MPI_COMM_WORLD);
+        ierr = MPI_Send(cols, 1, dtype, 2, 0, MPI_COMM_WORLD);
+    }
+    else {
+        /* printf("[%d] receiving...\n",rank); */
+        for (i = 0; i < elems; i++)
+            cols[i] = -1;
+        ierr = MPI_Recv(cols, 1, dtype, 0, 0, MPI_COMM_WORLD, &status);
+        /* ierr = MPI_Get_count(&status,MPI_LONG_LONG_INT,&cnt);
+         * Get_count still fails because count is not 64 bit */
+        for (i = 0; i < elems; i++) {
+            if (i % 2)
+                continue;
+            if (cols[i] != i) {
+                printf("Rank %d, cols[i]=%lld, should be %d\n", rank, cols[i], i);
+                errs++;
+            }
+        }
+    }
+
+    MPI_Type_free(&dtype);
+
+    MTest_Finalize(errs);
+    MPI_Finalize();
+    return 0;
+}
diff --git a/test/mpi/datatype/testlist.in b/test/mpi/datatype/testlist.in
index 549c3eb..7fee6a4 100644
--- a/test/mpi/datatype/testlist.in
+++ b/test/mpi/datatype/testlist.in
@@ -59,3 +59,4 @@ cxx-types 1 mpiversion=3.0
 @largetest at large_type 1 mpiversion=3.0
 @largetest at large_type_sendrec 2 arg=31 mpiversion=3.0
 @largetest at large_type_sendrec 2 arg=32 mpiversion=3.0 timeLimit=360
+ at largetest@large_vec 3 mpiversion=3.0

http://git.mpich.org/mpich.git/commitdiff/ce486ba0384633292b19ea124a07088dbd95872c

commit ce486ba0384633292b19ea124a07088dbd95872c
Author: Ken Raffenetti <raffenet at mcs.anl.gov>
Date:   Fri Oct 24 14:35:27 2014 -0500

    portals4: use helper function for big sends
    
    Move some duplicate code for posting multiple get operations into a
    dedicated helper function.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c
index 95e244e..152ff88 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c
@@ -8,6 +8,54 @@
 #include "rptl.h"
 
 #undef FUNCNAME
+#define FUNCNAME big_meappend
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static void big_meappend(void *buf, ptl_size_t left_to_send, MPIDI_VC_t *vc, ptl_match_bits_t match_bits, MPID_Request *sreq)
+{
+    void * user_ptr = NULL;
+    int i, ret;
+    MPID_nem_ptl_vc_area *vc_ptl;
+    ptl_me_t me;
+
+    vc_ptl = VC_PTL(vc);
+
+    me.start = buf;
+    me.ct_handle = PTL_CT_NONE;
+    me.uid = PTL_UID_ANY;
+    me.options = ( PTL_ME_OP_PUT | PTL_ME_OP_GET | PTL_ME_USE_ONCE | PTL_ME_IS_ACCESSIBLE | PTL_ME_EVENT_LINK_DISABLE |
+                   PTL_ME_EVENT_UNLINK_DISABLE );
+    me.match_id = vc_ptl->id;
+    me.match_bits = match_bits;
+    me.ignore_bits = 0;
+    me.min_free = 0;
+
+    /* allocate enough handles to cover all get operations */
+    REQ_PTL(sreq)->get_me_p = MPIU_Malloc(sizeof(ptl_handle_me_t) *
+                                        ((left_to_send / MPIDI_nem_ptl_ni_limits.max_msg_size) + 1));
+
+    /* queue up as many entries as necessary to describe the entire message */
+    for (i = 0; left_to_send > 0; i++) {
+        /* send up to the maximum allowed by the portals interface */
+        if (left_to_send > MPIDI_nem_ptl_ni_limits.max_msg_size)
+            me.length = MPIDI_nem_ptl_ni_limits.max_msg_size;
+        else {
+            me.length = left_to_send;
+            /* attach the request to the final operation */
+            user_ptr = sreq;
+        }
+
+        ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_get_pt, &me, PTL_PRIORITY_LIST, user_ptr, &REQ_PTL(sreq)->get_me_p[i]);
+        DBG_MSG_MEAPPEND("CTL", vc->pg_rank, me, sreq);
+        MPIU_Assert(ret == 0);
+
+        /* account for what has been sent */
+        me.start = (char *)me.start + me.length;
+        left_to_send -= me.length;
+    }
+}
+
+#undef FUNCNAME
 #define FUNCNAME handler_send_complete
 #undef FCNAME
 #define FCNAME MPIU_QUOTE(FUNCNAME)
@@ -277,47 +325,9 @@ static int send_msg(ptl_hdr_data_t ssend_flag, struct MPIDI_VC *vc, const void *
     /* Large message.  Send first chunk of data and let receiver get the rest */
     if (dt_contig) {
         /* create ME for buffer so receiver can issue a GET for the data */
-        ptl_size_t left_to_send;
-        void * user_ptr = NULL;
-        int i;
-
         MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Large contig message");
-        me.start = (char *)buf + dt_true_lb + PTL_LARGE_THRESHOLD;
-        left_to_send = data_sz - PTL_LARGE_THRESHOLD;
-        me.ct_handle = PTL_CT_NONE;
-        me.uid = PTL_UID_ANY;
-        me.options = ( PTL_ME_OP_PUT | PTL_ME_OP_GET | PTL_ME_USE_ONCE | PTL_ME_IS_ACCESSIBLE | PTL_ME_EVENT_LINK_DISABLE |
-                       PTL_ME_EVENT_UNLINK_DISABLE );
-        me.match_id = vc_ptl->id;
-        me.match_bits = NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank);
-        me.ignore_bits = 0;
-        me.min_free = 0;
-
-        /* allocate enough handles to cover all get operations */
-        MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->get_me, ptl_handle_me_t *,
-                            sizeof(ptl_handle_me_t) * ((left_to_send / MPIDI_nem_ptl_ni_limits.max_msg_size) + 1),
-                            mpi_errno, "get_me");
-
-        /* queue up as many entries as necessary to describe the entire message */
-        for (i = 0; left_to_send > 0; i++) {
-            /* send up to the maximum allowed by the portals interface */
-            if (left_to_send > MPIDI_nem_ptl_ni_limits.max_msg_size)
-                me.length = MPIDI_nem_ptl_ni_limits.max_msg_size;
-            else {
-                me.length = left_to_send;
-                /* attach the request to the final operation */
-                user_ptr = sreq;
-            }
-
-            ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_get_pt, &me, PTL_PRIORITY_LIST, user_ptr, &REQ_PTL(sreq)->get_me[i]);
-            MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmeappend", "**ptlmeappend %s", MPID_nem_ptl_strerror(ret));
-            DBG_MSG_MEAPPEND("CTL", vc->pg_rank, me, sreq);
-
-            /* account for what has been sent */
-            me.start = (char *)me.start + me.length;
-            left_to_send -= me.length;
-        }
-
+        big_meappend((char *)buf + dt_true_lb + PTL_LARGE_THRESHOLD, data_sz - PTL_LARGE_THRESHOLD, vc,
+                     NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), sreq);
         REQ_PTL(sreq)->large = TRUE;
 
         REQ_PTL(sreq)->event_handler = handler_large;
@@ -402,11 +412,6 @@ static int send_msg(ptl_hdr_data_t ssend_flag, struct MPIDI_VC *vc, const void *
         /* Don't handle this case separately */
     }
 
-    /* same code as large contig */
-    ptl_size_t left_to_send;
-    void * user_ptr = NULL;
-    int i;
-
     /* allocate a temporary buffer and copy all the data to send */
     MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->chunk_buffer[0], void *, data_sz, mpi_errno, "tmpbuf");
 
@@ -414,42 +419,8 @@ static int send_msg(ptl_hdr_data_t ssend_flag, struct MPIDI_VC *vc, const void *
     MPID_Segment_pack(sreq->dev.segment_ptr, 0, &last, REQ_PTL(sreq)->chunk_buffer[0]);
     MPIU_Assert(last == data_sz);
 
-    me.start = (char *)REQ_PTL(sreq)->chunk_buffer[0] + PTL_LARGE_THRESHOLD;
-    left_to_send = data_sz - PTL_LARGE_THRESHOLD;
-    me.ct_handle = PTL_CT_NONE;
-    me.uid = PTL_UID_ANY;
-    me.options = ( PTL_ME_OP_PUT | PTL_ME_OP_GET | PTL_ME_USE_ONCE | PTL_ME_IS_ACCESSIBLE | PTL_ME_EVENT_LINK_DISABLE |
-                   PTL_ME_EVENT_UNLINK_DISABLE );
-    me.match_id = vc_ptl->id;
-    me.match_bits = NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank);
-    me.ignore_bits = 0;
-    me.min_free = 0;
-
-    /* allocate enough handles to cover all get operations */
-    MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->get_me, ptl_handle_me_t *,
-                        sizeof(ptl_handle_me_t) * ((left_to_send / MPIDI_nem_ptl_ni_limits.max_msg_size) + 1),
-                        mpi_errno, "get_me");
-
-    /* queue up as many entries as necessary to describe the entire message */
-    for (i = 0; left_to_send > 0; i++) {
-        /* send up to the maximum allowed by the portals interface */
-        if (left_to_send > MPIDI_nem_ptl_ni_limits.max_msg_size)
-            me.length = MPIDI_nem_ptl_ni_limits.max_msg_size;
-        else {
-            me.length = left_to_send;
-            /* attach the request to the final operation */
-            user_ptr = sreq;
-        }
-
-        ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_get_pt, &me, PTL_PRIORITY_LIST, user_ptr, &REQ_PTL(sreq)->get_me[i]);
-        MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmeappend", "**ptlmeappend %s", MPID_nem_ptl_strerror(ret));
-        DBG_MSG_MEAPPEND("CTL", vc->pg_rank, me, sreq);
-
-        /* account for what has been sent */
-        me.start = (char *)me.start + me.length;
-        left_to_send -= me.length;
-    }
-
+    big_meappend((char *)REQ_PTL(sreq)->chunk_buffer[0] + PTL_LARGE_THRESHOLD, data_sz - PTL_LARGE_THRESHOLD, vc,
+                 NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), sreq);
     REQ_PTL(sreq)->large = TRUE;
 
     REQ_PTL(sreq)->event_handler = handler_large;

http://git.mpich.org/mpich.git/commitdiff/d0b28aa71ebcd03673879c0672cd29097409d41b

commit d0b28aa71ebcd03673879c0672cd29097409d41b
Author: Ken Raffenetti <raffenet at mcs.anl.gov>
Date:   Thu Oct 23 10:53:45 2014 -0500

    portals4: add support for large non-contig
    
    Large messages (either larger than max_msg_size or > MPID_IOV_LIMIT), will
    be packed into a temporary buffer. These need to be optimized.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_impl.h b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_impl.h
index 10c66f1..ec599f2 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_impl.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_impl.h
@@ -44,6 +44,7 @@ typedef struct {
     ptl_handle_md_t md;
     ptl_handle_me_t put_me;
     ptl_handle_me_t *get_me_p;
+    ptl_size_t chunk_offset;
     void *chunk_buffer[MPID_NEM_PTL_NUM_CHUNK_BUFFERS];
     MPIDI_msg_sz_t bytes_put;
     int found; /* used in probes with PtlMESearch() */
@@ -65,6 +66,7 @@ typedef struct {
         REQ_PTL(req_)->put_me        = PTL_INVALID_HANDLE;      \
         REQ_PTL(req_)->get_me_p      = NULL;                    \
         REQ_PTL(req_)->event_handler = NULL;                    \
+        REQ_PTL(req_)->chunk_offset  = 0;                       \
     } while (0)
 
 #define MPID_nem_ptl_request_create_sreq(sreq_, errno_, comm_) do {                                             \
diff --git a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_recv.c b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_recv.c
index 2f4345d..80576e7 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_recv.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_recv.c
@@ -122,6 +122,78 @@ static int handler_recv_dequeue_complete(const ptl_event_t *e)
 }
 
 #undef FUNCNAME
+#define FUNCNAME handler_recv_unpack
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static int handler_recv_unpack(const ptl_event_t *e)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPID_Request *const rreq = e->user_ptr;
+    MPI_Aint last;
+
+    MPIDI_STATE_DECL(MPID_STATE_HANDLER_RECV_UNPACK);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_HANDLER_RECV_UNPACK);
+
+    MPIU_Assert(e->type == PTL_EVENT_REPLY);
+
+    last = rreq->dev.segment_size;
+    MPID_Segment_unpack(rreq->dev.segment_ptr, rreq->dev.segment_first, &last,
+                        (char *)REQ_PTL(rreq)->chunk_buffer[0] + REQ_PTL(rreq)->chunk_offset);
+
+    rreq->dev.segment_first += e->mlength;
+    REQ_PTL(rreq)->chunk_offset += e->mlength;
+    if (rreq->dev.segment_first == rreq->dev.segment_size)
+        mpi_errno = handler_recv_complete(e);
+
+    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_HANDLER_RECV_UNPACK);
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME big_get
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+static void big_get(void *buf, ptl_size_t left_to_get, MPIDI_VC_t *vc, ptl_match_bits_t match_bits, MPID_Request *rreq)
+{
+    int ret;
+    MPID_nem_ptl_vc_area *vc_ptl;
+    ptl_size_t start, get_sz;
+    void *user_ptr = NULL;
+
+    vc_ptl = VC_PTL(vc);
+    start = (ptl_size_t)buf;
+
+    /* we need to handle all event if we are unpacking from the chunk_buffer */
+    if (REQ_PTL(rreq)->event_handler == handler_recv_unpack)
+        user_ptr = rreq;
+
+    while (left_to_get > 0) {
+        /* get up to the maximum allowed by the portals interface */
+        if (left_to_get > MPIDI_nem_ptl_ni_limits.max_msg_size) {
+            get_sz = MPIDI_nem_ptl_ni_limits.max_msg_size;
+        } else {
+            get_sz = left_to_get;
+            /* attach the request to the final operation */
+            user_ptr = rreq;
+        }
+        ret = MPID_nem_ptl_rptl_get(MPIDI_nem_ptl_global_md, start, get_sz, vc_ptl->id, vc_ptl->ptg, match_bits, 0, user_ptr);
+        DBG_MSG_GET("global", get_sz, vc->pg_rank, match_bits);
+        MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "   buf=%p", (char *)start);
+        MPIU_Assert(ret == 0);
+
+        /* account for what has been sent */
+        start += get_sz;
+        left_to_get -= get_sz;
+    }
+}
+
+#undef FUNCNAME
 #define FUNCNAME handler_recv_unpack_complete
 #undef FCNAME
 #define FCNAME MPIU_QUOTE(FUNCNAME)
@@ -232,33 +304,10 @@ static int handler_recv_dequeue_large(const ptl_event_t *e)
 
     /* we need to GET the rest of the data from the sender's buffer */
     if (dt_contig) {
-        /* recv buffer is contig */
-        ptl_size_t start, left_to_get, get_sz;
-        void * user_ptr = NULL;
-
         REQ_PTL(rreq)->event_handler = handler_recv_complete;
 
-        start = (ptl_size_t)((char *)rreq->dev.user_buf + dt_true_lb + PTL_LARGE_THRESHOLD);
-        left_to_get = data_sz - PTL_LARGE_THRESHOLD;
-
-        while (left_to_get > 0) {
-            /* get up to the maximum allowed by the portals interface */
-            if (left_to_get > MPIDI_nem_ptl_ni_limits.max_msg_size) {
-                get_sz = MPIDI_nem_ptl_ni_limits.max_msg_size;
-            } else {
-                get_sz = left_to_get;
-                /* attach the request to the final operation */
-                user_ptr = rreq;
-            }
-            ret = MPID_nem_ptl_rptl_get(MPIDI_nem_ptl_global_md, start, get_sz, vc_ptl->id, vc_ptl->ptg, e->match_bits, 0, user_ptr);
-            DBG_MSG_GET("global", get_sz, vc->pg_rank, e->match_bits);
-            MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "   buf=%p", (char *)start);
-            MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlget", "**ptlget %s", MPID_nem_ptl_strerror(ret));
-
-            /* account for what has been sent */
-            start += get_sz;
-            left_to_get -= get_sz;
-        }
+        big_get((char *)rreq->dev.user_buf + dt_true_lb + PTL_LARGE_THRESHOLD, data_sz - PTL_LARGE_THRESHOLD,
+                vc, e->match_bits, rreq);
         goto fn_exit;
     }
 
@@ -268,7 +317,7 @@ static int handler_recv_dequeue_large(const ptl_event_t *e)
     rreq->dev.iov_count = MPID_IOV_LIMIT;
     MPID_Segment_pack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, rreq->dev.iov, &rreq->dev.iov_count);
 
-    if (last == rreq->dev.segment_size) {
+    if (last == rreq->dev.segment_size && rreq->dev.segment_size <= MPIDI_nem_ptl_ni_limits.max_msg_size + PTL_LARGE_THRESHOLD) {
         /* Rest of message fits in one IOV */
         ptl_md_t md;
 
@@ -289,12 +338,10 @@ static int handler_recv_dequeue_large(const ptl_event_t *e)
         
     /* message won't fit in a single IOV, allocate buffer and unpack when received */
     /* FIXME: For now, allocate a single large buffer to hold entire message */
-    MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, rreq->dev.segment_size - rreq->dev.segment_first, mpi_errno, "chunk_buffer");
-
-    REQ_PTL(rreq)->event_handler = handler_recv_unpack_complete;
-    ret = MPID_nem_ptl_rptl_get(MPIDI_nem_ptl_global_md, (ptl_size_t)REQ_PTL(rreq)->chunk_buffer[0],
-                 rreq->dev.segment_size - rreq->dev.segment_first, vc_ptl->id, vc_ptl->ptg, e->match_bits, 0, rreq);
-    MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlget", "**ptlget %s", MPID_nem_ptl_strerror(ret));
+    MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, rreq->dev.segment_size - rreq->dev.segment_first,
+                        mpi_errno, "chunk_buffer");
+    REQ_PTL(rreq)->event_handler = handler_recv_unpack;
+    big_get(REQ_PTL(rreq)->chunk_buffer[0], rreq->dev.segment_size - rreq->dev.segment_first, vc, e->match_bits, rreq);
 
  fn_exit:
     MPIU_CHKPMEM_COMMIT();
@@ -316,8 +363,7 @@ static int handler_recv_dequeue_unpack_large(const ptl_event_t *e)
     int mpi_errno = MPI_SUCCESS;
     MPID_Request *const rreq = e->user_ptr;
     MPIDI_VC_t *vc;
-    MPID_nem_ptl_vc_area *vc_ptl;
-    int ret;
+    MPI_Aint last;
     void *buf;
     MPIU_CHKPMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_LARGE);
@@ -326,7 +372,6 @@ static int handler_recv_dequeue_unpack_large(const ptl_event_t *e)
     MPIU_Assert(e->type == PTL_EVENT_PUT || e->type == PTL_EVENT_PUT_OVERFLOW);
 
     MPIDI_Comm_get_vc(rreq->comm, NPTL_MATCH_GET_RANK(e->match_bits), &vc);
-    vc_ptl = VC_PTL(vc);
 
     dequeue_req(e);
 
@@ -343,18 +388,16 @@ static int handler_recv_dequeue_unpack_large(const ptl_event_t *e)
         buf = REQ_PTL(rreq)->chunk_buffer[0];
 
     MPIU_Assert(e->mlength == PTL_LARGE_THRESHOLD);
-    mpi_errno = MPID_nem_ptl_unpack_byte(rreq->dev.segment_ptr, rreq->dev.segment_first, PTL_LARGE_THRESHOLD,
-                                         buf, &REQ_PTL(rreq)->overflow[0]);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    last = PTL_LARGE_THRESHOLD;
+    MPID_Segment_unpack(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, buf);
+    MPIU_Assert(last == PTL_LARGE_THRESHOLD);
     rreq->dev.segment_first += PTL_LARGE_THRESHOLD;
     MPIU_Free(REQ_PTL(rreq)->chunk_buffer[0]);
 
-    MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, rreq->dev.segment_size - rreq->dev.segment_first, mpi_errno, "chunk_buffer");
-
-    REQ_PTL(rreq)->event_handler = handler_recv_unpack_complete;
-    ret = MPID_nem_ptl_rptl_get(MPIDI_nem_ptl_global_md, (ptl_size_t)REQ_PTL(rreq)->chunk_buffer[0],
-                 rreq->dev.segment_size - rreq->dev.segment_first, vc_ptl->id, vc_ptl->ptg, e->match_bits, 0, rreq);
-    MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlget", "**ptlget %s", MPID_nem_ptl_strerror(ret));
+    MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, rreq->dev.segment_size - rreq->dev.segment_first,
+                        mpi_errno, "chunk_buffer");
+    REQ_PTL(rreq)->event_handler = handler_recv_unpack;
+    big_get(REQ_PTL(rreq)->chunk_buffer[0], rreq->dev.segment_size - rreq->dev.segment_first, vc, e->match_bits, rreq);
 
  fn_exit:
     MPIU_CHKPMEM_COMMIT();
@@ -666,12 +709,8 @@ int MPID_nem_ptl_lmt_start_recv(MPIDI_VC_t *vc,  MPID_Request *rreq, MPID_IOV s_
         void * real_user_buf = (char *)rreq->dev.user_buf + dt_true_lb;
 
         REQ_PTL(rreq)->event_handler = handler_recv_complete;
-        ret = MPID_nem_ptl_rptl_get(MPIDI_nem_ptl_global_md, (ptl_size_t)((char *)real_user_buf + PTL_LARGE_THRESHOLD),
-                     data_sz - PTL_LARGE_THRESHOLD, vc_ptl->id, vc_ptl->ptg, match_bits, 0, rreq);
-        DBG_MSG_GET("global", data_sz - PTL_LARGE_THRESHOLD, vc->pg_rank, match_bits);
-        MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "   buf=%p", (char *)real_user_buf + PTL_LARGE_THRESHOLD);
-        MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlget", "**ptlget %s",
-                             MPID_nem_ptl_strerror(ret));
+        big_get((char *)real_user_buf + PTL_LARGE_THRESHOLD, data_sz - PTL_LARGE_THRESHOLD, vc, match_bits, rreq);
+
         /* The memcpy is done after the get purposely for overlapping */
         MPIU_Memcpy(real_user_buf, rreq->dev.tmpbuf, PTL_LARGE_THRESHOLD);
     }
@@ -684,16 +723,16 @@ int MPID_nem_ptl_lmt_start_recv(MPIDI_VC_t *vc,  MPID_Request *rreq, MPID_IOV s_
         MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype,
                           rreq->dev.segment_ptr, 0);
         rreq->dev.segment_first = 0;
-        rreq->dev.segment_size = data_sz - PTL_LARGE_THRESHOLD;
+        rreq->dev.segment_size = data_sz;
         last = PTL_LARGE_THRESHOLD;
         MPID_Segment_unpack(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, rreq->dev.tmpbuf);
         MPIU_Assert(last == PTL_LARGE_THRESHOLD);
         rreq->dev.segment_first = PTL_LARGE_THRESHOLD;
-        last = data_sz - PTL_LARGE_THRESHOLD;
+        last = rreq->dev.segment_size;
         rreq->dev.iov_count = MPID_IOV_LIMIT;
         MPID_Segment_pack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, rreq->dev.iov,
                                  &rreq->dev.iov_count);
-        if (last == rreq->dev.segment_size) {
+        if (last == rreq->dev.segment_size && last <= MPIDI_nem_ptl_ni_limits.max_msg_size + PTL_LARGE_THRESHOLD) {
             /* Rest of message fits in one IOV */
             ptl_md_t md;
 
@@ -707,22 +746,18 @@ int MPID_nem_ptl_lmt_start_recv(MPIDI_VC_t *vc,  MPID_Request *rreq, MPID_IOV s_
                                  MPID_nem_ptl_strerror(ret));
 
             REQ_PTL(rreq)->event_handler = handler_recv_complete;
-            ret = MPID_nem_ptl_rptl_get(REQ_PTL(rreq)->md, 0, rreq->dev.segment_size, vc_ptl->id, vc_ptl->ptg,
-                         match_bits, PTL_LARGE_THRESHOLD, rreq);
+            ret = MPID_nem_ptl_rptl_get(REQ_PTL(rreq)->md, 0, rreq->dev.segment_size - rreq->dev.segment_first,
+                                        vc_ptl->id, vc_ptl->ptg, match_bits, 0, rreq);
             MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlget", "**ptlget %s",
                                  MPID_nem_ptl_strerror(ret));
         }
         else {
             /* message won't fit in a single IOV, allocate buffer and unpack when received */
             /* FIXME: For now, allocate a single large buffer to hold entire message */
-            MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, rreq->dev.segment_size,
+            MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, rreq->dev.segment_size - rreq->dev.segment_first,
                                 mpi_errno, "chunk_buffer");
-            REQ_PTL(rreq)->event_handler = handler_recv_unpack_complete;
-            ret = MPID_nem_ptl_rptl_get(MPIDI_nem_ptl_global_md, (ptl_size_t)REQ_PTL(rreq)->chunk_buffer[0],
-                         rreq->dev.segment_size, vc_ptl->id, vc_ptl->ptg, match_bits,
-                         PTL_LARGE_THRESHOLD, rreq);
-            MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlget", "**ptlget %s",
-                                 MPID_nem_ptl_strerror(ret));
+            REQ_PTL(rreq)->event_handler = handler_recv_unpack;
+            big_get(REQ_PTL(rreq)->chunk_buffer[0], rreq->dev.segment_size - rreq->dev.segment_first, vc, match_bits, rreq);
         }
     }
     MPIU_Free(rreq->dev.tmpbuf);
diff --git a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c
index d2cee52..95e244e 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c
@@ -317,9 +317,9 @@ static int send_msg(ptl_hdr_data_t ssend_flag, struct MPIDI_VC *vc, const void *
             me.start = (char *)me.start + me.length;
             left_to_send -= me.length;
         }
-        
+
         REQ_PTL(sreq)->large = TRUE;
-            
+
         REQ_PTL(sreq)->event_handler = handler_large;
         ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)((char *)buf + dt_true_lb), PTL_LARGE_THRESHOLD, PTL_ACK_REQ, vc_ptl->id, vc_ptl->pt,
                      NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq,
@@ -356,7 +356,7 @@ static int send_msg(ptl_hdr_data_t ssend_flag, struct MPIDI_VC *vc, const void *
                                      &sreq->dev.iov[initial_iov_count], &sreq->dev.iov_count);
             remaining_iov_count = sreq->dev.iov_count;
 
-            if (last == sreq->dev.segment_size) {
+            if (last == sreq->dev.segment_size && last <= MPIDI_nem_ptl_ni_limits.max_msg_size + PTL_LARGE_THRESHOLD) {
                 /* Entire message fit in one IOV */
                 MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "    rest of message fits in one IOV");
                 /* Create ME for remaining data */
@@ -388,7 +388,7 @@ static int send_msg(ptl_hdr_data_t ssend_flag, struct MPIDI_VC *vc, const void *
                 MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmdbind", "**ptlmdbind %s", MPID_nem_ptl_strerror(ret));
 
                 REQ_PTL(sreq)->large = TRUE;
-                        
+
                 REQ_PTL(sreq)->event_handler = handler_large;
                 ret = MPID_nem_ptl_rptl_put(REQ_PTL(sreq)->md, 0, PTL_LARGE_THRESHOLD, PTL_ACK_REQ, vc_ptl->id, vc_ptl->pt,
                              NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq,
@@ -402,16 +402,20 @@ static int send_msg(ptl_hdr_data_t ssend_flag, struct MPIDI_VC *vc, const void *
         /* Don't handle this case separately */
     }
 
-    /* Message doesn't fit in IOV, pack into buffers */
-    MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "    Message doesn't fit in IOV: use bounce buffer");
+    /* same code as large contig */
+    ptl_size_t left_to_send;
+    void * user_ptr = NULL;
+    int i;
+
+    /* allocate a temporary buffer and copy all the data to send */
+    MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->chunk_buffer[0], void *, data_sz, mpi_errno, "tmpbuf");
 
-    /* FIXME: For now, allocate a single large buffer to hold entire message */
-    MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->chunk_buffer[0], void *, data_sz, mpi_errno, "chunk_buffer");
-    MPI_nem_ptl_pack_byte(sreq->dev.segment_ptr, 0, data_sz, REQ_PTL(sreq)->chunk_buffer[0], &REQ_PTL(sreq)->overflow[0]);
+    last = data_sz;
+    MPID_Segment_pack(sreq->dev.segment_ptr, 0, &last, REQ_PTL(sreq)->chunk_buffer[0]);
+    MPIU_Assert(last == data_sz);
 
-    /* create ME for buffer so receiver can issue a GET for the data */
     me.start = (char *)REQ_PTL(sreq)->chunk_buffer[0] + PTL_LARGE_THRESHOLD;
-    me.length = data_sz - PTL_LARGE_THRESHOLD;
+    left_to_send = data_sz - PTL_LARGE_THRESHOLD;
     me.ct_handle = PTL_CT_NONE;
     me.uid = PTL_UID_ANY;
     me.options = ( PTL_ME_OP_PUT | PTL_ME_OP_GET | PTL_ME_USE_ONCE | PTL_ME_IS_ACCESSIBLE | PTL_ME_EVENT_LINK_DISABLE |
@@ -421,61 +425,39 @@ static int send_msg(ptl_hdr_data_t ssend_flag, struct MPIDI_VC *vc, const void *
     me.ignore_bits = 0;
     me.min_free = 0;
 
-    MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->get_me, ptl_handle_me_t *, sizeof(ptl_handle_me_t), mpi_errno, "get_me");
+    /* allocate enough handles to cover all get operations */
+    MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->get_me, ptl_handle_me_t *,
+                        sizeof(ptl_handle_me_t) * ((left_to_send / MPIDI_nem_ptl_ni_limits.max_msg_size) + 1),
+                        mpi_errno, "get_me");
+
+    /* queue up as many entries as necessary to describe the entire message */
+    for (i = 0; left_to_send > 0; i++) {
+        /* send up to the maximum allowed by the portals interface */
+        if (left_to_send > MPIDI_nem_ptl_ni_limits.max_msg_size)
+            me.length = MPIDI_nem_ptl_ni_limits.max_msg_size;
+        else {
+            me.length = left_to_send;
+            /* attach the request to the final operation */
+            user_ptr = sreq;
+        }
+
+        ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_get_pt, &me, PTL_PRIORITY_LIST, user_ptr, &REQ_PTL(sreq)->get_me[i]);
+        MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmeappend", "**ptlmeappend %s", MPID_nem_ptl_strerror(ret));
+        DBG_MSG_MEAPPEND("CTL", vc->pg_rank, me, sreq);
 
-    DBG_MSG_MEAPPEND("CTL", vc->pg_rank, me, sreq);
-    ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_get_pt, &me, PTL_PRIORITY_LIST, sreq, &REQ_PTL(sreq)->get_me[0]);
-    MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmeappend", "**ptlmeappend %s", MPID_nem_ptl_strerror(ret));
+        /* account for what has been sent */
+        me.start = (char *)me.start + me.length;
+        left_to_send -= me.length;
+    }
 
     REQ_PTL(sreq)->large = TRUE;
-    
+
     REQ_PTL(sreq)->event_handler = handler_large;
-    ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)REQ_PTL(sreq)->chunk_buffer[0], PTL_LARGE_THRESHOLD, PTL_ACK_REQ,
-                 vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq,
-                                NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz), 1);
+    ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)REQ_PTL(sreq)->chunk_buffer[0], PTL_LARGE_THRESHOLD,
+                                PTL_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank),
+                                0, sreq, NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz), 1);
     MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret));
     DBG_MSG_PUT("global", PTL_LARGE_THRESHOLD, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz));
-    goto fn_exit;
-
-#if 0
-    sreq->dev.segment_first = 0;
-
-    /* Pack first chunk of message */
-    MPIU_CHKPMEM_MALLOC(req_PTL(sreq_)->chunk_buffer, void *, PTL_LARGE_THRESHOLD, mpi_errno, "chunk_buffer");
-    MPI_nem_ptl_pack_byte(sreq->dev.segment_ptr, 0, PTL_LARGE_THRESHOLD, REQ_PTL(sreq_)->chunk_buffer[0],
-              &REQ_PTL(sreq)->overflow[0]);
-    sreq->dev.segment_first = PTL_LARGE_THRESHOLD;
-            
-    /* Pack second chunk of message */
-    MPIU_CHKPMEM_MALLOC(req_PTL(sreq_)->chunk_buffer, void *, PTL_LARGE_THRESHOLD, mpi_errno, "chunk_buffer");
-    MPI_nem_ptl_pack_byte(sreq->dev.segment_ptr, sreq->dev.segment_first, sreq->dev.segment_first + PTL_LARGE_THRESHOLD,
-              REQ_PTL(sreq_)->chunk_buffer[1], &REQ_PTL(sreq)->overflow[1]);
-    sreq->dev.segment_first += PTL_LARGE_THRESHOLD;
-
-    /* create ME for second chunk */
-    me.start = REQ_PTL(sreq_)->chunk_buffer[1];
-    me.length = PTL_LARGE_THRESHOLD;
-    me.ct_handle = PTL_CT_NONE;
-    me.uid = PTL_UID_ANY;
-    me.options = ( PTL_ME_OP_PUT | PTL_ME_OP_GET | PTL_ME_USE_ONCE | PTL_ME_IS_ACCESSIBLE | PTL_ME_EVENT_LINK_DISABLE |
-                   PTL_ME_EVENT_UNLINK_DISABLE );
-    me.match_id = vc_ptl->id;
-    me.match_bits = NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank);
-    me.ignore_bits = 0;
-    me.min_free = 0;
-            
-    ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_get_pt, &me, PTL_PRIORITY_LIST, sreq, &REQ_PTL(sreq)->me);
-    MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmeappend", "**ptlmeappend %s", MPID_nem_ptl_strerror(ret));
-
-
-    REQ_PTL(sreq)->large = TRUE;
-                        
-    REQ_PTL(sreq)->event_handler = handler_large_multi;
-    ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)REQ_PTL(sreq_)->chunk_buffer[0], PTL_LARGE_THRESHOLD, PTL_ACK_REQ, vc_ptl->id,
-                 vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq,
-                                NPTL_HEADER(ssend_flag | NPTL_LARGE | NPTL_MULTIPLE, data_sz), 1);
-    MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret));
-#endif
     
  fn_exit:
     *request = sreq;

http://git.mpich.org/mpich.git/commitdiff/7887c04e1d398858d6f989fa42664cf698c312b3

commit 7887c04e1d398858d6f989fa42664cf698c312b3
Author: Ken Raffenetti <raffenet at mcs.anl.gov>
Date:   Fri Oct 10 16:31:39 2014 -0500

    portals4: support for large contiguous messages
    
    If a message is larger than the max_msg_size limit, issue multiple MEs
    for the remainder of the message. Completion events for the intermediate
    operations will be ignored. Only the final operation will trigger the
    event handler to tell MPI that communication is complete.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_impl.h b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_impl.h
index 46d33f7..10c66f1 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_impl.h
+++ b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_impl.h
@@ -42,7 +42,8 @@ typedef struct {
     int noncontig;
     int large;
     ptl_handle_md_t md;
-    ptl_handle_me_t me;
+    ptl_handle_me_t put_me;
+    ptl_handle_me_t *get_me_p;
     void *chunk_buffer[MPID_NEM_PTL_NUM_CHUNK_BUFFERS];
     MPIDI_msg_sz_t bytes_put;
     int found; /* used in probes with PtlMESearch() */
@@ -61,7 +62,8 @@ typedef struct {
         REQ_PTL(req_)->noncontig     = FALSE;                   \
         REQ_PTL(req_)->large         = FALSE;                   \
         REQ_PTL(req_)->md            = PTL_INVALID_HANDLE;      \
-        REQ_PTL(req_)->me            = PTL_INVALID_HANDLE;      \
+        REQ_PTL(req_)->put_me        = PTL_INVALID_HANDLE;      \
+        REQ_PTL(req_)->get_me_p      = NULL;                    \
         REQ_PTL(req_)->event_handler = NULL;                    \
     } while (0)
 
diff --git a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_poll.c b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_poll.c
index 26a1eb2..4c4b6a4 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_poll.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_poll.c
@@ -165,11 +165,14 @@ int MPID_nem_ptl_poll(int is_blocking_poll)
         case PTL_EVENT_ACK:
         case PTL_EVENT_REPLY:
         case PTL_EVENT_SEARCH: {
-            MPID_Request * const req = event.user_ptr;
-            MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "req = %p", req);
-            MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "REQ_PTL(req)->event_handler = %p", REQ_PTL(req)->event_handler);
-            mpi_errno = REQ_PTL(req)->event_handler(&event);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            /* intermediate operations for large messages pass a NULL user_ptr. we can ignore these events */
+            if (event.user_ptr) {
+                MPID_Request * const req = event.user_ptr;
+                MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "req = %p", req);
+                MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "REQ_PTL(req)->event_handler = %p", REQ_PTL(req)->event_handler);
+                mpi_errno = REQ_PTL(req)->event_handler(&event);
+                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            }
             break;
         }
         case PTL_EVENT_AUTO_FREE:
diff --git a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_probe.c b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_probe.c
index 3d88225..9a583e5 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_probe.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_probe.c
@@ -73,7 +73,7 @@ static int handle_mprobe(const ptl_event_t *e)
 
     /* At this point we know the ME is unlinked. Invalidate the handle to
        prevent further accesses, e.g. an attempted cancel. */
-    REQ_PTL(req)->me = PTL_INVALID_HANDLE;
+    REQ_PTL(req)->put_me = PTL_INVALID_HANDLE;
     req->dev.recv_pending_count = 1;
 
   fn_exit:
diff --git a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_recv.c b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_recv.c
index cca6d4c..2f4345d 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_recv.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_recv.c
@@ -19,7 +19,7 @@ static void dequeue_req(const ptl_event_t *e)
 
     /* At this point we know the ME is unlinked. Invalidate the handle to
        prevent further accesses, e.g. an attempted cancel. */
-    REQ_PTL(rreq)->me = PTL_INVALID_HANDLE;
+    REQ_PTL(rreq)->put_me = PTL_INVALID_HANDLE;
 
     found = MPIDI_CH3U_Recvq_DP(rreq);
     MPIU_Assert(found);
@@ -233,12 +233,32 @@ static int handler_recv_dequeue_large(const ptl_event_t *e)
     /* we need to GET the rest of the data from the sender's buffer */
     if (dt_contig) {
         /* recv buffer is contig */
+        ptl_size_t start, left_to_get, get_sz;
+        void * user_ptr = NULL;
+
         REQ_PTL(rreq)->event_handler = handler_recv_complete;
-        ret = MPID_nem_ptl_rptl_get(MPIDI_nem_ptl_global_md, (ptl_size_t)((char *)rreq->dev.user_buf + dt_true_lb + PTL_LARGE_THRESHOLD),
-                     data_sz - PTL_LARGE_THRESHOLD, vc_ptl->id, vc_ptl->ptg, e->match_bits, 0, rreq);
-        DBG_MSG_GET("global", data_sz - PTL_LARGE_THRESHOLD, vc->pg_rank, e->match_bits);
-        MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "   buf=%p", (char *)rreq->dev.user_buf + dt_true_lb + PTL_LARGE_THRESHOLD);
-        MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlget", "**ptlget %s", MPID_nem_ptl_strerror(ret));
+
+        start = (ptl_size_t)((char *)rreq->dev.user_buf + dt_true_lb + PTL_LARGE_THRESHOLD);
+        left_to_get = data_sz - PTL_LARGE_THRESHOLD;
+
+        while (left_to_get > 0) {
+            /* get up to the maximum allowed by the portals interface */
+            if (left_to_get > MPIDI_nem_ptl_ni_limits.max_msg_size) {
+                get_sz = MPIDI_nem_ptl_ni_limits.max_msg_size;
+            } else {
+                get_sz = left_to_get;
+                /* attach the request to the final operation */
+                user_ptr = rreq;
+            }
+            ret = MPID_nem_ptl_rptl_get(MPIDI_nem_ptl_global_md, start, get_sz, vc_ptl->id, vc_ptl->ptg, e->match_bits, 0, user_ptr);
+            DBG_MSG_GET("global", get_sz, vc->pg_rank, e->match_bits);
+            MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "   buf=%p", (char *)start);
+            MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlget", "**ptlget %s", MPID_nem_ptl_strerror(ret));
+
+            /* account for what has been sent */
+            start += get_sz;
+            left_to_get -= get_sz;
+        }
         goto fn_exit;
     }
 
@@ -479,7 +499,7 @@ int MPID_nem_ptl_recv_posted(MPIDI_VC_t *vc, MPID_Request *rreq)
         
     }
 
-    ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_pt, &me, PTL_PRIORITY_LIST, rreq, &REQ_PTL(rreq)->me);
+    ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_pt, &me, PTL_PRIORITY_LIST, rreq, &REQ_PTL(rreq)->put_me);
     MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmeappend", "**ptlmeappend %s", MPID_nem_ptl_strerror(ret));
     DBG_MSG_MEAPPEND("REG", vc ? vc->pg_rank : MPI_ANY_SOURCE, me, rreq);
     MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "    buf=%p", me.start);
@@ -533,8 +553,8 @@ static int cancel_recv(MPID_Request *rreq, int *cancelled)
     /* An invalid handle indicates the operation has been completed
        and the matching list entry unlinked. At that point, the operation
        cannot be cancelled. */
-    if (REQ_PTL(rreq)->me != PTL_INVALID_HANDLE) {
-        ptl_err = PtlMEUnlink(REQ_PTL(rreq)->me);
+    if (REQ_PTL(rreq)->put_me != PTL_INVALID_HANDLE) {
+        ptl_err = PtlMEUnlink(REQ_PTL(rreq)->put_me);
         if (ptl_err == PTL_OK)
             *cancelled = TRUE;
         /* FIXME: if we properly invalidate matching list entry handles, we should be
diff --git a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c
index 796f559..d2cee52 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_send.c
@@ -31,6 +31,9 @@ static int handler_send_complete(const ptl_event_t *e)
     for (i = 0; i < MPID_NEM_PTL_NUM_CHUNK_BUFFERS; ++i)
         if (REQ_PTL(sreq)->chunk_buffer[i])
             MPIU_Free(REQ_PTL(sreq)->chunk_buffer[i]);
+
+    if (REQ_PTL(sreq)->get_me_p)
+        MPIU_Free(REQ_PTL(sreq)->get_me_p);
     
     MPIDI_CH3U_Request_complete(sreq);
 
@@ -274,9 +277,13 @@ static int send_msg(ptl_hdr_data_t ssend_flag, struct MPIDI_VC *vc, const void *
     /* Large message.  Send first chunk of data and let receiver get the rest */
     if (dt_contig) {
         /* create ME for buffer so receiver can issue a GET for the data */
+        ptl_size_t left_to_send;
+        void * user_ptr = NULL;
+        int i;
+
         MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Large contig message");
         me.start = (char *)buf + dt_true_lb + PTL_LARGE_THRESHOLD;
-        me.length = data_sz - PTL_LARGE_THRESHOLD;
+        left_to_send = data_sz - PTL_LARGE_THRESHOLD;
         me.ct_handle = PTL_CT_NONE;
         me.uid = PTL_UID_ANY;
         me.options = ( PTL_ME_OP_PUT | PTL_ME_OP_GET | PTL_ME_USE_ONCE | PTL_ME_IS_ACCESSIBLE | PTL_ME_EVENT_LINK_DISABLE |
@@ -286,9 +293,30 @@ static int send_msg(ptl_hdr_data_t ssend_flag, struct MPIDI_VC *vc, const void *
         me.ignore_bits = 0;
         me.min_free = 0;
 
-        ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_get_pt, &me, PTL_PRIORITY_LIST, sreq, &REQ_PTL(sreq)->me);
-        MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmeappend", "**ptlmeappend %s", MPID_nem_ptl_strerror(ret));
-        DBG_MSG_MEAPPEND("CTL", vc->pg_rank, me, sreq);
+        /* allocate enough handles to cover all get operations */
+        MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->get_me, ptl_handle_me_t *,
+                            sizeof(ptl_handle_me_t) * ((left_to_send / MPIDI_nem_ptl_ni_limits.max_msg_size) + 1),
+                            mpi_errno, "get_me");
+
+        /* queue up as many entries as necessary to describe the entire message */
+        for (i = 0; left_to_send > 0; i++) {
+            /* send up to the maximum allowed by the portals interface */
+            if (left_to_send > MPIDI_nem_ptl_ni_limits.max_msg_size)
+                me.length = MPIDI_nem_ptl_ni_limits.max_msg_size;
+            else {
+                me.length = left_to_send;
+                /* attach the request to the final operation */
+                user_ptr = sreq;
+            }
+
+            ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_get_pt, &me, PTL_PRIORITY_LIST, user_ptr, &REQ_PTL(sreq)->get_me[i]);
+            MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmeappend", "**ptlmeappend %s", MPID_nem_ptl_strerror(ret));
+            DBG_MSG_MEAPPEND("CTL", vc->pg_rank, me, sreq);
+
+            /* account for what has been sent */
+            me.start = (char *)me.start + me.length;
+            left_to_send -= me.length;
+        }
         
         REQ_PTL(sreq)->large = TRUE;
             
@@ -342,9 +370,11 @@ static int send_msg(ptl_hdr_data_t ssend_flag, struct MPIDI_VC *vc, const void *
                 me.match_bits = NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank);
                 me.ignore_bits = 0;
                 me.min_free = 0;
+
+                MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->get_me_p, ptl_handle_me_t *, sizeof(ptl_handle_me_t), mpi_errno, "get_me_p");
                         
                 ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_get_pt, &me, PTL_PRIORITY_LIST, sreq,
-                                  &REQ_PTL(sreq)->me);
+                                  &REQ_PTL(sreq)->get_me_p[0]);
                 MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmeappend", "**ptlmeappend %s", MPID_nem_ptl_strerror(ret));
                 DBG_MSG_MEAPPEND("CTL", vc->pg_rank, me, sreq);
 
@@ -391,8 +421,10 @@ static int send_msg(ptl_hdr_data_t ssend_flag, struct MPIDI_VC *vc, const void *
     me.ignore_bits = 0;
     me.min_free = 0;
 
+    MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->get_me, ptl_handle_me_t *, sizeof(ptl_handle_me_t), mpi_errno, "get_me");
+
     DBG_MSG_MEAPPEND("CTL", vc->pg_rank, me, sreq);
-    ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_get_pt, &me, PTL_PRIORITY_LIST, sreq, &REQ_PTL(sreq)->me);
+    ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_get_pt, &me, PTL_PRIORITY_LIST, sreq, &REQ_PTL(sreq)->get_me[0]);
     MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmeappend", "**ptlmeappend %s", MPID_nem_ptl_strerror(ret));
 
     REQ_PTL(sreq)->large = TRUE;

-----------------------------------------------------------------------

Summary of changes:
 .../channels/nemesis/netmod/portals4/ptl_impl.h    |   12 ++-
 .../channels/nemesis/netmod/portals4/ptl_probe.c   |    2 +-
 .../channels/nemesis/netmod/portals4/ptl_recv.c    |  149 ++++++++++++------
 .../channels/nemesis/netmod/portals4/ptl_send.c    |  165 +++++++++-----------
 test/mpi/datatype/Makefile.am                      |    1 +
 test/mpi/datatype/large_vec.c                      |   81 ++++++++++
 test/mpi/datatype/testlist.in                      |    1 +
 7 files changed, 270 insertions(+), 141 deletions(-)
 create mode 100644 test/mpi/datatype/large_vec.c


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list