[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.1rc2-147-g4b07ac8

mysql vizuser noreply at mpich.org
Wed Jan 8 14:28:50 CST 2014


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  4b07ac8ab4ba5203f228348b05a9a2da71f13396 (commit)
       via  c03e766f497f91de0f75a37c945a8e356f1d3788 (commit)
       via  039c2290692c6fff053fd62ec2a3b5da700c7163 (commit)
       via  e25dc31f0ba40a35086b60e76afe1069af876b58 (commit)
       via  31ac6f01a51883c466255cd8ae80ae32a9d7a6a9 (commit)
       via  c6d910c783380440d1946ab366e5f2496eaed042 (commit)
      from  399e546369c3938ada9cc0b6c3c218f685c7613b (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/4b07ac8ab4ba5203f228348b05a9a2da71f13396

commit 4b07ac8ab4ba5203f228348b05a9a2da71f13396
Author: Sameer Kumar <sameerk at us.ibm.com>
Date:   Tue Jan 7 04:29:13 2014 -0600

    Bug fix for strided datatypes.
    
    Full fix
    
    Fix for get accumulate that sends contig ack back and then scatters result buffer on the src node.
    
    Remove unused params.
    
    Signed-off-by: Michael Blocksome <blocksom at us.ibm.com>

diff --git a/src/mpid/pamid/src/onesided/mpid_1s.c b/src/mpid/pamid/src/onesided/mpid_1s.c
index 3b89aab..6e64c3f 100644
--- a/src/mpid/pamid/src/onesided/mpid_1s.c
+++ b/src/mpid/pamid/src/onesided/mpid_1s.c
@@ -45,7 +45,9 @@ MPIDI_Win_DoneCB(pami_context_t  context,
                                      req->origin.count,
                                      req->origin.datatype);
           MPID_assert(mpi_errno == MPI_SUCCESS);
+#ifndef USE_PAMI_RDMA
           MPIDI_Win_datatype_unmap(&req->target.dt);
+#endif
           MPID_Datatype_release(req->origin.dt.pointer);
           MPIU_Free(req->buffer);
           MPIU_Free(req->user_buffer);
diff --git a/src/mpid/pamid/src/onesided/mpid_win_get_accumulate.c b/src/mpid/pamid/src/onesided/mpid_win_get_accumulate.c
index b8779e1..8f8d07c 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_get_accumulate.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_get_accumulate.c
@@ -38,28 +38,21 @@ MPIDI_Win_GetAccumSendAck(pami_context_t   context,
   pami_result_t rc = PAMI_SUCCESS;
 
   //Copy from msginfo->addr to a contiguous buffer
-  MPIDI_Datatype result_dt;
   char *buffer = NULL;
 
-  MPIDI_Win_datatype_basic(msginfo->result_count,
-			   msginfo->result_datatype,
-			   &result_dt);
-
-  int use_map = 0;
-  buffer      = MPIU_Malloc(result_dt.size);
-  if (result_dt.contig)
-    memcpy(buffer, (msginfo->addr + result_dt.true_lb), result_dt.size);
+  buffer      = MPIU_Malloc(msginfo->size);
+  MPID_assert(buffer != NULL);
+  
+  if (msginfo->num_contig == 1)
+    memcpy(buffer, msginfo->addr, msginfo->size);
   else
     {
-      use_map = 1;
-      MPID_assert(buffer != NULL);
-
       int mpi_errno = 0;
       mpi_errno = MPIR_Localcopy(msginfo->addr,
                                  msginfo->count,
                                  msginfo->type,
                                  buffer,
-                                 result_dt.size,
+                                 msginfo->size,
                                  MPI_CHAR);
       MPID_assert(mpi_errno == MPI_SUCCESS);      
     }
@@ -69,6 +62,7 @@ MPIDI_Win_GetAccumSendAck(pami_context_t   context,
   pami_send_t params = {
     .send = {
       .header = {
+	 .iov_base = msginfo,
 	 .iov_len = sizeof(MPIDI_Win_GetAccMsgInfo),
        },
       .dispatch = MPIDI_Protocols_WinGetAccumAck,
@@ -80,30 +74,14 @@ MPIDI_Win_GetAccumSendAck(pami_context_t   context,
      },
   };
 
-  int index  = 0;
-  size_t local_offset = 0;
-  //Set the map
-  MPIDI_Win_datatype_map(&result_dt);
-  MPID_assert(result_dt.num_contig == msginfo->result_num_contig);
-
-  while (index < result_dt.num_contig) {  
-    params.send.header.iov_base = msginfo;
-    params.send.data.iov_len    = result_dt.map[index].DLOOP_VECTOR_LEN;
-    params.send.data.iov_base   = buffer + local_offset;
-    
-    rc = PAMI_Send(context, &params);
-    MPID_assert(rc == PAMI_SUCCESS);
-    local_offset += params.send.data.iov_len;
-    ++index;    
-  }    
-
-  /** Review PAMI_Send semantics and consider moving the free calls to
-      the completion callback*/
-  //free msginfo
-  MPIU_Free(msginfo);
+  params.send.data.iov_len    = msginfo->size;
+  params.send.data.iov_base   = buffer;
+  
+  rc = PAMI_Send(context, &params);
+  MPID_assert(rc == PAMI_SUCCESS);
   
-  if (use_map && result_dt.map != &result_dt.__map)
-    MPIU_Free (result_dt.map);    
+  //free msginfo
+  //MPIU_Free(msginfo);  
 }
 
 void
@@ -158,8 +136,7 @@ MPIDI_Win_GetAccDoneCB(pami_context_t  context,
   ++req->win->mpid.sync.complete;
   ++req->origin.completed;
 
-  if (req->origin.completed == 
-      (req->result_num_contig + req->target.dt.num_contig))
+  if (req->origin.completed == req->target.dt.num_contig + 1)
     {
       if(req->req_handle)
           MPID_cc_set(req->req_handle->cc_ptr, 0);
@@ -178,6 +155,29 @@ MPIDI_Win_GetAccDoneCB(pami_context_t  context,
 }
 
 void
+MPIDI_Win_GetAccAckDoneCB(pami_context_t   context,
+			  void           * _msginfo,
+			  pami_result_t    result)
+{
+  MPIDI_Win_GetAccMsgInfo * msginfo =(MPIDI_Win_GetAccMsgInfo *)_msginfo;
+  MPIDI_Win_request *req = (MPIDI_Win_request *) msginfo->request;
+
+  if (req->result_num_contig > 1) {
+    MPIR_Localcopy(req->result.addr,
+		   req->result.count,
+		   req->result.datatype,
+		   msginfo->result_addr,
+		   msginfo->size,
+		   MPI_CHAR);
+    MPIU_Free(msginfo->result_addr);
+  }
+  MPIU_Free(msginfo);
+  
+  MPIDI_Win_GetAccDoneCB(context, req, result);
+}
+
+
+void
 MPIDI_WinGetAccumAckCB(pami_context_t    context,
 		       void            * cookie,
 		       const void      * _msginfo,
@@ -189,24 +189,22 @@ MPIDI_WinGetAccumAckCB(pami_context_t    context,
 {
   MPID_assert(recv   != NULL);
   MPID_assert(sndbuf == NULL);
-  MPID_assert(msginfo_size == sizeof(MPIDI_Win_GetAccMsgInfo));
   MPID_assert(_msginfo != NULL);
-  const MPIDI_Win_GetAccMsgInfo * msginfo =(const MPIDI_Win_GetAccMsgInfo *)_msginfo;
-
-  int null=0;
-  pami_type_t         pami_type;
-  pami_data_function  pami_op;
-  MPI_Op op = msginfo->op;
-
-  MPIDI_Datatype_to_pami(msginfo->result_datatype, &pami_type, op, &pami_op, &null);
-
-  recv->addr        = msginfo->result_addr;
-  recv->type        = pami_type;
+  MPIDI_Win_GetAccMsgInfo * msginfo =MPIU_Malloc(sizeof(MPIDI_Win_GetAccMsgInfo));
+  *msginfo = *(const MPIDI_Win_GetAccMsgInfo *)_msginfo;
+  MPIDI_Win_request *req = (MPIDI_Win_request *) msginfo->request;
+  
+  msginfo->result_addr = NULL;
+  recv->addr = req->result.addr;
+  if (req->result_num_contig > 1)
+    recv->addr = msginfo->result_addr = MPIU_Malloc(msginfo->size); 
+  
+  recv->type        = PAMI_TYPE_BYTE;
   recv->offset      = 0;
   recv->data_fn     = PAMI_DATA_COPY;
   recv->data_cookie = NULL;
-  recv->local_fn    = MPIDI_Win_GetAccDoneCB;
-  recv->cookie      = msginfo->req;
+  recv->local_fn    = MPIDI_Win_GetAccAckDoneCB;
+  recv->cookie      = msginfo;
 }
 
 static pami_result_t
@@ -455,13 +453,15 @@ MPID_Get_accumulate(const void   * origin_addr,
 
 
   MPIDI_Win_datatype_map(&req->target.dt);
-  MPIDI_Datatype result_dt;
-  MPIDI_Win_datatype_basic(result_count, result_datatype, &result_dt);
-  req->result_num_contig = 1;
-  if (!result_dt.contig)
-    req->result_num_contig =result_dt.pointer->max_contig_blocks*result_count+1;
+  req->result.addr = result_addr;
+  req->result.count = result_count;
+  req->result.datatype = result_datatype;
+  MPIDI_Win_datatype_basic(result_count, result_datatype, &req->result.dt);
+  MPIDI_Win_datatype_map(&req->result.dt);
+  req->result_num_contig = req->result.dt.num_contig;
+  
   //We wait for #messages depending on target and result_datatype
-  win->mpid.sync.total += (req->result_num_contig + req->target.dt.num_contig);
+  win->mpid.sync.total += (1 + req->target.dt.num_contig);
 
   {
     MPI_Datatype basic_type = MPI_DATATYPE_NULL;
@@ -478,20 +478,6 @@ MPID_Get_accumulate(const void   * origin_addr,
       }
     MPID_assert(basic_type != MPI_DATATYPE_NULL);
 
-    MPI_Datatype result_basic_type = MPI_DATATYPE_NULL;
-    MPID_Datatype_get_basic_type(result_datatype, result_basic_type);
-    /* MPID_Datatype_get_basic_type() doesn't handle the struct types */
-    if ((result_datatype == MPI_FLOAT_INT)  ||
-	(result_datatype == MPI_DOUBLE_INT) ||
-	(result_datatype == MPI_LONG_INT)   ||
-	(result_datatype == MPI_SHORT_INT)  ||
-	(result_datatype == MPI_LONG_DOUBLE_INT))
-      {
-	MPID_assert(result_basic_type == MPI_DATATYPE_NULL);
-	result_basic_type = result_datatype;
-      }
-    MPID_assert(result_basic_type != MPI_DATATYPE_NULL);
-    
     unsigned index;
     MPIDI_Win_GetAccMsgInfo * headers = MPIU_Calloc0(req->target.dt.num_contig, MPIDI_Win_GetAccMsgInfo);
     req->accum_headers = headers;
@@ -505,11 +491,8 @@ MPID_Get_accumulate(const void   * origin_addr,
      headers[index].count            = target_count;
      headers[index].counter          = index;
      headers[index].num_contig       = req->target.dt.num_contig;
+     headers[index].size             = req->target.dt.size;
      headers[index].request          = req;
-     headers[index].result_addr      = result_addr;
-     headers[index].result_count     = result_count;
-     headers[index].result_datatype  = result_basic_type;
-     headers[index].result_num_contig= req->result_num_contig;
     }
 
   }
diff --git a/src/mpid/pamid/src/onesided/mpidi_onesided.h b/src/mpid/pamid/src/onesided/mpidi_onesided.h
index 6969351..180aaad 100644
--- a/src/mpid/pamid/src/onesided/mpidi_onesided.h
+++ b/src/mpid/pamid/src/onesided/mpidi_onesided.h
@@ -199,11 +199,9 @@ typedef struct
   int            count;
   int            counter;
   int            num_contig;
+  int            size;
   void         * request;
   void         * result_addr;
-  int            result_count;
-  MPI_Datatype   result_datatype;
-  int            result_num_contig;		      
   pami_endpoint_t src_endpoint;    
 } MPIDI_Win_GetAccMsgInfo;
 
@@ -245,13 +243,19 @@ typedef struct _mpidi_win_request
     MPIDI_Datatype   dt;
   } target;
 
+  struct
+  {
+    void            *addr;
+    int              count;
+    MPI_Datatype     datatype;
+    MPIDI_Datatype   dt;
+  } result;
+
   void     *user_buffer;
-  void     *compare_buffer;     /* anchor of compare buffer for compare and swap */
+  void     *compare_buffer;    /* anchor of compare buffer for compare and swap */
   uint32_t  buffer_free;
   void     *buffer;
   struct _mpidi_win_request *next; 
-  void     * compare_addr;
-  void     * result_addr;  
   MPI_Op     op;
   int        result_num_contig;   
 

http://git.mpich.org/mpich.git/commitdiff/c03e766f497f91de0f75a37c945a8e356f1d3788

commit c03e766f497f91de0f75a37c945a8e356f1d3788
Author: Su Huang <suhuang at us.ibm.com>
Date:   Mon Jan 6 15:25:55 2014 -0500

    PAMID: fix memory leak problem
    
    Signed-off-by: Michael Blocksome <blocksom at us.ibm.com>

diff --git a/src/mpid/pamid/src/mpid_finalize.c b/src/mpid/pamid/src/mpid_finalize.c
index 2ed8633..82da86f 100644
--- a/src/mpid/pamid/src/mpid_finalize.c
+++ b/src/mpid/pamid/src/mpid_finalize.c
@@ -37,12 +37,19 @@ extern conn_info  *_conn_info_list;
 
 
 void MPIDI_close_pe_extension() {
+     extern MPIDI_printenv_t  *mpich_env;
+     extern MPIX_stats_t *mpid_statp;
      int rc;
      /* PAMI_Extension_open in pami_init   */
      rc = PAMI_Extension_close (pe_extension);
      if (rc != PAMI_SUCCESS) {
          TRACE_ERR("ERROR close PAMI_Extension failed rc %d", rc);
      }
+     if (mpich_env)
+         MPIU_Free(mpich_env);
+     if (mpid_statp)
+         MPIU_Free(mpid_statp);
+
 }
 #endif
 
diff --git a/src/mpid/pamid/src/mpidi_util.c b/src/mpid/pamid/src/mpidi_util.c
index 70f2a60..1c6cf3c 100644
--- a/src/mpid/pamid/src/mpidi_util.c
+++ b/src/mpid/pamid/src/mpidi_util.c
@@ -45,7 +45,7 @@
 #define PAMI_ASYNC_EXT_ATTR 2000
 
 #if (MPIDI_PRINTENV || MPIDI_STATISTICS || MPIDI_BANNER)
-MPIDI_printenv_t  *mpich_env;
+MPIDI_printenv_t  *mpich_env=NULL;
 extern char* mp_euilib;
 char mp_euidevice[20];
 extern pami_extension_t pe_extension;
diff --git a/src/mpid/pamid/src/onesided/mpid_1s.c b/src/mpid/pamid/src/onesided/mpid_1s.c
index dd3b234..3b89aab 100644
--- a/src/mpid/pamid/src/onesided/mpid_1s.c
+++ b/src/mpid/pamid/src/onesided/mpid_1s.c
@@ -54,7 +54,9 @@ MPIDI_Win_DoneCB(pami_context_t  context,
     }
 
 
-    if (req->origin.completed == req->target.dt.num_contig)
+    if ((req->origin.completed == req->target.dt.num_contig) || 
+        ((req->type >= MPIDI_WIN_REQUEST_COMPARE_AND_SWAP) && 
+         (req->origin.completed == req->origin.dt.num_contig)))
     {
       if(req->req_handle)
           MPID_cc_set(req->req_handle->cc_ptr, 0);

http://git.mpich.org/mpich.git/commitdiff/039c2290692c6fff053fd62ec2a3b5da700c7163

commit 039c2290692c6fff053fd62ec2a3b5da700c7163
Author: Su Huang <suhuang at us.ibm.com>
Date:   Mon Jan 6 14:10:59 2014 -0500

    Fix multiple FCNAME declarations error.
    
    Also clean up warnings.
    
      src/mpid/pamid/src/dyntask/mpidi_port.c:53: warning: useless storage class
      specifier in empty declaration
    
    Signed-off-by: Michael Blocksome <blocksom at us.ibm.com>

diff --git a/src/mpid/pamid/src/dyntask/mpid_comm_spawn_multiple.c b/src/mpid/pamid/src/dyntask/mpid_comm_spawn_multiple.c
index d470475..2b4bd86 100644
--- a/src/mpid/pamid/src/dyntask/mpid_comm_spawn_multiple.c
+++ b/src/mpid/pamid/src/dyntask/mpid_comm_spawn_multiple.c
@@ -112,7 +112,6 @@ int MPID_Comm_spawn_multiple(int count, char *array_of_commands[],
 			     int array_of_errcodes[])
 {
     int mpi_errno = MPI_SUCCESS;
-    static char FCNAME[] = "MPID_Comm_spawn_multiple";
 
     if(mpidi_dynamic_tasking == 0) {
 	fprintf(stderr, "Received spawn request for non-dynamic jobs\n");
@@ -359,7 +358,10 @@ int MPIDI_Comm_spawn_multiple(int count, char **commands,
 static char *parent_port_name = 0;    /* Name of parent port if this
 					 process was spawned (and is root
 					 of comm world) or null */
-
+#undef FUNCNAME
+#define FUNCNAME MPIDI_GetParentPort
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
 int MPIDI_GetParentPort(char ** parent_port)
 {
     int mpi_errno = MPI_SUCCESS;
diff --git a/src/mpid/pamid/src/dyntask/mpid_port.c b/src/mpid/pamid/src/dyntask/mpid_port.c
index b0415b5..e6f8810 100644
--- a/src/mpid/pamid/src/dyntask/mpid_port.c
+++ b/src/mpid/pamid/src/dyntask/mpid_port.c
@@ -98,7 +98,6 @@ int MPID_Comm_accept(const char * port_name, MPID_Info * info, int root,
 		     MPID_Comm * comm, MPID_Comm ** newcomm_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
-     static char FCNAME[] = "MPID_Comm_accept";
 
     if(mpidi_dynamic_tasking == 0) {
 	fprintf(stderr, "Dynamic tasking API is called on non-dynamic jobs\n");
@@ -127,7 +126,6 @@ int MPID_Comm_connect(const char * port_name, MPID_Info * info, int root,
 		      MPID_Comm * comm, MPID_Comm ** newcomm_ptr)
 {
     int mpi_errno=MPI_SUCCESS;
-     static char FCNAME[] = "MPID_Comm_connect";
 
     if(mpidi_dynamic_tasking == 0) {
 	fprintf(stderr, "Dynamic tasking API is called on non-dynamic jobs\n");
diff --git a/src/mpid/pamid/src/dyntask/mpidi_port.c b/src/mpid/pamid/src/dyntask/mpidi_port.c
index 2d4fecf..fcdbee1 100644
--- a/src/mpid/pamid/src/dyntask/mpidi_port.c
+++ b/src/mpid/pamid/src/dyntask/mpidi_port.c
@@ -50,7 +50,6 @@ static int maxAcceptQueueSize = 0;
 static int AcceptQueueSize    = 0;
 
 pthread_mutex_t rem_connlist_mutex = PTHREAD_MUTEX_INITIALIZER;
-extern struct transactionID;
 
 /* FIXME: If dynamic processes are not supported, this file will contain
    no code and some compilers may warn about an "empty translation unit" */

http://git.mpich.org/mpich.git/commitdiff/e25dc31f0ba40a35086b60e76afe1069af876b58

commit e25dc31f0ba40a35086b60e76afe1069af876b58
Author: Su Huang <suhuang at us.ibm.com>
Date:   Tue Dec 31 12:44:13 2013 -0500

    PAMID: segfault occurred in MPIDI_Win_DoneCB
    
    The segfault occurred while trying to free an already freed request
    handle in MPIDI_Win_DoneCB. To fix the problem, the second MPIU_Free(req)
    is removed from the routine.
    
    Signed-off-by: Michael Blocksome <blocksom at us.ibm.com>

diff --git a/src/mpid/pamid/src/onesided/mpid_1s.c b/src/mpid/pamid/src/onesided/mpid_1s.c
index b111546..dd3b234 100644
--- a/src/mpid/pamid/src/onesided/mpid_1s.c
+++ b/src/mpid/pamid/src/onesided/mpid_1s.c
@@ -69,9 +69,6 @@ MPIDI_Win_DoneCB(pami_context_t  context,
       if (!((req->type > MPIDI_WIN_REQUEST_GET_ACCUMULATE) && (req->type <=MPIDI_WIN_REQUEST_RGET_ACCUMULATE)))
           MPIU_Free(req);
     }
-
-    if ( (req->origin.completed == req->origin.dt.num_contig) && ( (req->type == MPIDI_WIN_REQUEST_FETCH_AND_OP) || (req->type == MPIDI_WIN_REQUEST_COMPARE_AND_SWAP) ) )
-          MPIU_Free(req);
   MPIDI_Progress_signal();
 }
 

http://git.mpich.org/mpich.git/commitdiff/31ac6f01a51883c466255cd8ae80ae32a9d7a6a9

commit 31ac6f01a51883c466255cd8ae80ae32a9d7a6a9
Author: Su Huang <suhuang at us.ibm.com>
Date:   Wed Oct 30 16:30:59 2013 -0400

    PAMID: fixes for MPI_Win_*
    
    PAMID: fixes for RMA shared related, lock_all and unlock_all functions
    
    The following has been updated:
    - MPID_Win_allocated_shared
    - MPID_Win_shared_query
    - several functions in mpid_win_lock_all.c, mpid_win_lock.c, mpid_win_free.c and
      structures in mpidi_onesided.h, mpidi_datatypes.h have been modified for
      scalability support for MPID_Win_lock_all() and MPID_Win_unlock_all()
    
    PAMID: fix MPID_Win_allocate_shared, MPID_Win_lock_all/unlock_all and MPID_Win_shared_query
    
    Add changes based on code review feedback from the team:
    1) update getPageSize to ensure that the pagesize is obtained from the range of
       passed in address.
    2) don't call dispatcher if lockQ[index].done == 1
       if (!lockQ[index].done)
         MPID_PROGRESS_WAIT_WHILE(lockQ[index].done == 0);
    3) in mpid_win_shared_query()
       replace
         MPID_assert(win->create_flavor == MPI_WIN_FLAVOR_SHARED);
       by
         MPIU_ERR_CHKANDSTMT((win->create_flavor != MPI_WIN_FLAVOR_SHARED), mpi_errno,
                             MPI_ERR_RMA_FLAVOR, return mpi_errno, "**rmaflavor");
    3) some minor fixes.
    
    PAMID: modify MPI_Win_flush_local, MPI_Win_lock etc for better performance
    
    The current implementation for the subject mentioned function is to allocate
    two counters for each rank in the window group. The design could cause a scaling
    issue.  The fix is to update MPI_Win_flush_local etc with two counters per
    window approach.
    
    The changes also include the follwoing:
    - provides mutex_lock/mutex_unlock for atomic operations in shared window
    - fixes some bugs in handling shared window.
    - removes the shared segment with IPC_RMID in MPI_Win_free.
    
    PAMID: fixed base address for each rank in a window group
    
    For shared window, the base address for each rank should not be
    exchanged among ranks in a window group. Without the fix, the job
    will be terminated with segfault.
    
    (ibm) F189033
    (ibm) D194640
    
    Signed-off-by: Michael Blocksome <blocksom at us.ibm.com>

diff --git a/src/mpid/pamid/include/mpidi_datatypes.h b/src/mpid/pamid/include/mpidi_datatypes.h
index 2d98dcf..b773ce7 100644
--- a/src/mpid/pamid/include/mpidi_datatypes.h
+++ b/src/mpid/pamid/include/mpidi_datatypes.h
@@ -388,6 +388,7 @@ struct MPIDI_Win_lock
   unsigned               rank;
   MPIDI_LOCK_TYPE_t      mtype;    /* MPIDI_REQUEST_LOCK or MPIDI_REQUEST_LOCKALL    */
   int                    type;
+  void                   *flagAddr;
 };
 struct MPIDI_Win_queue
 {
@@ -415,12 +416,6 @@ typedef struct MPIDI_Win_info_args {
     int alloc_shared_noncontig;
 } MPIDI_Win_info_args;
 
-
-typedef  struct {
-       int nStarted;
-       int nCompleted;
-} RMA_nOps_t;
-
 typedef struct workQ_t {
    void *msgQ;
    int  count;
@@ -446,13 +441,18 @@ typedef struct MPIDI_Win_info
   uint32_t           memregion_used;
 } MPIDI_Win_info;
 
+typedef pthread_mutex_t MPIDI_SHM_MUTEX;
+
 typedef struct MPIDI_Win_shm_t
 {
     int allocated;                  /* flag: TRUE iff this window has a shared memory
                                                  region associated with it */
     void *base_addr;                /* base address of shared memory region */
     MPI_Aint segment_len;           /* size of shared memory region         */
-    uint32_t  shm_key;              /* shared memory key                    */
+    uint32_t  shm_id;                /* shared memory id                    */
+    int       *shm_count;
+    MPIDI_SHM_MUTEX *mutex_lock;    /* shared memory windows -- lock for    */
+                                     /*     accumulate/atomic operations     */
 } MPIDI_Win_shm_t;
 
 /**
@@ -465,7 +465,7 @@ struct MPIDI_Win
   void             ** shm_base_addrs; /* base address shared by all process in comm      */
   MPIDI_Win_shm_t  *shm;             /* shared memory info                             */
   workQ_t work;
-  RMA_nOps_t *origin;
+  int   max_ctrlsends;
   struct MPIDI_Win_sync
   {
 #if 0
diff --git a/src/mpid/pamid/src/dyntask/mpid_comm_spawn_multiple.c b/src/mpid/pamid/src/dyntask/mpid_comm_spawn_multiple.c
index be71200..d470475 100644
--- a/src/mpid/pamid/src/dyntask/mpid_comm_spawn_multiple.c
+++ b/src/mpid/pamid/src/dyntask/mpid_comm_spawn_multiple.c
@@ -112,6 +112,7 @@ int MPID_Comm_spawn_multiple(int count, char *array_of_commands[],
 			     int array_of_errcodes[])
 {
     int mpi_errno = MPI_SUCCESS;
+    static char FCNAME[] = "MPID_Comm_spawn_multiple";
 
     if(mpidi_dynamic_tasking == 0) {
 	fprintf(stderr, "Received spawn request for non-dynamic jobs\n");
diff --git a/src/mpid/pamid/src/dyntask/mpid_port.c b/src/mpid/pamid/src/dyntask/mpid_port.c
index e6f8810..b0415b5 100644
--- a/src/mpid/pamid/src/dyntask/mpid_port.c
+++ b/src/mpid/pamid/src/dyntask/mpid_port.c
@@ -98,6 +98,7 @@ int MPID_Comm_accept(const char * port_name, MPID_Info * info, int root,
 		     MPID_Comm * comm, MPID_Comm ** newcomm_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
+     static char FCNAME[] = "MPID_Comm_accept";
 
     if(mpidi_dynamic_tasking == 0) {
 	fprintf(stderr, "Dynamic tasking API is called on non-dynamic jobs\n");
@@ -126,6 +127,7 @@ int MPID_Comm_connect(const char * port_name, MPID_Info * info, int root,
 		      MPID_Comm * comm, MPID_Comm ** newcomm_ptr)
 {
     int mpi_errno=MPI_SUCCESS;
+     static char FCNAME[] = "MPID_Comm_connect";
 
     if(mpidi_dynamic_tasking == 0) {
 	fprintf(stderr, "Dynamic tasking API is called on non-dynamic jobs\n");
diff --git a/src/mpid/pamid/src/onesided/mpid_1s.c b/src/mpid/pamid/src/onesided/mpid_1s.c
index 618f01f..b111546 100644
--- a/src/mpid/pamid/src/onesided/mpid_1s.c
+++ b/src/mpid/pamid/src/onesided/mpid_1s.c
@@ -56,7 +56,6 @@ MPIDI_Win_DoneCB(pami_context_t  context,
 
     if (req->origin.completed == req->target.dt.num_contig)
     {
-      req->win->mpid.origin[target_rank].nCompleted++;
       if(req->req_handle)
           MPID_cc_set(req->req_handle->cc_ptr, 0);
 
diff --git a/src/mpid/pamid/src/onesided/mpid_win_accumulate.c b/src/mpid/pamid/src/onesided/mpid_win_accumulate.c
index d6c3224..7792d8f 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_accumulate.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_accumulate.c
@@ -251,7 +251,6 @@ MPID_Accumulate(void         *origin_addr,
         MPIU_Free(req);
       return MPI_SUCCESS;
     }
-  win->mpid.origin[target_rank].nStarted++;
 
   req->target.rank = target_rank;
 
diff --git a/src/mpid/pamid/src/onesided/mpid_win_allocate.c b/src/mpid/pamid/src/onesided/mpid_win_allocate.c
index d17ff26..f9ff368 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_allocate.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_allocate.c
@@ -86,7 +86,7 @@ MPID_Win_allocate(MPI_Aint     size,
   winfo->win = win;
   winfo->disp_unit = disp_unit;
 
-  rc= MPIDI_Win_allgather(baseP,size,win_ptr);
+  rc= MPIDI_Win_allgather(size,win_ptr);
   if (rc != MPI_SUCCESS)
       return rc;
   *(void**) base_ptr = (void *) win->base;
diff --git a/src/mpid/pamid/src/onesided/mpid_win_allocate_shared.c b/src/mpid/pamid/src/onesided/mpid_win_allocate_shared.c
index 97525f5..d0f687b 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_allocate_shared.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_allocate_shared.c
@@ -36,10 +36,12 @@ extern int mpidi_dynamic_tasking;
 #define MPIDI_PAGESIZE ((MPI_Aint)pageSize)
 #define MPIDI_PAGESIZE_MASK (~(MPIDI_PAGESIZE-1))
 #define MPIDI_ROUND_UP_PAGESIZE(x) ((((MPI_Aint)x)+(~MPIDI_PAGESIZE_MASK)) & MPIDI_PAGESIZE_MASK)
+#define ALIGN_BOUNDARY 128     /* Align data structures to cache line */
+#define PAD_SIZE(s) (ALIGN_BOUNDARY - (sizeof(s) & (ALIGN_BOUNDARY-1)))
 
 
 int CheckRankOnNode(MPID_Comm  * comm_ptr,int *onNode ) {
-      int rank,comm_size;
+      int rank,comm_size,i;
       int mpi_errno=PAMI_SUCCESS;
 
 
@@ -47,9 +49,13 @@ int CheckRankOnNode(MPID_Comm  * comm_ptr,int *onNode ) {
       comm_size = comm_ptr->local_size;
       rank      = comm_ptr->rank;
 
-      *onNode=0;  
-      if (comm_ptr->intranode_table[rank] != -1) 
-           *onNode=1;
+      *onNode=1;
+      for (i=0; i< comm_size; i++) {
+          if (comm_ptr->intranode_table[i] == -1) {
+               *onNode=0;
+               break;
+          }
+      }
      if (*onNode== 0) {
       MPIU_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_CONFLICT,
                           return mpi_errno, "**rmaconflict");
@@ -109,7 +115,7 @@ int GetPageSize(void *addr, ulong *pageSize)
   while(fgets(Line,200,fp)) {
     i++;
     sscanf(Line,"%s  %s %s %s \n",A1,A2,A3,A4);
-    if (memcmp(A1,"KernelPageSize",14)==0) {
+    if ((found == 1) && (memcmp(A1,"KernelPageSize",14)==0)) {
          j=atoi(A2);
          if ((A3[0]=='k') || (A3[0]=='K'))
                k=1024;
@@ -118,13 +124,29 @@ int GetPageSize(void *addr, ulong *pageSize)
          else if ((A3[0]=='g') || (A3[0]=='G'))
                k=0x40000000;  /* 1 GB  */
          else {
-             printf("ERROR unrecognized unit A3=%s\n",A3);
+             TRACE_ERR("ERROR unrecognized unit A3=%s\n",A3);
              break;
          }
          *pageSize = (ulong)(j * k);
          TRACE_ERR(" addr=%p pageSize=%ld %s(%d)\n", addr,*pageSize,__FILE__,__LINE__);
          break;
     }
+    if ((strlen(A2) == 4) && ((A2[0]=='r') || (A2[3]=='p'))) {
+         len = strlen(A1);
+       #ifndef REDHAT
+         t1=strtok(A1,search);
+       #else
+         t1=strtok(A1,"-");
+       #endif
+         t2 = A1+strlen(t1)+1;
+         sscanf(t1,"%p \n",&beg);
+         sscanf(t2,"%p \n",&end);
+         if (((ulong) addr >= (ulong)beg) && ((ulong)addr <= (ulong)end)) {
+             found=1;
+             TRACE_ERR("found addr=%p i=%d between beg=%p and end=%p in %s\n",
+                    addr,i,beg,end,fileName);
+         }
+    }
   }
   fclose(fp);
   if (*pageSize == 0) {
@@ -135,34 +157,28 @@ int GetPageSize(void *addr, ulong *pageSize)
   return 0;
 }
 
-#define MPIDI_PAGESIZE ((MPI_Aint)pageSize)
-#define MPIDI_PAGESIZE_MASK (~(MPIDI_PAGESIZE-1))
-#define MPIDI_ROUND_UP_PAGESIZE(x) ((((MPI_Aint)x)+(~MPIDI_PAGESIZE_MASK)) & MPIDI_PAGESIZE_MASK)
-
-
-
 int
-MPID_getSharedSegment(MPI_Aint     size,
+MPID_getSharedSegment(MPI_Aint        size,
                          int          disp_unit,
-                         MPID_Info  * info,
                          MPID_Comm  * comm_ptr,
-                         void **base_ptr,
-                         MPID_Win  ** win_ptr)
+                         void       **base_ptr,
+                         MPID_Win   **win_ptr,
+                         MPI_Aint      *pSize,
+                         int        *noncontig)
 {
     int mpi_errno = MPI_SUCCESS;
-    void **base_pp = (void **) base_ptr;
+    void **base_pp = base_ptr;
     int i, k, comm_size, rank;
-    int  shm_id;
     uint32_t shm_key; 
     int  node_rank;
+    int  shm_id;
     MPI_Aint *node_sizes;
-    void * base_addr;
     MPI_Aint *tmp_buf;
     int errflag = FALSE;
-    int noncontig = FALSE;
-    MPI_Aint pageSize, len,new_size;
+    MPI_Aint pageSize,pageSize2, len,new_size;
     char *cp;
     MPID_Win  *win;
+    int    padSize;
     MPIDI_Win_info *winfo;
     int shm_flag = IPC_CREAT | IPC_EXCL | S_IRUSR | S_IWUSR;
 
@@ -171,14 +187,14 @@ MPID_getSharedSegment(MPI_Aint     size,
     rank = win->comm_ptr->rank;
     tmp_buf = MPIU_Malloc( 2*comm_size*sizeof(MPI_Aint));
 
-    mpi_errno=CheckSpaceType(win_ptr,info,&noncontig);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
-    GetPageSize((void *) win_ptr, &pageSize);
+    GetPageSize((void *) win_ptr, (ulong *) &pageSize);
+    *pSize = pageSize;
     win->mpid.shm->segment_len = 0;
     if (comm_size == 1) {
          if (size > 0) {
-             if (noncontig) 
+             if (*noncontig) 
                  new_size = MPIDI_ROUND_UP_PAGESIZE(size);
              else 
                  new_size = size;
@@ -196,6 +212,7 @@ MPID_getSharedSegment(MPI_Aint     size,
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
          win->mpid.shm->segment_len = new_size;
          win->mpid.info[rank].base_addr = *base_pp;
+         win->base = *base_pp;
      } else {
          tmp_buf[rank]   = (MPI_Aint) size;
          mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
@@ -206,12 +223,13 @@ MPID_getSharedSegment(MPI_Aint     size,
          /* calculate total number of bytes needed */
          for (i = 0; i < comm_size; ++i) {
              len = tmp_buf[i];
-             if (noncontig)
+             if (*noncontig)
                 /* Round up to next page size */
                  win->mpid.shm->segment_len += MPIDI_ROUND_UP_PAGESIZE(len); 
              else
                  win->mpid.shm->segment_len += len;
           }
+          len = len + 128; /* needed for mutex_lock etc */
           /* get shared segment   */
 
           shm_key=-1;
@@ -266,7 +284,13 @@ MPID_getSharedSegment(MPI_Aint     size,
               shm_id = shmget(shm_key, win->mpid.shm->segment_len, shm_flag);
               MPIU_ERR_CHKANDJUMP((shm_id == -1), mpi_errno, MPI_ERR_RMA_SHARED, "**rmashared");
               win->mpid.shm->base_addr = (void *) shmat(shm_id,0,0);
-              MPIU_ERR_CHKANDJUMP((base_addr == NULL), mpi_errno,MPI_ERR_BUFFER, "**bufnull");
+              MPIU_ERR_CHKANDJUMP((win->mpid.shm->base_addr == NULL), mpi_errno,MPI_ERR_BUFFER, "**bufnull");
+              GetPageSize((void *) win->mpid.shm->base_addr, &pageSize2);
+              MPID_assert(pageSize == pageSize2);
+              /* set mutex_lock address and initialize it   */
+              win->mpid.shm->mutex_lock = (pthread_mutex_t *) win->mpid.shm->base_addr;
+              win->mpid.shm->shm_count=(int *)((MPI_Aint) win->mpid.shm->mutex_lock + (MPI_Aint) sizeof(pthread_mutex_t));
+              MPIDI_SHM_MUTEX_INIT(win);
               win->mpid.shm->allocated = 1;
               /* successfully created shm segment */
                mpi_errno = MPIR_Bcast_impl((void *) &shm_key, sizeof(int), MPI_CHAR, 0, comm_ptr, &errflag);
@@ -278,35 +302,22 @@ MPID_getSharedSegment(MPI_Aint     size,
                if (shm_id != -1) { /* shm segment is available */
                    win->mpid.shm->base_addr = (void *) shmat(shm_id,0,0);
                    win->mpid.shm->allocated = 1;
-                   MPIU_ERR_CHKANDJUMP((base_addr == (void *) -1), mpi_errno, MPI_ERR_RMA_SHARED, "**rmashared");
+                   MPIU_ERR_CHKANDJUMP((win->mpid.shm->base_addr == (void *) -1), mpi_errno, MPI_ERR_RMA_SHARED, "**rmashared");
                } else { /* node leader failed, no need to try here */
                   MPIU_ERR_CHKANDJUMP((shm_id == -1), mpi_errno, MPI_ERR_RMA_SHARED, "**rmashared");
                }
+               win->mpid.shm->mutex_lock = (pthread_mutex_t *) win->mpid.shm->base_addr;
+               win->mpid.shm->shm_count=(int *)((MPI_Aint) win->mpid.shm->mutex_lock + (MPI_Aint) sizeof(pthread_mutex_t));
               }
-         mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag);
+         win->mpid.shm->shm_id = shm_id;
+         OPA_fetch_and_add_int((OPA_int_t *) win->mpid.shm->shm_count,1);
+         while(*win->mpid.shm->shm_count != comm_size) MPIDI_QUICKSLEEP;  /* wait for all ranks complete shmat */
          /* compute the base addresses of each process within the shared memory segment */
         {
-         win->base = win->mpid.shm->base_addr;
-         winfo = &win->mpid.info[rank];
-         winfo->win = win;
-         winfo->disp_unit = disp_unit;
-         win->mpid.info[0].base_addr = win->mpid.shm->base_addr;
-         char *cur_base = (*win_ptr)->mpid.shm->base_addr;
-         for (i = 1; i < comm_size; ++i) {
-             size = tmp_buf[i];
-             if (size) {
-                if (noncontig)  
-                   /* Round up to next page size */
-                    win->mpid.info[i].base_addr =(void *) ((MPI_Aint) cur_base + (MPI_Aint) MPIDI_ROUND_UP_PAGESIZE(size));
-                else
-                    win->mpid.info[i].base_addr = (void *) ((MPI_Aint) cur_base + size);
-                    cur_base = win->mpid.info[i].base_addr;
-              } else {
-                    win->mpid.info[i].base_addr = NULL; 
-              }
-          }
-          }
-          *base_pp = win->mpid.info[rank].base_addr;
+         padSize=sizeof(pthread_mutex_t) + sizeof(OPA_int_t);
+         win->base = (void *) ((long) win->mpid.shm->base_addr + (long ) PAD_SIZE(padSize));
+         }
+          *base_pp = win->base;
      }
 
 fn_exit:
@@ -352,6 +363,11 @@ fn_fail:
  * \param[out] win_ptr  window object returned by the call (handle)
  * \return MPI_SUCCESS, MPI_ERR_ARG, MPI_ERR_COMM, MPI_ERR_INFO. MPI_ERR_OTHER,
  *         MPI_ERR_SIZE
+ *
+ *  win->mpid.shm->base_addr  \* return address from shmat                                *\
+ *  win->base                 \* address for data starts here == win->mpid.shm->base_addr *\          
+ *                            \* + space for mutex_lock and shm_count                     *\
+ *
  */
 int
 MPID_Win_allocate_shared(MPI_Aint     size,   
@@ -365,35 +381,49 @@ MPID_Win_allocate_shared(MPI_Aint     size,
   void **baseP = base_ptr;
   MPIDI_Win_info  *winfo;
   MPID_Win    *win;
-  int         rank, comm_size;
-  int         onNode;
+  int         rank, comm_size,i;
+  int         onNode,noncontig=FALSE;
+  MPI_Aint    pageSize=0;
+ 
   
   
   mpi_errno =MPIDI_Win_init(size,disp_unit,win_ptr, info, comm_ptr, MPI_WIN_FLAVOR_SHARED, MPI_WIN_UNIFIED);
   if (mpi_errno) MPIU_ERR_POP(mpi_errno);
   win = *win_ptr;
   mpi_errno=CheckRankOnNode(comm_ptr,&onNode);
+  if (mpi_errno) MPIU_ERR_POP(mpi_errno);
   MPIU_ERR_CHKANDJUMP((onNode == 0), mpi_errno, MPI_ERR_RMA_SHARED, "**rmashared");
-
+  mpi_errno=CheckSpaceType(win_ptr,info,&noncontig);
   rank     = (*win_ptr)->comm_ptr->rank;
   comm_size = (*win_ptr)->comm_ptr->local_size;
   win->mpid.shm = MPIU_Malloc(sizeof(MPIDI_Win_shm_t));
   win->mpid.shm->allocated=0;
   MPID_assert(win->mpid.shm != NULL);
-  MPID_getSharedSegment(size, disp_unit,info,comm_ptr,baseP, win_ptr);
+  MPID_getSharedSegment(size, disp_unit,comm_ptr,baseP, win_ptr,(ulong *)&pageSize,(int *)&noncontig);
 
-  win->base = *baseP;
   winfo = &win->mpid.info[rank];
   winfo->win = win;
   winfo->disp_unit = disp_unit;
-  win->base = (void *) MPIU_PtrToAint(winfo->base_addr);
-  winfo->base_addr = win->base;
-
-  mpi_errno = MPIDI_Win_allgather(*baseP,size,win_ptr);
+  mpi_errno = MPIDI_Win_allgather(size,win_ptr);
   if (mpi_errno != MPI_SUCCESS)
       return mpi_errno;
-
-  *(void**) base_ptr = (void *) win->base;
+  win->mpid.info[0].base_addr = win->base;
+  if (comm_size > 1) {
+     char *cur_base = (*win_ptr)->base;
+     for (i = 1; i < comm_size; ++i) {
+          if (size) {
+              if (noncontig)  
+                  /* Round up to next page size */
+                   win->mpid.info[i].base_addr =(void *) ((MPI_Aint) cur_base + (MPI_Aint) MPIDI_ROUND_UP_PAGESIZE(size));
+                else
+                    win->mpid.info[i].base_addr = (void *) ((MPI_Aint) cur_base + (MPI_Aint) size);
+                cur_base = win->mpid.info[i].base_addr;
+           } else {
+                 win->mpid.info[i].base_addr = NULL; 
+           }
+      }
+  }
+  *(void**) base_ptr = (void *) win->mpid.info[rank].base_addr;
 
   mpi_errno = MPIR_Barrier_impl(comm_ptr, &mpi_errno);
 
diff --git a/src/mpid/pamid/src/onesided/mpid_win_compare_and_swap.c b/src/mpid/pamid/src/onesided/mpid_win_compare_and_swap.c
index 8022bc7..3f7d925 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_compare_and_swap.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_compare_and_swap.c
@@ -112,7 +112,6 @@ int MPID_Compare_and_swap(const void *origin_addr, const void *compare_addr,
         int disp_unit;
         int len;
 
-#ifdef PENDING_SHM_WIN
         if (win->create_flavor == MPI_WIN_FLAVOR_SHARED) {
             MPIDI_SHM_MUTEX_LOCK(win);
             shm_locked = 1;
@@ -121,12 +120,9 @@ int MPID_Compare_and_swap(const void *origin_addr, const void *compare_addr,
             disp_unit = win->disp_unit;
         }
         else {
-#endif
             base = win->base;
             disp_unit = win->disp_unit;
-#ifdef PENDING_SHM_WIN
         }
-#endif
 
         dest_addr = (char *) base + disp_unit * target_disp;
 
@@ -136,12 +132,10 @@ int MPID_Compare_and_swap(const void *origin_addr, const void *compare_addr,
         if (MPIR_Compare_equal(compare_addr, dest_addr, datatype))
             MPIU_Memcpy(dest_addr, origin_addr, len); 
 
-#ifdef PENDING_SHM_WIN
         if (shm_locked) {
             MPIDI_SHM_MUTEX_UNLOCK(win);
             shm_locked = 0;
         }
-#endif
         MPIU_Free(req);
     } 
   else {
diff --git a/src/mpid/pamid/src/onesided/mpid_win_create.c b/src/mpid/pamid/src/onesided/mpid_win_create.c
index 38f072a..e3d2b9d 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_create.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_create.c
@@ -78,7 +78,6 @@ MPIDI_Win_init( MPI_Aint length,
       MPID_assert(mpi_errno == 0);
   }
   MPID_assert(mpi_errno == 0);
-  win->mpid.origin = MPIU_Calloc0(size, RMA_nOps_t);
 
 
     /* Initialize the info (hint) flags per window */
@@ -106,7 +105,7 @@ MPIDI_Win_init( MPI_Aint length,
 /*                                                                         */
 /***************************************************************************/
 int
-MPIDI_Win_allgather(void *base, MPI_Aint size, MPID_Win **win_ptr )
+MPIDI_Win_allgather( MPI_Aint size, MPID_Win **win_ptr )
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Win *win;
@@ -208,7 +207,7 @@ MPID_Win_create(void       * base,
   winfo->win = win;
   winfo->disp_unit = disp_unit;
 
-  rc= MPIDI_Win_allgather(base,size,win_ptr);
+  rc= MPIDI_Win_allgather(size,win_ptr);
   if (rc != MPI_SUCCESS)
       return rc;
 
diff --git a/src/mpid/pamid/src/onesided/mpid_win_create_dynamic.c b/src/mpid/pamid/src/onesided/mpid_win_create_dynamic.c
index 5343362..74e10ca 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_create_dynamic.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_create_dynamic.c
@@ -57,7 +57,7 @@ MPID_Win_create_dynamic( MPID_Info  * info,
   winfo = &win->mpid.info[rank];
   winfo->win = win;
 
-  rc= MPIDI_Win_allgather(MPI_BOTTOM,0,win_ptr);
+  rc= MPIDI_Win_allgather(0,win_ptr);
   if (rc != MPI_SUCCESS)
       return rc;
 
diff --git a/src/mpid/pamid/src/onesided/mpid_win_fetch_and_op.c b/src/mpid/pamid/src/onesided/mpid_win_fetch_and_op.c
index 153ef88..9f47b9b 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_fetch_and_op.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_fetch_and_op.c
@@ -293,7 +293,6 @@ int MPID_Fetch_and_op(const void *origin_addr, void *result_addr,
         int disp_unit;
         int len, one;
 
-#ifdef PENDING_SHM_WIN
         if (win->create_flavor == MPI_WIN_FLAVOR_SHARED) {
             MPIDI_SHM_MUTEX_LOCK(win);
             shm_locked = 1;
@@ -302,12 +301,9 @@ int MPID_Fetch_and_op(const void *origin_addr, void *result_addr,
 
         }
         else {
-#endif
             base = win->base;
             disp_unit = win->disp_unit;
-#ifdef PENDING_SHM_WIN
         }
-#endif
 
         dest_addr = (char *) base + disp_unit * target_disp;
 
@@ -319,12 +315,10 @@ int MPID_Fetch_and_op(const void *origin_addr, void *result_addr,
 
         (*uop)((void *) origin_addr, dest_addr, &one, &datatype);
 
-#ifdef PENDING_SHM_WIN
         if (shm_locked) {
             MPIDI_SHM_MUTEX_UNLOCK(win);
             shm_locked = 0;
         }
-#endif
 
         MPIU_Free(req);
 
diff --git a/src/mpid/pamid/src/onesided/mpid_win_flush.c b/src/mpid/pamid/src/onesided/mpid_win_flush.c
index 4fb3dde..c2e7e29 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_flush.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_flush.c
@@ -52,12 +52,11 @@ MPID_Win_flush(int       rank,
       MPIU_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC,
                         return mpi_errno, "**rmasync");
      }
-  MPID_PROGRESS_WAIT_WHILE(win->mpid.origin[rank].nStarted != win->mpid.origin[rank].nCompleted);
-                                        
   sync = &win->mpid.sync;
-  win->mpid.origin[rank].nStarted=0;
-  win->mpid.origin[rank].nCompleted=0;
-
+  MPID_PROGRESS_WAIT_WHILE(sync->total != sync->complete);
+  sync->total    = 0;
+  sync->started  = 0;
+  sync->complete = 0;
 
   return mpi_errno;
 }
@@ -80,7 +79,6 @@ int
 MPID_Win_flush_all(MPID_Win *win)
 {
   int mpi_errno = MPI_SUCCESS;
-  int nTasks,i;
   struct MPIDI_Win_sync* sync;
   static char FCNAME[] = "MPID_Win_flush_all";
 
@@ -91,15 +89,11 @@ MPID_Win_flush_all(MPID_Win *win)
                         return mpi_errno, "**rmasync");
      }
 
-   sync = &win->mpid.sync;
-   MPID_PROGRESS_WAIT_WHILE(sync->total != sync->complete);
-   sync->total    = 0;
-   sync->started  = 0;
-   sync->complete = 0;
-   for (i = 0; i < MPIR_Comm_size(win->comm_ptr); i++) {
-        win->mpid.origin[i].nStarted=0;
-        win->mpid.origin[i].nCompleted=0;
-   }
+  sync = &win->mpid.sync;
+  MPID_PROGRESS_WAIT_WHILE(sync->total != sync->complete);
+  sync->total    = 0;
+  sync->started  = 0;
+  sync->complete = 0;
   return mpi_errno;
 
 }
@@ -114,6 +108,10 @@ MPID_Win_flush_all(MPID_Win *win)
  * the cng process to the target rank on the given window. The user may
  * reuse any buffers after this routine returns.
  *
+ * It has been determined that the routine uses only counters for each window
+ * and not for each rank because the overhead of tracking each rank could be great
+ * if a window group contains a large number of ranks. 
+ *
  * \param[in] rank      rank of target window
  * \param[in] win       window object
  * \return MPI_SUCCESS, MPI_ERR_OTHER
@@ -123,6 +121,7 @@ int
 MPID_Win_flush_local(int rank, MPID_Win *win)
 {
   int mpi_errno = MPI_SUCCESS;
+  struct MPIDI_Win_sync* sync;
   static char FCNAME[] = "MPID_Win_flush_local";
 
   if((win->mpid.sync.origin_epoch_type != MPID_EPOTYPE_LOCK) &&
@@ -131,9 +130,11 @@ MPID_Win_flush_local(int rank, MPID_Win *win)
       MPIU_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC,
                         return mpi_errno, "**rmasync");
      }
-   MPID_PROGRESS_WAIT_WHILE(win->mpid.origin[rank].nStarted != win->mpid.origin[rank].nCompleted);
-   win->mpid.origin[rank].nStarted=0;
-   win->mpid.origin[rank].nCompleted=0;
+   sync = &win->mpid.sync;
+   MPID_PROGRESS_WAIT_WHILE(sync->total != sync->complete);
+   sync->total    = 0;
+   sync->started  = 0;
+   sync->complete = 0;
 
   return mpi_errno;
 }
@@ -155,8 +156,8 @@ int
 MPID_Win_flush_local_all(MPID_Win *win)
 {
   int mpi_errno = MPI_SUCCESS;
+  struct MPIDI_Win_sync* sync;
   static char FCNAME[] = "MPID_Win_flush";
-  int size,i;
 
   if((win->mpid.sync.origin_epoch_type != MPID_EPOTYPE_LOCK) &&
      (win->mpid.sync.origin_epoch_type != MPID_EPOTYPE_LOCK_ALL))
@@ -164,14 +165,11 @@ MPID_Win_flush_local_all(MPID_Win *win)
       MPIU_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC,
                         return mpi_errno, "**rmasync");
      }
-  size = MPIR_Comm_size(win->comm_ptr);
-  for (i=0; i < size; i++) {
-      MPID_PROGRESS_WAIT_WHILE(win->mpid.origin[i].nStarted != win->mpid.origin[i].nCompleted);
-  }
-  for (i=0; i < size; i++) {
-     win->mpid.origin[i].nStarted=0;
-     win->mpid.origin[i].nCompleted=0;
-  }
+  sync = &win->mpid.sync;
+  MPID_PROGRESS_WAIT_WHILE(sync->total != sync->complete);
+  sync->total    = 0;
+  sync->started  = 0;
+  sync->complete = 0;
 
   return mpi_errno;
 }
diff --git a/src/mpid/pamid/src/onesided/mpid_win_free.c b/src/mpid/pamid/src/onesided/mpid_win_free.c
index 2ba83d9..c8a8867 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_free.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_free.c
@@ -20,16 +20,34 @@
  * \brief ???
  */
 #include "mpidi_onesided.h"
+#include <sys/shm.h>
+#include <sys/ipc.h>
+#include <sys/stat.h>
+
 
 int MPIDI_SHM_Win_free(MPID_Win **win_ptr)
 {
+  static char FCNAME[] = "MPID_SHM_Win_free";
+  int    rc;
   int mpi_errno = MPI_SUCCESS;
+
     /* Free shared memory region */
     /* free shm_base_addrs that's only used for shared memory windows */
-    if ((*win_ptr)->mpid.shm->allocated)
-        mpi_errno = shmdt((*win_ptr)->base);
+    if ((*win_ptr)->mpid.shm->allocated) {
+        OPA_fetch_and_add_int((OPA_int_t *) (*win_ptr)->mpid.shm->shm_count,-1);
+        while(*(*win_ptr)->mpid.shm->shm_count) MPIDI_QUICKSLEEP;
+        if ((*win_ptr)->comm_ptr->rank == 0) {
+            MPIDI_SHM_MUTEX_DESTROY(*win_ptr);
+        }
+       mpi_errno = shmdt((*win_ptr)->mpid.shm->base_addr);
+       if ((*win_ptr)->comm_ptr->rank == 0) {
+            rc=shmctl((*win_ptr)->mpid.shm->shm_id,IPC_RMID,NULL);
+            MPIU_ERR_CHKANDJUMP((rc == -1), errno,MPI_ERR_RMA_SHARED, "**shmctl");
+        }
+    }
     MPIU_Free((*win_ptr)->mpid.shm);
     (*win_ptr)->mpid.shm = NULL;
+    fn_fail:
     return mpi_errno;
 }
 
@@ -62,10 +80,12 @@ MPID_Win_free(MPID_Win **win_ptr)
 
   mpi_errno = MPIR_Barrier_impl(win->comm_ptr, &errflag);
   MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**mpi_bcast");
- 
-  if (win->create_flavor == MPI_WIN_FLAVOR_SHARED)   
+
+  if (win->create_flavor == MPI_WIN_FLAVOR_SHARED)
        mpi_errno=MPIDI_SHM_Win_free(win_ptr);
 
+
+
   struct MPIDI_Win_info *winfo = &win->mpid.info[rank];
 #ifdef USE_PAMI_RDMA
   if (win->size != 0)
@@ -82,12 +102,10 @@ MPID_Win_free(MPID_Win **win_ptr)
       MPID_assert(rc == PAMI_SUCCESS);
     }
 #endif
+
   MPIU_Free(win->mpid.info);
-  MPIU_Free(win->mpid.origin);
   if (win->mpid.work.msgQ) 
       MPIU_Free(win->mpid.work.msgQ);
-  if (win->create_flavor == MPI_WIN_FLAVOR_SHARED) 
-      MPIU_Free(win->base);
 
   MPIR_Comm_release(win->comm_ptr, 0);
 
diff --git a/src/mpid/pamid/src/onesided/mpid_win_get.c b/src/mpid/pamid/src/onesided/mpid_win_get.c
index 0d5b6a2..b56e2b2 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_get.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_get.c
@@ -258,13 +258,11 @@ MPID_Get(void         *origin_addr,
          MPIU_Free(req);
       return MPI_SUCCESS;
     }
-  win->mpid.origin[target_rank].nStarted++;
 
   /* If the get is a local operation, do it here */
   if (target_rank == win->comm_ptr->rank)
     {
       size_t offset = req->offset;
-      win->mpid.origin[target_rank].nCompleted++;
       if(req->req_handle)
         MPID_cc_set(req->req_handle->cc_ptr, 0);
       else
diff --git a/src/mpid/pamid/src/onesided/mpid_win_lock.c b/src/mpid/pamid/src/onesided/mpid_win_lock.c
index 259ab75..5871ddc 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_lock.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_lock.c
@@ -91,8 +91,10 @@ MPIDI_WinLockReq_proc(pami_context_t              context,
   struct MPIDI_Win_lock* lock = MPIU_Calloc0(1, struct MPIDI_Win_lock);
   if (info->type == MPIDI_WIN_MSGTYPE_LOCKREQ)
        lock->mtype = MPIDI_REQUEST_LOCK;
-  else if (info->type == MPIDI_WIN_MSGTYPE_LOCKALLREQ)
+  else if (info->type == MPIDI_WIN_MSGTYPE_LOCKALLREQ) {
        lock->mtype = MPIDI_REQUEST_LOCKALL;
+       lock->flagAddr = (void *) info->flagAddr;
+  }
   lock->rank = info->rank;
   lock->type = info->data.lock.type;
 
@@ -206,9 +208,10 @@ MPID_Win_unlock(int       rank,
    }
   if (rank == MPI_PROC_NULL) goto fn_exit;
   struct MPIDI_Win_sync* sync = &win->mpid.sync;
-  MPID_PROGRESS_WAIT_DO_WHILE(win->mpid.origin[rank].nStarted != win->mpid.origin[rank].nCompleted);
-  win->mpid.origin[rank].nCompleted=0;
-  win->mpid.origin[rank].nStarted=0;
+  MPID_PROGRESS_WAIT_DO_WHILE(sync->total != sync->complete);
+  sync->total    = 0;
+  sync->started  = 0;
+  sync->complete = 0;
 
   MPIDI_WinLock_info info = {
   .done = 0,
diff --git a/src/mpid/pamid/src/onesided/mpid_win_lock_all.c b/src/mpid/pamid/src/onesided/mpid_win_lock_all.c
index 74248e4..5fdf67f 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_lock_all.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_lock_all.c
@@ -29,6 +29,7 @@ MPIDI_WinLockAllReq_post(pami_context_t   context,
   MPIDI_WinLock_info* info = (MPIDI_WinLock_info*)_info;
   MPIDI_Win_control_t msg = {
     .type       = MPIDI_WIN_MSGTYPE_LOCKALLREQ,
+    .flagAddr   = info,
     .data       = {
       .lock       = {
         .type = info->lock_type,
@@ -42,19 +43,17 @@ MPIDI_WinLockAllReq_post(pami_context_t   context,
 
 
 static pami_result_t
-MPIDI_WinUnlockAll_post(pami_context_t   context,
+MPIDI_WinUnlockAllReq_post(pami_context_t   context,
                         void           * _info)
 {
   MPID_Win  *win;  
   MPIDI_WinLock_info* info = (MPIDI_WinLock_info*)_info;
   MPIDI_Win_control_t msg = {
   .type       = MPIDI_WIN_MSGTYPE_UNLOCKALL,
+  .flagAddr   = info,
   };
   win=info->win; 
   MPIDI_WinCtrlSend(context, &msg, info->peer, info->win);
-  /* current request is the last one in the group */
-  if (win->mpid.sync.lock.remote.allLocked == 1) 
-        info->done = 1; 
   return PAMI_SUCCESS;
 }
 
@@ -88,6 +87,8 @@ MPID_Win_lock_all(int      assert,
   int mpi_errno = MPI_SUCCESS;
   int i,size;
   MPIDI_WinLock_info *lockQ;
+  char *cp=NULL;
+  int  nMask,index;
   struct MPIDI_Win_sync_lock* slock = &win->mpid.sync.lock;
   static char FCNAME[] = "MPID_Win_lock_all";
 
@@ -97,19 +98,47 @@ MPID_Win_lock_all(int      assert,
                         return mpi_errno, "**rmasync");
    }
    size = (MPIR_Comm_size(win->comm_ptr));
+   win->mpid.max_ctrlsends = MAX_NUM_CTRLSEND;
+   nMask= win->mpid.max_ctrlsends - 1;
+   if (cp=getenv("MP_MAX_NUM_CTRLSEND")) {
+       win->mpid.max_ctrlsends = atoi(cp);
+   }
+   nMask=(win->mpid.max_ctrlsends - 1);
    if (!win->mpid.work.msgQ) {
-       win->mpid.work.msgQ = (void *) MPIU_Calloc0(size, MPIDI_WinLock_info);
+       if (size < (win->mpid.max_ctrlsends)) {
+           win->mpid.work.msgQ = (void *) MPIU_Calloc0(size, MPIDI_WinLock_info);
+       }  else {
+           win->mpid.work.msgQ = (void *) MPIU_Calloc0((win->mpid.max_ctrlsends), MPIDI_WinLock_info);
+       }
        MPID_assert(win->mpid.work.msgQ != NULL);
-        win->mpid.work.count=0;
+       win->mpid.work.count=0;
    }
    lockQ = (MPIDI_WinLock_info *) win->mpid.work.msgQ;
-   for (i = 0; i < size; i++) {
-       lockQ[i].done=0;
-       lockQ[i].peer=i;               
-       lockQ[i].win=win;               
-       lockQ[i].lock_type=MPI_LOCK_SHARED;               
-       MPIDI_Context_post(MPIDI_Context[0], &lockQ[i].work, MPIDI_WinLockAllReq_post, &lockQ[i]);
-   }
+   if (size < win->mpid.max_ctrlsends) {
+      for (i = 0; i < size; i++) {
+           lockQ[i].done=0;
+           lockQ[i].peer=i;
+           lockQ[i].win=win;
+           lockQ[i].lock_type=MPI_LOCK_SHARED;
+           MPIDI_Context_post(MPIDI_Context[0], &lockQ[i].work, MPIDI_WinLockAllReq_post, &lockQ[i]);
+      }
+    } else {
+      for (i = 0; i < size; i++) {
+           if (i < win->mpid.max_ctrlsends)
+               index=i;
+           else {
+               index = i & nMask;
+               if (!lockQ[index].done) {
+                   MPID_PROGRESS_WAIT_WHILE(lockQ[index].done == 0);
+               }
+           }
+           lockQ[index].done=0;
+           lockQ[index].peer=i;
+           lockQ[index].win=win;
+           lockQ[index].lock_type=MPI_LOCK_SHARED;
+           MPIDI_Context_post(MPIDI_Context[0], &lockQ[index].work, MPIDI_WinLockAllReq_post, &lockQ[index]);
+      }
+    }
     /* wait for the lock is granted for all tasks in the window */
    MPID_PROGRESS_WAIT_WHILE(size != slock->remote.allLocked);
 
@@ -123,8 +152,9 @@ int
 MPID_Win_unlock_all(MPID_Win *win)
 {
   int mpi_errno = MPI_SUCCESS;
-  int i;
+  int i,size;
   MPIDI_WinLock_info *lockQ;
+  int  nMask,index;
   struct MPIDI_Win_sync* sync;
   static char FCNAME[] = "MPID_Win_unlock_all";
 
@@ -138,17 +168,32 @@ MPID_Win_unlock_all(MPID_Win *win)
   sync->total    = 0;
   sync->started  = 0;
   sync->complete = 0;
-  for (i = 0; i < MPIR_Comm_size(win->comm_ptr); i++) {
-       win->mpid.origin[i].nStarted=0;
-       win->mpid.origin[i].nCompleted=0;
-  }
   MPID_assert(win->mpid.work.msgQ != NULL);
   lockQ = (MPIDI_WinLock_info *) win->mpid.work.msgQ;
-  for (i = 0; i < MPIR_Comm_size(win->comm_ptr); i++) {
-       lockQ[i].done=0;
-       lockQ[i].peer=i;
-       lockQ[i].win=win;
-       MPIDI_Context_post(MPIDI_Context[0], &lockQ[i].work, MPIDI_WinUnlockAll_post, &lockQ[i]);
+  size = MPIR_Comm_size(win->comm_ptr);
+  nMask = (win->mpid.max_ctrlsends - 1);
+  if (size < win->mpid.max_ctrlsends) {
+      for (i = 0; i < size; i++) {
+           lockQ[i].done=0;
+           lockQ[i].peer=i;
+           lockQ[i].win=win;
+           MPIDI_Context_post(MPIDI_Context[0], &lockQ[i].work, MPIDI_WinUnlockAllReq_post, &lockQ[i]);
+      }
+   } else {
+      for (i = 0; i < size; i++) {
+           if (i < win->mpid.max_ctrlsends)
+               index=i;
+           else {
+               index = (i & nMask);
+               if (!lockQ[index].done) {
+                  MPID_PROGRESS_WAIT_WHILE(lockQ[index].done == 0);
+               }
+           }
+           lockQ[index].done=0;
+           lockQ[index].peer=i;
+           lockQ[index].win=win;
+           MPIDI_Context_post(MPIDI_Context[0], &lockQ[index].work, MPIDI_WinUnlockAllReq_post, &lockQ[index]);
+      }
   }
   
   MPID_PROGRESS_WAIT_WHILE(sync->lock.remote.allLocked);
diff --git a/src/mpid/pamid/src/onesided/mpid_win_put.c b/src/mpid/pamid/src/onesided/mpid_win_put.c
index 6e8d38b..67f9d84 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_put.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_put.c
@@ -274,13 +274,11 @@ MPID_Put(void         *origin_addr,
       return MPI_SUCCESS;
     }
 
-  win->mpid.origin[target_rank].nStarted++;
 
   /* If the get is a local operation, do it here */
   if (target_rank == win->comm_ptr->rank)
     {
       size_t offset = req->offset;
-      win->mpid.origin[target_rank].nCompleted++;
       if(req->req_handle)
         MPID_cc_set(req->req_handle->cc_ptr, 0);
       else
diff --git a/src/mpid/pamid/src/onesided/mpid_win_shared_query.c b/src/mpid/pamid/src/onesided/mpid_win_shared_query.c
index fb304ee..21e70fd 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_shared_query.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_shared_query.c
@@ -40,8 +40,10 @@ MPID_Win_shared_query(MPID_Win *win, int rank, MPI_Aint *size,
                            int *disp_unit, void *base_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
-
-    *(void**) base_ptr = (void *) win->base;
+    static char FCNAME[] = "MPID_Win_shared_query";
+    MPIU_ERR_CHKANDSTMT((win->create_flavor != MPI_WIN_FLAVOR_SHARED), mpi_errno,
+                         MPI_ERR_RMA_FLAVOR, return mpi_errno, "**rmaflavor");
+    *((void **) base_ptr) = (void *) win->base;
     *size             = win->size;
     *disp_unit        = win->disp_unit;
 
diff --git a/src/mpid/pamid/src/onesided/mpidi_onesided.h b/src/mpid/pamid/src/onesided/mpidi_onesided.h
index 57aabca..6969351 100644
--- a/src/mpid/pamid/src/onesided/mpidi_onesided.h
+++ b/src/mpid/pamid/src/onesided/mpidi_onesided.h
@@ -20,6 +20,7 @@
  * \brief ???
  */
 #include <mpidimpl.h>
+#include "opa_primitives.h"
 
 #ifndef __src_onesided_mpidi_onesided_h__
 #define __src_onesided_mpidi_onesided_h__
@@ -33,6 +34,54 @@ pami_send_immediate_t   zero_send_immediate_parms;
 pami_recv_t   zero_recv_parms;
 pami_rmw_t   zero_rmw_parms;
 
+#define MPIDI_QUICKSLEEP     usleep(1);
+#define MAX_NUM_CTRLSEND  1024          /* no more than 1024 outstanding control sends */
+
+
+#define MPIDI_SHM_MUTEX_LOCK(win)                                                       \
+    do {                                                                                \
+        pthread_mutex_t *shm_mutex = win->mpid.shm->mutex_lock;                         \
+        int rval = pthread_mutex_lock(shm_mutex);                                       \
+        MPIU_ERR_CHKANDJUMP1(rval, mpi_errno, MPI_ERR_OTHER, "**pthread_lock",          \
+                             "**pthread_lock %s", strerror(rval));                      \
+    } while (0)
+
+#define MPIDI_SHM_MUTEX_UNLOCK(win)                                                     \
+    do {                                                                                \
+        pthread_mutex_t *shm_mutex = win->mpid.shm->mutex_lock;                         \
+        int rval = pthread_mutex_unlock(shm_mutex);                                     \
+        MPIU_ERR_CHKANDJUMP1(rval, mpi_errno, MPI_ERR_OTHER, "**pthread_unlock",        \
+                             "**pthread_unlock %s", strerror(rval));                    \
+    } while (0)
+
+#define MPIDI_SHM_MUTEX_INIT(win)                                                       \
+    do {                                                                                \
+        int rval=0;                                                                     \
+        pthread_mutexattr_t attr;                                                       \
+        pthread_mutex_t *shm_mutex = win->mpid.shm->mutex_lock;                         \
+                                                                                        \
+        rval = pthread_mutexattr_init(&attr);                                           \
+        MPIU_ERR_CHKANDJUMP1(rval, mpi_errno, MPI_ERR_OTHER, "**pthread_mutex",         \
+                             "**pthread_mutex %s", strerror(rval));                     \
+        rval = pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);             \
+        MPIU_ERR_CHKANDJUMP1(rval, mpi_errno, MPI_ERR_OTHER, "**pthread_mutex",         \
+                             "**pthread_mutex %s", strerror(rval));                     \
+        rval = pthread_mutex_init(shm_mutex, &attr);                                    \
+        MPIU_ERR_CHKANDJUMP1(rval, mpi_errno, MPI_ERR_OTHER, "**pthread_mutex",         \
+                             "**pthread_mutex %s", strerror(rval));                     \
+        rval = pthread_mutexattr_destroy(&attr);                                        \
+        MPIU_ERR_CHKANDJUMP1(rval, mpi_errno, MPI_ERR_OTHER, "**pthread_mutex",         \
+                             "**pthread_mutex %s", strerror(rval));                     \
+    } while (0);
+
+#define MPIDI_SHM_MUTEX_DESTROY(win)                                                    \
+    do {                                                                                \
+        pthread_mutex_t *shm_mutex = (win)->mpid.shm->mutex_lock;                       \
+        int rval = pthread_mutex_destroy(shm_mutex);                                    \
+        MPIU_ERR_CHKANDJUMP1(rval, mpi_errno, MPI_ERR_OTHER, "**pthread_mutex",         \
+                             "**pthread_mutex %s", strerror(rval));                     \
+    } while (0);
+
 /**
  * \brief One-sided Message Types
  */
@@ -75,6 +124,7 @@ typedef struct
   MPIDI_Win_msgtype_t type;
   MPID_Win *win;
   int      rank;          /* MPI rank */
+  void     *flagAddr;
   union
   {
     struct
@@ -104,6 +154,7 @@ typedef struct MPIDI_WinLock_info
   unsigned            peer;
   int                 lock_type;
   struct MPID_Win   * win;
+  void                *flagAddr;
   volatile unsigned   done;
   pami_work_t         work;
 } MPIDI_WinLock_info;
@@ -280,8 +331,7 @@ MPIDI_Win_init( MPI_Aint length,
                 int create_flavor,
                 int model);
 int
-MPIDI_Win_allgather(void *base,
-                    MPI_Aint size,
+MPIDI_Win_allgather(MPI_Aint size,
                     MPID_Win **win_ptr);
 
 void
diff --git a/src/mpid/pamid/src/onesided/mpidi_win_control.c b/src/mpid/pamid/src/onesided/mpidi_win_control.c
index 8c537ff..56016be 100644
--- a/src/mpid/pamid/src/onesided/mpidi_win_control.c
+++ b/src/mpid/pamid/src/onesided/mpidi_win_control.c
@@ -29,6 +29,7 @@ MPIDI_WinCtrlSend(pami_context_t       context,
                   MPID_Win            *win)
 {
   pami_task_t  taskid;
+  MPIDI_WinLock_info *winLock;
   control->win = win->mpid.info[rank].win;
   control->rank = win->comm_ptr->rank;
   taskid=MPID_VCR_GET_LPID(win->comm_ptr->vcr,rank);
@@ -72,6 +73,10 @@ MPIDI_WinCtrlSend(pami_context_t       context,
     };
     rc = PAMI_Send_immediate(context, &params);
   }
+  if ((control->type == MPIDI_WIN_MSGTYPE_LOCKALLREQ) || (control->type == MPIDI_WIN_MSGTYPE_UNLOCKALL)) {
+      winLock = (MPIDI_WinLock_info *) control->flagAddr;
+      winLock->done = 1;
+  }
   MPID_assert(rc == PAMI_SUCCESS);
 
 }

http://git.mpich.org/mpich.git/commitdiff/c6d910c783380440d1946ab366e5f2496eaed042

commit c6d910c783380440d1946ab366e5f2496eaed042
Author: Su Huang <suhuang at us.ibm.com>
Date:   Wed Oct 16 14:27:49 2013 -0400

    PAMID: MPID_Win_allocate_shared and MPID_Win_shared_query support
    
    added MPID_Win_allocate_shared and MPID_Win_shared_query support and
    fixes on passing incorrect rank to MPIDI_WinCtrlSend in MPIDI_WinPost_post
    and MPIDI_WinComplete_post
    
    minor fixes for 'win allocate shared'
    
    - compiles for bgq
    - UNRESOLVED: undefined shm key on bgq, or any other time the required
      environment variables are not set
    - UNRESOLVED: multiple calls to 'win allocate shared' will use the same
      shm key
    
    (ibm) F189033
    
    Signed-off-by: Michael Blocksome <blocksom at us.ibm.com>

diff --git a/src/mpid/pamid/include/mpidi_datatypes.h b/src/mpid/pamid/include/mpidi_datatypes.h
index 203961e..2d98dcf 100644
--- a/src/mpid/pamid/include/mpidi_datatypes.h
+++ b/src/mpid/pamid/include/mpidi_datatypes.h
@@ -445,6 +445,16 @@ typedef struct MPIDI_Win_info
   pami_memregion_t   memregion;     /**< Memory region descriptor for each node               */
   uint32_t           memregion_used;
 } MPIDI_Win_info;
+
+typedef struct MPIDI_Win_shm_t
+{
+    int allocated;                  /* flag: TRUE iff this window has a shared memory
+                                                 region associated with it */
+    void *base_addr;                /* base address of shared memory region */
+    MPI_Aint segment_len;           /* size of shared memory region         */
+    uint32_t  shm_key;              /* shared memory key                    */
+} MPIDI_Win_shm_t;
+
 /**
  * \brief Structure of PAMI extensions to MPID_Win structure
  */
@@ -453,7 +463,8 @@ struct MPIDI_Win
   struct MPIDI_Win_info     *info;          /**< allocated array of collective info             */
   MPIDI_Win_info_args info_args;
   void             ** shm_base_addrs; /* base address shared by all process in comm      */
-  workQ_t work;    
+  MPIDI_Win_shm_t  *shm;             /* shared memory info                             */
+  workQ_t work;
   RMA_nOps_t *origin;
   struct MPIDI_Win_sync
   {
diff --git a/src/mpid/pamid/src/misc/mpid_unimpl.c b/src/mpid/pamid/src/misc/mpid_unimpl.c
index 44bda50..1009566 100644
--- a/src/mpid/pamid/src/misc/mpid_unimpl.c
+++ b/src/mpid/pamid/src/misc/mpid_unimpl.c
@@ -90,17 +90,3 @@ int MPID_Comm_group_failed(MPID_Comm *comm_ptr, MPID_Group **failed_group_ptr)
   MPID_abort();
   return 0;
 }
-
-int MPID_Win_allocate_shared(MPI_Aint size, int disp_unit, MPID_Info *info_ptr, MPID_Comm *comm_ptr,
-                             void **base_ptr, MPID_Win **win_ptr)
-{
-  MPID_abort();
-  return 0;
-}
-
-int MPID_Win_shared_query(MPID_Win *win, int rank, MPI_Aint *size, int *disp_unit,
-                          void *baseptr)
-{
-  MPID_abort();
-  return 0;
-}
diff --git a/src/mpid/pamid/src/onesided/Makefile.mk b/src/mpid/pamid/src/onesided/Makefile.mk
index cb526db..9a6cd37 100644
--- a/src/mpid/pamid/src/onesided/Makefile.mk
+++ b/src/mpid/pamid/src/onesided/Makefile.mk
@@ -36,7 +36,9 @@ lib_lib at MPILIBNAME@_la_SOURCES +=                                    \
   src/mpid/pamid/src/onesided/mpid_win_lock_all.c                    \
   src/mpid/pamid/src/onesided/mpid_win_pscw.c                        \
   src/mpid/pamid/src/onesided/mpid_win_put.c                         \
+  src/mpid/pamid/src/onesided/mpid_win_shared_query.c                \
   src/mpid/pamid/src/onesided/mpid_win_create_dynamic.c              \
+  src/mpid/pamid/src/onesided/mpid_win_allocate_shared.c             \
   src/mpid/pamid/src/onesided/mpid_win_flush.c                       \
   src/mpid/pamid/src/onesided/mpid_win_allocate.c                    \
   src/mpid/pamid/src/onesided/mpid_win_sync.c                        \
diff --git a/src/mpid/pamid/src/onesided/mpid_win_allocate_shared.c b/src/mpid/pamid/src/onesided/mpid_win_allocate_shared.c
new file mode 100644
index 0000000..97525f5
--- /dev/null
+++ b/src/mpid/pamid/src/onesided/mpid_win_allocate_shared.c
@@ -0,0 +1,409 @@
+/* begin_generated_IBM_copyright_prolog                             */
+/*                                                                  */
+/* This is an automatically generated copyright prolog.             */
+/* After initializing,  DO NOT MODIFY OR MOVE                       */
+/*  --------------------------------------------------------------- */
+/* Licensed Materials - Property of IBM                             */
+/* Blue Gene/Q 5765-PER 5765-PRP                                    */
+/*                                                                  */
+/* (C) Copyright IBM Corp. 2011, 2012 All Rights Reserved           */
+/* US Government Users Restricted Rights -                          */
+/* Use, duplication, or disclosure restricted                       */
+/* by GSA ADP Schedule Contract with IBM Corp.                      */
+/*                                                                  */
+/*  --------------------------------------------------------------- */
+/*                                                                  */
+/* end_generated_IBM_copyright_prolog                               */
+/*  (C)Copyright IBM Corp.  2007, 2011  */
+/**
+ * \file src/onesided/mpid_win_allocate_shared.c
+ * \brief
+ */
+#include "mpidi_onesided.h"
+#include <sys/shm.h>
+#include <sys/ipc.h>
+#include <sys/stat.h>
+
+#undef FUNCNAME 
+#define FUNCNAME MPID_Win_allocate_shared
+#undef FCNAME
+#define FCNAME MPIU_QUOTE(FUNCNAME)
+
+#define SHM_KEY_TAIL 0xfdbc         /* If this value is changed, change  */
+                                      /* in PMD (mp_pmd.c) as well.        */
+
+extern int mpidi_dynamic_tasking;
+#define MPIDI_PAGESIZE ((MPI_Aint)pageSize)
+#define MPIDI_PAGESIZE_MASK (~(MPIDI_PAGESIZE-1))
+#define MPIDI_ROUND_UP_PAGESIZE(x) ((((MPI_Aint)x)+(~MPIDI_PAGESIZE_MASK)) & MPIDI_PAGESIZE_MASK)
+
+
+int CheckRankOnNode(MPID_Comm  * comm_ptr,int *onNode ) {
+      int rank,comm_size;
+      int mpi_errno=PAMI_SUCCESS;
+
+
+      rank=comm_ptr->rank;
+      comm_size = comm_ptr->local_size;
+      rank      = comm_ptr->rank;
+
+      *onNode=0;  
+      if (comm_ptr->intranode_table[rank] != -1) 
+           *onNode=1;
+     if (*onNode== 0) {
+      MPIU_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_CONFLICT,
+                          return mpi_errno, "**rmaconflict");
+     }
+
+     return mpi_errno;
+}
+
+int CheckSpaceType(MPID_Win **win_ptr, MPID_Info *info,int *noncontig) {
+    int mpi_errno=MPI_SUCCESS;
+  /* Check if we are allowed to allocate space non-contiguously */
+    if (info != NULL) {
+        int alloc_shared_nctg_flag = 0;
+        char alloc_shared_nctg_value[MPI_MAX_INFO_VAL+1];
+        MPIR_Info_get_impl(info, "alloc_shared_noncontig", MPI_MAX_INFO_VAL,
+                           alloc_shared_nctg_value, &alloc_shared_nctg_flag);
+        if ((alloc_shared_nctg_flag == 1)) {
+            if (!strncmp(alloc_shared_nctg_value, "true", strlen("true")))
+                (*win_ptr)->mpid.info_args.alloc_shared_noncontig = 1;
+            if (!strncmp(alloc_shared_nctg_value, "false", strlen("false")))
+                (*win_ptr)->mpid.info_args.alloc_shared_noncontig = 0;
+        }
+    }
+     *noncontig = (*win_ptr)->mpid.info_args.alloc_shared_noncontig;
+    return mpi_errno;
+}
+
+int GetPageSize(void *addr, ulong *pageSize)
+{
+  pid_t pid;
+  FILE *fp;
+  char Line[201],A1[50],A2[50];
+  char fileName[100];
+  char A3[50],A4[50];
+  int  i=0;
+  char *t1,*t2;
+  int  len;
+  #ifndef REDHAT
+  char a='-';
+  char *search = &a;
+  #endif
+  void *beg, *end;
+  int  found=0;
+  int  ps;
+  int   j,k;
+
+  *pageSize = 0;
+  pid = getpid();
+  sprintf(fileName,"/proc/%d/smaps",pid);
+  /* linux-2.6.29 or greater, KernelPageSize in /proc/pid/smaps includes 4K, 16 MB */
+  TRACE_ERR("fileName = %s   addr=%p\n",fileName,addr);
+  fp = fopen(fileName,"r");
+  if (fp == NULL) {
+      TRACE_ERR("fileName = %s open failed errno=%d\n",fileName,errno);
+      return errno;
+  }
+  while(fgets(Line,200,fp)) {
+    i++;
+    sscanf(Line,"%s  %s %s %s \n",A1,A2,A3,A4);
+    if (memcmp(A1,"KernelPageSize",14)==0) {
+         j=atoi(A2);
+         if ((A3[0]=='k') || (A3[0]=='K'))
+               k=1024;
+         else if ((A3[0]=='m') || (A3[0]=='M'))
+               k=1048576;
+         else if ((A3[0]=='g') || (A3[0]=='G'))
+               k=0x40000000;  /* 1 GB  */
+         else {
+             printf("ERROR unrecognized unit A3=%s\n",A3);
+             break;
+         }
+         *pageSize = (ulong)(j * k);
+         TRACE_ERR(" addr=%p pageSize=%ld %s(%d)\n", addr,*pageSize,__FILE__,__LINE__);
+         break;
+    }
+  }
+  fclose(fp);
+  if (*pageSize == 0) {
+       ps = getpagesize();
+       *pageSize = (ulong) ps;
+       TRACE_ERR("LinuxPageSize %p not in %s  getpagesize=%ld\n", addr,fileName,*pageSize);
+  }
+  return 0;
+}
+
+#define MPIDI_PAGESIZE ((MPI_Aint)pageSize)
+#define MPIDI_PAGESIZE_MASK (~(MPIDI_PAGESIZE-1))
+#define MPIDI_ROUND_UP_PAGESIZE(x) ((((MPI_Aint)x)+(~MPIDI_PAGESIZE_MASK)) & MPIDI_PAGESIZE_MASK)
+
+
+
+int
+MPID_getSharedSegment(MPI_Aint     size,
+                         int          disp_unit,
+                         MPID_Info  * info,
+                         MPID_Comm  * comm_ptr,
+                         void **base_ptr,
+                         MPID_Win  ** win_ptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    void **base_pp = (void **) base_ptr;
+    int i, k, comm_size, rank;
+    int  shm_id;
+    uint32_t shm_key; 
+    int  node_rank;
+    MPI_Aint *node_sizes;
+    void * base_addr;
+    MPI_Aint *tmp_buf;
+    int errflag = FALSE;
+    int noncontig = FALSE;
+    MPI_Aint pageSize, len,new_size;
+    char *cp;
+    MPID_Win  *win;
+    MPIDI_Win_info *winfo;
+    int shm_flag = IPC_CREAT | IPC_EXCL | S_IRUSR | S_IWUSR;
+
+    win =  *win_ptr;
+    comm_size = win->comm_ptr->local_size;
+    rank = win->comm_ptr->rank;
+    tmp_buf = MPIU_Malloc( 2*comm_size*sizeof(MPI_Aint));
+
+    mpi_errno=CheckSpaceType(win_ptr,info,&noncontig);
+    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+    GetPageSize((void *) win_ptr, &pageSize);
+    win->mpid.shm->segment_len = 0;
+    if (comm_size == 1) {
+         if (size > 0) {
+             if (noncontig) 
+                 new_size = MPIDI_ROUND_UP_PAGESIZE(size);
+             else 
+                 new_size = size;
+             *base_pp = MPIU_Malloc(new_size);
+             #ifndef MPIDI_NO_ASSERT
+                     MPID_assert(*base_pp != NULL);
+             #else
+              MPIU_ERR_CHKANDJUMP((*base_pp == NULL), mpi_errno, MPI_ERR_BUFFER, "**bufnull");
+             #endif
+         } else if (size == 0) {
+                   *base_pp = NULL;
+         } else {
+               MPIU_ERR_CHKANDSTMT(size >=0 , mpi_errno, MPI_ERR_SIZE,return mpi_errno, "**rmasize");
+         }
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+         win->mpid.shm->segment_len = new_size;
+         win->mpid.info[rank].base_addr = *base_pp;
+     } else {
+         tmp_buf[rank]   = (MPI_Aint) size;
+         mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
+                                         tmp_buf, 1 * sizeof(MPI_Aint), MPI_BYTE,
+                                         (*win_ptr)->comm_ptr, &errflag);
+         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+         /* calculate total number of bytes needed */
+         for (i = 0; i < comm_size; ++i) {
+             len = tmp_buf[i];
+             if (noncontig)
+                /* Round up to next page size */
+                 win->mpid.shm->segment_len += MPIDI_ROUND_UP_PAGESIZE(len); 
+             else
+                 win->mpid.shm->segment_len += len;
+          }
+          /* get shared segment   */
+
+          shm_key=-1;
+          if (rank == 0) {
+             #ifdef DYNAMIC_TASKING
+             /* generate an appropriate key */
+             if (!mpidi_dynamic_tasking) {
+                cp = getenv("MP_I_PMD_PID");
+                if (cp) {
+                    shm_key = atoi(cp);
+                    shm_key = shm_key & 0x07ffffff;
+                    shm_key = shm_key | 0x80000000;
+                 } else {
+                    cp = getenv("MP_PARTITION");
+                    if (cp ) {
+                       shm_key = atol(cp);
+                       shm_key = (shm_key << 16) + SHM_KEY_TAIL;
+                    } else {
+                       TRACE_ERR("ERROR MP_PARTITION not set \n"); 
+                    }
+                  }
+              } else {
+                cp = getenv("MP_I_KEY_RANGE");
+                if (cp) {
+                    sscanf(cp, "0x%x", &shm_key);
+                    shm_key = shm_key | 0x80;
+                } else {
+                    TRACE_ERR("ERROR MP_I_KEY_RANGE not set \n"); 
+                }
+               }
+              #else 
+              cp = getenv("MP_I_PMD_PID");
+              if (cp) {
+                  shm_key = atoi(cp);
+                  shm_key = shm_key & 0x07ffffff;
+                  shm_key = shm_key | 0x80000000;
+              } else {
+                  cp = getenv("MP_PARTITION");
+                  if (cp ) {
+                      shm_key = atol(cp);
+#ifdef __PE__
+                      shm_key = (shm_key << 16) + SHMCC_KEY_TAIL;
+#else
+                      shm_key = (shm_key << 16);
+#endif
+                  } else {
+                      TRACE_ERR("ERROR MP_PARTITION not set \n"); 
+                  }
+               }
+              #endif
+              MPID_assert(shm_key != -1);
+              shm_id = shmget(shm_key, win->mpid.shm->segment_len, shm_flag);
+              MPIU_ERR_CHKANDJUMP((shm_id == -1), mpi_errno, MPI_ERR_RMA_SHARED, "**rmashared");
+              win->mpid.shm->base_addr = (void *) shmat(shm_id,0,0);
+              MPIU_ERR_CHKANDJUMP((base_addr == NULL), mpi_errno,MPI_ERR_BUFFER, "**bufnull");
+              win->mpid.shm->allocated = 1;
+              /* successfully created shm segment */
+               mpi_errno = MPIR_Bcast_impl((void *) &shm_key, sizeof(int), MPI_CHAR, 0, comm_ptr, &errflag);
+             } else { /* task other than task 0  */
+               mpi_errno = MPIR_Bcast_impl((void *) &shm_key,  sizeof(int), MPI_CHAR, 0, comm_ptr, &errflag);
+               MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
+               MPID_assert(shm_key != -1);
+               shm_id = shmget(shm_key, 0, 0);
+               if (shm_id != -1) { /* shm segment is available */
+                   win->mpid.shm->base_addr = (void *) shmat(shm_id,0,0);
+                   win->mpid.shm->allocated = 1;
+                   MPIU_ERR_CHKANDJUMP((base_addr == (void *) -1), mpi_errno, MPI_ERR_RMA_SHARED, "**rmashared");
+               } else { /* node leader failed, no need to try here */
+                  MPIU_ERR_CHKANDJUMP((shm_id == -1), mpi_errno, MPI_ERR_RMA_SHARED, "**rmashared");
+               }
+              }
+         mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag);
+         /* compute the base addresses of each process within the shared memory segment */
+        {
+         win->base = win->mpid.shm->base_addr;
+         winfo = &win->mpid.info[rank];
+         winfo->win = win;
+         winfo->disp_unit = disp_unit;
+         win->mpid.info[0].base_addr = win->mpid.shm->base_addr;
+         char *cur_base = (*win_ptr)->mpid.shm->base_addr;
+         for (i = 1; i < comm_size; ++i) {
+             size = tmp_buf[i];
+             if (size) {
+                if (noncontig)  
+                   /* Round up to next page size */
+                    win->mpid.info[i].base_addr =(void *) ((MPI_Aint) cur_base + (MPI_Aint) MPIDI_ROUND_UP_PAGESIZE(size));
+                else
+                    win->mpid.info[i].base_addr = (void *) ((MPI_Aint) cur_base + size);
+                    cur_base = win->mpid.info[i].base_addr;
+              } else {
+                    win->mpid.info[i].base_addr = NULL; 
+              }
+          }
+          }
+          *base_pp = win->mpid.info[rank].base_addr;
+     }
+
+fn_exit:
+    MPIU_Free(tmp_buf);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+fn_fail:
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+
+}
+
+/**
+ * \brief MPI-PAMI glue for MPI_Win_allocate_shared function
+ *
+ * Create a window object. Allocates a MPID_Win object and initializes it,
+ * then allocates the collective info array, initalizes our entry, and
+ * performs an Allgather to distribute/collect the rest of the array entries.
+ * On each process, it allocates memory of at least size bytes that is shared
+ * among all processes in comm, and returns a pointer to the locally allocated
+ * segment in base_ptr that can be used for load/store accesses on the calling
+ * process. The locally allocated memory can be the target of load/store accessses
+ * by remote processes; the base pointers for other processes can be queried using
+ * the function 'MPI_Win_shared_query'.
+ *
+ * The call also returns a window object that can be used by all processes in comm
+ * to perform RMA operations. The size argument may be different at each process
+ * and size = 0 is valid. It is the user''s responsibility to ensure that the
+ * communicator comm represents a group of processes that can create a shared
+ * memory segment that can be accessed by all processes in the group. The
+ * allocated memory is contiguous across process ranks unless the info key
+ * alloc_shared_noncontig is specified. Contiguous across process ranks means that
+ * the first address in the memory segment of process i is consecutive with the
+ * last address in the memory segment of process i - 1.  This may enable the user
+ * to calculate remote address offsets with local information only.
+ *
+ * Input Parameters:
+ * \param[in] size      size of window in bytes (nonnegative integer)
+ * \param[in] disp_unit local unit size for displacements, in bytes (positive integer)
+ * \param[in] info      info argument (handle))
+ * \param[in] comm_ptr  intra-Communicator (handle)
+ * \param[out] base_ptr  address of local allocated window segment
+ * \param[out] win_ptr  window object returned by the call (handle)
+ * \return MPI_SUCCESS, MPI_ERR_ARG, MPI_ERR_COMM, MPI_ERR_INFO. MPI_ERR_OTHER,
+ *         MPI_ERR_SIZE
+ */
+int
+MPID_Win_allocate_shared(MPI_Aint     size,   
+                         int          disp_unit,
+                         MPID_Info  * info,
+                         MPID_Comm  * comm_ptr,
+                         void **base_ptr,
+                         MPID_Win  ** win_ptr)
+{
+  int mpi_errno  = MPI_SUCCESS;
+  void **baseP = base_ptr;
+  MPIDI_Win_info  *winfo;
+  MPID_Win    *win;
+  int         rank, comm_size;
+  int         onNode;
+  
+  
+  mpi_errno =MPIDI_Win_init(size,disp_unit,win_ptr, info, comm_ptr, MPI_WIN_FLAVOR_SHARED, MPI_WIN_UNIFIED);
+  if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+  win = *win_ptr;
+  mpi_errno=CheckRankOnNode(comm_ptr,&onNode);
+  MPIU_ERR_CHKANDJUMP((onNode == 0), mpi_errno, MPI_ERR_RMA_SHARED, "**rmashared");
+
+  rank     = (*win_ptr)->comm_ptr->rank;
+  comm_size = (*win_ptr)->comm_ptr->local_size;
+  win->mpid.shm = MPIU_Malloc(sizeof(MPIDI_Win_shm_t));
+  win->mpid.shm->allocated=0;
+  MPID_assert(win->mpid.shm != NULL);
+  MPID_getSharedSegment(size, disp_unit,info,comm_ptr,baseP, win_ptr);
+
+  win->base = *baseP;
+  winfo = &win->mpid.info[rank];
+  winfo->win = win;
+  winfo->disp_unit = disp_unit;
+  win->base = (void *) MPIU_PtrToAint(winfo->base_addr);
+  winfo->base_addr = win->base;
+
+  mpi_errno = MPIDI_Win_allgather(*baseP,size,win_ptr);
+  if (mpi_errno != MPI_SUCCESS)
+      return mpi_errno;
+
+  *(void**) base_ptr = (void *) win->base;
+
+  mpi_errno = MPIR_Barrier_impl(comm_ptr, &mpi_errno);
+
+fn_exit:
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+fn_fail:
+    MPIU_Free(win->mpid.shm);
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+
+}
+
diff --git a/src/mpid/pamid/src/onesided/mpid_win_free.c b/src/mpid/pamid/src/onesided/mpid_win_free.c
index 5155ce8..2ba83d9 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_free.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_free.c
@@ -21,6 +21,17 @@
  */
 #include "mpidi_onesided.h"
 
+int MPIDI_SHM_Win_free(MPID_Win **win_ptr)
+{
+  int mpi_errno = MPI_SUCCESS;
+    /* Free shared memory region */
+    /* free shm_base_addrs that's only used for shared memory windows */
+    if ((*win_ptr)->mpid.shm->allocated)
+        mpi_errno = shmdt((*win_ptr)->base);
+    MPIU_Free((*win_ptr)->mpid.shm);
+    (*win_ptr)->mpid.shm = NULL;
+    return mpi_errno;
+}
 
 /**
  * \brief MPI-PAMI glue for MPI_Win_free function
@@ -41,6 +52,7 @@ MPID_Win_free(MPID_Win **win_ptr)
 
   MPID_Win *win = *win_ptr;
   size_t rank = win->comm_ptr->rank;
+  int errflag = FALSE;
 
   if(win->mpid.sync.origin_epoch_type != win->mpid.sync.target_epoch_type ||
      (win->mpid.sync.origin_epoch_type != MPID_EPOTYPE_NONE &&
@@ -48,9 +60,11 @@ MPID_Win_free(MPID_Win **win_ptr)
     MPIU_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, return mpi_errno, "**rmasync");
   }
 
-  mpi_errno = MPIR_Barrier_impl(win->comm_ptr, &mpi_errno);
-  if (mpi_errno != MPI_SUCCESS)
-    return mpi_errno;
+  mpi_errno = MPIR_Barrier_impl(win->comm_ptr, &errflag);
+  MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**mpi_bcast");
+ 
+  if (win->create_flavor == MPI_WIN_FLAVOR_SHARED)   
+       mpi_errno=MPIDI_SHM_Win_free(win_ptr);
 
   struct MPIDI_Win_info *winfo = &win->mpid.info[rank];
 #ifdef USE_PAMI_RDMA
@@ -68,9 +82,12 @@ MPID_Win_free(MPID_Win **win_ptr)
       MPID_assert(rc == PAMI_SUCCESS);
     }
 #endif
-
   MPIU_Free(win->mpid.info);
   MPIU_Free(win->mpid.origin);
+  if (win->mpid.work.msgQ) 
+      MPIU_Free(win->mpid.work.msgQ);
+  if (win->create_flavor == MPI_WIN_FLAVOR_SHARED) 
+      MPIU_Free(win->base);
 
   MPIR_Comm_release(win->comm_ptr, 0);
 
diff --git a/src/mpid/pamid/src/onesided/mpid_win_pscw.c b/src/mpid/pamid/src/onesided/mpid_win_pscw.c
index 1ee4b4a..ff22460 100644
--- a/src/mpid/pamid/src/onesided/mpid_win_pscw.c
+++ b/src/mpid/pamid/src/onesided/mpid_win_pscw.c
@@ -36,7 +36,7 @@ MPIDI_WinPost_post(pami_context_t   context,
                    void           * _info)
 {
   MPIDI_WinPSCW_info * info = (MPIDI_WinPSCW_info*)_info;
-  unsigned peer, index;
+  unsigned peer, index, pid,i;
   MPID_Group *group = info->win->mpid.sync.pw.group;
   MPID_assert(group != NULL);
   MPIDI_Win_control_t msg = {
@@ -44,7 +44,13 @@ MPIDI_WinPost_post(pami_context_t   context,
   };
 
   for (index=0; index < group->size; ++index) {
-    peer = group->lrank_to_lpid[index].lrank;
+      pid = group->lrank_to_lpid[index].lpid;
+      for (i=0;i < ((info->win)->comm_ptr->local_size); i++) {
+         if ((info->win)->comm_ptr->local_group->lrank_to_lpid[i].lpid == pid) {
+             peer = ((info->win)->comm_ptr->local_group->lrank_to_lpid[i].lrank);
+             break;
+         }
+      }
     MPIDI_WinCtrlSend(context, &msg, peer, info->win);
   }
 
@@ -67,7 +73,7 @@ MPIDI_WinComplete_post(pami_context_t   context,
                        void           * _info)
 {
   MPIDI_WinPSCW_info * info = (MPIDI_WinPSCW_info*)_info;
-  unsigned peer, index;
+  unsigned peer, index,pid,i;
   MPID_Group *group = info->win->mpid.sync.sc.group;
   MPID_assert(group != NULL);
   MPIDI_Win_control_t msg = {
@@ -75,8 +81,14 @@ MPIDI_WinComplete_post(pami_context_t   context,
   };
 
   for (index=0; index < group->size; ++index) {
-    peer = group->lrank_to_lpid[index].lrank;
-    MPIDI_WinCtrlSend(context, &msg, peer, info->win);
+     pid = group->lrank_to_lpid[index].lpid;
+     for (i=0;i < ((info->win)->comm_ptr->local_size); i++) {
+         if ((info->win)->comm_ptr->local_group->lrank_to_lpid[i].lpid == pid) {
+            peer = ((info->win)->comm_ptr->local_group->lrank_to_lpid[i].lrank);
+            break;
+         }
+     }
+     MPIDI_WinCtrlSend(context, &msg, peer, info->win);
   }
 
   info->done = 1;
diff --git a/src/mpid/pamid/src/onesided/mpid_win_shared_query.c b/src/mpid/pamid/src/onesided/mpid_win_shared_query.c
new file mode 100644
index 0000000..fb304ee
--- /dev/null
+++ b/src/mpid/pamid/src/onesided/mpid_win_shared_query.c
@@ -0,0 +1,50 @@
+/* begin_generated_IBM_copyright_prolog                             */
+/*                                                                  */
+/* This is an automatically generated copyright prolog.             */
+/* After initializing,  DO NOT MODIFY OR MOVE                       */
+/*  --------------------------------------------------------------- */
+/* Licensed Materials - Property of IBM                             */
+/* Blue Gene/Q 5765-PER 5765-PRP                                    */
+/*                                                                  */
+/* (C) Copyright IBM Corp. 2011, 2012 All Rights Reserved           */
+/* US Government Users Restricted Rights -                          */
+/* Use, duplication, or disclosure restricted                       */
+/* by GSA ADP Schedule Contract with IBM Corp.                      */
+/*                                                                  */
+/*  --------------------------------------------------------------- */
+/*                                                                  */
+/* end_generated_IBM_copyright_prolog                               */
+/*  (C)Copyright IBM Corp.  2007, 2011  */
+/**
+ * \file src/onesided/mpid_win_shared_query.c
+ * \brief queries the process-local address for remote memory segments
+ *        created with MPI_Win_allocate_shared.                       
+ */
+#include "mpidi_onesided.h"
+
+/**
+ * \brief MPI-PAMI glue for MPI_Win_shared_query function
+ * 
+ * Query the size and base pointer for a patch of a shared memory window
+ * 
+ * \param[in]  win       shared memory window object
+ * \param[in]  rank      rank in the group of window win or MPI_PROC_NULL
+ * \param[out] size      size of the window segment (non-negative integer)
+ * \param[out] disp_unit local unit size for displacements, in bytes  
+ * \param[out] base_ptr   address for load/store access to window segment
+ * \return MPI_SUCCESS, MPI_ERR_OTHER, or error returned from
+ */
+
+int
+MPID_Win_shared_query(MPID_Win *win, int rank, MPI_Aint *size,
+                           int *disp_unit, void *base_ptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    *(void**) base_ptr = (void *) win->base;
+    *size             = win->size;
+    *disp_unit        = win->disp_unit;
+
+    return mpi_errno;
+}
+

-----------------------------------------------------------------------

Summary of changes:
 src/mpid/pamid/include/mpidi_datatypes.h           |   27 +-
 .../pamid/src/dyntask/mpid_comm_spawn_multiple.c   |    5 +-
 src/mpid/pamid/src/dyntask/mpidi_port.c            |    1 -
 src/mpid/pamid/src/misc/mpid_unimpl.c              |   14 -
 src/mpid/pamid/src/mpid_finalize.c                 |    7 +
 src/mpid/pamid/src/mpidi_util.c                    |    2 +-
 src/mpid/pamid/src/onesided/Makefile.mk            |    2 +
 src/mpid/pamid/src/onesided/mpid_1s.c              |   10 +-
 src/mpid/pamid/src/onesided/mpid_win_accumulate.c  |    1 -
 src/mpid/pamid/src/onesided/mpid_win_allocate.c    |    2 +-
 .../pamid/src/onesided/mpid_win_allocate_shared.c  |  439 ++++++++++++++++++++
 .../pamid/src/onesided/mpid_win_compare_and_swap.c |    6 -
 src/mpid/pamid/src/onesided/mpid_win_create.c      |    5 +-
 .../pamid/src/onesided/mpid_win_create_dynamic.c   |    2 +-
 .../pamid/src/onesided/mpid_win_fetch_and_op.c     |    6 -
 src/mpid/pamid/src/onesided/mpid_win_flush.c       |   52 ++--
 src/mpid/pamid/src/onesided/mpid_win_free.c        |   43 ++-
 src/mpid/pamid/src/onesided/mpid_win_get.c         |    2 -
 .../pamid/src/onesided/mpid_win_get_accumulate.c   |  135 +++----
 src/mpid/pamid/src/onesided/mpid_win_lock.c        |   11 +-
 src/mpid/pamid/src/onesided/mpid_win_lock_all.c    |   91 +++-
 src/mpid/pamid/src/onesided/mpid_win_pscw.c        |   22 +-
 src/mpid/pamid/src/onesided/mpid_win_put.c         |    2 -
 .../{mpid_win_attach.c => mpid_win_shared_query.c} |   46 +--
 src/mpid/pamid/src/onesided/mpidi_onesided.h       |   70 +++-
 src/mpid/pamid/src/onesided/mpidi_win_control.c    |    5 +
 26 files changed, 783 insertions(+), 225 deletions(-)
 create mode 100644 src/mpid/pamid/src/onesided/mpid_win_allocate_shared.c
 copy src/mpid/pamid/src/onesided/{mpid_win_attach.c => mpid_win_shared_query.c} (56%)


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list