[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.0.4-165-g5c4f5de

mysql vizuser noreply at mpich.org
Wed May 8 09:59:40 CDT 2013


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  5c4f5de7563db2bee452a102614c53b8ed9583ed (commit)
      from  07de139fa7b3cba2894ddf61cc73854384f3e50f (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/5c4f5de7563db2bee452a102614c53b8ed9583ed

commit 5c4f5de7563db2bee452a102614c53b8ed9583ed
Author: Haizhu Liu <haizhu at us.ibm.com>
Date:   Fri Mar 22 21:09:05 2013 -0400

    Fix seg fault in _mpi_world_exiting_handler, fix hang at MPI_Finalize, removing calling PAMI_Context_advance in dispatch.
    
    (ibm) D189340
    (ibm) 7c57f7ed7a8aa1f0156ba5112664a7c2cd35f227
    
    Signed-off-by: Bob Cernohous <bobc at us.ibm.com>

diff --git a/src/mpid/pamid/src/dyntask/mpid_comm_disconnect.c b/src/mpid/pamid/src/dyntask/mpid_comm_disconnect.c
index 094ca0f..3e5378e 100644
--- a/src/mpid/pamid/src/dyntask/mpid_comm_disconnect.c
+++ b/src/mpid/pamid/src/dyntask/mpid_comm_disconnect.c
@@ -137,29 +137,23 @@ void MPIDI_wait_for_AM(long long tranid, int expected_AM, int whichAM)
   double starttime, currtime, elapsetime;
   int    rc, curr_AMcntr;
 
-  MPIU_THREAD_CS_EXIT(ALLFUNC,);
   rc = PAMI_Context_advance(MPIDI_Context[0], (size_t)100);
-  MPIU_THREAD_CS_ENTER(ALLFUNC,);
   if(whichAM == LAST_AM) {
     CURTIME(starttime)
     do {
       CURTIME(currtime)
       elapsetime = currtime - starttime;
 
-      MPIU_THREAD_CS_EXIT(ALLFUNC,);
       rc = PAMI_Context_advance(MPIDI_Context[0], (size_t)100);
-      MPIU_THREAD_CS_ENTER(ALLFUNC,);
       curr_AMcntr = MPIDI_get_AM_cntr_for_tranid(tranid, whichAM);
-      TRACE_ERR("_try_to_disconnect: Looping in timer for TranID %lld, whichAM %d expected_AM = %d, Current AM = %d\n",tranid,whichAM,expected_AM,curr_AMcntr);
+      /*TRACE_ERR("_try_to_disconnect: Looping in timer for TranID %lld, whichAM %d expected_AM = %d, Current AM = %d\n",tranid,whichAM,expected_AM,curr_AMcntr); */
     }while(curr_AMcntr != expected_AM && elapsetime < DISCONNECT_LAPI_XFER_TIMEOUT);
   }
   else {
     do {
-      MPIU_THREAD_CS_EXIT(ALLFUNC,);
       rc = PAMI_Context_advance(MPIDI_Context[0], (size_t)100);
-      MPIU_THREAD_CS_ENTER(ALLFUNC,);
       curr_AMcntr = MPIDI_get_AM_cntr_for_tranid(tranid, whichAM);
-      TRACE_ERR("_try_to_disconnect: Looping in timer for TranID %lld, whichAM %d expected_AM = %d, Current AM = %d\n",tranid,whichAM,expected_AM,curr_AMcntr);
+      /*TRACE_ERR("_try_to_disconnect: Looping in timer for TranID %lld, whichAM %d expected_AM = %d, Current AM = %d\n",tranid,whichAM,expected_AM,curr_AMcntr);*/
     }while(curr_AMcntr != expected_AM);
   }
 }
diff --git a/src/mpid/pamid/src/dyntask/mpidi_pg.c b/src/mpid/pamid/src/dyntask/mpidi_pg.c
index d588882..f59890d 100644
--- a/src/mpid/pamid/src/dyntask/mpidi_pg.c
+++ b/src/mpid/pamid/src/dyntask/mpidi_pg.c
@@ -170,9 +170,9 @@ int MPIDI_PG_Finalize(void)
 
    MPIU_Free(root_wid_barray); /* root_wid_barray is now NULL for non-root */
 
-
+#if 0
    pthread_create(&finalize_req_thread, NULL, mpidi_finalize_req, NULL);
-   MPIU_THREAD_CS_EXIT(ALLFUNC,);
+   /*MPIU_THREAD_CS_EXIT(ALLFUNC,); */
    while (mpidi_sync_done !=1) {
      mpi_errno=PAMI_Context_advance(MPIDI_Context[0], 1000);
      if (mpi_errno == PAMI_EAGAIN) {
@@ -183,6 +183,9 @@ int MPIDI_PG_Finalize(void)
    if (mpi_errno = pthread_join(finalize_req_thread, NULL) ) {
          TRACE_ERR("error returned from pthread_join() mpi_errno=%d\n",mpi_errno);
    }
+#endif
+   MPIU_THREAD_CS_EXIT(ALLFUNC,);
+   PMI2_Finalize();
    MPIU_THREAD_CS_ENTER(ALLFUNC,);
 
    if(_conn_info_list) {
diff --git a/src/mpid/pamid/src/mpid_init.c b/src/mpid/pamid/src/mpid_init.c
index 022a2fd..4f85bd6 100644
--- a/src/mpid/pamid/src/mpid_init.c
+++ b/src/mpid/pamid/src/mpid_init.c
@@ -33,6 +33,8 @@
 #define MAX_JOBID_LEN                1024
 int     world_rank;
 int     world_size;
+extern int (*mp_world_exiting_handler)(int);
+extern int _mpi_world_exiting_handler(int);
 #endif
 int mpidi_dynamic_tasking = 0;
 
@@ -1009,10 +1011,8 @@ MPIDI_VCRT_init(int rank, int size, char *world_tasks, MPIDI_PG_t *pg)
     world_tasks_save = MPIU_Strdup(world_tasks);
     if(world_tasks != NULL) {
       comm->vcr[0]->taskid = atoi(strtok(world_tasks, ":"));
-      TRACE_ERR("comm->vcr[0]->taskid =%d\n", comm->vcr[0]->taskid);
       while( (cp=strtok(NULL, ":")) != NULL) {
         comm->vcr[++i]->taskid= atoi(cp);
-        TRACE_ERR("comm->vcr[i]->taskid =%d\n", comm->vcr[i]->taskid);
       }
     }
     MPIU_Free(world_tasks_save);
@@ -1228,6 +1228,7 @@ int MPID_Init(int * argc,
 	/* FIXME: Check that this intercommunicator gets freed in MPI_Finalize
 	   if not already freed.  */
    }
+  mp_world_exiting_handler = &(_mpi_world_exiting_handler);
 #endif
   /* ------------------------------- */
   /* Initialize timer data           */
diff --git a/src/pmi/pmi2/poe/poe2pmi.c b/src/pmi/pmi2/poe/poe2pmi.c
index e37563e..4fa1fa0 100644
--- a/src/pmi/pmi2/poe/poe2pmi.c
+++ b/src/pmi/pmi2/poe/poe2pmi.c
@@ -69,8 +69,7 @@ static MPID_Thread_cond_t cond;
 #endif
 
 extern int mpidi_finalized;
-extern int (*mp_world_exiting_handler)(int);
-extern int _mpi_world_exiting_handler(int);
+int _mpi_world_exiting_handler(int);
 
 void *poeptr = NULL;
 
@@ -98,7 +97,7 @@ int PMI2_Init(int *spawned, int *size, int *rank, int *appnum)
     }
 
     ret = (*pmi2_init)(spawned, size, rank, appnum);
-    mp_world_exiting_handler = &(_mpi_world_exiting_handler);
+    /*mp_world_exiting_handler = &(_mpi_world_exiting_handler); */
     return ret;
 }
 
@@ -271,7 +270,7 @@ int PMI2_Info_GetJobAttr(const char name[], char value[], int valuelen, int *fla
  * This is the mpi level of callback that get invoked when a task get notified
  * of a world's exiting
  */
-int _mpi_world_exiting_handler_wrapper(pami_context_t context, void *cookie)
+int _mpi_world_exiting_handler(int world_id)
 {
   /* check the reference count associated with that remote world
      if the reference count is zero, the task will call LAPI_Purge_totask on
@@ -286,10 +285,10 @@ int _mpi_world_exiting_handler_wrapper(pami_context_t context, void *cookie)
   char world_id_str[32];
   int mpi_errno = MPI_SUCCESS;
   pami_endpoint_t dest;
-  struct worldExitReq *req = (struct worldExitReq *)cookie;
-  int world_id = req->world_id;
+/*  struct worldExitReq *req = (struct worldExitReq *)cookie; */
   MPID_Comm *comm = MPIR_Process.comm_world;
 
+  MPIU_THREAD_CS_ENTER(ALLFUNC,);
   ref_count = MPIDI_get_refcnt_of_world(world_id);
   TRACE_ERR("_mpi_world_exiting_handler: invoked for world %d exiting ref_count=%d my comm_word_size=%d\n", world_id, ref_count, world_size);
   if(ref_count == 0) {
@@ -300,7 +299,7 @@ int _mpi_world_exiting_handler_wrapper(pami_context_t context, void *cookie)
         MPIDI_OpState_reset(taskid_list[i]);
 	MPIDI_IpState_reset(taskid_list[i]);
 	TRACE_ERR("PAMI_Purge on taskid_list[%d]=%d\n", i,taskid_list[i]);
-        PAMI_Purge(context, &dest, 1);
+        PAMI_Purge(MPIDI_Context[0], &dest, 1);
       }
       MPIDI_delete_conn_record(world_id);
     }
@@ -313,6 +312,7 @@ int _mpi_world_exiting_handler_wrapper(pami_context_t context, void *cookie)
 
   TRACE_ERR("_mpi_world_exiting_handler: Out of _mpi_reduce_for_dyntask for exiting world %d reduce_state=%d\n",world_id, reduce_state);
 
+  MPIU_THREAD_CS_EXIT(ALLFUNC,);
   if(comm->rank == 0) {
     MPIU_Snprintf(world_id_str, sizeof(world_id_str), "%d", world_id);
     PMI2_Abort(0, world_id_str);
@@ -329,28 +329,11 @@ int _mpi_world_exiting_handler_wrapper(pami_context_t context, void *cookie)
     rc = -2;
   }
 
-  if(cookie) MPIU_Free(cookie);
+/*  if(cookie) MPIU_Free(cookie);*/
   return PAMI_SUCCESS;
 }
 
 
-int _mpi_world_exiting_handler(int world_id)
-{
-    struct worldExitReq *req;
-    req = MPIU_Malloc(sizeof(struct worldExitReq));
-    req->world_id = world_id;
-
-    if(MPIDI_Context[0]) {
-      if(!mpidi_finalized)
-        PAMI_Context_post(MPIDI_Context[0], &(req->work), _mpi_world_exiting_handler_wrapper, req);
-      else
-        _mpi_world_exiting_handler_wrapper(MPIDI_Context[0], req);
-    }
-
-    return MPI_SUCCESS;
-}
-
-
 int getchildren(int iam, double alpha,int gsize, int *children,
                 int *blocks, int *numchildren, int *parent)
 {
@@ -418,14 +401,17 @@ int _mpi_reduce_for_dyntask(int *sendbuf, int *recvbuf)
   {
     remaining_child_count = i;
     child_rank = (children[i])% TASKS;
-    mpi_errno = MPIC_Recv(recvbuf, sizeof(int),MPI_BYTE, pg_world->vct[child_rank].taskid, tag, comm_ptr->handle, MPI_STATUS_IGNORE);
+    TRACE_ERR("_mpi_reduce_for_dyntask - recv from child_rank%d child_taskid=%d\n", child_rank, pg_world->vct[child_rank].taskid);
+    mpi_errno = MPIC_Recv(recvbuf, sizeof(int),MPI_BYTE, child_rank, tag, comm_ptr->handle, MPI_STATUS_IGNORE);
+    TRACE_ERR("_mpi_reduce_for_dyntask - recv DONE from child_rank%d child_taskid=%d\n", child_rank, pg_world->vct[child_rank].taskid);
 
     if(world_rank != parent)
     {
       if(remaining_child_count == 0) {
         parent_rank = (parent) % TASKS;
         result += *recvbuf;
-        MPIC_Send(&result, sizeof(int), MPI_BYTE, pg_world->vct[parent_rank].taskid, tag, comm_ptr->handle);
+        TRACE_ERR("_mpi_reduce_for_dyntask - send to parent_rank=%d parent taskid=%d \n", parent_rank, pg_world->vct[parent_rank].taskid);
+        MPIC_Send(&result, sizeof(int), MPI_BYTE, parent_rank, tag, comm_ptr->handle);
       }
       else
       {
@@ -440,7 +426,8 @@ int _mpi_reduce_for_dyntask(int *sendbuf, int *recvbuf)
 
   if(world_rank != parent && numchildren == 0) {
     parent_rank = (parent) % TASKS;
-    MPIC_Send(sendbuf, sizeof(int), MPI_BYTE, pg_world->vct[parent_rank].taskid, tag, comm_ptr->handle);
+    TRACE_ERR("_mpi_reduce_for_dyntask - send to parent_rank=%d parent_task_id=%d\n", parent_rank, pg_world->vct[parent_rank].taskid);
+    MPIC_Send(sendbuf, sizeof(int), MPI_BYTE, parent_rank, tag, comm_ptr->handle);
   }
 
   if(world_rank == 0) {

-----------------------------------------------------------------------

Summary of changes:
 src/mpid/pamid/src/dyntask/mpid_comm_disconnect.c |   10 +----
 src/mpid/pamid/src/dyntask/mpidi_pg.c             |    7 ++-
 src/mpid/pamid/src/mpid_init.c                    |    5 +-
 src/pmi/pmi2/poe/poe2pmi.c                        |   43 +++++++-------------
 4 files changed, 25 insertions(+), 40 deletions(-)


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list