[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.0.4-165-g5c4f5de
mysql vizuser
noreply at mpich.org
Wed May 8 09:59:40 CDT 2013
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".
The branch, master has been updated
via 5c4f5de7563db2bee452a102614c53b8ed9583ed (commit)
from 07de139fa7b3cba2894ddf61cc73854384f3e50f (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/5c4f5de7563db2bee452a102614c53b8ed9583ed
commit 5c4f5de7563db2bee452a102614c53b8ed9583ed
Author: Haizhu Liu <haizhu at us.ibm.com>
Date: Fri Mar 22 21:09:05 2013 -0400
Fix seg fault in _mpi_world_exiting_handler, fix hang at MPI_Finalize, removing calling PAMI_Context_advance in dispatch.
(ibm) D189340
(ibm) 7c57f7ed7a8aa1f0156ba5112664a7c2cd35f227
Signed-off-by: Bob Cernohous <bobc at us.ibm.com>
diff --git a/src/mpid/pamid/src/dyntask/mpid_comm_disconnect.c b/src/mpid/pamid/src/dyntask/mpid_comm_disconnect.c
index 094ca0f..3e5378e 100644
--- a/src/mpid/pamid/src/dyntask/mpid_comm_disconnect.c
+++ b/src/mpid/pamid/src/dyntask/mpid_comm_disconnect.c
@@ -137,29 +137,23 @@ void MPIDI_wait_for_AM(long long tranid, int expected_AM, int whichAM)
double starttime, currtime, elapsetime;
int rc, curr_AMcntr;
- MPIU_THREAD_CS_EXIT(ALLFUNC,);
rc = PAMI_Context_advance(MPIDI_Context[0], (size_t)100);
- MPIU_THREAD_CS_ENTER(ALLFUNC,);
if(whichAM == LAST_AM) {
CURTIME(starttime)
do {
CURTIME(currtime)
elapsetime = currtime - starttime;
- MPIU_THREAD_CS_EXIT(ALLFUNC,);
rc = PAMI_Context_advance(MPIDI_Context[0], (size_t)100);
- MPIU_THREAD_CS_ENTER(ALLFUNC,);
curr_AMcntr = MPIDI_get_AM_cntr_for_tranid(tranid, whichAM);
- TRACE_ERR("_try_to_disconnect: Looping in timer for TranID %lld, whichAM %d expected_AM = %d, Current AM = %d\n",tranid,whichAM,expected_AM,curr_AMcntr);
+ /*TRACE_ERR("_try_to_disconnect: Looping in timer for TranID %lld, whichAM %d expected_AM = %d, Current AM = %d\n",tranid,whichAM,expected_AM,curr_AMcntr); */
}while(curr_AMcntr != expected_AM && elapsetime < DISCONNECT_LAPI_XFER_TIMEOUT);
}
else {
do {
- MPIU_THREAD_CS_EXIT(ALLFUNC,);
rc = PAMI_Context_advance(MPIDI_Context[0], (size_t)100);
- MPIU_THREAD_CS_ENTER(ALLFUNC,);
curr_AMcntr = MPIDI_get_AM_cntr_for_tranid(tranid, whichAM);
- TRACE_ERR("_try_to_disconnect: Looping in timer for TranID %lld, whichAM %d expected_AM = %d, Current AM = %d\n",tranid,whichAM,expected_AM,curr_AMcntr);
+ /*TRACE_ERR("_try_to_disconnect: Looping in timer for TranID %lld, whichAM %d expected_AM = %d, Current AM = %d\n",tranid,whichAM,expected_AM,curr_AMcntr);*/
}while(curr_AMcntr != expected_AM);
}
}
diff --git a/src/mpid/pamid/src/dyntask/mpidi_pg.c b/src/mpid/pamid/src/dyntask/mpidi_pg.c
index d588882..f59890d 100644
--- a/src/mpid/pamid/src/dyntask/mpidi_pg.c
+++ b/src/mpid/pamid/src/dyntask/mpidi_pg.c
@@ -170,9 +170,9 @@ int MPIDI_PG_Finalize(void)
MPIU_Free(root_wid_barray); /* root_wid_barray is now NULL for non-root */
-
+#if 0
pthread_create(&finalize_req_thread, NULL, mpidi_finalize_req, NULL);
- MPIU_THREAD_CS_EXIT(ALLFUNC,);
+ /*MPIU_THREAD_CS_EXIT(ALLFUNC,); */
while (mpidi_sync_done !=1) {
mpi_errno=PAMI_Context_advance(MPIDI_Context[0], 1000);
if (mpi_errno == PAMI_EAGAIN) {
@@ -183,6 +183,9 @@ int MPIDI_PG_Finalize(void)
if (mpi_errno = pthread_join(finalize_req_thread, NULL) ) {
TRACE_ERR("error returned from pthread_join() mpi_errno=%d\n",mpi_errno);
}
+#endif
+ MPIU_THREAD_CS_EXIT(ALLFUNC,);
+ PMI2_Finalize();
MPIU_THREAD_CS_ENTER(ALLFUNC,);
if(_conn_info_list) {
diff --git a/src/mpid/pamid/src/mpid_init.c b/src/mpid/pamid/src/mpid_init.c
index 022a2fd..4f85bd6 100644
--- a/src/mpid/pamid/src/mpid_init.c
+++ b/src/mpid/pamid/src/mpid_init.c
@@ -33,6 +33,8 @@
#define MAX_JOBID_LEN 1024
int world_rank;
int world_size;
+extern int (*mp_world_exiting_handler)(int);
+extern int _mpi_world_exiting_handler(int);
#endif
int mpidi_dynamic_tasking = 0;
@@ -1009,10 +1011,8 @@ MPIDI_VCRT_init(int rank, int size, char *world_tasks, MPIDI_PG_t *pg)
world_tasks_save = MPIU_Strdup(world_tasks);
if(world_tasks != NULL) {
comm->vcr[0]->taskid = atoi(strtok(world_tasks, ":"));
- TRACE_ERR("comm->vcr[0]->taskid =%d\n", comm->vcr[0]->taskid);
while( (cp=strtok(NULL, ":")) != NULL) {
comm->vcr[++i]->taskid= atoi(cp);
- TRACE_ERR("comm->vcr[i]->taskid =%d\n", comm->vcr[i]->taskid);
}
}
MPIU_Free(world_tasks_save);
@@ -1228,6 +1228,7 @@ int MPID_Init(int * argc,
/* FIXME: Check that this intercommunicator gets freed in MPI_Finalize
if not already freed. */
}
+ mp_world_exiting_handler = &(_mpi_world_exiting_handler);
#endif
/* ------------------------------- */
/* Initialize timer data */
diff --git a/src/pmi/pmi2/poe/poe2pmi.c b/src/pmi/pmi2/poe/poe2pmi.c
index e37563e..4fa1fa0 100644
--- a/src/pmi/pmi2/poe/poe2pmi.c
+++ b/src/pmi/pmi2/poe/poe2pmi.c
@@ -69,8 +69,7 @@ static MPID_Thread_cond_t cond;
#endif
extern int mpidi_finalized;
-extern int (*mp_world_exiting_handler)(int);
-extern int _mpi_world_exiting_handler(int);
+int _mpi_world_exiting_handler(int);
void *poeptr = NULL;
@@ -98,7 +97,7 @@ int PMI2_Init(int *spawned, int *size, int *rank, int *appnum)
}
ret = (*pmi2_init)(spawned, size, rank, appnum);
- mp_world_exiting_handler = &(_mpi_world_exiting_handler);
+ /*mp_world_exiting_handler = &(_mpi_world_exiting_handler); */
return ret;
}
@@ -271,7 +270,7 @@ int PMI2_Info_GetJobAttr(const char name[], char value[], int valuelen, int *fla
* This is the mpi level of callback that get invoked when a task get notified
* of a world's exiting
*/
-int _mpi_world_exiting_handler_wrapper(pami_context_t context, void *cookie)
+int _mpi_world_exiting_handler(int world_id)
{
/* check the reference count associated with that remote world
if the reference count is zero, the task will call LAPI_Purge_totask on
@@ -286,10 +285,10 @@ int _mpi_world_exiting_handler_wrapper(pami_context_t context, void *cookie)
char world_id_str[32];
int mpi_errno = MPI_SUCCESS;
pami_endpoint_t dest;
- struct worldExitReq *req = (struct worldExitReq *)cookie;
- int world_id = req->world_id;
+/* struct worldExitReq *req = (struct worldExitReq *)cookie; */
MPID_Comm *comm = MPIR_Process.comm_world;
+ MPIU_THREAD_CS_ENTER(ALLFUNC,);
ref_count = MPIDI_get_refcnt_of_world(world_id);
TRACE_ERR("_mpi_world_exiting_handler: invoked for world %d exiting ref_count=%d my comm_word_size=%d\n", world_id, ref_count, world_size);
if(ref_count == 0) {
@@ -300,7 +299,7 @@ int _mpi_world_exiting_handler_wrapper(pami_context_t context, void *cookie)
MPIDI_OpState_reset(taskid_list[i]);
MPIDI_IpState_reset(taskid_list[i]);
TRACE_ERR("PAMI_Purge on taskid_list[%d]=%d\n", i,taskid_list[i]);
- PAMI_Purge(context, &dest, 1);
+ PAMI_Purge(MPIDI_Context[0], &dest, 1);
}
MPIDI_delete_conn_record(world_id);
}
@@ -313,6 +312,7 @@ int _mpi_world_exiting_handler_wrapper(pami_context_t context, void *cookie)
TRACE_ERR("_mpi_world_exiting_handler: Out of _mpi_reduce_for_dyntask for exiting world %d reduce_state=%d\n",world_id, reduce_state);
+ MPIU_THREAD_CS_EXIT(ALLFUNC,);
if(comm->rank == 0) {
MPIU_Snprintf(world_id_str, sizeof(world_id_str), "%d", world_id);
PMI2_Abort(0, world_id_str);
@@ -329,28 +329,11 @@ int _mpi_world_exiting_handler_wrapper(pami_context_t context, void *cookie)
rc = -2;
}
- if(cookie) MPIU_Free(cookie);
+/* if(cookie) MPIU_Free(cookie);*/
return PAMI_SUCCESS;
}
-int _mpi_world_exiting_handler(int world_id)
-{
- struct worldExitReq *req;
- req = MPIU_Malloc(sizeof(struct worldExitReq));
- req->world_id = world_id;
-
- if(MPIDI_Context[0]) {
- if(!mpidi_finalized)
- PAMI_Context_post(MPIDI_Context[0], &(req->work), _mpi_world_exiting_handler_wrapper, req);
- else
- _mpi_world_exiting_handler_wrapper(MPIDI_Context[0], req);
- }
-
- return MPI_SUCCESS;
-}
-
-
int getchildren(int iam, double alpha,int gsize, int *children,
int *blocks, int *numchildren, int *parent)
{
@@ -418,14 +401,17 @@ int _mpi_reduce_for_dyntask(int *sendbuf, int *recvbuf)
{
remaining_child_count = i;
child_rank = (children[i])% TASKS;
- mpi_errno = MPIC_Recv(recvbuf, sizeof(int),MPI_BYTE, pg_world->vct[child_rank].taskid, tag, comm_ptr->handle, MPI_STATUS_IGNORE);
+ TRACE_ERR("_mpi_reduce_for_dyntask - recv from child_rank%d child_taskid=%d\n", child_rank, pg_world->vct[child_rank].taskid);
+ mpi_errno = MPIC_Recv(recvbuf, sizeof(int),MPI_BYTE, child_rank, tag, comm_ptr->handle, MPI_STATUS_IGNORE);
+ TRACE_ERR("_mpi_reduce_for_dyntask - recv DONE from child_rank%d child_taskid=%d\n", child_rank, pg_world->vct[child_rank].taskid);
if(world_rank != parent)
{
if(remaining_child_count == 0) {
parent_rank = (parent) % TASKS;
result += *recvbuf;
- MPIC_Send(&result, sizeof(int), MPI_BYTE, pg_world->vct[parent_rank].taskid, tag, comm_ptr->handle);
+ TRACE_ERR("_mpi_reduce_for_dyntask - send to parent_rank=%d parent taskid=%d \n", parent_rank, pg_world->vct[parent_rank].taskid);
+ MPIC_Send(&result, sizeof(int), MPI_BYTE, parent_rank, tag, comm_ptr->handle);
}
else
{
@@ -440,7 +426,8 @@ int _mpi_reduce_for_dyntask(int *sendbuf, int *recvbuf)
if(world_rank != parent && numchildren == 0) {
parent_rank = (parent) % TASKS;
- MPIC_Send(sendbuf, sizeof(int), MPI_BYTE, pg_world->vct[parent_rank].taskid, tag, comm_ptr->handle);
+ TRACE_ERR("_mpi_reduce_for_dyntask - send to parent_rank=%d parent_task_id=%d\n", parent_rank, pg_world->vct[parent_rank].taskid);
+ MPIC_Send(sendbuf, sizeof(int), MPI_BYTE, parent_rank, tag, comm_ptr->handle);
}
if(world_rank == 0) {
-----------------------------------------------------------------------
Summary of changes:
src/mpid/pamid/src/dyntask/mpid_comm_disconnect.c | 10 +----
src/mpid/pamid/src/dyntask/mpidi_pg.c | 7 ++-
src/mpid/pamid/src/mpid_init.c | 5 +-
src/pmi/pmi2/poe/poe2pmi.c | 43 +++++++-------------
4 files changed, 25 insertions(+), 40 deletions(-)
hooks/post-receive
--
MPICH primary repository
More information about the commits
mailing list