[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.2-449-gaeb915d

Service Account noreply at mpich.org
Thu Sep 15 13:43:36 CDT 2016


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  aeb915d93e8f55c83b9a426e538bebc32ff6b682 (commit)
       via  320b25fddd6783ffe4acf447608a7bd48bad572b (commit)
      from  80a0ceb46d8bf80d9c2d7e5dc4ed73a3dddf852d (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/aeb915d93e8f55c83b9a426e538bebc32ff6b682

commit aeb915d93e8f55c83b9a426e538bebc32ff6b682
Author: Lena Oden <loden at anl.gov>
Date:   Wed Aug 24 21:13:01 2016 +0000

    CH4/UCX: split Key to avoid KSV overflow
    
    This patch makes sure that even large UCX addresses are transfered
    using KVS. In this case, the address is split up and saved in peaces
    
    Signed-off-by: Tomislav Janjusic <tomislavj at mellanox.com>

diff --git a/src/mpid/ch4/netmod/ucx/ucx_init.h b/src/mpid/ch4/netmod/ucx/ucx_init.h
index 343d867..3222b86 100644
--- a/src/mpid/ch4/netmod/ucx/ucx_init.h
+++ b/src/mpid/ch4/netmod/ucx/ucx_init.h
@@ -31,13 +31,23 @@ static inline int MPIDI_NM_mpi_init_hook(int rank,
     ucp_config_t *config;
     ucs_status_t ucx_status;
     uint64_t features = 0;
-    char valS[MPIDI_UCX_KVSAPPSTRLEN], *val;
-    char keyS[MPIDI_UCX_KVSAPPSTRLEN];
-    char remote_addr[MPIDI_UCX_KVSAPPSTRLEN];
-    size_t maxlen = MPIDI_UCX_KVSAPPSTRLEN;
-    //   char *table = NULL;
+    int status;
+    int val_max_sz, key_max_sz;
+    char *valS, *val;
+    char *keyS;
+    char *remote_addr;
+    size_t maxlen;
+    int string_addr_len;
+    int max_string;
     int i;
     ucp_params_t ucp_params;
+    int avtid = 0, max_n_avts;
+
+    int p;
+    int addr_size = 0;
+    char *string_addr;
+    MPIR_CHKLMEM_DECL(4);
+
 
     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_INIT);
     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_INIT);
@@ -62,54 +72,140 @@ static inline int MPIDI_NM_mpi_init_hook(int rank,
         ucp_worker_get_address(MPIDI_UCX_global.worker, &MPIDI_UCX_global.if_address,
                                &MPIDI_UCX_global.addrname_len);
     MPIDI_UCX_CHK_STATUS(ucx_status);
+#ifdef USE_PMI2_API
+        val_max_sz = PMI2_MAX_VALLEN;
+        key_max_sz = PMI2_MAX_KEYLEN;
+#else
+        pmi_errno = PMI_KVS_Get_value_length_max(&val_max_sz);
+        MPIDI_UCX_PMI_ERROR(pmi_errno);
 
+        pmi_errno = PMI_KVS_Get_key_length_max(&key_max_sz);
+        MPIDI_UCX_PMI_ERROR(pmi_errno);
 
-    val = valS;
-    str_errno =
-        MPL_str_add_binary_arg(&val, (int *) &maxlen, "UCX", (char *) MPIDI_UCX_global.if_address,
-                               (int) MPIDI_UCX_global.addrname_len);
-    MPIDI_UCX_global.max_addr_len = MPIDI_UCX_global.addrname_len;
-    /* MPIDI_CH4_UCX_STR_ERRCHK(str_errno, buscard_len); */
-    pmi_errno = PMI_KVS_Get_my_name(MPIDI_UCX_global.kvsname, MPIDI_UCX_KVSAPPSTRLEN);
+#endif
+
+ /*we have to reduce the value - the total size of an PMI string is 1024, so command+value+key
+  * assume 100 characters for the command to be save */
+    val_max_sz = val_max_sz - key_max_sz -100;
+
+
+    MPIR_CHKLMEM_MALLOC(valS, char *, val_max_sz, mpi_errno, "valS");
+/* In UCX we have the problem that the key size (as a string) van be larger than val_max_sz.
+ * We create a string from the key - but we don't know the size that this string will have
+ * So we guess the size - based on the worker address size. The decoding uses the hex-representation
+ * of the binary. So we need 2 bytes per byte. Add some extra bytes for the "key".
+ */
+
+    max_string =  MPIDI_UCX_global.addrname_len * 2 + 128;
+    MPIR_CHKLMEM_MALLOC(keyS, char *, key_max_sz, mpi_errno, "keyS");
+    MPIR_CHKLMEM_MALLOC(string_addr, char *, max_string, mpi_errno, "string_addr");
+    MPIR_CHKLMEM_MALLOC(remote_addr, char *, max_string, mpi_errno, "remote_addr");
+
+    maxlen = max_string;
+    val = string_addr;
 
+    str_errno = MPL_str_add_binary_arg(&val, (int *) &maxlen, "U", (char *) MPIDI_UCX_global.if_address,
+                              (int) MPIDI_UCX_global.addrname_len);
+
+   /*todo: fallback if buffer is to small */
+    MPIDI_UCX_STR_ERRCHK(str_errno);
+
+    string_addr_len =  max_string - maxlen;
+    pmi_errno = PMI_KVS_Get_my_name(MPIDI_UCX_global.kvsname, val_max_sz);
+    val = valS;
+   /* I first commit my worker-address size */
+    maxlen = val_max_sz;
+    sprintf(keyS, "Ksize-%d", rank);
+    MPL_str_add_int_arg(&val, (int*) &maxlen, "K", string_addr_len); 
     val = valS;
-    sprintf(keyS, "UCX-%d", rank);
     pmi_errno = PMI_KVS_Put(MPIDI_UCX_global.kvsname, keyS, val);
     MPIDI_UCX_PMI_ERROR(pmi_errno);
     pmi_errno = PMI_KVS_Commit(MPIDI_UCX_global.kvsname);
     MPIDI_UCX_PMI_ERROR(pmi_errno);
+/* now we have to commit the key. However, if the size is larger than the val_max_sz,
+ * we have tho spilt it up That's ugly, but badluck */
+
+    if(string_addr_len  < val_max_sz) {
+        val = string_addr;
+        sprintf(keyS, "UCX-%d", rank);
+        pmi_errno = PMI_KVS_Put(MPIDI_UCX_global.kvsname, keyS, val);
+        MPIDI_UCX_PMI_ERROR(pmi_errno);
+        pmi_errno = PMI_KVS_Commit(MPIDI_UCX_global.kvsname);
+    }
+    else{
+        p = 0;
+        while (p<string_addr_len) {
+            val=valS;
+            MPL_snprintf(val,val_max_sz ,"%s",string_addr+p);
+            val=valS;
+            sprintf(keyS, "UCX-%d-%d", rank,p);
+            pmi_errno = PMI_KVS_Put(MPIDI_UCX_global.kvsname, keyS, val);
+            MPIDI_UCX_PMI_ERROR(pmi_errno);
+            pmi_errno = PMI_KVS_Commit(MPIDI_UCX_global.kvsname);
+            MPIDI_UCX_PMI_ERROR(pmi_errno);
+            p += val_max_sz-1 ;
+        }
+    }
+
+    val = valS;
+    MPIDI_UCX_global.max_addr_len = MPIDI_UCX_global.addrname_len;
+
     pmi_errno = PMI_Barrier();
     MPIDI_UCX_PMI_ERROR(pmi_errno);
 
-    ///table = MPL_malloc(size * MPIDI_UCX_NAME_LEN);
-    MPIDI_UCX_global.pmi_addr_table = NULL;
-//    memset(table,0x0, MPIDI_UCX_NAME_LEN*size);
+    /* Set to NULL now, only created if required in MPI_Intercomm_create*/
 
-    maxlen = MPIDI_UCX_KVSAPPSTRLEN;
+    MPIDI_UCX_global.pmi_addr_table = NULL;
+    maxlen = val_max_sz -1;
 
     for (i = 0; i < size; i++) {
-        sprintf(keyS, "UCX-%d", i);
-        pmi_errno = PMI_KVS_Get(MPIDI_UCX_global.kvsname, keyS, valS, MPIDI_UCX_KVSAPPSTRLEN);
-        MPIDI_UCX_PMI_ERROR(pmi_errno);
-        str_errno = MPL_str_get_binary_arg(valS, "UCX", remote_addr,
-                                           (int) MPIDI_UCX_KVSAPPSTRLEN, (int *) &maxlen);
-        if (maxlen > MPIDI_UCX_global.max_addr_len)
-            MPIDI_UCX_global.max_addr_len = maxlen;
+        /*first get the size */
+        sprintf(keyS, "Ksize-%d", i);
+        pmi_errno = PMI_KVS_Get(MPIDI_UCX_global.kvsname, keyS, val, val_max_sz);
+        str_errno = MPL_str_get_int_arg(val, "K", &string_addr_len);
+
+        if(string_addr_len< val_max_sz) {
+            val = string_addr;
+            sprintf(keyS, "UCX-%d", i);
+            pmi_errno = PMI_KVS_Get(MPIDI_UCX_global.kvsname, keyS, val, val_max_sz);
+            MPIDI_UCX_PMI_ERROR(pmi_errno);
+            str_errno = MPL_str_get_binary_arg(string_addr, "U", remote_addr,
+                    (int)max_string, (int *) &addr_size);
+
+            MPIDI_UCX_STR_ERRCHK(str_errno);
+        }
+        else{
+            /* first catch the string together*/
+            p = 0;
+            while(p < string_addr_len) {
+                val = string_addr+p;
+                sprintf(keyS, "UCX-%d-%d", i,p);
+                pmi_errno = PMI_KVS_Get(MPIDI_UCX_global.kvsname, keyS, val, val_max_sz);
+                p+=val_max_sz-1;
+            }
+            str_errno = MPL_str_get_binary_arg(string_addr, "U", remote_addr,
+                    (int)max_string, (int *) &addr_size);
+            MPIDI_UCX_STR_ERRCHK(str_errno);
+
+        }
+
+        if(addr_size > MPIDI_UCX_global.max_addr_len)
+            MPIDI_UCX_global.max_addr_len = addr_size;
 
         ucx_status = ucp_ep_create(MPIDI_UCX_global.worker,
-                                   (ucp_address_t *) remote_addr,
-                                   &MPIDI_UCX_AV(&MPIDIU_get_av(0, i)).dest);
+                (ucp_address_t *) remote_addr,
+                &MPIDI_UCX_AV(&MPIDIU_get_av(0, i)).dest);
 
         MPIDI_UCX_CHK_STATUS(ucx_status);
-        memset(remote_addr, 0x0, maxlen);
     }
 
     MPIDI_CH4U_mpi_init(comm_world, comm_self, num_contexts, netmod_contexts);
 
     mpi_errno = MPIR_Datatype_init_names();
-    MPIDI_CH4_UCX_MPI_ERROR(mpi_errno);
+    MPIDI_UCX_MPI_ERROR(mpi_errno);
 
   fn_exit:
+    MPIR_CHKLMEM_FREEALL();
     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_EXIT);
     return mpi_errno;
   fn_fail:
@@ -181,28 +277,80 @@ static inline int MPIDI_NM_comm_get_lpid(MPIR_Comm * comm_ptr,
 
 }
 
-static inline int allocate_address_table()
+static inline int MPIDI_NMI_allocate_address_table()
 {
 
-    char keyS[MPIDI_UCX_KVSAPPSTRLEN];
-    char valS[MPIDI_UCX_KVSAPPSTRLEN];
+    char *keyS;
+    char *valS, *val;
+    int mpi_errno = MPI_SUCCESS, pmi_errno, str_errno;
     int len = MPIDI_UCX_global.max_addr_len;
     int i;
     int size, maxlen = 1;
+    int key_max_sz, val_max_sz;
+    int string_addr_len;
+    int max_string;
+    char *string_addr;
+    int addr_size;
+    int p;
+    MPIR_CHKLMEM_DECL(3);
+#ifdef USE_PMI2_API
+    val_max_sz = PMI2_MAX_VALLEN;
+    key_max_sz = PMI2_MAX_KEYLEN;
+#else
+    pmi_errno = PMI_KVS_Get_value_length_max(&val_max_sz);
+    MPIDI_UCX_PMI_ERROR(pmi_errno);
+    pmi_errno = PMI_KVS_Get_key_length_max(&key_max_sz);
+    MPIDI_UCX_PMI_ERROR(pmi_errno);
+#endif
+
+    val_max_sz = val_max_sz - 100 - key_max_sz; /*see comment at init */
+
+    max_string = len * 2 + 128;
+    MPIR_CHKLMEM_MALLOC(valS, char *, val_max_sz, mpi_errno, "valS");
+    MPIR_CHKLMEM_MALLOC(keyS, char *, key_max_sz, mpi_errno, "keyS");
+    MPIR_CHKLMEM_MALLOC(string_addr, char *, max_string, mpi_errno, "string_addr");
+    val  = valS;
+
     size = MPIR_Process.comm_world->local_size;
     MPIDI_UCX_global.pmi_addr_table = MPL_malloc(size * len);
     memset(MPIDI_UCX_global.pmi_addr_table, 0x0, len * size);
 
-
     for (i = 0; i < size; i++) {
-        sprintf(keyS, "UCX-%d", i);
-        pmi_errno = PMI_KVS_Get(MPIDI_UCX_global.kvsname, keyS, valS, MPIDI_UCX_KVSAPPSTRLEN);
+        /*first get the size */
+        sprintf(keyS, "Ksize-%d", i);
+        pmi_errno = PMI_KVS_Get(MPIDI_UCX_global.kvsname, keyS, val, val_max_sz);
         MPIDI_UCX_PMI_ERROR(pmi_errno);
-        MPL_str_get_binary_arg(valS, "UCX", &MPIDI_UCX_global.pmi_addr_table[len * i],
-                               (int) len, (int *) &maxlen);
-    }
+                str_errno = MPL_str_get_int_arg(val, "K", &string_addr_len);
+                if(string_addr_len < val_max_sz) {
+                val = string_addr;
+                sprintf(keyS, "UCX-%d", i);
+                pmi_errno = PMI_KVS_Get(MPIDI_UCX_global.kvsname, keyS, val, val_max_sz);
+                MPIDI_UCX_PMI_ERROR(pmi_errno);
+                str_errno = MPL_str_get_binary_arg(string_addr, "U", &MPIDI_UCX_global.pmi_addr_table[i * len]  ,
+                    (int)max_string, (int *) &addr_size);
+                MPIDI_UCX_STR_ERRCHK(str_errno);
+                }
+                else{
+                /* first catch the string together*/
+                p = 0;
+                while(p < string_addr_len) {
+                val = string_addr+p;
+                sprintf(keyS, "UCX-%d-%d", i,p);
+                pmi_errno = PMI_KVS_Get(MPIDI_UCX_global.kvsname, keyS, val, val_max_sz);
+                MPIDI_UCX_PMI_ERROR(pmi_errno);
+                p+=val_max_sz-1;
+                }
+                str_errno = MPL_str_get_binary_arg(string_addr, "U", &MPIDI_UCX_global.pmi_addr_table[i * len],
+                        (int)max_string, (int *) &addr_size);
+                }
 
+    }
 
+fn_exit:
+    MPIR_CHKLMEM_FREEALL();
+    return mpi_errno;
+fn_fail:
+    goto fn_exit;
 }
 
 static inline int MPIDI_NM_gpid_get(MPIR_Comm * comm_ptr, int rank, MPIR_Gpid * gpid)
@@ -217,7 +365,7 @@ static inline int MPIDI_NM_gpid_get(MPIR_Comm * comm_ptr, int rank, MPIR_Gpid *
     MPIR_Assert(rank < comm_ptr->local_size);
 
     if (MPIDI_UCX_global.pmi_addr_table == NULL) {
-        allocate_address_table();
+        MPIDI_NMI_allocate_address_table();
     }
     memset(MPIDI_UCX_GPID(gpid).addr, 0, len);
     memcpy(MPIDI_UCX_GPID(gpid).addr, &MPIDI_UCX_global.pmi_addr_table[lpid * len], len);
@@ -251,7 +399,7 @@ static inline int MPIDI_NM_gpid_tolpidarray(int size, MPIR_Gpid gpid[], int lpid
     new_avt_procs = (int *) MPL_malloc(size * sizeof(int));
     max_n_avts = MPIDIU_get_max_n_avts();
     if (MPIDI_UCX_global.pmi_addr_table == NULL) {
-        allocate_address_table();
+        MPIDI_NMI_allocate_address_table();
     }
 
     for (i = 0; i < size; i++) {

http://git.mpich.org/mpich.git/commitdiff/320b25fddd6783ffe4acf447608a7bd48bad572b

commit 320b25fddd6783ffe4acf447608a7bd48bad572b
Author: Lena Oden <loden at anl.gov>
Date:   Wed Aug 17 22:44:24 2016 +0000

    CH4/UCX Simplified but working errorhandling
    
    The error handling in CH4/UCX was not working so far.
    This patch simplified it (reduces the number of error-names) -
    but it works now with the autogen-scribts

diff --git a/src/mpid/ch4/netmod/ucx/errnames.txt b/src/mpid/ch4/netmod/ucx/errnames.txt
index e6058f2..d7e4ba6 100644
--- a/src/mpid/ch4/netmod/ucx/errnames.txt
+++ b/src/mpid/ch4/netmod/ucx/errnames.txt
@@ -1,16 +1,10 @@
-**ucx_nm_read_config:ucx_read_config faild
-**ucx_nm_read_config %s %d %s %s: ucx_read_config faild(%s %d %s %s)
-**ucx_nm_init:ucx_init failed
-**ucx_nm_init %s %d %s %s: ucx_init failed (%s %d %s %s)
-**ucx_nm_worker_create:ucx_worker_create failed
-**ucx_nm_worker_create %s %d %s %s: ucx_worker_create failed (%s %d %s %s)
-**ucx_nm_ep_create:failed to create ucp_endpoint
-**ucx_nm_ep_create %s %d %s %s: failed to create ucp_endpoint (%s %d %s %s)
-**ucx_nm_get_worker_address:ucx failed to get worker address
-**ucx_nm_get_worker_address %s %d %s %s: ucx failed to get worker address (%s %s %s %s)
-**ucx_nm_tag_nb_send:Failed to start tag send in ucx
-**ucx_nm_tag_nb_send %s %d %s %s: Failed to start tag send in ucx (%s %d %s %s)
-**ucx_nm_tag_nb_recv:Failed to start tag recv in ucx
-**ucx_nm_tag_nb_recv %s %d %s %s: Failed to start tag recv in ucx (%s %d %s %s)
-**ucx_nm_other:Other error
-**ucx_nm_other %s %d %s %s: Other error (%s %d %s %s)
+**ucx_nm_status: ucx function returned with failed status
+**ucx_nm_status %s %d %s %s: ucx function returned with failed status(%s %d %s %s)
+**ucx_nm_pmi_error: pmi error in UCX netmod
+**ucx_nm_pmi_error %s %d %s %s: string error in UCX netmod(%s %d %s %s)
+**ucx_nm_str_error: string error in UCX netmod
+**ucx_nm_str_error %s %d %s %s: string error in UCX netmod(%s %d %s %s)
+**ucx_nm_rq_error: return failed request in UCX netmod
+**ucx_nm_rq_error %s %d %s %s: returned failed request in UCX netmod(%s %d %s %s)
+**ucx_nm_other:Other UCX error
+**ucx_nm_other %s %d %s %s: Other UCX error (%s %d %s %s)
diff --git a/src/mpid/ch4/netmod/ucx/ucx_am.h b/src/mpid/ch4/netmod/ucx/ucx_am.h
index d071891..2cd7315 100644
--- a/src/mpid/ch4/netmod/ucx/ucx_am.h
+++ b/src/mpid/ch4/netmod/ucx/ucx_am.h
@@ -79,6 +79,7 @@ static inline void MPIDI_UCX_inject_am_callback(void *request, ucs_status_t stat
 #define FUNCNAME MPIDI_NM_am_isend
 #undef FCNAME
 #define FCNAME MPL_QUOTE(FUNCNAME)
+
 static inline int MPIDI_NM_am_isend(int rank,
                                     MPIR_Comm * comm,
                                     int handler_id,
@@ -88,6 +89,7 @@ static inline int MPIDI_NM_am_isend(int rank,
                                     MPI_Count count,
                                     MPI_Datatype datatype, MPIR_Request * sreq,
                                     void *netmod_context)
+
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_UCX_ucp_request_t *ucp_request;
@@ -160,7 +162,8 @@ static inline int MPIDI_NM_am_isend(int rank,
                                                               data_sz + am_hdr_sz + sizeof(ucx_hdr),
                                                               ucp_dt_make_contig(1), ucx_tag,
                                                               &MPIDI_UCX_am_isend_callback);
-    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+    MPIDI_CH4_UCX_REQUEST(ucp_request);
+
     /* send is done. free all resources and complete the request */
     if (ucp_request == NULL) {
         MPL_free(send_buf);
@@ -283,7 +286,8 @@ static inline int MPIDI_NM_am_isend_reply(MPIR_Context_id_t context_id,
                                                                   sizeof(ucx_hdr),
                                                                   ucp_dt_make_contig(1), ucx_tag,
                                                                   &MPIDI_UCX_am_isend_callback);
-        MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+        MPIDI_CH4_UCX_REQUEST(ucp_request);
+
     }
 
     /* send is done. free all resources and complete the request */
@@ -353,7 +357,7 @@ static inline int MPIDI_NM_am_send_hdr(int rank,
                                                               am_hdr_sz + sizeof(ucx_hdr),
                                                               ucp_dt_make_contig(1), ucx_tag,
                                                               &MPIDI_UCX_inject_am_callback);
-    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+    MPIDI_CH4_UCX_REQUEST(ucp_request);
 
     if (ucp_request == NULL) {
         /* inject is done */
@@ -405,7 +409,7 @@ static inline int MPIDI_NM_am_send_hdr_reply(MPIR_Context_id_t context_id,
                                                               am_hdr_sz + sizeof(ucx_hdr),
                                                               ucp_dt_make_contig(1), ucx_tag,
                                                               &MPIDI_UCX_inject_am_callback);
-    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+    MPIDI_CH4_UCX_REQUEST(ucp_request);
 
     if (ucp_request == NULL) {
         /* inject is done */
diff --git a/src/mpid/ch4/netmod/ucx/ucx_impl.h b/src/mpid/ch4/netmod/ucx/ucx_impl.h
index 6f7200d..71bb82f 100644
--- a/src/mpid/ch4/netmod/ucx/ucx_impl.h
+++ b/src/mpid/ch4/netmod/ucx/ucx_impl.h
@@ -80,15 +80,14 @@ static inline int MPIDI_UCX_get_source(uint64_t match_bits)
 }
 
 
-#define MPIDI_UCX_ERR  MPIR_ERR_CHKANDJUMP4
 
-#define MPIDI_UCX_CHK_STATUS(STATUS,STR)                \
+#define MPIDI_UCX_CHK_STATUS(STATUS)                \
   do {								\
-    MPIDI_UCX_ERR((STATUS!=UCS_OK && STATUS!=UCS_INPROGRESS),\
+    MPIR_ERR_CHKANDJUMP4((STATUS!=UCS_OK && STATUS!=UCS_INPROGRESS),\
 			  mpi_errno,				\
 			  MPI_ERR_OTHER,			\
-			  "**ch4_ucx_nm_"#STR,                  \
-			  "**ch4_ucx_nm_"#STR" %s %d %s %s",    \
+			  "**ucx_nm_status",                  \
+			  "**ucx_nm_status %s %d %s %s",    \
 			  __SHORT_FILE__,			\
 			  __LINE__,				\
 			  FCNAME,				\
@@ -96,50 +95,49 @@ static inline int MPIDI_UCX_get_source(uint64_t match_bits)
     } while (0)
 
 
-
-#define MPIDI_UCX_PMI_ERROR(_errno,STR)				\
+#define MPIDI_UCX_PMI_ERROR(_errno)				\
   do									\
     {									\
-      MPIDI_UCX_ERR(_errno!=PMI_SUCCESS,			\
+       MPIR_ERR_CHKANDJUMP4(_errno!=PMI_SUCCESS,			\
 			    mpi_errno,					\
 			    MPI_ERR_OTHER,				\
-			    "**ch4_ucx_nm_pmi"#STR,			\
-			    "**ch4_ucx_nm_mpi"#STR" %s %d %s %s",	\
+			    "**ucx_nm_pmi_error",			\
+			    "**ucx_nm_pmi_error %s %d %s %s",	\
 			    __SHORT_FILE__,				\
 			    __LINE__,					\
 			    FCNAME,					\
-			    #STR);					\
+			    "pmi_error");					\
     } while (0)
 
-#define MPIDI_CH4_UCX_MPI_ERROR(_errno)				     \
+#define MPIDI_UCX_MPI_ERROR(_errno)				     \
   do								     \
     {								     \
       if (unlikely(_errno!=MPI_SUCCESS)) MPIR_ERR_POP(mpi_errno);    \
     } while (0)
 
-#define MPIDI_CH4_UCX_STR_ERRCHK(_errno,STR)				\
+#define MPIDI_UCX_STR_ERRCHK(_errno)				\
   do									\
     {									\
-      MPIDI_UCX_ERR(_errno!=MPL_STR_SUCCESS,			\
+       MPIR_ERR_CHKANDJUMP4(_errno!=MPL_STR_SUCCESS,			\
 			    mpi_errno,					\
 			    MPI_ERR_OTHER,				\
-			    "**ch4_ucx_nm_"#STR,			\
-			    "**ch4_ucx_nm_"#STR" %s %d %s %s",		\
+			    "**ucx_nm_str_error",			\
+			    "**ucx_nm_str_error %s %d %s %s",		\
 			    __SHORT_FILE__,				\
 			    __LINE__,					\
 			    FCNAME,					\
-			    #STR);					\
+			    "strng_error");					\
     } while (0)
 
 
 
-#define MPIDI_CH4_UCX_REQUEST(_req, STR)				\
+#define MPIDI_CH4_UCX_REQUEST(_req)				\
   do {									\
-    MPIDI_UCX_ERR(UCS_PTR_IS_ERR(_req),				\
+   MPIR_ERR_CHKANDJUMP4(UCS_PTR_IS_ERR(_req),				\
 			  mpi_errno,					\
 			  MPI_ERR_OTHER,				\
-			  "**ch4_ucx_nm_"#STR,				\
-			  "**ch4_ucx_nm_"#STR" %s %d %s %s",		\
+			  "**ucx_nm_rq_error",				\
+			  "**ucx_nm_rq_error %s %d %s %s",		\
 			  __SHORT_FILE__,				\
 			  __LINE__,					\
 			  FCNAME,					\
diff --git a/src/mpid/ch4/netmod/ucx/ucx_init.h b/src/mpid/ch4/netmod/ucx/ucx_init.h
index 2e523ea..343d867 100644
--- a/src/mpid/ch4/netmod/ucx/ucx_init.h
+++ b/src/mpid/ch4/netmod/ucx/ucx_init.h
@@ -43,7 +43,7 @@ static inline int MPIDI_NM_mpi_init_hook(int rank,
     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_INIT);
 
     ucx_status = ucp_config_read(NULL, NULL, &config);
-    MPIDI_UCX_CHK_STATUS(ucx_status, read_config);
+    MPIDI_UCX_CHK_STATUS(ucx_status);
 
     /* For now use only the tag feature */
     features = UCP_FEATURE_TAG | UCP_FEATURE_RMA;
@@ -52,16 +52,16 @@ static inline int MPIDI_NM_mpi_init_hook(int rank,
     ucp_params.request_init = MPIDI_UCX_Request_init_callback;
     ucp_params.request_cleanup = NULL;
     ucx_status = ucp_init(&ucp_params, config, &MPIDI_UCX_global.context);
-    MPIDI_UCX_CHK_STATUS(ucx_status, init);
+    MPIDI_UCX_CHK_STATUS(ucx_status);
     ucp_config_release(config);
 
     ucx_status = ucp_worker_create(MPIDI_UCX_global.context, UCS_THREAD_MODE_SERIALIZED,
                                    &MPIDI_UCX_global.worker);
-    MPIDI_UCX_CHK_STATUS(ucx_status, worker_create);
+    MPIDI_UCX_CHK_STATUS(ucx_status);
     ucx_status =
         ucp_worker_get_address(MPIDI_UCX_global.worker, &MPIDI_UCX_global.if_address,
                                &MPIDI_UCX_global.addrname_len);
-    MPIDI_UCX_CHK_STATUS(ucx_status, get_worker_address);
+    MPIDI_UCX_CHK_STATUS(ucx_status);
 
 
     val = valS;
@@ -75,11 +75,11 @@ static inline int MPIDI_NM_mpi_init_hook(int rank,
     val = valS;
     sprintf(keyS, "UCX-%d", rank);
     pmi_errno = PMI_KVS_Put(MPIDI_UCX_global.kvsname, keyS, val);
-    MPIDI_UCX_PMI_ERROR(pmi_errno, pmi_put_name);
+    MPIDI_UCX_PMI_ERROR(pmi_errno);
     pmi_errno = PMI_KVS_Commit(MPIDI_UCX_global.kvsname);
-    MPIDI_UCX_PMI_ERROR(pmi_errno, pmi_commit);
+    MPIDI_UCX_PMI_ERROR(pmi_errno);
     pmi_errno = PMI_Barrier();
-    MPIDI_UCX_PMI_ERROR(pmi_errno, pmi_barrier);
+    MPIDI_UCX_PMI_ERROR(pmi_errno);
 
     ///table = MPL_malloc(size * MPIDI_UCX_NAME_LEN);
     MPIDI_UCX_global.pmi_addr_table = NULL;
@@ -90,17 +90,17 @@ static inline int MPIDI_NM_mpi_init_hook(int rank,
     for (i = 0; i < size; i++) {
         sprintf(keyS, "UCX-%d", i);
         pmi_errno = PMI_KVS_Get(MPIDI_UCX_global.kvsname, keyS, valS, MPIDI_UCX_KVSAPPSTRLEN);
-        MPIDI_UCX_PMI_ERROR(pmi_errno, pmi_commit);
+        MPIDI_UCX_PMI_ERROR(pmi_errno);
         str_errno = MPL_str_get_binary_arg(valS, "UCX", remote_addr,
                                            (int) MPIDI_UCX_KVSAPPSTRLEN, (int *) &maxlen);
         if (maxlen > MPIDI_UCX_global.max_addr_len)
             MPIDI_UCX_global.max_addr_len = maxlen;
-        /* MPIDI_UCX_STR_ERRCHK(str_errno, buscard_len); */
+
         ucx_status = ucp_ep_create(MPIDI_UCX_global.worker,
                                    (ucp_address_t *) remote_addr,
                                    &MPIDI_UCX_AV(&MPIDIU_get_av(0, i)).dest);
 
-        MPIDI_UCX_CHK_STATUS(ucx_status, ep_create);
+        MPIDI_UCX_CHK_STATUS(ucx_status);
         memset(remote_addr, 0x0, maxlen);
     }
 
@@ -135,7 +135,7 @@ static inline int MPIDI_NM_mpi_finalize_hook(void)
             ucp_ep_destroy(MPIDI_UCX_AV(&MPIDIU_get_av(i, j)).dest);
     }
     pmi_errno = PMI_Barrier();
-    MPIDI_UCX_PMI_ERROR(pmi_errno, pmi_barrier);
+    MPIDI_UCX_PMI_ERROR(pmi_errno);
 
 
     if (MPIDI_UCX_global.worker != NULL)
@@ -196,8 +196,8 @@ static inline int allocate_address_table()
 
     for (i = 0; i < size; i++) {
         sprintf(keyS, "UCX-%d", i);
-        PMI_KVS_Get(MPIDI_UCX_global.kvsname, keyS, valS, MPIDI_UCX_KVSAPPSTRLEN);
-        // MPIDI_UCX_PMI_ERROR(pmi_errno, pmi_commit);
+        pmi_errno = PMI_KVS_Get(MPIDI_UCX_global.kvsname, keyS, valS, MPIDI_UCX_KVSAPPSTRLEN);
+        MPIDI_UCX_PMI_ERROR(pmi_errno);
         MPL_str_get_binary_arg(valS, "UCX", &MPIDI_UCX_global.pmi_addr_table[len * i],
                                (int) len, (int *) &maxlen);
     }
diff --git a/src/mpid/ch4/netmod/ucx/ucx_recv.h b/src/mpid/ch4/netmod/ucx/ucx_recv.h
index 6dd5ea6..e63a34c 100644
--- a/src/mpid/ch4/netmod/ucx/ucx_recv.h
+++ b/src/mpid/ch4/netmod/ucx/ucx_recv.h
@@ -33,7 +33,7 @@ MPL_STATIC_INLINE_PREFIX int ucx_irecv_continous(void *buf,
                                                               &MPIDI_UCX_Handle_recv_callback);
 
 
-    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+    MPIDI_CH4_UCX_REQUEST(ucp_request);
 
 
     if (ucp_request->req == NULL) {
@@ -80,7 +80,7 @@ MPL_STATIC_INLINE_PREFIX int ucx_irecv_non_continous(void *buf,
                                                               &MPIDI_UCX_Handle_recv_callback);
 
 
-    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+    MPIDI_CH4_UCX_REQUEST(ucp_request);
 
 
     if (ucp_request->req == NULL) {
@@ -195,7 +195,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_imrecv(void *buf,
                                                                       &MPIDI_UCX_Handle_recv_callback);
 
 
-    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+    MPIDI_CH4_UCX_REQUEST(ucp_request);
 
     if (ucp_request->req == NULL) {
         req = MPIR_Request_create(MPIR_REQUEST_KIND__RECV);
diff --git a/src/mpid/ch4/netmod/ucx/ucx_rma.h b/src/mpid/ch4/netmod/ucx/ucx_rma.h
index 53aa08e..f023797 100644
--- a/src/mpid/ch4/netmod/ucx/ucx_rma.h
+++ b/src/mpid/ch4/netmod/ucx/ucx_rma.h
@@ -33,7 +33,7 @@ static inline int MPIDI_UCX_contig_put(const void *origin_addr,
     if (status == UCS_INPROGRESS)
         MPIDI_UCX_WIN(win).need_local_flush = 1;
     else
-        MPIDI_UCX_CHK_STATUS(status, ucp_mem_map);
+        MPIDI_UCX_CHK_STATUS(status);
 
   fn_exit:
     return mpi_errno;
@@ -67,7 +67,7 @@ static inline int MPIDI_UCX_contig_get(void *origin_addr,
     if (status == UCS_INPROGRESS)
         MPIDI_UCX_WIN(win).need_local_flush = 1;
     else
-        MPIDI_UCX_CHK_STATUS(status, ucp_mem_map);
+        MPIDI_UCX_CHK_STATUS(status);
 
   fn_exit:
     return mpi_errno;
diff --git a/src/mpid/ch4/netmod/ucx/ucx_send.h b/src/mpid/ch4/netmod/ucx/ucx_send.h
index c90972b..4ba6817 100644
--- a/src/mpid/ch4/netmod/ucx/ucx_send.h
+++ b/src/mpid/ch4/netmod/ucx/ucx_send.h
@@ -39,7 +39,7 @@ MPL_STATIC_INLINE_PREFIX int ucx_send_continous(const void *buf,
         (MPIDI_UCX_ucp_request_t *) ucp_tag_send_nb(ep, buf, data_sz, ucp_dt_make_contig(1),
                                                     ucx_tag, &MPIDI_UCX_Handle_send_callback);
 
-    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+    MPIDI_CH4_UCX_REQUEST(ucp_request);
 
     if (ucp_request == NULL) {
         req = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
@@ -96,7 +96,7 @@ MPL_STATIC_INLINE_PREFIX int ucx_sync_send_continous(const void *buf,
         (MPIDI_UCX_ucp_request_t *) ucp_tag_send_sync_nb(ep, buf, data_sz, ucp_dt_make_contig(1),
                                                          ucx_tag, &MPIDI_UCX_Handle_send_callback);
 
-    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+    MPIDI_CH4_UCX_REQUEST(ucp_request);
     if (ucp_request->req) {
         req = ucp_request->req;
         ucp_request->req = NULL;
@@ -147,7 +147,7 @@ MPL_STATIC_INLINE_PREFIX int ucx_sync_send_non_continous(const void *buf,
                                                          datatype->dev.netmod.ucx.ucp_datatype,
                                                          ucx_tag, &MPIDI_UCX_Handle_send_callback);
 
-    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+    MPIDI_CH4_UCX_REQUEST(ucp_request);
 
     if (ucp_request->req) {
         req = ucp_request->req;
@@ -206,7 +206,7 @@ MPL_STATIC_INLINE_PREFIX int ucx_send_non_continous(const void *buf,
                                                     datatype->dev.netmod.ucx.ucp_datatype, ucx_tag,
                                                     &MPIDI_UCX_Handle_send_callback);
 
-    MPIDI_CH4_UCX_REQUEST(ucp_request, tag_send_nb);
+    MPIDI_CH4_UCX_REQUEST(ucp_request);
 
     if (ucp_request == NULL) {
         req = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
diff --git a/src/mpid/ch4/netmod/ucx/ucx_win.h b/src/mpid/ch4/netmod/ucx/ucx_win.h
index 16ccbdd..535035b 100644
--- a/src/mpid/ch4/netmod/ucx/ucx_win.h
+++ b/src/mpid/ch4/netmod/ucx/ucx_win.h
@@ -47,7 +47,7 @@ static inline int MPIDI_UCX_Win_allgather(MPIR_Win * win, size_t length,
         base = *base_ptr;
 
     status = ucp_mem_map(MPIDI_UCX_global.context, &base, size, 0, &mem_h);
-    MPIDI_UCX_CHK_STATUS(status, ucp_mem_map);
+    MPIDI_UCX_CHK_STATUS(status);
     if (length > 0)
         *base_ptr = base;
 
@@ -56,7 +56,7 @@ static inline int MPIDI_UCX_Win_allgather(MPIR_Win * win, size_t length,
     /* pack the key */
     status = ucp_rkey_pack(ucp_context, mem_h, (void **) &rkey_buffer, &rkey_size);
 
-    MPIDI_UCX_CHK_STATUS(status, ucp_mem_map);
+    MPIDI_UCX_CHK_STATUS(status);
 
     rkey_sizes = (int *) MPL_malloc(sizeof(int) * comm_ptr->local_size);
     rkey_sizes[comm_ptr->rank] = (int) rkey_size;
@@ -96,7 +96,7 @@ static inline int MPIDI_UCX_Win_allgather(MPIR_Win * win, size_t length,
             MPIDI_UCX_WIN_INFO(win, i).rkey = NULL;
         }
         else
-            MPIDI_UCX_CHK_STATUS(status, ucp_mem_map);
+            MPIDI_UCX_CHK_STATUS(status);
     }
     share_data = MPL_malloc(comm_ptr->local_size * sizeof(struct _UCX_share));
 
@@ -211,8 +211,10 @@ static inline int MPIDI_NM_mpi_win_unlock(int rank, MPIR_Win * win)
     ucp_ep_h ep = MPIDI_UCX_COMM_TO_EP(win->comm_ptr, rank);
     /* make sure all operations are completed  */
     ucp_status = ucp_ep_flush(ep);
-    MPIDI_UCX_CHK_STATUS(ucp_status, ucp_worker_fence);
+
+    MPIDI_UCX_CHK_STATUS(ucp_status);
     mpi_errno = MPIDI_CH4R_mpi_win_unlock(rank, win);
+
   fn_exit:
     return mpi_errno;
   fn_fail:
@@ -263,7 +265,7 @@ static inline int MPIDI_NM_mpi_win_fence(int assert, MPIR_Win * win)
     if (mpi_errno)
         MPIR_ERR_POP(mpi_errno);
 
-    MPIDI_UCX_CHK_STATUS(ucp_status, ucp_worker_fence);
+    MPIDI_UCX_CHK_STATUS(ucp_status);
   fn_exit:
     return mpi_errno;
   fn_fail:
@@ -394,7 +396,7 @@ static inline int MPIDI_NM_mpi_win_flush(int rank, MPIR_Win * win)
 /* only flush the endpoint */
     ucp_status = ucp_ep_flush(ep);
 
-    MPIDI_UCX_CHK_STATUS(ucp_status, ucp_worker_fence);
+    MPIDI_UCX_CHK_STATUS(ucp_status);
 
   fn_exit:
     return mpi_errno;
@@ -415,7 +417,7 @@ static inline int MPIDI_NM_mpi_win_flush_local_all(MPIR_Win * win)
      * a global flush. This is not good for performance - but OK for now */
     if (MPIDI_UCX_WIN(win).need_local_flush == 1) {
         ucp_status = ucp_worker_flush(MPIDI_UCX_global.worker);
-        MPIDI_UCX_CHK_STATUS(ucp_status, ucp_worker_fence);
+        MPIDI_UCX_CHK_STATUS(ucp_status);
         MPIDI_UCX_WIN(win).need_local_flush = 0;
     }
 
@@ -433,7 +435,7 @@ static inline int MPIDI_NM_mpi_win_unlock_all(MPIR_Win * win)
 
     /*first we have to make sure that all operations are completed */
     ucp_status = ucp_worker_flush(MPIDI_UCX_global.worker);
-    MPIDI_UCX_CHK_STATUS(ucp_status, ucp_worker_fence);
+    MPIDI_UCX_CHK_STATUS(ucp_status);
     mpi_errno = MPIDI_CH4R_mpi_win_unlock_all(win);
   fn_exit:
     return mpi_errno;
@@ -461,7 +463,7 @@ static inline int MPIDI_NM_mpi_win_flush_local(int rank, MPIR_Win * win)
 
     if (MPIDI_UCX_WIN(win).need_local_flush == 1) {
         ucp_status = ucp_ep_flush(ep);
-        MPIDI_UCX_CHK_STATUS(ucp_status, ucp_worker_fence);
+        MPIDI_UCX_CHK_STATUS(ucp_status);
         MPIDI_UCX_WIN(win).need_local_flush = 0;
     }
 
@@ -489,7 +491,7 @@ static inline int MPIDI_NM_mpi_win_flush_all(MPIR_Win * win)
 
     ucp_status = ucp_worker_flush(MPIDI_UCX_global.worker);
 
-    MPIDI_UCX_CHK_STATUS(ucp_status, ucp_worker_fence);
+    MPIDI_UCX_CHK_STATUS(ucp_status);
 
   fn_exit:
     return mpi_errno;

-----------------------------------------------------------------------

Summary of changes:
 src/mpid/ch4/netmod/ucx/errnames.txt |   26 ++---
 src/mpid/ch4/netmod/ucx/ucx_am.h     |   12 +-
 src/mpid/ch4/netmod/ucx/ucx_impl.h   |   40 +++---
 src/mpid/ch4/netmod/ucx/ucx_init.h   |  248 +++++++++++++++++++++++++++-------
 src/mpid/ch4/netmod/ucx/ucx_recv.h   |    6 +-
 src/mpid/ch4/netmod/ucx/ucx_rma.h    |    4 +-
 src/mpid/ch4/netmod/ucx/ucx_send.h   |    8 +-
 src/mpid/ch4/netmod/ucx/ucx_win.h    |   22 ++--
 8 files changed, 256 insertions(+), 110 deletions(-)


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list