Skip to content

Commit

Permalink
Merge pull request openucx#904 from Sergei-Lebedev/topic/fix_rank_reo…
Browse files Browse the repository at this point in the history
…rdering

TOPO: fix rank reordering for host ordered sbgp
  • Loading branch information
artemry-nv authored Jan 25, 2024
2 parents c8385d1 + 21c0404 commit 878414a
Showing 1 changed file with 61 additions and 13 deletions.
74 changes: 61 additions & 13 deletions src/components/topo/ucc_sbgp.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -439,26 +439,82 @@ static int ucc_compare_proc_info_id(const void *a, const void *b)
} else if (d1->numa_id != d2->numa_id) {
return d1->numa_id - d2->numa_id;
} else {
return d1->pid - d2->pid;
return 0;
}
}

static ucc_status_t sbgp_create_full_ordered(ucc_topo_t *topo, ucc_sbgp_t *sbgp)
{
ucc_rank_t gsize = ucc_subset_size(&topo->set);
proc_info_id_t *sorted;
ucc_rank_t i;
ucc_rank_t gsize = ucc_subset_size(&topo->set);
ucc_proc_info_t *pinfo = topo->topo->procs;
ucc_host_id_t *visited;
proc_info_id_t *sorted;
ucc_rank_t i, j, num_visited;
int is_sorted, d;

ucc_assert(gsize > 0);
sbgp->status = UCC_SBGP_ENABLED;
sbgp->group_size = gsize;
sbgp->group_rank = topo->set.myrank;
sbgp->rank_map = ucc_malloc(sizeof(ucc_rank_t) * gsize, "rank_map");
if (ucc_unlikely(!sbgp->rank_map)) {
ucc_error("failed to allocate %zd bytes for rank_map",
gsize * sizeof(ucc_rank_t));
return UCC_ERR_NO_MEMORY;
}

visited = (ucc_host_id_t *)ucc_malloc(gsize * sizeof(ucc_host_id_t),
"visited host");
if (ucc_unlikely(!visited)) {
ucc_error("failed to allocate %zd bytes for list of visited nodes",
gsize * sizeof(ucc_host_id_t));
ucc_free(sbgp->rank_map);
return UCC_ERR_NO_MEMORY;
}

is_sorted = 1;
num_visited = 1;
visited[0] = pinfo[0].host_hash;
for (i = 1; i < gsize; i++) {
if (pinfo[i].host_hash != pinfo[i-1].host_hash) {
/* check if we saw that host_has before*/
for (j = 0; j < num_visited; j++) {
if (visited[j] == pinfo[i].host_hash) {
break;
}
}
if (j < num_visited) {
/* this host was present already, ranks are not ordered */
is_sorted = 0;
break;
}
/* add new host to the list of visited */
visited[num_visited++] = pinfo[i].host_hash;
} else {
d = ucc_compare_proc_info_id(&pinfo[i - 1].host_hash,
&pinfo[i].host_hash);

if (d > 0) {
is_sorted = 0;
break;
}
}
}
ucc_free(visited);

if (is_sorted) {
for (i = 0; i < gsize; i++) {
sbgp->rank_map[i] = i;
}
return UCC_OK;
}

sorted = (proc_info_id_t *)ucc_malloc(gsize * sizeof(proc_info_id_t),
"proc_sorted");
if (ucc_unlikely(!sorted)) {
ucc_error("failed to allocate %zd bytes for sorted proc info",
gsize * sizeof(proc_info_id_t));
ucc_free(sbgp->rank_map);
return UCC_ERR_NO_MEMORY;
}

Expand All @@ -467,14 +523,6 @@ static ucc_status_t sbgp_create_full_ordered(ucc_topo_t *topo, ucc_sbgp_t *sbgp)
sorted[i].id = i;
}

sbgp->rank_map = ucc_malloc(sizeof(ucc_rank_t) * gsize, "rank_map");
if (ucc_unlikely(!sbgp->rank_map)) {
ucc_error("failed to allocate %zd bytes for rank_map",
gsize * sizeof(ucc_rank_t));
ucc_free(sorted);
return UCC_ERR_NO_MEMORY;
}

qsort(sorted, gsize, sizeof(proc_info_id_t), ucc_compare_proc_info_id);
for (i = 0; i < gsize; i++) {
if (sorted[i].id == topo->set.myrank) {
Expand Down

0 comments on commit 878414a

Please sign in to comment.