Skip to content

Commit

Permalink
[Distributed] Fix ps address list sort by index. (#945)
Browse files Browse the repository at this point in the history
Signed-off-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
  • Loading branch information
Mesilenceki authored Nov 8, 2023
1 parent c2e664a commit fc4f9f5
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 8 deletions.
17 changes: 11 additions & 6 deletions tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
#include <memory>
#include <vector>

#include <google/protobuf/map.h>
#include "include/json/json.h"
#include "grpc/support/alloc.h"
#include "grpcpp/grpcpp.h"
Expand Down Expand Up @@ -89,7 +90,7 @@ Status ElasticGrpcServer::UpdateServerDef(const string& cluster_def_str, int& be
return errors::Internal("PARSE TF_CONFIG/cluster ERROR");
}

std::unordered_set<string> ps_addrs_vec;
std::set<string> ps_addrs_vec; //ordered
after_part_num = cluster_json["cluster"]["ps"].size();
for (auto& value: cluster_json["cluster"]["ps"]) {
ps_addrs_vec.emplace(value.asString());
Expand All @@ -111,21 +112,25 @@ Status ElasticGrpcServer::UpdateServerDef(const string& cluster_def_str, int& be
}
for (auto ps_addr: ps_addrs_vec) {
if (target_string_set.find(ps_addr) == target_string_set.end()) {
job->mutable_tasks()->insert({idx, ps_addr});
job->mutable_tasks()->insert({idx++, ps_addr});
tf_config_json["cluster"]["ps"].append(ps_addr);
}
}
break;
} else {
LOG(INFO) << "SCALING DOWN, partition_num is: " << after_part_num;
google::protobuf::Map< google::protobuf::int32, std::string > tasks;
Json::Value arr_value(Json::arrayValue);
int idx = 0;
for (int i = 0; i < before_part_num; ++i) {
string tmp_string = tf_config_json["cluster"]["ps"][i].asString();
if (ps_addrs_vec.find(tmp_string) == ps_addrs_vec.end()) {
Json::Value ps_addr;
tf_config_json["cluster"]["ps"].removeIndex(i, &ps_addr);
job->mutable_tasks()->erase(i);
if (ps_addrs_vec.find(tmp_string) != ps_addrs_vec.end()) {
arr_value.append(tf_config_json["cluster"]["ps"][i]);
tasks[idx++] = tmp_string;
}
}
tf_config_json["cluster"]["ps"].swap(arr_value);
job->mutable_tasks()->swap(tasks);
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/contrib/elastic_grpc_server/elastic_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ limitations under the License.
#include <grpcpp/server.h>
#include "grpcpp/server_builder.h"

using namespace des;
using namespace deeprec;

using grpc::Server;
using grpc::ServerAsyncResponseWriter;
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/core/protobuf/elastic_training.proto
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
syntax = "proto3";

package des;
package deeprec;

enum Code {
OK = 0;
Expand Down
1 change: 1 addition & 0 deletions tensorflow/python/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -4747,6 +4747,7 @@ py_library(
":platform",
":protos_all_py",
":session_run_hook",
"//tensorflow/core:elastic_service_pb_py",
":training_util",
":util",
],
Expand Down

0 comments on commit fc4f9f5

Please sign in to comment.