-
Notifications
You must be signed in to change notification settings - Fork 45
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
nsys-jax: all-to-all and repeated thunk support
Also run notebook in CI
- Loading branch information
Showing
153 changed files
with
23,778 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file added
BIN
+68 Bytes
...xtext_fsdp4_test_data/dump/module_0000.jit_convert_element_type.autotune_results.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+196 Bytes
..._fsdp4_test_data/dump/module_0000.jit_convert_element_type.before_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+412 Bytes
...text_fsdp4_test_data/dump/module_0000.jit_convert_element_type.gpu_target_config.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+396 Bytes
...t_data/dump/module_0000.jit_convert_element_type.sm_9.0_gpu_after_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+68 Bytes
...jax/maxtext_fsdp4_test_data/dump/module_0001.jit__threefry_seed.autotune_results.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+672 Bytes
...axtext_fsdp4_test_data/dump/module_0001.jit__threefry_seed.before_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+412 Bytes
...ax/maxtext_fsdp4_test_data/dump/module_0001.jit__threefry_seed.gpu_target_config.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+820 Bytes
...p4_test_data/dump/module_0001.jit__threefry_seed.sm_9.0_gpu_after_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+68 Bytes
...ys-jax/maxtext_fsdp4_test_data/dump/module_0002.jit_concatenate.autotune_results.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+472 Bytes
...x/maxtext_fsdp4_test_data/dump/module_0002.jit_concatenate.before_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+412 Bytes
...s-jax/maxtext_fsdp4_test_data/dump/module_0002.jit_concatenate.gpu_target_config.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+672 Bytes
...fsdp4_test_data/dump/module_0002.jit_concatenate.sm_9.0_gpu_after_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+68 Bytes
...fsdp4_test_data/dump/module_0003.jit__unnamed_wrapped_function_.autotune_results.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+16.7 KB
..._test_data/dump/module_0003.jit__unnamed_wrapped_function_.before_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+412 Bytes
...sdp4_test_data/dump/module_0003.jit__unnamed_wrapped_function_.gpu_target_config.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+72.3 KB
.../dump/module_0003.jit__unnamed_wrapped_function_.sm_9.0_gpu_after_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+68 Bytes
...sdp4_test_data/dump/module_0012.jit_raw_generate_synthetic_data.autotune_results.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+380 Bytes
...test_data/dump/module_0012.jit_raw_generate_synthetic_data.before_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+412 Bytes
...dp4_test_data/dump/module_0012.jit_raw_generate_synthetic_data.gpu_target_config.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+876 Bytes
...dump/module_0012.jit_raw_generate_synthetic_data.sm_9.0_gpu_after_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+68 Bytes
...s/nsys-jax/maxtext_fsdp4_test_data/dump/module_0013.jit_fold_in.autotune_results.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+960 Bytes
...s-jax/maxtext_fsdp4_test_data/dump/module_0013.jit_fold_in.before_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+412 Bytes
.../nsys-jax/maxtext_fsdp4_test_data/dump/module_0013.jit_fold_in.gpu_target_config.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+3.6 KB
...ext_fsdp4_test_data/dump/module_0013.jit_fold_in.sm_9.0_gpu_after_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+1.1 KB
...sys-jax/maxtext_fsdp4_test_data/dump/module_0016.jit_train_step.autotune_results.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+30.2 KB
...ax/maxtext_fsdp4_test_data/dump/module_0016.jit_train_step.before_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+412 Bytes
...ys-jax/maxtext_fsdp4_test_data/dump/module_0016.jit_train_step.gpu_target_config.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+129 KB
..._fsdp4_test_data/dump/module_0016.jit_train_step.sm_9.0_gpu_after_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+1.1 KB
...flows/nsys-jax/maxtext_fsdp4_test_data/dump/module_0033.jit_cos.autotune_results.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+440 Bytes
.../nsys-jax/maxtext_fsdp4_test_data/dump/module_0033.jit_cos.before_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+412 Bytes
...lows/nsys-jax/maxtext_fsdp4_test_data/dump/module_0033.jit_cos.gpu_target_config.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+612 Bytes
...maxtext_fsdp4_test_data/dump/module_0033.jit_cos.sm_9.0_gpu_after_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+1.1 KB
...flows/nsys-jax/maxtext_fsdp4_test_data/dump/module_0034.jit_add.autotune_results.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+528 Bytes
.../nsys-jax/maxtext_fsdp4_test_data/dump/module_0034.jit_add.before_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+412 Bytes
...lows/nsys-jax/maxtext_fsdp4_test_data/dump/module_0034.jit_add.gpu_target_config.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+720 Bytes
...maxtext_fsdp4_test_data/dump/module_0034.jit_add.sm_9.0_gpu_after_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+1.1 KB
.../nsys-jax/maxtext_fsdp4_test_data/dump/module_0035.jit_multiply.autotune_results.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+460 Bytes
...-jax/maxtext_fsdp4_test_data/dump/module_0035.jit_multiply.before_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+412 Bytes
...nsys-jax/maxtext_fsdp4_test_data/dump/module_0035.jit_multiply.gpu_target_config.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+660 Bytes
...xt_fsdp4_test_data/dump/module_0035.jit_multiply.sm_9.0_gpu_after_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+1.1 KB
.../nsys-jax/maxtext_fsdp4_test_data/dump/module_0036.jit_subtract.autotune_results.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+532 Bytes
...-jax/maxtext_fsdp4_test_data/dump/module_0036.jit_subtract.before_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+412 Bytes
...nsys-jax/maxtext_fsdp4_test_data/dump/module_0036.jit_subtract.gpu_target_config.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+728 Bytes
...xt_fsdp4_test_data/dump/module_0036.jit_subtract.sm_9.0_gpu_after_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+1.1 KB
...flows/nsys-jax/maxtext_fsdp4_test_data/dump/module_0037.jit_add.autotune_results.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+456 Bytes
.../nsys-jax/maxtext_fsdp4_test_data/dump/module_0037.jit_add.before_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+412 Bytes
...lows/nsys-jax/maxtext_fsdp4_test_data/dump/module_0037.jit_add.gpu_target_config.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+652 Bytes
...maxtext_fsdp4_test_data/dump/module_0037.jit_add.sm_9.0_gpu_after_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+1.1 KB
...ws/nsys-jax/maxtext_fsdp4_test_data/dump/module_0038.jit__where.autotune_results.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+460 Bytes
...ys-jax/maxtext_fsdp4_test_data/dump/module_0038.jit__where.before_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+412 Bytes
...s/nsys-jax/maxtext_fsdp4_test_data/dump/module_0038.jit__where.gpu_target_config.pbtxt.xz
Binary file not shown.
Binary file added
BIN
+684 Bytes
...text_fsdp4_test_data/dump/module_0038.jit__where.sm_9.0_gpu_after_optimizations.hlo.pb.xz
Binary file not shown.
Binary file added
BIN
+782 KB
.github/workflows/nsys-jax/maxtext_fsdp4_test_data/nvtx_gpu_proj_trace/trace.parquet
Binary file not shown.
71 changes: 71 additions & 0 deletions
71
...hub/workflows/nsys-jax/maxtext_fsdp4_test_data/protos/tsl/profiler/protobuf/profile.proto
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
// This proto intends to match format expected by pprof tool. | ||
syntax = "proto3"; | ||
|
||
package tensorflow.tfprof.pprof; | ||
|
||
message Profile { | ||
repeated ValueType sample_type = 1; | ||
repeated Sample sample = 2; | ||
repeated Mapping mapping = 3; | ||
repeated Location location = 4; | ||
repeated Function function = 5; | ||
repeated string string_table = 6; | ||
int64 drop_frames = 7; | ||
int64 keep_frames = 8; | ||
int64 time_nanos = 9; | ||
int64 duration_nanos = 10; | ||
ValueType period_type = 11; | ||
int64 period = 12; | ||
repeated int64 comment = 13; | ||
int64 default_sample_type = 14; | ||
} | ||
|
||
message ValueType { | ||
int64 type = 1; | ||
int64 unit = 2; | ||
} | ||
|
||
message Sample { | ||
repeated uint64 location_id = 1; | ||
repeated int64 value = 2; | ||
repeated Label label = 3; | ||
} | ||
|
||
message Label { | ||
int64 key = 1; | ||
int64 str = 2; | ||
int64 num = 3; | ||
} | ||
|
||
message Mapping { | ||
uint64 id = 1; | ||
uint64 memory_start = 2; | ||
uint64 memory_limit = 3; | ||
uint64 file_offset = 4; | ||
int64 filename = 5; | ||
int64 build_id = 6; | ||
bool has_functions = 7; | ||
bool has_filenames = 8; | ||
bool has_line_numbers = 9; | ||
bool has_inline_frames = 10; | ||
} | ||
|
||
message Location { | ||
uint64 id = 1; | ||
uint64 mapping_id = 2; | ||
uint64 address = 3; | ||
repeated Line line = 4; | ||
} | ||
|
||
message Line { | ||
uint64 function_id = 1; | ||
int64 line = 2; | ||
} | ||
|
||
message Function { | ||
uint64 id = 1; | ||
int64 name = 2; | ||
int64 system_name = 3; | ||
int64 filename = 4; | ||
int64 start_line = 5; | ||
} |
33 changes: 33 additions & 0 deletions
33
...nsys-jax/maxtext_fsdp4_test_data/protos/tsl/profiler/protobuf/profiled_instructions.proto
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
==============================================================================*/ | ||
|
||
syntax = "proto3"; | ||
|
||
package tensorflow.profiler; | ||
|
||
// Next ID: 3 | ||
message ProfiledInstructionsProto { | ||
message InstructionCost { | ||
string name = 1; | ||
double cost_us = 2; | ||
} | ||
message Latency { | ||
string source = 1; | ||
string target = 2; | ||
double latency_us = 3; | ||
} | ||
repeated InstructionCost costs = 1; | ||
repeated Latency latencies = 2; | ||
} |
81 changes: 81 additions & 0 deletions
81
...ows/nsys-jax/maxtext_fsdp4_test_data/protos/tsl/profiler/protobuf/profiler_analysis.proto
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
syntax = "proto3"; | ||
|
||
package tensorflow; | ||
|
||
import "tsl/profiler/protobuf/profiler_service.proto"; | ||
|
||
message NewProfileSessionRequest { | ||
ProfileRequest request = 1; | ||
// The place where we will dump profile data. We will normally use | ||
// MODEL_DIR/plugins/profile as the repository root. | ||
string repository_root = 2; | ||
repeated string hosts = 3; // host or host:port, port will be ignored. | ||
string session_id = 4; | ||
} | ||
|
||
message NewProfileSessionResponse { | ||
// Auxiliary error_message. | ||
string error_message = 1; | ||
|
||
// Whether all hosts had returned a empty trace. | ||
bool empty_trace = 2; | ||
} | ||
|
||
message EnumProfileSessionsAndToolsRequest { | ||
string repository_root = 1; | ||
} | ||
|
||
message ProfileSessionInfo { | ||
string session_id = 1; | ||
// Which tool data is available for consumption. | ||
repeated string available_tools = 2; | ||
} | ||
|
||
message EnumProfileSessionsAndToolsResponse { | ||
// Auxiliary error_message. | ||
string error_message = 1; | ||
// If success, the returned sessions information are stored here. | ||
repeated ProfileSessionInfo sessions = 2; | ||
} | ||
|
||
message ProfileSessionDataRequest { | ||
// The place where we will read profile data. We will normally use | ||
// MODEL_DIR/plugins/profile as the repository root. | ||
string repository_root = 1; | ||
string session_id = 2; | ||
// Which host the data is associated. if empty, data from all hosts are | ||
// aggregated. | ||
string host_name = 5; | ||
// Which tool | ||
string tool_name = 3; | ||
// Tool's specific parameters. e.g. TraceViewer's viewport etc | ||
map<string, string> parameters = 4; | ||
} | ||
|
||
message ProfileSessionDataResponse { | ||
// Auxiliary error_message. | ||
string error_message = 1; | ||
|
||
// Output format. e.g. "json" or "proto" or "blob" | ||
string output_format = 2; | ||
|
||
// TODO(jiesun): figure out whether to put bytes or oneof tool specific proto. | ||
bytes output = 3; | ||
} | ||
//////////////////////////////////////////////////////////////////////////////// | ||
// ProfileAnalysis service provide entry point for profiling TPU and for | ||
// serving profiled data to TensorBoard through GRPC | ||
//////////////////////////////////////////////////////////////////////////////// | ||
service ProfileAnalysis { | ||
// Starts a profiling session, blocks until it completes. | ||
// TPUProfileAnalysis service delegate this to TPUProfiler service. | ||
// Populate the profiled data in repository, then return status to caller. | ||
rpc NewSession(NewProfileSessionRequest) returns (NewProfileSessionResponse) { | ||
} | ||
// Enumerate existing sessions and return available profile tools. | ||
rpc EnumSessions(EnumProfileSessionsAndToolsRequest) | ||
returns (EnumProfileSessionsAndToolsResponse) {} | ||
// Retrieve specific tool's data for specific session. | ||
rpc GetSessionToolData(ProfileSessionDataRequest) | ||
returns (ProfileSessionDataResponse) {} | ||
} |
Oops, something went wrong.