-
Notifications
You must be signed in to change notification settings - Fork 501
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[k8s] On-demand single-host TPU support on GKE #3947
base: master
Are you sure you want to change the base?
Changes from 39 commits
a929474
80e1877
70a07ab
0cba9a5
17bcbd8
9233bf5
12e62c0
c795fe7
1c895f0
a8f5b6b
bdb3469
1d2d243
92f4f38
2662ec8
58f8ad6
1cf82b6
7b551c9
81a05ee
e8764f1
3497aee
724806a
e8d73fe
7ac5036
4470dbe
0860e45
35f3c80
2b56a9e
305705c
c2b5bfc
2ba5537
92cd77d
d085a5b
7860679
96924a7
1bbac21
4f7ea03
16b6c29
ad5089f
aa8efc3
e390843
884f0a2
ee28466
11142e5
de55663
a500555
bce8731
0e8366c
f67ad0f
05c37aa
06d3879
9a2046c
62b235f
4db1e63
5923f10
fa2e670
c1ee117
cbce4d5
241efc0
61b01d1
2fbb4eb
5dc92f3
9e8d53d
932e073
0a0eac2
3bc95b9
9dbaa72
f5e1d37
688c0b4
2dec7f9
dc23e88
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -79,12 +79,12 @@ def list_accelerators_realtime( | |
if not has_gpu: | ||
return {}, {}, {} | ||
|
||
label_formatter, _ = kubernetes_utils.detect_gpu_label_formatter() | ||
if not label_formatter: | ||
lf, _ = kubernetes_utils.detect_gpu_label_formatter() | ||
if not lf: | ||
return {}, {}, {} | ||
|
||
accelerators_qtys: Set[Tuple[str, int]] = set() | ||
key = label_formatter.get_label_key() | ||
keys = lf.get_label_keys() | ||
nodes = kubernetes_utils.get_kubernetes_nodes() | ||
# Get the pods to get the real-time GPU usage | ||
pods = kubernetes_utils.get_kubernetes_pods() | ||
|
@@ -95,56 +95,67 @@ def list_accelerators_realtime( | |
min_quantity_filter = quantity_filter if quantity_filter else 1 | ||
|
||
for node in nodes: | ||
if key in node.metadata.labels: | ||
allocated_qty = 0 | ||
accelerator_name = label_formatter.get_accelerator_from_label_value( | ||
node.metadata.labels.get(key)) | ||
|
||
# Check if name_filter regex matches the accelerator_name | ||
regex_flags = 0 if case_sensitive else re.IGNORECASE | ||
if name_filter and not re.match( | ||
name_filter, accelerator_name, flags=regex_flags): | ||
continue | ||
|
||
accelerator_count = int( | ||
node.status.allocatable.get('nvidia.com/gpu', 0)) | ||
|
||
# Generate the GPU quantities for the accelerators | ||
if accelerator_name and accelerator_count > 0: | ||
for count in range(1, accelerator_count + 1): | ||
accelerators_qtys.add((accelerator_name, count)) | ||
|
||
for pod in pods: | ||
# Get all the pods running on the node | ||
if (pod.spec.node_name == node.metadata.name and | ||
pod.status.phase in ['Running', 'Pending']): | ||
# Iterate over all the containers in the pod and sum the | ||
# GPU requests | ||
for container in pod.spec.containers: | ||
if container.resources.requests: | ||
allocated_qty += int( | ||
container.resources.requests.get( | ||
'nvidia.com/gpu', 0)) | ||
|
||
accelerators_available = accelerator_count - allocated_qty | ||
|
||
if accelerator_count >= min_quantity_filter: | ||
quantized_count = (min_quantity_filter * | ||
(accelerator_count // min_quantity_filter)) | ||
if accelerator_name not in total_accelerators_capacity: | ||
total_accelerators_capacity[ | ||
accelerator_name] = quantized_count | ||
else: | ||
total_accelerators_capacity[ | ||
accelerator_name] += quantized_count | ||
|
||
if accelerator_name not in total_accelerators_available: | ||
total_accelerators_available[accelerator_name] = 0 | ||
if accelerators_available >= min_quantity_filter: | ||
quantized_availability = min_quantity_filter * ( | ||
accelerators_available // min_quantity_filter) | ||
total_accelerators_available[ | ||
accelerator_name] += quantized_availability | ||
for key in keys: | ||
if key in node.metadata.labels: | ||
allocated_qty = 0 | ||
accelerator_name = lf.get_accelerator_from_label_value( | ||
node.metadata.labels.get(key)) | ||
|
||
# Check if name_filter regex matches the accelerator_name | ||
regex_flags = 0 if case_sensitive else re.IGNORECASE | ||
if name_filter and not re.match( | ||
name_filter, accelerator_name, flags=regex_flags): | ||
continue | ||
|
||
accelerator_count = 0 | ||
if kubernetes_utils.GPU_RESOURCE_KEY in node.status.allocatable: | ||
accelerator_count = int(node.status.allocatable[ | ||
kubernetes_utils.GPU_RESOURCE_KEY]) | ||
elif (kubernetes_utils.TPU_RESOURCE_KEY | ||
in node.status.allocatable): | ||
accelerator_count = int(node.status.allocatable[ | ||
kubernetes_utils.TPU_RESOURCE_KEY]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: can we have a function to get value from a dictionary with default value? sth like this: def get_node_attribute(attribute_dict: dict, default_value = None) -> Any:
assert not (kubernetes_utils.GPU_RESOURCE_KEY in attribute_dict and kubernetes_utils.TPU_RESOURCE_KEY in attribute_dict), 'Cannot have both GPU and TPU resources on the same node.'
if kubernetes_utils.GPU_RESOURCE_KEY in attribute_dict:
return attribute_dict[kubernetes_utils.GPU_RESOURCE_KEY]
if kubernetes_utils.TPU_RESOURCE_KEY in attribute_dict:
return attribute_dict[kubernetes_utils.TPU_RESOURCE_KEY]
return default_value There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Implemented at 06d3879 |
||
|
||
# Generate the GPU quantities for the accelerators | ||
if accelerator_name and accelerator_count > 0: | ||
for count in range(1, accelerator_count + 1): | ||
accelerators_qtys.add((accelerator_name, count)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe a quick way of addressing the show-gpus issue is to change this logic to show only the exact count, not the range if it is type TPU:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cblmemo @romilbhardwaj fixed at cbce4d5
|
||
|
||
for pod in pods: | ||
# Get all the pods running on the node | ||
if (pod.spec.node_name == node.metadata.name and | ||
pod.status.phase in ['Running', 'Pending']): | ||
# Iterate over all the containers in the pod and sum the | ||
# GPU requests | ||
for container in pod.spec.containers: | ||
if container.resources.requests: | ||
allocated_qty += int( | ||
container.resources.requests.get( | ||
kubernetes_utils.GPU_RESOURCE_KEY, 0)) | ||
allocated_qty += int( | ||
container.resources.requests.get( | ||
kubernetes_utils.TPU_RESOURCE_KEY, 0)) | ||
landscapepainter marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
accelerators_available = accelerator_count - allocated_qty | ||
|
||
if accelerator_count >= min_quantity_filter: | ||
quantized_count = ( | ||
min_quantity_filter * | ||
(accelerator_count // min_quantity_filter)) | ||
if accelerator_name not in total_accelerators_capacity: | ||
total_accelerators_capacity[ | ||
accelerator_name] = quantized_count | ||
else: | ||
total_accelerators_capacity[ | ||
accelerator_name] += quantized_count | ||
|
||
if accelerator_name not in total_accelerators_available: | ||
total_accelerators_available[accelerator_name] = 0 | ||
if accelerators_available >= min_quantity_filter: | ||
quantized_availability = min_quantity_filter * ( | ||
accelerators_available // min_quantity_filter) | ||
total_accelerators_available[ | ||
accelerator_name] += quantized_availability | ||
|
||
result = [] | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lets change the
get_gpu_label_key_value
function to include ability forget_tpu_topology_label_key_value
? i.e. return 4 values or a dict of valuesThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Resolved at a500555