From 49e5bf0bcfffcf69474005afe1cee43bfddeb62e Mon Sep 17 00:00:00 2001 From: Josiah Lee Date: Fri, 20 Sep 2024 15:12:00 -0700 Subject: [PATCH 1/4] fix: unrevert private WANDB_BASE_URL for weave (#220) --- charts/operator-wandb/Chart.yaml | 2 +- charts/operator-wandb/charts/weave/templates/deployment.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/operator-wandb/Chart.yaml b/charts/operator-wandb/Chart.yaml index 4d019f59..6a4adefb 100644 --- a/charts/operator-wandb/Chart.yaml +++ b/charts/operator-wandb/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: operator-wandb description: A Helm chart for deploying W&B to Kubernetes type: application -version: 0.17.8 +version: 0.17.9 appVersion: 1.0.0 icon: https://wandb.ai/logo.svg diff --git a/charts/operator-wandb/charts/weave/templates/deployment.yaml b/charts/operator-wandb/charts/weave/templates/deployment.yaml index af062b2d..b3b7b9a4 100644 --- a/charts/operator-wandb/charts/weave/templates/deployment.yaml +++ b/charts/operator-wandb/charts/weave/templates/deployment.yaml @@ -60,7 +60,7 @@ spec: - name: WEAVE_LOCAL_ARTIFACT_DIR value: /vol/weave/cache - name: WANDB_BASE_URL - value: {{ .Values.global.host }} + value: http://{{ .Release.Name }}-app:8080/ - name: WEAVE_SERVER_NUM_WORKERS value: "4" From 23c0b3f3be02ad06af1693ce26e4621650322f24 Mon Sep 17 00:00:00 2001 From: Zachary Blasczyk <77289967+zacharyblasczyk@users.noreply.github.com> Date: Mon, 23 Sep 2024 16:33:53 -0500 Subject: [PATCH 2/4] feat: Init HPA workaround (#221) --- .gitignore | 2 + charts/operator-wandb/Chart.yaml | 2 +- .../charts/app/templates/_deployment.tpl | 284 +++++++++++++++ .../charts/app/templates/_helpers.tpl | 4 +- .../charts/app/templates/deployment.yaml | 326 ++---------------- .../charts/app/templates/hpa.yaml | 6 +- charts/operator-wandb/charts/app/values.yaml | 9 + charts/operator-wandb/local-development.md | 19 +- charts/operator-wandb/templates/_helpers.tpl | 2 +- 9 files changed, 349 insertions(+), 305 deletions(-) create mode 100644 charts/operator-wandb/charts/app/templates/_deployment.tpl diff --git a/.gitignore b/.gitignore index cb3b675e..d10680a5 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ dryrun.yaml license.txt test-values.yaml +.DS_Store +secret.*.yaml diff --git a/charts/operator-wandb/Chart.yaml b/charts/operator-wandb/Chart.yaml index 6a4adefb..03691558 100644 --- a/charts/operator-wandb/Chart.yaml +++ b/charts/operator-wandb/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: operator-wandb description: A Helm chart for deploying W&B to Kubernetes type: application -version: 0.17.9 +version: 0.18.0 appVersion: 1.0.0 icon: https://wandb.ai/logo.svg diff --git a/charts/operator-wandb/charts/app/templates/_deployment.tpl b/charts/operator-wandb/charts/app/templates/_deployment.tpl new file mode 100644 index 00000000..2779e661 --- /dev/null +++ b/charts/operator-wandb/charts/app/templates/_deployment.tpl @@ -0,0 +1,284 @@ +{{/* +This template is used to generate the deployment for the app, and is used for both the non-glue and glue deployments. +*/}} +{{- define "app.deployment" -}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "app.fullname" . }}{{ .suffix }} + labels: + {{- include "wandb.commonLabels" . | nindent 4 }} + {{- include "app.commonLabels" . | nindent 4 }} + {{- include "app.labels" . | nindent 4 }} + {{- if .Values.deployment.labels }} + {{- toYaml .Values.deployment.labels | nindent 4 }} + {{- end }} + annotations: + {{- include "wandb.deploymentAnnotations" . | nindent 4 }} + {{- if .Values.deployment.annotations }} + {{- toYaml .Values.deployment.annotations | nindent 4 }} + {{- end }} +spec: + replicas: 1 + selector: + matchLabels: + {{- include "wandb.selectorLabels" . | nindent 6 }} + {{- include "app.labels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "wandb.podLabels" . | nindent 8 }} + {{- include "app.commonLabels" . | nindent 8 }} + {{- include "app.podLabels" . | nindent 8 }} + {{- include "app.labels" . | nindent 8 }} + annotations: + checksum/secret: {{ include (print $.Template.BasePath "/secrets.yaml") . | sha256sum }} + {{- if .Values.pod.annotations }} + {{- toYaml .Values.pod.annotations | nindent 8 }} + {{- end }} + spec: + serviceAccountName: {{ include "app.serviceAccountName" . }} + {{- if .tolerations }} + tolerations: + {{- toYaml .tolerations | nindent 8 }} + {{- end }} + {{- include "wandb.nodeSelector" . | nindent 6 }} + {{- include "wandb.priorityClassName" . | nindent 6 }} + {{- include "wandb.podSecurityContext" .Values.pod.securityContext | nindent 6 }} + terminationGracePeriodSeconds: 60 + initContainers: + - name: init-db + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + env: + - name: MYSQL_PORT + value: "{{ include "wandb.mysql.port" . }}" + - name: MYSQL_HOST + value: "{{ include "wandb.mysql.host" . }}" + - name: MYSQL_DATABASE + value: "{{ include "wandb.mysql.database" . }}" + - name: MYSQL_USER + value: "{{ include "wandb.mysql.user" . }}" + - name: MYSQL_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "wandb.mysql.passwordSecret" . }} + key: MYSQL_PASSWORD + command: ['bash', '-c', "until mysql -h$MYSQL_HOST -u$MYSQL_USER -p$MYSQL_PASSWORD -D$MYSQL_DATABASE -P$MYSQL_PORT --execute=\"SELECT 1\"; do echo waiting for db; sleep 2; done"] + containers: + - name: {{ .Chart.Name }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + volumeMounts: + {{- if ne (include "wandb.redis.caCert" .) "" }} + - name: {{ include "app.fullname" . }}-redis-ca + mountPath: /etc/ssl/certs/redis_ca.pem + subPath: redis_ca.pem + {{- end }} + {{- range $index, $v := .Values.global.customCACerts }} + - name: wandb-ca-certs + mountPath: /usr/local/share/ca-certificates/customCA{{$index}}.crt + subPath: customCA{{$index}}.crt + {{- end }} + ports: + - name: http + containerPort: 8080 + protocol: TCP + - name: prometheus + containerPort: 8181 + protocol: TCP + - name: gorilla-statsd + containerPort: 8125 + protocol: TCP + env: + - name: GLUE_ENABLED + value: "{{ .glueSingletonEnabled }}" + {{- if .onlyService }} + - name: ONLY_SERVICE + value: {{ .onlyService }} + {{- end }} + - name: HOST + value: "{{ .Values.global.host }}" + {{- if .Values.extraCors }} + - name: GORILLA_CORS_ORIGINS + value: "{{ join "," .Values.extraCors }}" + {{- end }} + - name: MYSQL_PORT + value: "{{ include "wandb.mysql.port" . }}" + - name: MYSQL_HOST + value: "{{ include "wandb.mysql.host" . }}" + - name: MYSQL_DATABASE + value: "{{ include "wandb.mysql.database" . }}" + - name: MYSQL_USER + value: "{{ include "wandb.mysql.user" . }}" + - name: MYSQL_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "wandb.mysql.passwordSecret" . }} + key: MYSQL_PASSWORD + - name: MYSQL + value: "mysql://$(MYSQL_USER):$(MYSQL_PASSWORD)@$(MYSQL_HOST):$(MYSQL_PORT)/$(MYSQL_DATABASE)" + - name: WEAVE_SERVICE + value: "{{ .Release.Name }}-weave:9994" + - name: PARQUET_HOST + value: "http://{{ .Release.Name }}-parquet:8087" + - name: PARQUET_ENABLED + value: "true" + {{- if index .Values.global "weave-trace" "enabled" }} + - name: WEAVE_TRACES_ENABLED + value: "true" + {{- end }} + {{- if ne (include "wandb.redis.password" .) "" }} + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "wandb.redis.passwordSecret" . }} + key: REDIS_PASSWORD + {{- end }} + - name: REDIS_PORT + value: "{{ include "wandb.redis.port" . }}" + - name: REDIS_HOST + value: "{{ include "wandb.redis.host" . }}" + - name: REDIS + value: "{{ include "app.redis" . | trim }}" + - name: SLACK_CLIENT_ID + value: {{ .Values.global.slack.clientId | quote }} + - name: SLACK_SECRET + valueFrom: + secretKeyRef: + name: {{ include "app.fullname" . }}-config + key: SLACK_SECRET + optional: true + {{- if ne .Values.global.email.smtp.host "" }} + - name: GORILLA_EMAIL_SINK + value: "smtp://{{ .Values.global.email.smtp.user }}:{{ .Values.global.email.smtp.password }}@{{ .Values.global.email.smtp.host }}:{{ .Values.global.email.smtp.port }}" + {{- end }} + - name: LICENSE + valueFrom: + secretKeyRef: + name: {{ include "app.fullname" . }}-config + key: LICENSE + optional: true + - name: GORILLA_LICENSE + valueFrom: + secretKeyRef: + name: {{ include "app.fullname" . }}-config + key: LICENSE + optional: true + {{- if ne .Values.global.auth.oidc.clientId "" }} + - name: OIDC_CLIENT_ID + value: {{ .Values.global.auth.oidc.clientId }} + - name: OIDC_AUTH_METHOD + value: {{ .Values.global.auth.oidc.authMethod }} + - name: OIDC_ISSUER + value: {{ .Values.global.auth.oidc.issuer }} + - name: OIDC_CLIENT_SECRET + value: {{ .Values.global.auth.oidc.secret }} + {{- end }} + - name: GORILLA_SESSION_LENGTH + value: "{{ .Values.global.auth.sessionLengthHours }}h" + {{- if and .Values.global .Values.global.observability }} + {{- if eq (default "custom" .Values.global.observability.mode) "otel" }} + - name: GORILLA_STATSD_PORT + value: "8125" + - name: GORILLA_STATSD_HOST + value: "0.0.0.0" + {{- end }} + {{- end }} + - name: BUCKET + value: "{{ include "app.bucket" . }}" + - name: AWS_REGION + value: {{ .Values.global.bucket.region | default .Values.global.defaultBucket.region }} + - name: AWS_S3_KMS_ID + value: "{{ .Values.global.bucket.kmsKey | default .Values.global.defaultBucket.kmsKey }}" + - name: OPERATOR_ENABLED + value: 'true' + - name: LOGGING_ENABLED + value: 'true' + - name: AZURE_STORAGE_KEY + valueFrom: + secretKeyRef: + name: "{{ include "wandb.bucket.secret" . }}" + key: ACCESS_KEY + optional: true + - name: GORILLA_CUSTOMER_SECRET_STORE_K8S_CONFIG_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: G_HOST_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: BANNERS + value: {{ toJson .Values.global.banners | quote }} + {{- if ne .Values.traceRatio 0.0 }} + - name: GORILLA_TRACER + value: "otlp+grpc://{{ .Release.Name }}-otel-daemonset:4317?trace_ratio={{ .Values.traceRatio }}" + {{- end }} + - name: KAFKA_BROKER_HOST + value: "{{ include "wandb.kafka.brokerHost" . }}" + - name: KAFKA_BROKER_PORT + value: "{{ include "wandb.kafka.brokerPort" . }}" + - name: KAFKA_CLIENT_USER + value: "{{ include "wandb.kafka.user" . }}" + - name: KAFKA_CLIENT_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "wandb.kafka.passwordSecret" . }} + key: KAFKA_CLIENT_PASSWORD + - name: KAFKA_TOPIC_RUN_UPDATE_SHADOW_QUEUE + value: {{ include "wandb.kafka.runUpdatesShadowTopic" .}} + - name: KAFKA_RUN_UPDATE_SHADOW_QUEUE_NUM_PARTITIONS + value: "{{ include "wandb.kafka.runUpdatesShadowNumPartitions" .}}" + - name: OVERFLOW_BUCKET_ADDR + value: "{{ include "app.bucket" .}}" + - name: GORILLA_RUN_UPDATE_SHADOW_QUEUE + value: > + { + "overflow-bucket": { + "store": "$(OVERFLOW_BUCKET_ADDR)", + "name": "wandb", + "prefix": "wandb-overflow" + }, + "addr": "kafka://$(KAFKA_CLIENT_USER):$(KAFKA_CLIENT_PASSWORD)@$(KAFKA_BROKER_HOST):$(KAFKA_BROKER_PORT)/$(KAFKA_TOPIC_RUN_UPDATE_SHADOW_QUEUE)?producer_batch_bytes=1048576&num_partitions=$(KAFKA_RUN_UPDATE_SHADOW_QUEUE_NUM_PARTITIONS)" + } + {{- include "app.extraEnv" (dict "global" $.Values.global "local" .Values) | nindent 12 }} + {{- include "wandb.extraEnvFrom" (dict "root" $ "local" .) | nindent 12 }} + {{- if .healthCheckEnabled }} + livenessProbe: + httpGet: + path: /healthz + port: http + readinessProbe: + httpGet: + path: /ready + port: http + initialDelaySeconds: 20 + periodSeconds: 5 + startupProbe: + httpGet: + path: /ready + port: http + initialDelaySeconds: 20 + periodSeconds: 5 + failureThreshold: 120 + lifecycle: + preStop: + exec: + command: ["sleep", "25"] + {{- end }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumes: + {{- if ne (include "wandb.redis.caCert" .) "" }} + - name: {{ include "app.fullname" . }}-redis-ca + secret: + secretName: "{{ include "wandb.redis.passwordSecret" . }}" + items: + - key: REDIS_CA_CERT + path: redis_ca.pem + {{- end }} + {{- if .Values.global.customCACerts }} + - name: wandb-ca-certs + configMap: + name: {{ include "wandb.fullname" . }}-ca-certs + {{- end }} +{{- end }} diff --git a/charts/operator-wandb/charts/app/templates/_helpers.tpl b/charts/operator-wandb/charts/app/templates/_helpers.tpl index 48c4e071..b088edc6 100644 --- a/charts/operator-wandb/charts/app/templates/_helpers.tpl +++ b/charts/operator-wandb/charts/app/templates/_helpers.tpl @@ -49,8 +49,8 @@ app.kubernetes.io/managed-by: {{ .Release.Service }} Selector labels */}} {{- define "app.selectorLabels" -}} -app.kubernetes.io/name: {{ include "app.name" . }} -app.kubernetes.io/instance: {{ .Release.Name }} +app.kubernetes.io/name: {{ include "app.name" . }}{{ .suffix }} +app.kubernetes.io/instance: {{ .Release.Name }}{{ .suffix }} {{- end }} {{/* diff --git a/charts/operator-wandb/charts/app/templates/deployment.yaml b/charts/operator-wandb/charts/app/templates/deployment.yaml index 26eead41..364c910e 100644 --- a/charts/operator-wandb/charts/app/templates/deployment.yaml +++ b/charts/operator-wandb/charts/app/templates/deployment.yaml @@ -1,294 +1,36 @@ {{- if .Values.enabled }} -{{- $imageCfg := dict "global" $.Values.global.image "local" $.Values.image -}} -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "app.fullname" . }} - labels: - {{- include "wandb.commonLabels" . | nindent 4 }} - {{- include "app.commonLabels" . | nindent 4 }} - {{- include "app.labels" . | nindent 4 }} - {{- if .Values.deployment.labels -}} - {{- toYaml .Values.deployment.labels | nindent 4 }} - {{- end }} - annotations: - {{- include "wandb.deploymentAnnotations" $ | nindent 4 }} - {{- if .Values.deployment.annotations -}} - {{- toYaml .Values.deployment.annotations | nindent 4 }} - {{- end }} -spec: - replicas: 1 - selector: - matchLabels: - {{- include "wandb.selectorLabels" $ | nindent 6 }} - {{- include "app.labels" . | nindent 6 }} - template: - metadata: - labels: - {{- include "wandb.podLabels" . | nindent 8 }} - {{- include "app.commonLabels" . | nindent 8 }} - {{- include "app.podLabels" . | nindent 8 }} - {{- include "app.labels" . | nindent 8 }} - annotations: - checksum/secret: {{ include (print $.Template.BasePath "/secrets.yaml") . | sha256sum }} - {{- if .Values.pod.annotations -}} - {{- toYaml .Values.pod.annotations | nindent 8 }} - {{- end }} - spec: - serviceAccountName: {{ include "app.serviceAccountName" . }} - {{- if .tolerations }} - tolerations: - {{- toYaml .tolerations | nindent 8 }} - {{- end }} - {{- include "wandb.nodeSelector" . | nindent 6 }} - {{- include "wandb.priorityClassName" . | nindent 6 }} - {{- include "wandb.podSecurityContext" .Values.pod.securityContext | nindent 6 }} - # Extend the pods shutdown grace period from the default of 30s to 60s. - # This goes in the pod template spec. - terminationGracePeriodSeconds: 60 - initContainers: - - name: init-db - image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" - env: - - name: MYSQL_PORT - value: "{{ include "wandb.mysql.port" . }}" - - name: MYSQL_HOST - value: "{{ include "wandb.mysql.host" . }}" - - name: MYSQL_DATABASE - value: "{{ include "wandb.mysql.database" . }}" - - name: MYSQL_USER - value: "{{ include "wandb.mysql.user" . }}" - - name: MYSQL_PASSWORD - valueFrom: - secretKeyRef: - name: {{ include "wandb.mysql.passwordSecret" . }} - key: MYSQL_PASSWORD - command: ['bash', '-c', "until mysql -h$MYSQL_HOST -u$MYSQL_USER -p$MYSQL_PASSWORD -D$MYSQL_DATABASE -P$MYSQL_PORT --execute=\"SELECT 1\"; do echo waiting for db; sleep 2; done"] - containers: - - name: {{ .Chart.Name }} - image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" - volumeMounts: - {{- if ne (include "wandb.redis.caCert" .) "" }} - - name: {{ include "app.fullname" . }}-redis-ca - mountPath: /etc/ssl/certs/redis_ca.pem - subPath: redis_ca.pem - {{- end }} - {{- range $index, $v := .Values.global.customCACerts }} - - name: wandb-ca-certs - mountPath: /usr/local/share/ca-certificates/customCA{{$index}}.crt - subPath: customCA{{$index}}.crt - {{- end }} - ports: - - name: http - containerPort: 8080 - protocol: TCP - - name: prometheus - containerPort: 8181 - protocol: TCP - - name: gorilla-statsd - containerPort: 8125 - protocol: TCP - env: - - name: HOST - value: "{{ .Values.global.host }}" - - {{- if .Values.extraCors }} - - name: GORILLA_CORS_ORIGINS - value: "{{ join "," .Values.extraCors }}" - {{- end }} - - - name: MYSQL_PORT - value: "{{ include "wandb.mysql.port" . }}" - - name: MYSQL_HOST - value: "{{ include "wandb.mysql.host" . }}" - - name: MYSQL_DATABASE - value: "{{ include "wandb.mysql.database" . }}" - - name: MYSQL_USER - value: "{{ include "wandb.mysql.user" . }}" - - name: MYSQL_PASSWORD - valueFrom: - secretKeyRef: - name: {{ include "wandb.mysql.passwordSecret" . }} - key: MYSQL_PASSWORD - - name: MYSQL - value: "mysql://$(MYSQL_USER):$(MYSQL_PASSWORD)@$(MYSQL_HOST):$(MYSQL_PORT)/$(MYSQL_DATABASE)" - - - name: WEAVE_SERVICE - value: "{{ .Release.Name }}-weave:9994" - - name: PARQUET_HOST - value: "http://{{ .Release.Name }}-parquet:8087" - - name: PARQUET_ENABLED - value: "true" - {{- if index .Values.global "weave-trace" "enabled" }} - - name: WEAVE_TRACES_ENABLED - value: "true" - {{- end }} - - {{- if ne (include "wandb.redis.password" .) "" }} - - name: REDIS_PASSWORD - valueFrom: - secretKeyRef: - name: {{ include "wandb.redis.passwordSecret" . }} - key: REDIS_PASSWORD - {{- end }} - - name: REDIS_PORT - value: "{{ include "wandb.redis.port" . }}" - - name: REDIS_HOST - value: "{{ include "wandb.redis.host" . }}" - - name: REDIS - value: "{{ include "app.redis" . | trim }}" - - - name: SLACK_CLIENT_ID - value: {{ .Values.global.slack.clientId | quote }} - - name: SLACK_SECRET - valueFrom: - secretKeyRef: - name: {{ include "app.fullname" . }}-config - key: SLACK_SECRET - optional: true - {{- if ne .Values.global.email.smtp.host "" }} - - name: GORILLA_EMAIL_SINK - value: "smtp://{{ .Values.global.email.smtp.user }}:{{ .Values.global.email.smtp.password }}@{{ .Values.global.email.smtp.host }}:{{ .Values.global.email.smtp.port }}" - {{- end }} - - - name: LICENSE - valueFrom: - secretKeyRef: - name: {{ include "app.fullname" . }}-config - key: LICENSE - optional: true - - name: GORILLA_LICENSE - valueFrom: - secretKeyRef: - name: {{ include "app.fullname" . }}-config - key: LICENSE - optional: true - {{- if ne .Values.global.auth.oidc.clientId "" }} - - name: OIDC_CLIENT_ID - value: {{ .Values.global.auth.oidc.clientId }} - - name: OIDC_AUTH_METHOD - value: {{ .Values.global.auth.oidc.authMethod }} - - name: OIDC_ISSUER - value: {{ .Values.global.auth.oidc.issuer }} - - name: OIDC_CLIENT_SECRET - value: {{ .Values.global.auth.oidc.secret }} - {{- end }} - - - name: GORILLA_SESSION_LENGTH - value: "{{ .Values.global.auth.sessionLengthHours }}h" - - {{- if and .Values.global .Values.global.observability }} - {{- if eq (default "custom" .Values.global.observability.mode) "otel" }} - - name: GORILLA_STATSD_PORT - value: "8125" - - name: GORILLA_STATSD_HOST - value: "0.0.0.0" - {{- end }} - {{- end }} - - - name: BUCKET - value: "{{ include "app.bucket" . }}" - - name: AWS_REGION - value: {{ .Values.global.bucket.region | default .Values.global.defaultBucket.region }} - - name: AWS_S3_KMS_ID - value: "{{ .Values.global.bucket.kmsKey | default .Values.global.defaultBucket.kmsKey }}" - - - name: OPERATOR_ENABLED - value: 'true' - - - name: LOGGING_ENABLED - value: 'true' - - - name: AZURE_STORAGE_KEY - valueFrom: - secretKeyRef: - name: "{{ include "wandb.bucket.secret" . }}" - key: ACCESS_KEY - optional: true - - - name: GORILLA_CUSTOMER_SECRET_STORE_K8S_CONFIG_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - - name: G_HOST_IP - valueFrom: - fieldRef: - fieldPath: status.hostIP - - - name: BANNERS - value: {{ toJson .Values.global.banners | quote }} - - {{- if ne .Values.traceRatio 0.0 }} - - name: GORILLA_TRACER - value: "otlp+grpc://{{ .Release.Name }}-otel-daemonset:4317?trace_ratio={{ .Values.traceRatio }}" - {{- end }} - - name: KAFKA_BROKER_HOST - value: "{{ include "wandb.kafka.brokerHost" . }}" - - name: KAFKA_BROKER_PORT - value: "{{ include "wandb.kafka.brokerPort" . }}" - - name: KAFKA_CLIENT_USER - value: "{{ include "wandb.kafka.user" . }}" - - name: KAFKA_CLIENT_PASSWORD - valueFrom: - secretKeyRef: - name: {{ include "wandb.kafka.passwordSecret" . }} - key: KAFKA_CLIENT_PASSWORD - - name: KAFKA_TOPIC_RUN_UPDATE_SHADOW_QUEUE - value: {{ include "wandb.kafka.runUpdatesShadowTopic" .}} - - name: KAFKA_RUN_UPDATE_SHADOW_QUEUE_NUM_PARTITIONS - value: "{{ include "wandb.kafka.runUpdatesShadowNumPartitions" .}}" - - name: OVERFLOW_BUCKET_ADDR - value: "{{ include "app.bucket" .}}" - - name: GORILLA_RUN_UPDATE_SHADOW_QUEUE - value: > - { - "overflow-bucket": { - "store": "$(OVERFLOW_BUCKET_ADDR)", - "name": "wandb", - "prefix": "wandb-overflow" - }, - "addr": "kafka://$(KAFKA_CLIENT_USER):$(KAFKA_CLIENT_PASSWORD)@$(KAFKA_BROKER_HOST):$(KAFKA_BROKER_PORT)/$(KAFKA_TOPIC_RUN_UPDATE_SHADOW_QUEUE)?producer_batch_bytes=1048576&num_partitions=$(KAFKA_RUN_UPDATE_SHADOW_QUEUE_NUM_PARTITIONS)" - } - {{- include "app.extraEnv" (dict "global" $.Values.global "local" .Values) | nindent 12 }} - {{- include "wandb.extraEnvFrom" (dict "root" $ "local" .) | nindent 12 }} - livenessProbe: - httpGet: - path: /healthz - port: http - readinessProbe: - httpGet: - path: /ready - port: http - initialDelaySeconds: 20 - periodSeconds: 5 - startupProbe: - httpGet: - path: /ready - port: http - initialDelaySeconds: 20 - periodSeconds: 5 - failureThreshold: 120 - # Increase the sleep before SIGTERM to 25s. I had this as 5s previously and it wasn't enough. - lifecycle: - preStop: - exec: - command: ["sleep", "25"] - - resources: - {{- toYaml .Values.resources | nindent 12 }} - volumes: - {{- if ne (include "wandb.redis.caCert" .) "" }} - - name: {{ include "app.fullname" . }}-redis-ca - secret: - secretName: "{{ include "wandb.redis.passwordSecret" . }}" - items: - - key: REDIS_CA_CERT - path: redis_ca.pem - {{- end }} - {{- if .Values.global.customCACerts }} - - name: wandb-ca-certs - configMap: - name: {{ include "wandb.fullname" . }}-ca-certs - {{- end }} +{{- $glueSingletonEnabled := .Values.glueSingleton.enabled }} +{{- $notGlueSingletonEnabled := not $glueSingletonEnabled }} + +{{/* +App deployment +*/}} +{{ include "app.deployment" (dict + "glueSingletonEnabled" $notGlueSingletonEnabled + "onlyService" nil + "suffix" "" + "healthCheckEnabled" true + "Values" .Values + "Chart" .Chart + "Release" .Release + "Template" .Template + "Capabilities" .Capabilities +) | indent 0 }} +--- +{{- if $glueSingletonEnabled }} +{{/* +Glue deployment +*/}} +{{ include "app.deployment" (dict + "glueSingletonEnabled" $glueSingletonEnabled + "onlyService" "gorilla-glue" + "suffix" "-glue" + "healthCheckEnabled" false + "Values" .Values + "Chart" .Chart + "Release" .Release + "Template" .Template + "Capabilities" .Capabilities +) | indent 0 }} +{{- end }} {{- end }} diff --git a/charts/operator-wandb/charts/app/templates/hpa.yaml b/charts/operator-wandb/charts/app/templates/hpa.yaml index f192d217..bf2406c4 100644 --- a/charts/operator-wandb/charts/app/templates/hpa.yaml +++ b/charts/operator-wandb/charts/app/templates/hpa.yaml @@ -1,3 +1,4 @@ +{{- if and .Values.autoscaling.hpa.enabled .Values.glueSingleton.enabled }} apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: @@ -12,8 +13,8 @@ spec: apiVersion: apps/v1 kind: Deployment name: {{ include "app.fullname" . }} - minReplicas: 1 - maxReplicas: 1 + minReplicas: {{ .Values.autoscaling.hpa.minReplicas }} + maxReplicas: {{ .Values.autoscaling.hpa.maxReplicas }} metrics: - type: Resource resource: @@ -21,3 +22,4 @@ spec: target: type: Utilization averageUtilization: 70 +{{- end }} diff --git a/charts/operator-wandb/charts/app/values.yaml b/charts/operator-wandb/charts/app/values.yaml index ffc8d8fb..f93db6c3 100644 --- a/charts/operator-wandb/charts/app/values.yaml +++ b/charts/operator-wandb/charts/app/values.yaml @@ -9,6 +9,15 @@ image: pullPolicy: Always # pullSecrets: [] +glueSingleton: + enabled: false + +autoscaling: + hpa: + enabled: false + minReplicas: 1 + maxReplicas: 1 + # Tolerations for pod scheduling tolerations: [] diff --git a/charts/operator-wandb/local-development.md b/charts/operator-wandb/local-development.md index 38f7d840..852bef1d 100644 --- a/charts/operator-wandb/local-development.md +++ b/charts/operator-wandb/local-development.md @@ -32,7 +32,6 @@ az account set --subscription az aks get-credentials --resource-group --name ``` - #### GCP Authenticate with the Google Cloud SDK: @@ -56,13 +55,13 @@ cd helm-charts Extract the current values from the deployed Helm chart and scale down the `wandb-controller-manager` deployment: ```bash -helm get values wandb > operator-spec.yaml +helm get values wandb > secret.operator-spec.yaml kubectl scale --replicas=0 deployment -n wandb wandb-controller-manager ``` ### 4. Develop and Test Your Changes -After extracting the current chart values into `operator-spec.yaml`, you can start making your changes to the chart or the operator specifications. +After extracting the current chart values into `secret.operator-spec.yaml`, you can start making your changes to the chart or the operator specifications. #### Building Dependencies @@ -77,9 +76,15 @@ helm dependency build ./charts/operator-wandb To apply your changes, upgrade the Helm release with your modified specifications: ```bash +# Helm template command +helm template wandb \ + ./charts/operator-wandb -f ./secret.operator-spec.yaml > secret.template.yaml + +# Helm upgrade command helm upgrade \ --install wandb \ - ./charts/operator-wandb -f ./operator-spec.yaml + ./charts/operator-wandb -f ./secret.operator-spec.yaml + ``` ### 5. Finalizing Development @@ -89,9 +94,9 @@ After completing your development work: 1. Ensure to increment the version in `Chart.yaml` of your Helm chart, e.g., `0.10.43`. 2. Scale the `wandb-controller-manager` deployment back up: - ```bash - kubectl scale --replicas=1 deployment -n wandb wandb-controller-manager - ``` + ```bash + kubectl scale --replicas=1 deployment -n wandb wandb-controller-manager + ``` ## Contributing diff --git a/charts/operator-wandb/templates/_helpers.tpl b/charts/operator-wandb/templates/_helpers.tpl index 200c75ce..7e82ea06 100644 --- a/charts/operator-wandb/templates/_helpers.tpl +++ b/charts/operator-wandb/templates/_helpers.tpl @@ -46,7 +46,7 @@ app.kubernetes.io/managed-by: {{ .Release.Service }} Selector labels */}} {{- define "wandb.selectorLabels" -}} -app.kubernetes.io/name: {{ include "wandb.name" . }} +app.kubernetes.io/name: {{ include "wandb.name" . }}{{ .suffix }} app.kubernetes.io/instance: {{ .Release.Name }} {{- end }} From 43982fb6e0f90a673cc77e50ce895c0c9b117d01 Mon Sep 17 00:00:00 2001 From: Aman Pruthi Date: Fri, 27 Sep 2024 12:07:43 +0530 Subject: [PATCH 3/4] feat: Disable permissions for node-level metrics and logs (#219) Disable permissions for node-level metrics and logs --- charts/operator-wandb/Chart.lock | 2 +- charts/operator-wandb/Chart.yaml | 2 +- charts/operator-wandb/charts/console/templates/role.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/charts/operator-wandb/Chart.lock b/charts/operator-wandb/Chart.lock index 13ec8ee7..11351ed7 100644 --- a/charts/operator-wandb/Chart.lock +++ b/charts/operator-wandb/Chart.lock @@ -42,4 +42,4 @@ dependencies: repository: file://charts/yace version: 0.1.0 digest: sha256:bca2b6781737da6806e4485605cf9ce87b1428944b14cb88f082024cc3500bbd -generated: "2024-07-18T01:17:04.532871-04:00" +generated: "2024-09-23T18:18:08.220787+05:30" diff --git a/charts/operator-wandb/Chart.yaml b/charts/operator-wandb/Chart.yaml index 03691558..f0fb66b1 100644 --- a/charts/operator-wandb/Chart.yaml +++ b/charts/operator-wandb/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: operator-wandb description: A Helm chart for deploying W&B to Kubernetes type: application -version: 0.18.0 +version: 0.18.1 appVersion: 1.0.0 icon: https://wandb.ai/logo.svg diff --git a/charts/operator-wandb/charts/console/templates/role.yaml b/charts/operator-wandb/charts/console/templates/role.yaml index 805ad0ff..0b45d3f9 100644 --- a/charts/operator-wandb/charts/console/templates/role.yaml +++ b/charts/operator-wandb/charts/console/templates/role.yaml @@ -24,7 +24,7 @@ rules: resources: [ "secrets" ] verbs: [ "get", "list", "watch", "patch", "create" ] - apiGroups: [ "" ] - resources: [ "nodes", "namespaces", "pods", "pods/log", "configmaps", "services", "serviceaccounts", "events" ] + resources: [ "namespaces", "pods", "pods/log", "configmaps", "services", "serviceaccounts", "events" ] verbs: [ "get", "list" ] - apiGroups: [ "apps" ] resources: [ "deployments", "statefulsets", "daemonsets", "replicasets", "controllerrevisions" ] From dbd397307a2951ee023ea306371aff82f8991f16 Mon Sep 17 00:00:00 2001 From: Aastha Gupta <71313011+velotioaastha@users.noreply.github.com> Date: Mon, 30 Sep 2024 11:20:34 +0530 Subject: [PATCH 4/4] feat: Support to pull bucket configurations from secrets (#224) Support to pull bucket configurations from secrets --- charts/operator-wandb/Chart.yaml | 2 +- charts/operator-wandb/templates/_bucket.tpl | 10 ++++++++-- charts/operator-wandb/templates/bucket.yaml | 4 +++- charts/operator-wandb/values.yaml | 4 +++- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/charts/operator-wandb/Chart.yaml b/charts/operator-wandb/Chart.yaml index f0fb66b1..f5fe1602 100644 --- a/charts/operator-wandb/Chart.yaml +++ b/charts/operator-wandb/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: operator-wandb description: A Helm chart for deploying W&B to Kubernetes type: application -version: 0.18.1 +version: 0.18.2 appVersion: 1.0.0 icon: https://wandb.ai/logo.svg diff --git a/charts/operator-wandb/templates/_bucket.tpl b/charts/operator-wandb/templates/_bucket.tpl index 5bad369d..b93320f1 100644 --- a/charts/operator-wandb/templates/_bucket.tpl +++ b/charts/operator-wandb/templates/_bucket.tpl @@ -1,6 +1,12 @@ {{/* -Return name of secret where bucket information is stored +Return the bucket credentials secret name */}} {{- define "wandb.bucket.secret" -}} -{{- print .Release.Name "-bucket" -}} +{{- if .Values.global.bucket.secretName -}} + {{ .Values.global.bucket.secretName }} +{{- else if .Values.global.defaultBucket.secretName -}} + {{ .Values.global.defaultBucket.secretName }} +{{- else }} + {{- print .Release.Name "-bucket" -}} {{- end -}} +{{- end }} diff --git a/charts/operator-wandb/templates/bucket.yaml b/charts/operator-wandb/templates/bucket.yaml index 4dc51148..08ff39ac 100644 --- a/charts/operator-wandb/templates/bucket.yaml +++ b/charts/operator-wandb/templates/bucket.yaml @@ -1,3 +1,4 @@ +{{- if not .Values.global.bucket.secretName }} apiVersion: v1 kind: Secret metadata: @@ -6,4 +7,5 @@ metadata: {{- include "wandb.commonLabels" . | nindent 4 }} data: ACCESS_KEY: {{ .Values.global.bucket.accessKey | default .Values.global.defaultBucket.accessKey | b64enc }} - SECRET_KEY: {{ .Values.global.bucket.secretKey | default .Values.global.defaultBucket.secretKey | b64enc }} \ No newline at end of file + SECRET_KEY: {{ .Values.global.bucket.secretKey | default .Values.global.defaultBucket.secretKey | b64enc }} +{{- end }} \ No newline at end of file diff --git a/charts/operator-wandb/values.yaml b/charts/operator-wandb/values.yaml index ad775dc5..3bf7e653 100644 --- a/charts/operator-wandb/values.yaml +++ b/charts/operator-wandb/values.yaml @@ -96,9 +96,11 @@ global: kmsKey: "" secretKey: "" accessKey: "" + secretName: "" # If specified the application will use this bucket for all storage operations, and will not be overridable by the user. - bucket: {} + bucket: + secretName: "" redis: host: ""