diff --git a/charts/operator-wandb/Chart.lock b/charts/operator-wandb/Chart.lock index 3a086c68..5a32264a 100644 --- a/charts/operator-wandb/Chart.lock +++ b/charts/operator-wandb/Chart.lock @@ -41,5 +41,8 @@ dependencies: - name: yace repository: file://charts/yace version: 0.1.0 -digest: sha256:bca2b6781737da6806e4485605cf9ce87b1428944b14cb88f082024cc3500bbd -generated: "2024-10-09T17:53:33.992+05:30" +- name: glue + repository: file://charts/glue + version: 0.1.0 +digest: sha256:e1cc6a9a4d904d5a54fb104579a72b917678b94b2ff7999b2721791e8ef04377 +generated: "2024-10-23T16:56:27.529427-07:00" diff --git a/charts/operator-wandb/Chart.yaml b/charts/operator-wandb/Chart.yaml index bd7e679a..139bf0f5 100644 --- a/charts/operator-wandb/Chart.yaml +++ b/charts/operator-wandb/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: operator-wandb description: A Helm chart for deploying W&B to Kubernetes type: application -version: 0.18.11 +version: 0.19.0 appVersion: 1.0.0 icon: https://wandb.ai/logo.svg @@ -68,3 +68,7 @@ dependencies: version: "*.*.*" repository: file://charts/yace condition: yace.install + - name: glue + version: "*.*.*" + repository: file://charts/glue + condition: global.beta.glue.enabled diff --git a/charts/operator-wandb/charts/app/templates/_deployment.tpl b/charts/operator-wandb/charts/app/templates/_deployment.tpl deleted file mode 100644 index bd91a74f..00000000 --- a/charts/operator-wandb/charts/app/templates/_deployment.tpl +++ /dev/null @@ -1,298 +0,0 @@ -{{/* -This template is used to generate the deployment for the app, and is used for both the non-glue and glue deployments. -*/}} -{{- define "app.deployment" -}} -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "app.fullname" . }}{{ .suffix }} - labels: - {{- include "wandb.commonLabels" . | nindent 4 }} - {{- include "app.commonLabels" . | nindent 4 }} - {{- include "app.labels" . | nindent 4 }} - {{- if .Values.deployment.labels }} - {{- toYaml .Values.deployment.labels | nindent 4 }} - {{- end }} - annotations: - {{- include "wandb.deploymentAnnotations" . | nindent 4 }} - {{- if .Values.deployment.annotations }} - {{- toYaml .Values.deployment.annotations | nindent 4 }} - {{- end }} -spec: - replicas: 1 - selector: - matchLabels: - {{- include "wandb.selectorLabels" . | nindent 6 }} - {{- include "app.labels" . | nindent 6 }} - template: - metadata: - labels: - {{- include "wandb.podLabels" . | nindent 8 }} - {{- include "app.commonLabels" . | nindent 8 }} - {{- include "app.podLabels" . | nindent 8 }} - {{- include "app.labels" . | nindent 8 }} - annotations: - checksum/secret: {{ include (print $.Template.BasePath "/secrets.yaml") . | sha256sum }} - {{- if .Values.pod.annotations }} - {{- toYaml .Values.pod.annotations | nindent 8 }} - {{- end }} - spec: - serviceAccountName: {{ include "app.serviceAccountName" . }} - {{- if .tolerations }} - tolerations: - {{- toYaml .tolerations | nindent 8 }} - {{- end }} - {{- include "wandb.nodeSelector" . | nindent 6 }} - {{- include "wandb.priorityClassName" . | nindent 6 }} - {{- include "wandb.podSecurityContext" .Values.pod.securityContext | nindent 6 }} - terminationGracePeriodSeconds: 60 - initContainers: - - name: init-db - image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" - env: - - name: MYSQL_PORT - value: "{{ include "wandb.mysql.port" . }}" - - name: MYSQL_HOST - value: "{{ include "wandb.mysql.host" . }}" - - name: MYSQL_DATABASE - value: "{{ include "wandb.mysql.database" . }}" - - name: MYSQL_USER - value: "{{ include "wandb.mysql.user" . }}" - - name: MYSQL_PASSWORD - valueFrom: - secretKeyRef: - name: {{ include "wandb.mysql.passwordSecret" . }} - key: MYSQL_PASSWORD - command: ['bash', '-c', "until mysql -h$MYSQL_HOST -u$MYSQL_USER -p$MYSQL_PASSWORD -D$MYSQL_DATABASE -P$MYSQL_PORT --execute=\"SELECT 1\"; do echo waiting for db; sleep 2; done"] - containers: - - name: {{ .Chart.Name }} - image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" - volumeMounts: - {{- if ne (include "wandb.redis.caCert" .) "" }} - - name: {{ include "app.fullname" . }}-redis-ca - mountPath: /etc/ssl/certs/redis_ca.pem - subPath: redis_ca.pem - {{- end }} - {{- range $index, $v := .Values.global.customCACerts }} - - name: wandb-ca-certs - mountPath: /usr/local/share/ca-certificates/customCA{{$index}}.crt - subPath: customCA{{$index}}.crt - {{- end }} - ports: - - name: http - containerPort: 8080 - protocol: TCP - - name: prometheus - containerPort: 8181 - protocol: TCP - - name: gorilla-statsd - containerPort: 8125 - protocol: TCP - env: - - name: GLUE_ENABLED - value: "{{ .glueSingletonEnabled }}" - {{- if .onlyService }} - - name: ONLY_SERVICE - value: {{ .onlyService }} - {{- end }} - - name: HOST - value: "{{ .Values.global.host }}" - {{- if .Values.extraCors }} - - name: GORILLA_CORS_ORIGINS - value: "{{ join "," .Values.extraCors }}" - {{- end }} - - name: MYSQL_PORT - value: "{{ include "wandb.mysql.port" . }}" - - name: MYSQL_HOST - value: "{{ include "wandb.mysql.host" . }}" - - name: MYSQL_DATABASE - value: "{{ include "wandb.mysql.database" . }}" - - name: MYSQL_USER - value: "{{ include "wandb.mysql.user" . }}" - - name: MYSQL_PASSWORD - valueFrom: - secretKeyRef: - name: {{ include "wandb.mysql.passwordSecret" . }} - key: MYSQL_PASSWORD - - name: MYSQL - value: "mysql://$(MYSQL_USER):$(MYSQL_PASSWORD)@$(MYSQL_HOST):$(MYSQL_PORT)/$(MYSQL_DATABASE)" - - name: WEAVE_SERVICE - value: "{{ .Release.Name }}-weave:9994" - - name: PARQUET_HOST - value: "http://{{ .Release.Name }}-parquet:8087" - - name: PARQUET_ENABLED - value: "true" - {{- if index .Values.global "weave-trace" "enabled" }} - - name: WEAVE_TRACES_ENABLED - value: "true" - {{- end }} - - name: REDIS_PASSWORD - valueFrom: - secretKeyRef: - name: {{ include "wandb.redis.passwordSecret" . }} - optional: true - key: REDIS_PASSWORD - - name: REDIS_PORT - value: "{{ include "wandb.redis.port" . }}" - - name: REDIS_HOST - value: "{{ include "wandb.redis.host" . }}" - - name: REDIS - value: "{{ include "app.redis" . | trim }}" - - name: SLACK_CLIENT_ID - value: {{ .Values.global.slack.clientId | quote }} - - name: SLACK_SECRET - valueFrom: - secretKeyRef: - name: {{ include "app.fullname" . }}-config - key: SLACK_SECRET - optional: true - {{- if ne .Values.global.email.smtp.host "" }} - - name: GORILLA_EMAIL_SINK - value: "smtp://{{ .Values.global.email.smtp.user }}:{{ .Values.global.email.smtp.password }}@{{ .Values.global.email.smtp.host }}:{{ .Values.global.email.smtp.port }}" - {{- end }} - {{- if and .Values.global.licenseSecret.name .Values.global.licenseSecret.key }} - - name: LICENSE - valueFrom: - secretKeyRef: - name: {{ .Values.global.licenseSecret.name }} - key: {{ .Values.global.licenseSecret.key }} - optional: true - - name: GORILLA_LICENSE - valueFrom: - secretKeyRef: - name: {{ .Values.global.licenseSecret.name }} - key: {{ .Values.global.licenseSecret.key }} - optional: true - {{- else }} - - name: LICENSE - valueFrom: - secretKeyRef: - name: {{ include "app.fullname" . }}-config - key: LICENSE - optional: true - - name: GORILLA_LICENSE - valueFrom: - secretKeyRef: - name: {{ include "app.fullname" . }}-config - key: LICENSE - optional: true - {{- end }} - {{- if ne .Values.global.auth.oidc.clientId "" }} - - name: OIDC_CLIENT_ID - value: {{ .Values.global.auth.oidc.clientId }} - - name: OIDC_AUTH_METHOD - value: {{ .Values.global.auth.oidc.authMethod }} - - name: OIDC_ISSUER - value: {{ .Values.global.auth.oidc.issuer }} - - name: OIDC_CLIENT_SECRET - value: {{ .Values.global.auth.oidc.secret }} - {{- end }} - - name: GORILLA_SESSION_LENGTH - value: "{{ .Values.global.auth.sessionLengthHours }}h" - {{- if and .Values.global .Values.global.observability }} - {{- if eq (default "custom" .Values.global.observability.mode) "otel" }} - - name: GORILLA_STATSD_PORT - value: "8125" - - name: GORILLA_STATSD_HOST - value: "0.0.0.0" - {{- end }} - {{- end }} - - name: BUCKET - value: "{{ include "app.bucket" . }}" - - name: AWS_REGION - value: {{ .Values.global.bucket.region | default .Values.global.defaultBucket.region }} - - name: AWS_S3_KMS_ID - value: "{{ .Values.global.bucket.kmsKey | default .Values.global.defaultBucket.kmsKey }}" - - name: OPERATOR_ENABLED - value: 'true' - - name: LOGGING_ENABLED - value: 'true' - - name: AZURE_STORAGE_KEY - valueFrom: - secretKeyRef: - name: "{{ include "wandb.bucket.secret" . }}" - key: ACCESS_KEY - optional: true - - name: GORILLA_CUSTOMER_SECRET_STORE_K8S_CONFIG_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: G_HOST_IP - valueFrom: - fieldRef: - fieldPath: status.hostIP - - name: BANNERS - value: {{ toJson .Values.global.banners | quote }} - {{- if ne .Values.traceRatio 0.0 }} - - name: GORILLA_TRACER - value: "otlp+grpc://{{ .Release.Name }}-otel-daemonset:4317?trace_ratio={{ .Values.traceRatio }}" - {{- end }} - - name: KAFKA_BROKER_HOST - value: "{{ include "wandb.kafka.brokerHost" . }}" - - name: KAFKA_BROKER_PORT - value: "{{ include "wandb.kafka.brokerPort" . }}" - - name: KAFKA_CLIENT_USER - value: "{{ include "wandb.kafka.user" . }}" - - name: KAFKA_CLIENT_PASSWORD - valueFrom: - secretKeyRef: - name: {{ include "wandb.kafka.passwordSecret" . }} - key: KAFKA_CLIENT_PASSWORD - - name: KAFKA_TOPIC_RUN_UPDATE_SHADOW_QUEUE - value: {{ include "wandb.kafka.runUpdatesShadowTopic" .}} - - name: KAFKA_RUN_UPDATE_SHADOW_QUEUE_NUM_PARTITIONS - value: "{{ include "wandb.kafka.runUpdatesShadowNumPartitions" .}}" - - name: OVERFLOW_BUCKET_ADDR - value: "{{ include "app.bucket" .}}" - - name: GORILLA_RUN_UPDATE_SHADOW_QUEUE - value: > - { - "overflow-bucket": { - "store": "$(OVERFLOW_BUCKET_ADDR)", - "name": "wandb", - "prefix": "wandb-overflow" - }, - "addr": "kafka://$(KAFKA_CLIENT_USER):$(KAFKA_CLIENT_PASSWORD)@$(KAFKA_BROKER_HOST):$(KAFKA_BROKER_PORT)/$(KAFKA_TOPIC_RUN_UPDATE_SHADOW_QUEUE)?producer_batch_bytes=1048576&num_partitions=$(KAFKA_RUN_UPDATE_SHADOW_QUEUE_NUM_PARTITIONS)&replication_factor=3" - } - {{- include "app.extraEnv" (dict "global" $.Values.global "local" .Values) | nindent 12 }} - {{- include "wandb.extraEnvFrom" (dict "root" $ "local" .) | nindent 12 }} - {{- if .healthCheckEnabled }} - livenessProbe: - httpGet: - path: /healthz - port: http - readinessProbe: - httpGet: - path: /ready - port: http - initialDelaySeconds: 20 - periodSeconds: 5 - startupProbe: - httpGet: - path: /ready - port: http - initialDelaySeconds: 20 - periodSeconds: 5 - failureThreshold: 120 - lifecycle: - preStop: - exec: - command: ["sleep", "25"] - {{- end }} - resources: - {{- toYaml .Values.resources | nindent 12 }} - volumes: - {{- if ne (include "wandb.redis.caCert" .) "" }} - - name: {{ include "app.fullname" . }}-redis-ca - secret: - secretName: "{{ .Release.Name }}-redis" - items: - - key: REDIS_CA_CERT - path: redis_ca.pem - {{- end }} - {{- if .Values.global.customCACerts }} - - name: wandb-ca-certs - configMap: - name: {{ include "wandb.fullname" . }}-ca-certs - {{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/operator-wandb/charts/app/templates/deployment.yaml b/charts/operator-wandb/charts/app/templates/deployment.yaml index c7d97c51..09126d76 100644 --- a/charts/operator-wandb/charts/app/templates/deployment.yaml +++ b/charts/operator-wandb/charts/app/templates/deployment.yaml @@ -1,36 +1,280 @@ {{- if .Values.enabled }} -{{- $glueSingletonEnabled := .Values.glueSingleton.enabled }} -{{- $notGlueSingletonEnabled := not $glueSingletonEnabled }} - -{{/* -App deployment -*/}} -{{ include "app.deployment" (dict - "glueSingletonEnabled" $notGlueSingletonEnabled - "onlyService" nil - "suffix" "" - "healthCheckEnabled" true - "Values" .Values - "Chart" .Chart - "Release" .Release - "Template" .Template - "Capabilities" .Capabilities -) | indent 0 }} ---- -{{- if $glueSingletonEnabled }} -{{/* -Glue deployment -*/}} -{{ include "app.deployment" (dict - "glueSingletonEnabled" $glueSingletonEnabled - "onlyService" "gorilla-glue" - "suffix" "-glue" - "healthCheckEnabled" false - "Values" .Values - "Chart" .Chart - "Release" .Release - "Template" .Template - "Capabilities" .Capabilities -) | indent 0 }} -{{- end }} +{{- $imageCfg := dict "global" $.Values.global.image "local" $.Values.image -}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "app.fullname" . }} + labels: + {{- include "wandb.commonLabels" . | nindent 4 }} + {{- include "app.commonLabels" . | nindent 4 }} + {{- include "app.labels" . | nindent 4 }} + {{- if .Values.deployment.labels -}} + {{- toYaml .Values.deployment.labels | nindent 4 }} + {{- end }} + annotations: + {{- include "wandb.deploymentAnnotations" $ | nindent 4 }} + {{- if .Values.deployment.annotations -}} + {{- toYaml .Values.deployment.annotations | nindent 4 }} + {{- end }} +spec: + replicas: 1 + selector: + matchLabels: + {{- include "wandb.selectorLabels" $ | nindent 6 }} + {{- include "app.labels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "wandb.podLabels" . | nindent 8 }} + {{- include "app.commonLabels" . | nindent 8 }} + {{- include "app.podLabels" . | nindent 8 }} + {{- include "app.labels" . | nindent 8 }} + annotations: + checksum/secret: {{ include (print $.Template.BasePath "/secrets.yaml") . | sha256sum }} + {{- if .Values.pod.annotations -}} + {{- toYaml .Values.pod.annotations | nindent 8 }} + {{- end }} + spec: + serviceAccountName: {{ include "app.serviceAccountName" . }} + {{- if .tolerations }} + tolerations: + {{- toYaml .tolerations | nindent 8 }} + {{- end }} + {{- include "wandb.nodeSelector" . | nindent 6 }} + {{- include "wandb.priorityClassName" . | nindent 6 }} + {{- include "wandb.podSecurityContext" .Values.pod.securityContext | nindent 6 }} + # Extend the pods shutdown grace period from the default of 30s to 60s. + # This goes in the pod template spec. + terminationGracePeriodSeconds: 60 + initContainers: + - name: init-db + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + env: + - name: MYSQL_PORT + value: "{{ include "wandb.mysql.port" . }}" + - name: MYSQL_HOST + value: "{{ include "wandb.mysql.host" . }}" + - name: MYSQL_DATABASE + value: "{{ include "wandb.mysql.database" . }}" + - name: MYSQL_USER + value: "{{ include "wandb.mysql.user" . }}" + - name: MYSQL_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "wandb.mysql.passwordSecret" . }} + key: MYSQL_PASSWORD + command: ['bash', '-c', "until mysql -h$MYSQL_HOST -u$MYSQL_USER -p$MYSQL_PASSWORD -D$MYSQL_DATABASE -P$MYSQL_PORT --execute=\"SELECT 1\"; do echo waiting for db; sleep 2; done"] + containers: + - name: {{ .Chart.Name }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + volumeMounts: + {{- if ne (include "wandb.redis.caCert" .) "" }} + - name: {{ include "app.fullname" . }}-redis-ca + mountPath: /etc/ssl/certs/redis_ca.pem + subPath: redis_ca.pem + {{- end }} + {{- range $index, $v := .Values.global.customCACerts }} + - name: wandb-ca-certs + mountPath: /usr/local/share/ca-certificates/customCA{{$index}}.crt + subPath: customCA{{$index}}.crt + {{- end }} + ports: + - name: http + containerPort: 8080 + protocol: TCP + - name: prometheus + containerPort: 8181 + protocol: TCP + - name: gorilla-statsd + containerPort: 8125 + protocol: TCP + env: + - name: GLUE_ENABLED + value: "{{ not .Values.global.beta.glue.enabled }}" + - name: HOST + value: "{{ .Values.global.host }}" + {{- if .Values.extraCors }} + - name: GORILLA_CORS_ORIGINS + value: "{{ join "," .Values.extraCors }}" + {{- end }} + - name: MYSQL_PORT + value: "{{ include "wandb.mysql.port" . }}" + - name: MYSQL_HOST + value: "{{ include "wandb.mysql.host" . }}" + - name: MYSQL_DATABASE + value: "{{ include "wandb.mysql.database" . }}" + - name: MYSQL_USER + value: "{{ include "wandb.mysql.user" . }}" + - name: MYSQL_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "wandb.mysql.passwordSecret" . }} + key: MYSQL_PASSWORD + - name: MYSQL + value: "mysql://$(MYSQL_USER):$(MYSQL_PASSWORD)@$(MYSQL_HOST):$(MYSQL_PORT)/$(MYSQL_DATABASE)" + - name: WEAVE_SERVICE + value: "{{ .Release.Name }}-weave:9994" + - name: PARQUET_HOST + value: "http://{{ .Release.Name }}-parquet:8087" + - name: PARQUET_ENABLED + value: "true" + {{- if index .Values.global "weave-trace" "enabled" }} + - name: WEAVE_TRACES_ENABLED + value: "true" + {{- end }} + {{- if ne (include "wandb.redis.password" .) "" }} + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "wandb.redis.passwordSecret" . }} + key: REDIS_PASSWORD + {{- end }} + - name: REDIS_PORT + value: "{{ include "wandb.redis.port" . }}" + - name: REDIS_HOST + value: "{{ include "wandb.redis.host" . }}" + - name: REDIS + value: "{{ include "app.redis" . | trim }}" + - name: SLACK_CLIENT_ID + value: {{ .Values.global.slack.clientId | quote }} + - name: SLACK_SECRET + valueFrom: + secretKeyRef: + name: {{ include "app.fullname" . }}-config + key: SLACK_SECRET + optional: true + {{- if ne .Values.global.email.smtp.host "" }} + - name: GORILLA_EMAIL_SINK + value: "smtp://{{ .Values.global.email.smtp.user }}:{{ .Values.global.email.smtp.password }}@{{ .Values.global.email.smtp.host }}:{{ .Values.global.email.smtp.port }}" + {{- end }} + - name: LICENSE + valueFrom: + secretKeyRef: + name: {{ include "app.fullname" . }}-config + key: LICENSE + optional: true + - name: GORILLA_LICENSE + valueFrom: + secretKeyRef: + name: {{ include "app.fullname" . }}-config + key: LICENSE + optional: true + {{- if ne .Values.global.auth.oidc.clientId "" }} + - name: OIDC_CLIENT_ID + value: {{ .Values.global.auth.oidc.clientId }} + - name: OIDC_AUTH_METHOD + value: {{ .Values.global.auth.oidc.authMethod }} + - name: OIDC_ISSUER + value: {{ .Values.global.auth.oidc.issuer }} + - name: OIDC_CLIENT_SECRET + value: {{ .Values.global.auth.oidc.secret }} + {{- end }} + - name: GORILLA_SESSION_LENGTH + value: "{{ .Values.global.auth.sessionLengthHours }}h" + {{- if and .Values.global .Values.global.observability }} + {{- if eq (default "custom" .Values.global.observability.mode) "otel" }} + - name: GORILLA_STATSD_PORT + value: "8125" + - name: GORILLA_STATSD_HOST + value: "0.0.0.0" + {{- end }} + {{- end }} + - name: BUCKET + value: "{{ include "app.bucket" . }}" + - name: AWS_REGION + value: {{ .Values.global.bucket.region | default .Values.global.defaultBucket.region }} + - name: AWS_S3_KMS_ID + value: "{{ .Values.global.bucket.kmsKey | default .Values.global.defaultBucket.kmsKey }}" + - name: OPERATOR_ENABLED + value: 'true' + - name: LOGGING_ENABLED + value: 'true' + - name: AZURE_STORAGE_KEY + valueFrom: + secretKeyRef: + name: "{{ include "wandb.bucket.secret" . }}" + key: ACCESS_KEY + optional: true + + - name: GORILLA_CUSTOMER_SECRET_STORE_K8S_CONFIG_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: G_HOST_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: BANNERS + value: {{ toJson .Values.global.banners | quote }} + {{- if ne .Values.traceRatio 0.0 }} + - name: GORILLA_TRACER + value: "otlp+grpc://{{ .Release.Name }}-otel-daemonset:4317?trace_ratio={{ .Values.traceRatio }}" + {{- end }} + - name: KAFKA_BROKER_HOST + value: "{{ include "wandb.kafka.brokerHost" . }}" + - name: KAFKA_BROKER_PORT + value: "{{ include "wandb.kafka.brokerPort" . }}" + - name: KAFKA_CLIENT_USER + value: "{{ include "wandb.kafka.user" . }}" + - name: KAFKA_CLIENT_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "wandb.kafka.passwordSecret" . }} + key: KAFKA_CLIENT_PASSWORD + - name: KAFKA_TOPIC_RUN_UPDATE_SHADOW_QUEUE + value: {{ include "wandb.kafka.runUpdatesShadowTopic" .}} + - name: KAFKA_RUN_UPDATE_SHADOW_QUEUE_NUM_PARTITIONS + value: "{{ include "wandb.kafka.runUpdatesShadowNumPartitions" .}}" + - name: OVERFLOW_BUCKET_ADDR + value: "{{ include "app.bucket" .}}" + - name: GORILLA_RUN_UPDATE_SHADOW_QUEUE + value: > + { + "overflow-bucket": { + "store": "$(OVERFLOW_BUCKET_ADDR)", + "name": "wandb", + "prefix": "wandb-overflow" + }, + "addr": "kafka://$(KAFKA_CLIENT_USER):$(KAFKA_CLIENT_PASSWORD)@$(KAFKA_BROKER_HOST):$(KAFKA_BROKER_PORT)/$(KAFKA_TOPIC_RUN_UPDATE_SHADOW_QUEUE)?producer_batch_bytes=1048576&num_partitions=$(KAFKA_RUN_UPDATE_SHADOW_QUEUE_NUM_PARTITIONS)" + } + {{- include "app.extraEnv" (dict "global" $.Values.global "local" .Values) | nindent 12 }} + {{- include "wandb.extraEnvFrom" (dict "root" $ "local" .) | nindent 12 }} + livenessProbe: + httpGet: + path: /healthz + port: http + readinessProbe: + httpGet: + path: /ready + port: http + initialDelaySeconds: 20 + periodSeconds: 5 + startupProbe: + httpGet: + path: /ready + port: http + initialDelaySeconds: 20 + periodSeconds: 5 + failureThreshold: 120 + # Increase the sleep before SIGTERM to 25s. I had this as 5s previously and it wasn't enough. + lifecycle: + preStop: + exec: + command: ["sleep", "25"] + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumes: + {{- if ne (include "wandb.redis.caCert" .) "" }} + - name: {{ include "app.fullname" . }}-redis-ca + secret: + secretName: "{{ include "wandb.redis.passwordSecret" . }}" + items: + - key: REDIS_CA_CERT + path: redis_ca.pem + {{- end }} + {{- if .Values.global.customCACerts }} + - name: wandb-ca-certs + configMap: + name: {{ include "wandb.fullname" . }}-ca-certs + {{- end }} {{- end }} \ No newline at end of file diff --git a/charts/operator-wandb/charts/app/templates/hpa.yaml b/charts/operator-wandb/charts/app/templates/hpa.yaml index bf2406c4..c164f213 100644 --- a/charts/operator-wandb/charts/app/templates/hpa.yaml +++ b/charts/operator-wandb/charts/app/templates/hpa.yaml @@ -1,4 +1,4 @@ -{{- if and .Values.autoscaling.hpa.enabled .Values.glueSingleton.enabled }} +{{- if and .Values.autoscaling.hpa.enabled .Values.global.beta.glue.enabled }} apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: diff --git a/charts/operator-wandb/charts/app/values.yaml b/charts/operator-wandb/charts/app/values.yaml index f93db6c3..06db9ced 100644 --- a/charts/operator-wandb/charts/app/values.yaml +++ b/charts/operator-wandb/charts/app/values.yaml @@ -9,9 +9,6 @@ image: pullPolicy: Always # pullSecrets: [] -glueSingleton: - enabled: false - autoscaling: hpa: enabled: false diff --git a/charts/operator-wandb/charts/glue/.helmignore b/charts/operator-wandb/charts/glue/.helmignore new file mode 100644 index 00000000..0e8a0eb3 --- /dev/null +++ b/charts/operator-wandb/charts/glue/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/operator-wandb/charts/glue/Chart.yaml b/charts/operator-wandb/charts/glue/Chart.yaml new file mode 100644 index 00000000..da5c7192 --- /dev/null +++ b/charts/operator-wandb/charts/glue/Chart.yaml @@ -0,0 +1,13 @@ +--- +apiVersion: v2 +name: glue +description: Chart for deploying the flat runs field updater +type: application +version: 0.1.0 +appVersion: "0.33.0" +home: https://wandb.ai +icon: https://wandb.ai/logo.svg +maintainers: + - name: wandb + email: support@wandb.com + url: https://wandb.com diff --git a/charts/operator-wandb/charts/glue/templates/_helpers.tpl b/charts/operator-wandb/charts/glue/templates/_helpers.tpl new file mode 100644 index 00000000..0a71c94a --- /dev/null +++ b/charts/operator-wandb/charts/glue/templates/_helpers.tpl @@ -0,0 +1,183 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "glue.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "glue.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "glue.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "glue.labels" -}} +helm.sh/chart: {{ include "glue.chart" . }} +{{ include "glue.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Returns the extraEnv keys and values to inject into containers. + +Global values will override any chart-specific values. +*/}} +{{- define "glue.extraEnv" -}} +{{- $allExtraEnv := merge (default (dict) .local.extraEnv) .global.extraEnv -}} +{{- range $key, $value := $allExtraEnv }} +- name: {{ $key }} + value: {{ $value | quote }} +{{- end -}} +{{- end -}} + +{{/* +Returns a list of _common_ labels to be shared across all +glue deployments and other shared objects. +*/}} +{{- define "glue.commonLabels" -}} +{{- $commonLabels := default (dict) .Values.common.labels -}} +{{- if $commonLabels }} +{{- range $key, $value := $commonLabels }} +{{ $key }}: {{ $value | quote }} +{{- end }} +{{- end -}} +{{- end -}} + +{{/* +Returns a list of _pod_ labels to be shared across all +glue deployments. +*/}} +{{- define "glue.podLabels" -}} +{{- range $key, $value := .Values.pod.labels }} +{{ $key }}: {{ $value | quote }} +{{- end }} +{{- end -}} + +{{/* +Selector labels +*/}} +{{- define "glue.selectorLabels" -}} +app.kubernetes.io/name: {{ include "glue.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "glue.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "glue.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} + +{{- define "glue.redis" -}} +{{- $cs := include "wandb.redis.connectionString" . }} +{{- $ca := include "wandb.redis.caCert" . }} +{{- if $ca }} +{{- printf "%s?tls=true&caCertPath=/etc/ssl/certs/redis_ca.pem&ttlInSeconds=604800" $cs -}} +{{- else }} +{{- print $cs -}} +{{- end }} +{{- end }} + +{{- define "glue.bucket" -}} +{{- $bucketValues := .Values.global.defaultBucket }} +{{- if .Values.global.bucket.provider }} +{{- $bucketValues = .Values.global.bucket }} +{{- end }} +{{- $bucket := "" -}} +{{- if eq $bucketValues.provider "az" -}} +{{- $bucket = printf "az://%s/%s" $bucketValues.name (default "" $bucketValues.path) -}} +{{- end -}} +{{- if eq $bucketValues.provider "gcs" -}} +{{- $bucket = printf "gs://%s/%s" $bucketValues.name (default "" $bucketValues.path) -}} +{{- end -}} +{{- if eq $bucketValues.provider "s3" -}} +{{- if and $bucketValues.accessKey $bucketValues.secretKey -}} +{{- $bucket = printf "s3://%s:%s@%s/%s" $bucketValues.accessKey $bucketValues.secretKey $bucketValues.name (default "" $bucketValues.path) -}} +{{- else -}} +{{- $bucket = printf "s3://%s/%s" $bucketValues.name (default "" $bucketValues.path) -}} +{{- end -}} +{{- end -}} +{{- trimSuffix "/" $bucket -}} +{{- end }} + +{{/* +MySQL Port +*/}} +{{- define "glue.mysql.port" -}} +{{- .Values.mysql.port | default "3306" }} +{{- end }} + +{{/* +MySQL Host +*/}} +{{- define "glue.mysql.host" -}} +{{- .Values.mysql.host | default (printf "%s-mysql" .Release.Name) }} +{{- end }} + +{{/* +MySQL Database +*/}} +{{- define "glue.mysql.database" -}} +{{- .Values.mysql.database | default "wandb" }} +{{- end }} + +{{/* +MySQL User +*/}} +{{- define "glue.mysql.user" -}} +{{- .Values.mysql.user | default "wandb" }} +{{- end }} + +{{/* +MySQL Password Secret +*/}} +{{- define "glue.mysql.passwordSecret" -}} +{{- .Values.mysql.passwordSecret | default (printf "%s-mysql" .Release.Name) }} +{{- end }} + +{{- define "glue.cloud" -}} +{{- $bucketValues := .Values.global.defaultBucket }} +{{- if .Values.global.bucket.provider }} +{{- $bucketValues = .Values.global.bucket }} +{{- end }} +{{- $cloud := "minio-local" -}} +{{- if eq $bucketValues.provider "az" -}} +{{- $cloud = "azure" -}} +{{- end -}} +{{- if eq $bucketValues.provider "gcs" -}} +{{- $cloud = "google" -}} +{{- end -}} +{{- if eq $bucketValues.provider "s3" -}} +{{- $cloud = "aws" -}} +{{- end -}} +{{- $cloud -}} +{{- end }} diff --git a/charts/operator-wandb/charts/glue/templates/deployment.yaml b/charts/operator-wandb/charts/glue/templates/deployment.yaml new file mode 100644 index 00000000..07807029 --- /dev/null +++ b/charts/operator-wandb/charts/glue/templates/deployment.yaml @@ -0,0 +1,196 @@ +{{- if .Values.global.beta.glue.enabled }} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "glue.fullname" . }} + labels: + {{- include "glue.labels" . | nindent 4 }} + app: {{ include "glue.fullname" . }} + chart: {{ .Chart.Name }}-{{ .Chart.Version }} + heritage: {{ .Release.Service }} + release: {{ .Release.Name }} + annotations: + {{- toYaml .Values.deployment.annotations | nindent 4 }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + app: {{ include "glue.fullname" . }} + release: {{ .Release.Name }} + template: + metadata: + labels: + app: {{ include "glue.fullname" . }} + release: {{ .Release.Name }} + spec: + serviceAccountName: {{ include "glue.serviceAccountName" . }} + containers: + - name: {{ .Chart.Name }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + args: ["glue"] + volumeMounts: + {{- if ne (include "wandb.redis.caCert" .) "" }} + - name: {{ include "glue.fullname" . }}-redis-ca + mountPath: /etc/ssl/certs/redis_ca.pem + subPath: redis_ca.pem + {{- end }} + env: + - name: GORILLA_RUN_STORE_ONPREM_MIGRATE_CREATE_RUN_TABLES + value: "true" + {{- if ne .Values.global.email.smtp.host "" }} + - name: GORILLA_EMAIL_SINK + value: "smtp://{{ .Values.global.email.smtp.user }}:{{ .Values.global.email.smtp.password }}@{{ .Values.global.email.smtp.host }}:{{ .Values.global.email.smtp.port }}" + {{- else }} + - name: GORILLA_EMAIL_SINK + value: "https://api.wandb.ai/email/dispatch" + {{- end }} + - name: GORILLA_SWEEP_PROVIDER + value: "{{ .Values.global.sweepProvider | default (printf "http://127.0.0.1:8082") }}" + - name: GORILLA_VIEW_SPEC_UPDATER_EXECUTABLE + value: "/usr/local/bin/view-spec-updater-linux" + - name: GORILLA_LIMITER + value: "noop://" + - name: GORILLA_PARQUET_RPC_PATH + value: "/_goRPC_" + - name: GORILLA_SCHEMA_FILE + value: "/schema.graphql" + - name: GORILLA_PORT + value: "8081" + - name: GORILLA_ACTIVITY_STORE_ENABLE + value: "true" + - name: GORILLA_RUN_STORE_ONPREM_MIGRATE_DISABLE_READS + value: "false" + - name: GORILLA_GLUE_TASK_STORE + value: "memory://" + - name: GORILLA_TASK_QUEUE + value: "noop://" + {{- if ne .Values.traceRatio 0.0 }} + - name: GORILLA_TRACER + value: "otlp+grpc://{{ .Release.Name }}-otel-daemonset:4317?trace_ratio={{ .Values.traceRatio }}" + {{- end }} + - name: GORILLA_COLLECT_AUDIT_LOGS + value: "true" + - name: GORILLA_USE_PARQUET_HISTORY_STORE + value: "true" + - name: GORILLA_PARQUET_PORT + value: "8087" + - name: GORILLA_RUN_UPDATE_QUEUE_ADDR + value: "internal://" + - name: GORILLA_RUN_STORE_ONPREM_MIGRATE_CREATE_RUN_STORE + value: "true" + - name: GORILLA_RUN_STORE_ONPREM_MIGRATE_FLAT_RUNS_MIGRATOR + value: "true" + - name: GORILLA_FILE_STORE_IS_PROXIED + value: "false" + - name: GORILLA_ACTIVITY_STORE_SERVE + value: "true" + - name: GORILLA_GLUE_TASK_CONFIG_PATH + value: /gorilla_glue_tasks_local.yaml + - name: GORILLA_ONPREM + value: "true" + - name: GORILLA_STATSD_PORT + value: "8125" + - name: GORILLA_ACTIVITY_STORE_BACKFILL_ENABLE + value: "true" + - name: GORILLA_ARTIFACT_GC_ENABLED + value: "false" + - name: GORILLA_RUN_STORE_ONPREM_MIGRATE_SHADOW_RUN_UPDATES + value: "true" + - name: GORILLA_GLUE_TASK_PROVIDER + value: "memory://" + - name: MYSQL_PORT + value: "{{ include "wandb.mysql.port" . }}" + - name: MYSQL_HOST + value: "{{ include "wandb.mysql.host" . }}" + - name: MYSQL_DATABASE + value: "{{ include "wandb.mysql.database" . }}" + - name: MYSQL_USER + value: "{{ include "wandb.mysql.user" . }}" + - name: MYSQL_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "wandb.mysql.passwordSecret" . }} + key: MYSQL_PASSWORD + - name: GORILLA_GLUE_TASK_STRATEGY_STORE + value: "mysql://$(MYSQL_USER):$(MYSQL_PASSWORD)@$(MYSQL_HOST):$(MYSQL_PORT)/$(MYSQL_DATABASE)?tls=preferred" + - name: GORILLA_GLUE_TASK_METADATA_STORE + value: "mysql://$(MYSQL_USER):$(MYSQL_PASSWORD)@$(MYSQL_HOST):$(MYSQL_PORT)/$(MYSQL_DATABASE)?tls=preferred" + - name: GORILLA_USAGE_STORE + value: "mysql://$(MYSQL_USER):$(MYSQL_PASSWORD)@$(MYSQL_HOST):$(MYSQL_PORT)/$(MYSQL_DATABASE)?tls=preferred" + - name: GORILLA_METADATA_STORE + value: "mysql://$(MYSQL_USER):$(MYSQL_PASSWORD)@$(MYSQL_HOST):$(MYSQL_PORT)/$(MYSQL_DATABASE)?tls=preferred" + - name: GORILLA_PARQUET_LIVE_HISTORY_STORE + value: "mysql://$(MYSQL_USER):$(MYSQL_PASSWORD)@$(MYSQL_HOST):$(MYSQL_PORT)/$(MYSQL_DATABASE)?tls=preferred" + - name: GORILLA_ANALYTICS_SINK + value: "mysql://$(MYSQL_USER):$(MYSQL_PASSWORD)@$(MYSQL_HOST):$(MYSQL_PORT)/$(MYSQL_DATABASE)?tls=preferred" + - name: GORILLA_CASBIN_ADDRESS + value: "mysql://$(MYSQL_USER):$(MYSQL_PASSWORD)@$(MYSQL_HOST):$(MYSQL_PORT)/$(MYSQL_DATABASE)?tls=preferred" + - name: GORILLA_RUN_STORE + value: "mysql://$(MYSQL_USER):$(MYSQL_PASSWORD)@$(MYSQL_HOST):$(MYSQL_PORT)/$(MYSQL_DATABASE)?tls=preferred" + - name: GORILLA_FILE_STREAM_STORE_ADDRESS + value: "mysql://$(MYSQL_USER):$(MYSQL_PASSWORD)@$(MYSQL_HOST):$(MYSQL_PORT)/$(MYSQL_DATABASE)?tls=preferred" + - name: GORILLA_HISTORY_STORE + value: "http://wandb-parquet:8087/_goRPC_,mysql://$(MYSQL_USER):$(MYSQL_PASSWORD)@$(MYSQL_HOST):$(MYSQL_PORT)/$(MYSQL_DATABASE)?tls=preferred" + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "wandb.redis.passwordSecret" . }} + optional: true + key: REDIS_PASSWORD + - name: REDIS_PORT + value: "{{ include "wandb.redis.port" . }}" + - name: REDIS_HOST + value: "{{ include "wandb.redis.host" . }}" + - name: REDIS + value: "{{ include "glue.redis" . | trim }}" + - name: GORILLA_AUDITOR_CACHE + value: "{{ include "glue.redis" . | trim }}" + - name: GORILLA_SETTINGS_CACHE + value: "{{ include "glue.redis" . | trim }}" + - name: GORILLA_LOCKER + value: "{{ include "glue.redis" . | trim }}" + - name: GORILLA_ACTIVITY_STORE_CACHE_ADDRESS + value: "{{ include "glue.redis" . | trim }}" + - name: GORILLA_CACHE + value: "{{ include "glue.redis" . | trim }}" + - name: GORILLA_FILE_METADATA_SOURCE + value: "{{ include "glue.redis" . | trim }}" + - name: BUCKET + value: "{{ include "glue.bucket" . }}" + - name: AWS_REGION + value: "{{ .Values.global.bucket.region | default .Values.global.defaultBucket.region }}" + - name: GORILLA_DEFAULT_REGION + value: "{{ include "glue.cloud" . }}-{{ .Values.global.bucket.region | default .Values.global.defaultBucket.region }}" + - name: AWS_S3_KMS_ID + value: "{{ .Values.global.bucket.kmsKey | default .Values.global.defaultBucket.kmsKey }}" + - name: GORILLA_FILE_STORE + value: "{{ include "glue.bucket" . }}" + - name: GORILLA_STORAGE_BUCKET + value: "{{ include "glue.bucket" . }}" + ports: + - name: http + containerPort: 8080 + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 30 + periodSeconds: 1 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 3 + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumes: + {{- if ne (include "wandb.redis.caCert" .) "" }} + - name: {{ include "glue.fullname" . }}-redis-ca + secret: + secretName: "{{ .Release.Name }}-redis" + items: + - key: REDIS_CA_CERT + path: redis_ca.pem + {{- end }} +{{- end }} diff --git a/charts/operator-wandb/charts/glue/templates/serviceaccount.yaml b/charts/operator-wandb/charts/glue/templates/serviceaccount.yaml new file mode 100644 index 00000000..def08caa --- /dev/null +++ b/charts/operator-wandb/charts/glue/templates/serviceaccount.yaml @@ -0,0 +1,18 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "glue.serviceAccountName" . }} + namespace: {{ $.Release.Namespace }} + labels: + {{- include "wandb.commonLabels" . | nindent 4 }} + {{- include "glue.commonLabels" . | nindent 4 }} + {{- include "glue.labels" . | nindent 4 }} + {{- if .Values.serviceAccount.labels -}} + {{- toYaml .Values.serviceAccount.labels | nindent 4 }} + {{- end }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/charts/operator-wandb/charts/glue/values.yaml b/charts/operator-wandb/charts/glue/values.yaml new file mode 100644 index 00000000..94dc00d9 --- /dev/null +++ b/charts/operator-wandb/charts/glue/values.yaml @@ -0,0 +1,52 @@ +--- +# Default values for glue. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. +replicaCount: 1 + +image: + repository: wandb/megabinary + pullPolicy: Always + # Overrides the image tag whose default is the chart appVersion. + tag: latest + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" +traceRatio: 0 + +serviceAccount: + create: true + annotations: {} + name: "glue-sa" + labels: {} + +pod: + securityContext: + fsGroup: 0 + fsGroupChangePolicy: "OnRootMismatch" + labels: {} + annotations: {} + +common: + labels: {} + +deployment: + labels: {} + annotations: {} + +service: + type: ClusterIP + annotations: {} + labels: {} + +resources: + requests: + cpu: 1 + memory: 8G + +nodeSelector: {} + +tolerations: [] + +affinity: {} diff --git a/charts/operator-wandb/values.yaml b/charts/operator-wandb/values.yaml index eb59387d..42e292b9 100644 --- a/charts/operator-wandb/values.yaml +++ b/charts/operator-wandb/values.yaml @@ -22,6 +22,8 @@ global: storageClass: "" + sweepProvider: "" + banners: {} # banner1: @@ -134,6 +136,19 @@ global: weave-trace: enabled: false + beta: + glue: + enabled: false + +glue: + image: + repository: wandb/megabinary + tag: latest + mysql: + max-idle-conns: 10 + max-open-conns: 10 + read-timeout: "60s" + ingress: install: true create: true