diff --git a/CHANGELOG.md b/CHANGELOG.md
index df212ea5..9734eb7e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,27 @@
All notable changes to this project will be documented in this file.
+### [4.12.2](https://github.com/wandb/terraform-aws-wandb/compare/v4.12.1...v4.12.2) (2024-06-17)
+
+
+### Bug Fixes
+
+* Revert resolve conflicts var ([#233](https://github.com/wandb/terraform-aws-wandb/issues/233)) ([778f147](https://github.com/wandb/terraform-aws-wandb/commit/778f147aa9962fde6a74b7d35501ec7dd7abf2a9))
+
+### [4.12.1](https://github.com/wandb/terraform-aws-wandb/compare/v4.12.0...v4.12.1) (2024-06-17)
+
+
+### Bug Fixes
+
+* Remove white space ([#231](https://github.com/wandb/terraform-aws-wandb/issues/231)) ([974b4f3](https://github.com/wandb/terraform-aws-wandb/commit/974b4f3ec0d01b34cf6d83008c9fe2a0d3d8ee7a))
+
+## [4.12.0](https://github.com/wandb/terraform-aws-wandb/compare/v4.11.0...v4.12.0) (2024-06-17)
+
+
+### Features
+
+* Added support yace ([#218](https://github.com/wandb/terraform-aws-wandb/issues/218)) ([12e053d](https://github.com/wandb/terraform-aws-wandb/commit/12e053d520f6998689d3bec0352b320a9105ba9e))
+
## [4.11.0](https://github.com/wandb/terraform-aws-wandb/compare/v4.10.2...v4.11.0) (2024-05-18)
diff --git a/README.md b/README.md
index 0bc592c9..3f9de70e 100644
--- a/README.md
+++ b/README.md
@@ -102,13 +102,14 @@ resources that lack official modules.
Users can update the EKS cluster version to the latest version offered by AWS. This can be done using the environment variable `eks_cluster_version`. Note that, cluster and nodegroup version updates can only be done in increments of one version at a time. For example, if your current cluster version is `1.21` and the latest version available is `1.25` - you'd need to:
1. update the cluster version in the app_eks module from `1.21` to `1.22`
-2. run `terraform apply`
+2. run `terraform apply`
3. update the cluster version to `1.23`
4. run `terraform apply`
5. update the cluster version to `1.24`
-...and so on and so forth.
+ ...and so on and so forth.
Upgrades must be executed in step-wise fashion from one version to the next. You cannot skip versions when upgrading EKS.
+
### Notes on EKS Add-ons
@@ -252,7 +253,11 @@ CLI and re-run the apply. Running pods will not be impacted.
## Migrations
-#### Upgrading from 3.x -> 4.x
+### Upgrading to Operator
+
+See our upgrade guide [here](./docs/operator-migration/readme.md)
+
+### Upgrading from 3.x -> 4.x
- If egress access for retrieving the wandb/controller image is not available, Terraform apply may experience failures.
- It's necessary to supply a license variable within the module, as shown:
diff --git a/docs/operator-migration/images/post-operator-apply.png b/docs/operator-migration/images/post-operator-apply.png
new file mode 100644
index 00000000..9ac05b96
Binary files /dev/null and b/docs/operator-migration/images/post-operator-apply.png differ
diff --git a/docs/operator-migration/images/post-operator-k8s.svg b/docs/operator-migration/images/post-operator-k8s.svg
new file mode 100644
index 00000000..20ebd449
--- /dev/null
+++ b/docs/operator-migration/images/post-operator-k8s.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/operator-migration/images/pre-operator-infra.svg b/docs/operator-migration/images/pre-operator-infra.svg
new file mode 100644
index 00000000..c1b474ea
--- /dev/null
+++ b/docs/operator-migration/images/pre-operator-infra.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/operator-migration/images/pre-operator-k8s.svg b/docs/operator-migration/images/pre-operator-k8s.svg
new file mode 100644
index 00000000..93719397
--- /dev/null
+++ b/docs/operator-migration/images/pre-operator-k8s.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/operator-migration/post-operator.tf.disabled b/docs/operator-migration/post-operator.tf.disabled
new file mode 100644
index 00000000..62ae09e3
--- /dev/null
+++ b/docs/operator-migration/post-operator.tf.disabled
@@ -0,0 +1,113 @@
+provider "aws" {
+ region = "us-west-2"
+
+ default_tags {
+ tags = {
+ GithubRepo = "terraform-aws-wandb"
+ GithubOrg = "wandb"
+ Enviroment = "Example"
+ Example = "PublicDnsExternal"
+ }
+ }
+}
+
+terraform {
+ required_version = "~> 1.0"
+ required_providers {
+ aws = {
+ source = "hashicorp/aws"
+ version = "~> 4.0" # Post-Operator
+ }
+ kubernetes = {
+ source = "hashicorp/kubernetes"
+ version = "~> 2.23"
+ }
+ }
+}
+
+module "wandb_infra" {
+ source = "wandb/wandb/aws"
+ version = "4.7.2"
+
+ namespace = var.namespace
+ public_access = true
+ external_dns = true
+
+ enable_dummy_dns = var.enable_dummy_dns # Post-Operator
+ enable_operator_alb = var.enable_operator_alb # Post-Operator
+ deletion_protection = false
+
+ database_instance_class = var.database_instance_class
+ database_engine_version = var.database_engine_version
+ database_snapshot_identifier = var.database_snapshot_identifier
+ database_sort_buffer_size = var.database_sort_buffer_size
+
+ database_performance_insights_kms_key_arn = null
+
+ allowed_inbound_cidr = var.allowed_inbound_cidr
+ allowed_inbound_ipv6_cidr = ["::/0"]
+
+ eks_cluster_version = "1.25"
+ kubernetes_public_access = true
+ kubernetes_public_access_cidrs = ["0.0.0.0/0"]
+
+ domain_name = var.domain_name
+ zone_id = var.zone_id
+ subdomain = var.subdomain
+
+ # Add License Post-Operator
+ license = var.wandb_license
+
+ # Use standard sizing Post-Operator
+ size = var.size
+
+ # Set the External DNS Custom Domain Filter Post-Operator
+ custom_domain_filter = var.custom_domain_filter
+
+ bucket_name = var.bucket_name
+ bucket_kms_key_arn = var.bucket_kms_key_arn
+ use_internal_queue = true
+
+ aws_loadbalancer_controller_tags = var.aws_loadbalancer_controller_tags
+}
+
+data "aws_eks_cluster" "app_cluster" {
+ name = module.wandb_infra.cluster_id
+}
+
+data "aws_eks_cluster_auth" "app_cluster" {
+ name = module.wandb_infra.cluster_id
+}
+
+provider "kubernetes" {
+ host = data.aws_eks_cluster.app_cluster.endpoint
+ cluster_ca_certificate = base64decode(data.aws_eks_cluster.app_cluster.certificate_authority[0].data)
+ token = data.aws_eks_cluster_auth.app_cluster.token
+ exec {
+ api_version = "client.authentication.k8s.io/v1beta1"
+ args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.app_cluster.name]
+ command = "aws"
+ }
+}
+
+# Enable the Helm provider
+provider "helm" {
+ kubernetes {
+ host = data.aws_eks_cluster.app_cluster.endpoint
+ cluster_ca_certificate = base64decode(data.aws_eks_cluster.app_cluster.certificate_authority[0].data)
+ token = data.aws_eks_cluster_auth.app_cluster.token
+ exec {
+ api_version = "client.authentication.k8s.io/v1beta1"
+ args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.app_cluster.name]
+ command = "aws"
+ }
+ }
+}
+
+output "bucket_name" {
+ value = module.wandb_infra.bucket_name
+}
+
+output "bucket_queue_name" {
+ value = module.wandb_infra.bucket_queue_name
+}
diff --git a/docs/operator-migration/post-operator.tfvars b/docs/operator-migration/post-operator.tfvars
new file mode 100644
index 00000000..9c3ae571
--- /dev/null
+++ b/docs/operator-migration/post-operator.tfvars
@@ -0,0 +1,12 @@
+namespace = "operator-upgrade"
+domain_name = "sandbox-aws.wandb.ml"
+zone_id = "Z032246913CW32RVRY0WU"
+subdomain = "operator-upgrade"
+wandb_license = "eyJh"
+# wandb_version = "0.51.2" Is now coming from the Release Channel or set in the User Spec.
+
+# Needed Operator Variables for Upgrade
+size = "small"
+enable_dummy_dns = true
+enable_operator_alb = true
+custom_domain_filter = "sandbox-aws.wandb.ml"
\ No newline at end of file
diff --git a/docs/operator-migration/pre-operator.tf b/docs/operator-migration/pre-operator.tf
new file mode 100644
index 00000000..40bf5a2b
--- /dev/null
+++ b/docs/operator-migration/pre-operator.tf
@@ -0,0 +1,112 @@
+provider "aws" {
+ region = "us-west-2"
+
+ default_tags {
+ tags = {
+ GithubRepo = "terraform-aws-wandb"
+ GithubOrg = "wandb"
+ Enviroment = "Example"
+ Example = "PublicDnsExternal"
+ }
+ }
+}
+
+terraform {
+ required_version = "~> 1.0"
+ required_providers {
+ aws = {
+ source = "hashicorp/aws"
+ version = "~> 3.6" # Pre-Operator
+ }
+ kubernetes = {
+ source = "hashicorp/kubernetes"
+ version = "~> 2.23"
+ }
+ }
+}
+
+module "wandb_infra" {
+ source = "wandb/wandb/aws"
+ version = "1.16.10"
+
+ namespace = var.namespace
+ public_access = true
+ external_dns = true
+
+ deletion_protection = false
+
+ database_instance_class = var.database_instance_class
+ database_engine_version = var.database_engine_version
+ database_snapshot_identifier = var.database_snapshot_identifier
+ database_sort_buffer_size = var.database_sort_buffer_size
+
+ database_performance_insights_kms_key_arn = null
+
+ allowed_inbound_cidr = var.allowed_inbound_cidr
+ allowed_inbound_ipv6_cidr = ["::/0"]
+
+ eks_cluster_version = "1.25"
+ kubernetes_public_access = true
+ kubernetes_public_access_cidrs = ["0.0.0.0/0"]
+
+ domain_name = var.domain_name
+ zone_id = var.zone_id
+ subdomain = var.subdomain
+
+ bucket_name = var.bucket_name
+ bucket_kms_key_arn = var.bucket_kms_key_arn
+ use_internal_queue = true
+}
+
+data "aws_eks_cluster" "app_cluster" {
+ name = module.wandb_infra.cluster_id
+}
+
+data "aws_eks_cluster_auth" "app_cluster" {
+ name = module.wandb_infra.cluster_id
+}
+
+provider "kubernetes" {
+ host = data.aws_eks_cluster.app_cluster.endpoint
+ cluster_ca_certificate = base64decode(data.aws_eks_cluster.app_cluster.certificate_authority[0].data)
+ token = data.aws_eks_cluster_auth.app_cluster.token
+ exec {
+ api_version = "client.authentication.k8s.io/v1beta1"
+ args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.app_cluster.name]
+ command = "aws"
+ }
+}
+
+module "wandb_app" {
+ source = "wandb/wandb/kubernetes"
+ version = "1.12.0"
+
+ license = var.wandb_license
+
+ host = module.wandb_infra.url
+ bucket = "s3://${module.wandb_infra.bucket_name}"
+ bucket_aws_region = module.wandb_infra.bucket_region
+ bucket_queue = "internal://"
+ bucket_kms_key_arn = module.wandb_infra.kms_key_arn
+ database_connection_string = "mysql://${module.wandb_infra.database_connection_string}"
+ redis_connection_string = "redis://${module.wandb_infra.elasticache_connection_string}?tls=true&ttlInSeconds=604800"
+
+ wandb_image = var.wandb_image
+ wandb_version = var.wandb_version
+
+ service_port = module.wandb_infra.internal_app_port
+
+ depends_on = [module.wandb_infra]
+
+ other_wandb_env = merge({
+ "GORILLA_CUSTOMER_SECRET_STORE_SOURCE" = "aws-secretmanager://${var.namespace}?namespace=${var.namespace}"
+ }, var.other_wandb_env)
+}
+
+output "bucket_name" {
+ value = module.wandb_infra.bucket_name
+}
+
+output "bucket_queue_name" {
+ value = module.wandb_infra.bucket_queue_name
+}
diff --git a/docs/operator-migration/pre-operator.tfvars b/docs/operator-migration/pre-operator.tfvars
new file mode 100644
index 00000000..7cfb15a1
--- /dev/null
+++ b/docs/operator-migration/pre-operator.tfvars
@@ -0,0 +1,7 @@
+namespace = "operator-upgrade"
+domain_name = "sandbox-aws.wandb.ml"
+zone_id = "Z032246913CW32RVRY0WU"
+subdomain = "operator-upgrade"
+wandb_license = "eyJh"
+wandb_version = "0.51.2"
+# size = "small"
\ No newline at end of file
diff --git a/docs/operator-migration/readme.md b/docs/operator-migration/readme.md
new file mode 100644
index 00000000..10179d04
--- /dev/null
+++ b/docs/operator-migration/readme.md
@@ -0,0 +1,267 @@
+# Operator Migration
+
+This guide details the steps required to upgrade from **_pre-operator_** to **_post-operator_** environments using the [terraform-aws-wandb](https://registry.terraform.io/modules/wandb/wandb/aws/latest) module.
+
+## Introduction to Operator Shift
+
+The transition to a Kubernetes [operator](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/) pattern is crucial for our architecture. This section explains the transition from **_pre_** to **_post_** architectures.
+
+### Reasons for the Architecture Shift
+
+Historically, the W&B application was deployed as a single Deployment and pod within a Kubernetes Cluster or Docker container. We have always recommended externalizing the Metadata Store and Object Store to decouple state from the application, especially in production environments.
+
+As the application grew, the need to evolve from a monolithic container to a distributed system became apparent. This change facilitates backend logic handling and seamlessly introduces **_in-kubernetes_** infrastructure capabilities. It also supports deploying new services essential for additional features that W&B relies on.
+
+Previously, any Kubernetes-related changes required updating the [terraform-kubernetes-wandb](https://github.com/wandb/terraform-kubernetes-wandb), ensuring compatibility across cloud providers, configuring necessary Terraform variables, and executing a terraform apply for each backend or Kubernetes-level change. This process was not scalable and placed a significant burden on our support staff to assist customers with upgrades.
+
+The solution was to implement an **_Operator_** that connects to a central [deploy.wandb.ai](https://deploy.wandb.ai) server with its `license` to request the latest specification changes for a given **_Release Channel_** and apply them. Helm was chosen as both the deployment mechanism for our operator and the means for the operator to handle all configuration templating of the W&B Kubernetes stack; Helmception.
+
+You can install the operator from [charts/operator](https://github.com/wandb/helm-charts/tree/main/charts/operator). This installation creates a deployment called `controller-manager` and utilizes a **_Custom Resource_** definition named `weightsandbiases.apps.wandb.com` (shortName: `wandb`), which takes a single `spec` and applies it to the cluster:
+
+```yaml
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+ name: weightsandbiases.apps.wandb.com
+```
+
+The `controller-manager` installs [charts/operator-wandb](https://github.com/wandb/helm-charts/tree/main/charts/operator-wandb) based on the spec of the **_Custom Resource_**, **_Release Channel_**, and a **_User Defined Config_** in the new **_System Console_**. This hierarchy allows for maximum configuration flexibility at the user end and enables W&B to release new images, configurations, features, and Helm updates without requiring Terraform reruns.
+
+## Before and After Architecture
+
+Previously, our architecture used:
+
+```hcl
+module "wandb_infra" {
+ source = "wandb/wandb/aws"
+ version = "1.16.10"
+ ...
+}
+```
+
+to control the infrastructure:
+
+![pre-operator-infra](./images/pre-operator-infra.svg)
+
+and this module to deploy the W&B application:
+
+```hcl
+module "wandb_app" {
+ source = "wandb/wandb/kubernetes"
+ version = "1.12.0"
+}
+```
+
+![pre-operator-k8s](./images/pre-operator-k8s.svg)
+
+Post-transition, the architecture uses:
+
+```hcl
+module "wandb_infra" {
+ source = "wandb/wandb/aws"
+ version = "4.7.2"
+ ...
+}
+```
+
+to manage both the installation of infrastructure and the W&B application to the Kubernetes cluster, thus eliminating the need for the `module "wandb_app"` in `post-operator.tf`.
+
+![post-operator-k8s](./images/post-operator-k8s.svg)
+
+This architectural shift facilitates the introduction of additional customer features (like OpenTelemetry, Prometheus, HPA's, Kafka, and image updates) without requiring manual Terraform operations by SRE/Infrastructure teams.
+
+### Specification Hierarchy
+
+In our operator model, configuration specifications follow a hierarchical model where higher-level specifications override lower-level ones. Here’s how it works:
+
+- **Release Channel Spec**: This base level configuration sets default values and configurations based on the **_Release Channel_** set by W&B for the deployment.
+- **User Input Spec**: Users can override the default settings provided by the Release Channel Spec through the System Console.
+- **Custom Resource Spec**: The highest level of specification, which comes from the Terraform configuration. Any values specified here will override both the User Input and Release Channel specifications.
+
+This hierarchical model ensures that configurations are flexible and customizable to meet varying needs while maintaining a manageable and systematic approach to upgrades and changes.
+
+## Migration
+
+To commence with a base installation of the W&B Pre-Operator, ensure that `post-operator.tf` has a `.disabled` file extension and `pre-operator.tf` is active (i.e., does not have a `.disabled` extension).
+
+### Prerequisites
+
+Before initiating the migration process, ensure the following prerequisites are met:
+
+- **Egress**: The deployment can't be airgapped. It needs access to [deploy.wandb.ai](deploy.wandb.ai) to get the latest spec for the **_Release Channel_**.
+- **AWS Credentials**: Proper AWS credentials configured to interact with your AWS resources.
+- **Terraform Installed**: The latest version of Terraform should be installed on your system.
+- **Route53 Hosted Zone**: An existing Route53 hosted zone corresponding to the domain under which the application will be served.
+- **Pre-Operator Terraform Files**: Ensure `pre-operator.tf` and associated variable files like `pre-operator.tfvars` are correctly set up.
+
+### Pre-Operator Setup
+
+Execute the following Terraform commands to initialize and apply the configuration for the Pre-Operator setup:
+
+```bash
+terraform init -upgrade
+terraform apply -var-file=./pre-operator.tfvars
+```
+
+`pre-operator.tf` should look something like this:
+
+```ini
+namespace = "operator-upgrade"
+domain_name = "sandbox-aws.wandb.ml"
+zone_id = "Z032246913CW32RVRY0WU"
+subdomain = "operator-upgrade"
+wandb_license = "ey..."
+wandb_version = "0.51.2"
+```
+
+The `pre-operator.tf` configuration calls two modules:
+
+```hcl
+module "wandb_infra" {
+ source = "wandb/wandb/aws"
+ version = "1.16.10"
+ ...
+}
+```
+
+This module spins up the infrastructure.
+
+```hcl
+module "wandb_app" {
+ source = "wandb/wandb/kubernetes"
+ version = "1.12.0"
+}
+```
+
+This module deploys the application.
+
+### Post-Operator Setup
+
+Make sure that `pre-operator.tf` has a `.disabled` extension, and `post-operator.tf` is active.
+
+The `post-operator.tfvars` includes additional variables:
+
+```ini
+...
+# wandb_version = "0.51.2" is now managed via the Release Channel or set in the User Spec.
+
+# Required Operator Variables for Upgrade:
+size = "small"
+enable_dummy_dns = true
+enable_operator_alb = true
+custom_domain_filter = "sandbox-aws.wandb.ml"
+```
+
+Run the following commands to initialize and apply the Post-Operator configuration:
+
+```bash
+terraform init -upgrade
+terraform apply -var-file=./post-operator.tfvars
+```
+
+The plan and apply steps will update the following resources:
+
+```yaml
+actions:
+ create:
+ - aws_efs_backup_policy.storage_class
+ - aws_efs_file_system.storage_class
+ - aws_efs_mount_target.storage_class["0"]
+ - aws_efs_mount_target.storage_class["1"]
+ - aws_eks_addon.efs
+ - aws_iam_openid_connect_provider.eks
+ - aws_iam_policy.secrets_manager
+ - aws_iam_role_policy_attachment.ebs_csi
+ - aws_iam_role_policy_attachment.eks_efs
+ - aws_iam_role_policy_attachment.node_secrets_manager
+ - aws_security_group.storage_class_nfs
+ - aws_security_group_rule.nfs_ingress
+ - random_pet.efs
+ - aws_s3_bucket_acl.file_storage
+ - aws_s3_bucket_cors_configuration.file_storage
+ - aws_s3_bucket_ownership_controls.file_storage
+ - aws_s3_bucket_server_side_encryption_configuration.file_storage
+ - helm_release.operator
+ - helm_release.wandb
+ - aws_cloudwatch_log_group.this[0]
+ - aws_iam_policy.default
+ - aws_iam_role.default
+ - aws_iam_role_policy_attachment.default
+ - helm_release.external_dns
+ - aws_default_network_acl.this[0]
+ - aws_default_route_table.default[0]
+ - aws_iam_policy.default
+ - aws_iam_role.default
+ - aws_iam_role_policy_attachment.default
+ - helm_release.aws_load_balancer_controller
+
+ update_in_place:
+ - aws_iam_policy.node_IMDSv2
+ - aws_iam_policy.node_cloudwatch
+ - aws_iam_policy.node_kms
+ - aws_iam_policy.node_s3
+ - aws_iam_policy.node_sqs
+ - aws_eks_cluster.this[0]
+ - aws_elasticache_replication_group.default
+ - aws_rds_cluster.this[0]
+ - aws_rds_cluster_instance.this["1"]
+ - aws_default_security_group.this[0]
+ - aws_subnet.private[0]
+ - aws_subnet.private[1]
+ - aws_subnet.public[0]
+ - aws_subnet.public[1]
+ - aws_launch_template.workers["primary"]
+
+ destroy:
+ - kubernetes_config_map.config_map
+ - kubernetes_deployment.wandb
+ - kubernetes_priority_class.priority
+ - kubernetes_secret.secret
+ - kubernetes_service.prometheus
+ - kubernetes_service.service
+ - random_id.snapshot_identifier[0]
+
+ replace:
+ - aws_autoscaling_attachment.autoscaling_attachment["primary"]
+ - aws_route53_record.alb
+ - aws_eks_node_group.workers["primary"]
+```
+
+You should see something like this:
+
+![post-operator-apply](./images/post-operator-apply.png)
+
+Note that in `post-operator.tf`, there is a single:
+
+```hcl
+module "wandb_infra" {
+ source = "wandb/wandb/aws"
+ version = "4.7.2"
+ ...
+}
+```
+
+#### Changes in the Post-Operator Configuration:
+
+1. **Update Required Providers**: Change `required_providers.aws.version` from `3.6` to `4.0` for provider compatibility.
+2. **DNS and Load Balancer Configuration**: Integrate `enable_dummy_dns` and `enable_operator_alb` to manage DNS records and AWS Load Balancer setup through an Ingress.
+3. **License and Size Configuration**: Transfer the `license` and `size` parameters directly to the `wandb_infra` module to match new operational requirements.
+4. **Custom Domain Handling**: If necessary, use `custom_domain_filter` to troubleshoot DNS issues by checking the External DNS pod logs within the `kube-system` namespace.
+5. **Helm Provider Configuration**: Enable and configure the Helm provider to manage Kubernetes resources effectively:
+
+```hcl
+provider "helm" {
+ kubernetes {
+ host = data.aws_eks_cluster.app_cluster.endpoint
+ cluster_ca_certificate = base64decode(data.aws_eks_cluster.app_cluster.certificate_authority[0].data)
+ token = data.aws_eks_cluster_auth.app_cluster.token
+ exec {
+ api_version = "client.authentication.k8s.io/v1beta1"
+ args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.app_cluster.name]
+ command = "aws"
+ }
+ }
+}
+```
+
+This comprehensive setup ensures a smooth transition from the Pre-Operator to the Post-Operator configuration, leveraging new efficiencies and capabilities enabled by the operator model.
diff --git a/docs/operator-migration/variables.tf b/docs/operator-migration/variables.tf
new file mode 100644
index 00000000..f7c3bd7e
--- /dev/null
+++ b/docs/operator-migration/variables.tf
@@ -0,0 +1,145 @@
+variable "namespace" {
+ type = string
+ description = "Name prefix used for resources"
+}
+
+variable "domain_name" {
+ type = string
+ description = "Domain name used to access instance."
+}
+
+variable "zone_id" {
+ type = string
+ description = "Id of Route53 zone"
+}
+
+variable "size" {
+ default = "small"
+ description = "Deployment size"
+ nullable = true
+ type = string
+}
+
+variable "subdomain" {
+ type = string
+ default = null
+ description = "Subdomain for accessing the Weights & Biases UI."
+}
+
+variable "wandb_license" {
+ type = string
+}
+
+variable "database_engine_version" {
+ description = "Version for MySQL Auora"
+ type = string
+ default = "8.0.mysql_aurora.3.03.0"
+}
+
+variable "database_instance_class" {
+ description = "Instance type to use by database master instance."
+ type = string
+ default = "db.r5.large"
+}
+
+variable "database_snapshot_identifier" {
+ description = "Specifies whether or not to create this cluster from a snapshot. You can use either the name or ARN when specifying a DB cluster snapshot, or the ARN when specifying a DB snapshot"
+ type = string
+ default = null
+}
+
+variable "database_sort_buffer_size" {
+ description = "Specifies the sort_buffer_size value to set for the database"
+ type = number
+ default = 262144
+}
+
+variable "wandb_version" {
+ description = "The version of Weights & Biases local to deploy."
+ type = string
+ default = "latest"
+}
+
+variable "wandb_image" {
+ description = "Docker repository of to pull the wandb image from."
+ type = string
+ default = "wandb/local"
+}
+
+variable "bucket_name" {
+ type = string
+ default = ""
+}
+
+variable "bucket_kms_key_arn" {
+ type = string
+ description = "The Amazon Resource Name of the KMS key with which S3 storage bucket objects will be encrypted."
+ default = ""
+}
+
+variable "enable_dummy_dns" {
+ type = bool
+ default = false
+ description = "Boolean indicating whether or not to enable dummy DNS for the old alb"
+}
+
+variable "enable_operator_alb" {
+ type = bool
+ default = false
+ description = "Boolean indicating whether to use operatore ALB (true) or not (false)."
+}
+
+variable "custom_domain_filter" {
+ description = "A custom domain filter to be used by external-dns instead of the default FQDN. If not set, the local FQDN is used."
+ type = string
+ default = null
+}
+
+variable "allowed_inbound_cidr" {
+ default = ["0.0.0.0/0"]
+ nullable = false
+ type = list(string)
+}
+
+
+variable "allowed_inbound_ipv6_cidr" {
+ default = ["::/0"]
+ nullable = false
+ type = list(string)
+}
+
+variable "other_wandb_env" {
+ type = map(string)
+ description = "Extra environment variables for W&B"
+ default = {}
+}
+
+variable "system_reserved_cpu_millicores" {
+ description = "(Optional) The amount of 'system-reserved' CPU millicores to pass to the kubelet. For example: 100. A value of -1 disables the flag."
+ type = number
+ default = -1
+}
+
+variable "system_reserved_memory_megabytes" {
+ description = "(Optional) The amount of 'system-reserved' memory in megabytes to pass to the kubelet. For example: 100. A value of -1 disables the flag."
+ type = number
+ default = -1
+}
+
+variable "system_reserved_ephemeral_megabytes" {
+ description = "(Optional) The amount of 'system-reserved' ephemeral storage in megabytes to pass to the kubelet. For example: 1000. A value of -1 disables the flag."
+ type = number
+ default = -1
+}
+
+variable "system_reserved_pid" {
+ description = "(Optional) The amount of 'system-reserved' process ids [pid] to pass to the kubelet. For example: 1000. A value of -1 disables the flag."
+ type = number
+ default = -1
+}
+
+variable "aws_loadbalancer_controller_tags" {
+ description = "(Optional) A map of AWS tags to apply to all resources managed by the load balancer controller"
+ type = map(string)
+ default = {}
+}
diff --git a/main.tf b/main.tf
index 3b6fc78d..e551263e 100644
--- a/main.tf
+++ b/main.tf
@@ -222,6 +222,15 @@ locals {
lb_name_truncated = "${substr(var.namespace, 0, local.max_lb_name_length)}-alb-k8s"
}
+data "aws_region" "current" {}
+
+module "iam_role" {
+ count = var.enable_yace ? 1 : 0
+ source = "./modules/iam_role"
+ namespace = var.namespace
+ aws_iam_openid_connect_provider_url = module.app_eks.aws_iam_openid_connect_provider
+}
+
module "wandb" {
source = "wandb/wandb/helm"
version = "1.2.0"
@@ -300,6 +309,53 @@ module "wandb" {
}, var.app_wandb_env)
}
+ # To support otel rds and redis metrics need operator-wandb chart minimum version 0.13.8 ( yace subchart)
+ yace = var.enable_yace ? {
+ install = true
+ regions = [data.aws_region.current.name]
+ serviceAccount = { annotations = { "eks.amazonaws.com/role-arn" = module.iam_role[0].role_arn} }
+ } : {
+ install = false
+ regions = []
+ serviceAccount = {}
+ }
+
+ otel = {
+ daemonset = var.enable_yace ? {
+ config = {
+ receivers = {
+ prometheus = {
+ config = {
+ scrape_configs = [
+ { job_name = "yace"
+ scheme = "http"
+ metrics_path = "/metrics"
+ dns_sd_configs = [
+ { names = ["yace"]
+ type = "A"
+ port = 5000
+ }
+ ]
+ }
+ ]
+ }
+ }
+ }
+ service = {
+ pipelines = {
+ metrics = {
+ receivers = ["hostmetrics", "k8s_cluster", "kubeletstats", "prometheus"]
+ }
+ }
+ }
+ }
+ } : { config = {
+ receivers = {}
+ service = {}
+ }
+ }
+ }
+
mysql = { install = false }
redis = { install = false }
diff --git a/modules/app_eks/add-ons.tf b/modules/app_eks/add-ons.tf
index 4f77765b..cc43043d 100644
--- a/modules/app_eks/add-ons.tf
+++ b/modules/app_eks/add-ons.tf
@@ -1,4 +1,3 @@
-
### IAM policy and role for vpc-cni
data "aws_iam_policy_document" "oidc_assume_role" {
statement {
@@ -39,8 +38,7 @@ resource "aws_eks_addon" "aws_efs_csi_driver" {
addon_name = "aws-efs-csi-driver"
addon_version = "v2.0.3-eksbuild.1"
resolve_conflicts = "OVERWRITE"
-
- }
+}
resource "aws_eks_addon" "aws_ebs_csi_driver" {
depends_on = [
@@ -50,7 +48,6 @@ resource "aws_eks_addon" "aws_ebs_csi_driver" {
addon_name = "aws-ebs-csi-driver"
addon_version = "v1.31.0-eksbuild.1"
resolve_conflicts = "OVERWRITE"
-
}
resource "aws_eks_addon" "coredns" {
@@ -74,9 +71,9 @@ resource "aws_eks_addon" "kube_proxy" {
}
resource "aws_eks_addon" "vpc_cni" {
- cluster_name = var.namespace
- addon_name = "vpc-cni"
- addon_version = "v1.18.0-eksbuild.1"
- resolve_conflicts = "OVERWRITE"
+ cluster_name = var.namespace
+ addon_name = "vpc-cni"
+ addon_version = "v1.18.0-eksbuild.1"
+ resolve_conflicts = "OVERWRITE"
service_account_role_arn = aws_iam_role.oidc.arn
-}
\ No newline at end of file
+}
diff --git a/modules/app_eks/outputs.tf b/modules/app_eks/outputs.tf
index 304b51db..cc791455 100644
--- a/modules/app_eks/outputs.tf
+++ b/modules/app_eks/outputs.tf
@@ -17,3 +17,7 @@ output "node_role" {
output "primary_workers_security_group_id" {
value = aws_security_group.primary_workers.id
}
+
+output "aws_iam_openid_connect_provider" {
+ value = aws_iam_openid_connect_provider.eks.url
+}
\ No newline at end of file
diff --git a/modules/iam_role/main.tf b/modules/iam_role/main.tf
new file mode 100644
index 00000000..5d982e56
--- /dev/null
+++ b/modules/iam_role/main.tf
@@ -0,0 +1,51 @@
+data "aws_caller_identity" "current" {}
+
+resource "aws_iam_role" "irsa" {
+ name = "${var.namespace}-yace-irsa-role"
+ assume_role_policy = jsonencode({
+ Version = "2012-10-17"
+ Statement = [
+ {
+ Sid = ""
+ Effect = "Allow"
+ Principal = {
+ Federated = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/${var.aws_iam_openid_connect_provider_url}"
+ }
+ Action = ["sts:AssumeRoleWithWebIdentity"]
+ Condition = {
+ StringLike = {
+ "${var.aws_iam_openid_connect_provider_url}:sub" = "system:serviceaccount:*:yace"
+ "${var.aws_iam_openid_connect_provider_url}:aud" = "sts.amazonaws.com"
+ }
+ }
+ }
+ ]
+ })
+}
+
+
+resource "aws_iam_policy" "irsa" {
+ name = "${var.namespace}-yace-irsa-policy"
+ description = "IRSA IAM Policy"
+
+ policy = jsonencode({
+ Version = "2012-10-17"
+ Statement = [
+ {
+ Effect = "Allow"
+ Action = [
+ "tag:GetResources",
+ "cloudwatch:GetMetricData",
+ "cloudwatch:GetMetricStatistics",
+ "cloudwatch:ListMetrics"
+ ]
+ Resource = "*"
+ }
+ ]
+ })
+}
+
+resource "aws_iam_role_policy_attachment" "default" {
+ role = aws_iam_role.irsa.name
+ policy_arn = aws_iam_policy.irsa.arn
+}
\ No newline at end of file
diff --git a/modules/iam_role/outputs.tf b/modules/iam_role/outputs.tf
new file mode 100644
index 00000000..989f8c0c
--- /dev/null
+++ b/modules/iam_role/outputs.tf
@@ -0,0 +1,3 @@
+output "role_arn" {
+ value = aws_iam_role.irsa.arn
+}
\ No newline at end of file
diff --git a/modules/iam_role/variables.tf b/modules/iam_role/variables.tf
new file mode 100644
index 00000000..4cd10397
--- /dev/null
+++ b/modules/iam_role/variables.tf
@@ -0,0 +1,8 @@
+variable "namespace" {
+ type = string
+ description = "The name prefix for all resources created."
+}
+
+variable "aws_iam_openid_connect_provider_url" {
+ type = string
+}
\ No newline at end of file
diff --git a/variables.tf b/variables.tf
index 067fd9f5..bf75219d 100644
--- a/variables.tf
+++ b/variables.tf
@@ -441,3 +441,9 @@ variable "parquet_wandb_env" {
description = "Extra environment variables for W&B"
default = {}
}
+
+variable "enable_yace" {
+ type = bool
+ description = "deploy yet another cloudwatch exporter to fetch aws resources metrics"
+ default = true
+}
\ No newline at end of file