diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9555cdcb..9734eb7e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,41 @@
All notable changes to this project will be documented in this file.
+### [4.12.2](https://github.com/wandb/terraform-aws-wandb/compare/v4.12.1...v4.12.2) (2024-06-17)
+
+
+### Bug Fixes
+
+* Revert resolve conflicts var ([#233](https://github.com/wandb/terraform-aws-wandb/issues/233)) ([778f147](https://github.com/wandb/terraform-aws-wandb/commit/778f147aa9962fde6a74b7d35501ec7dd7abf2a9))
+
+### [4.12.1](https://github.com/wandb/terraform-aws-wandb/compare/v4.12.0...v4.12.1) (2024-06-17)
+
+
+### Bug Fixes
+
+* Remove white space ([#231](https://github.com/wandb/terraform-aws-wandb/issues/231)) ([974b4f3](https://github.com/wandb/terraform-aws-wandb/commit/974b4f3ec0d01b34cf6d83008c9fe2a0d3d8ee7a))
+
+## [4.12.0](https://github.com/wandb/terraform-aws-wandb/compare/v4.11.0...v4.12.0) (2024-06-17)
+
+
+### Features
+
+* Added support yace ([#218](https://github.com/wandb/terraform-aws-wandb/issues/218)) ([12e053d](https://github.com/wandb/terraform-aws-wandb/commit/12e053d520f6998689d3bec0352b320a9105ba9e))
+
+## [4.11.0](https://github.com/wandb/terraform-aws-wandb/compare/v4.10.2...v4.11.0) (2024-05-18)
+
+
+### Features
+
+* Changes to Connect to AWS S3 and KMS using IAM role for EKS service account ([#186](https://github.com/wandb/terraform-aws-wandb/issues/186)) ([a07a45e](https://github.com/wandb/terraform-aws-wandb/commit/a07a45e6d5b979ec2ef8fbb79b63a5d15867da08))
+
+### [4.10.2](https://github.com/wandb/terraform-aws-wandb/compare/v4.10.1...v4.10.2) (2024-05-13)
+
+
+### Bug Fixes
+
+* Amend standard sizes ([#214](https://github.com/wandb/terraform-aws-wandb/issues/214)) ([a1763f9](https://github.com/wandb/terraform-aws-wandb/commit/a1763f93ef507a99e76940fc8c7a0223b5498ff3))
+
### [4.10.1](https://github.com/wandb/terraform-aws-wandb/compare/v4.10.0...v4.10.1) (2024-05-08)
diff --git a/NOTICE b/NOTICE
index cfcc8b0e..4b964c25 100644
--- a/NOTICE
+++ b/NOTICE
@@ -10,4 +10,6 @@
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
- limitations under the License.
\ No newline at end of file
+ limitations under the License.
+
+ .
\ No newline at end of file
diff --git a/README.md b/README.md
index 0bc592c9..3f9de70e 100644
--- a/README.md
+++ b/README.md
@@ -102,13 +102,14 @@ resources that lack official modules.
Users can update the EKS cluster version to the latest version offered by AWS. This can be done using the environment variable `eks_cluster_version`. Note that, cluster and nodegroup version updates can only be done in increments of one version at a time. For example, if your current cluster version is `1.21` and the latest version available is `1.25` - you'd need to:
1. update the cluster version in the app_eks module from `1.21` to `1.22`
-2. run `terraform apply`
+2. run `terraform apply`
3. update the cluster version to `1.23`
4. run `terraform apply`
5. update the cluster version to `1.24`
-...and so on and so forth.
+ ...and so on and so forth.
Upgrades must be executed in step-wise fashion from one version to the next. You cannot skip versions when upgrading EKS.
+
### Notes on EKS Add-ons
@@ -252,7 +253,11 @@ CLI and re-run the apply. Running pods will not be impacted.
## Migrations
-#### Upgrading from 3.x -> 4.x
+### Upgrading to Operator
+
+See our upgrade guide [here](./docs/operator-migration/readme.md)
+
+### Upgrading from 3.x -> 4.x
- If egress access for retrieving the wandb/controller image is not available, Terraform apply may experience failures.
- It's necessary to supply a license variable within the module, as shown:
diff --git a/deployment-size.tf b/deployment-size.tf
index 60a93f2a..f6aedbe6 100644
--- a/deployment-size.tf
+++ b/deployment-size.tf
@@ -7,19 +7,19 @@ locals {
deployment_size = {
small = {
db = "db.r6g.large",
- node_count = 3,
+ node_count = 2,
node_instance = "r6i.xlarge"
cache = "cache.m6g.large"
},
medium = {
db = "db.r6g.xlarge",
- node_count = 3,
+ node_count = 2,
node_instance = "r6i.xlarge"
cache = "cache.m6g.large"
},
large = {
db = "db.r6g.2xlarge",
- node_count = 3,
+ node_count = 2,
node_instance = "r6i.2xlarge"
cache = "cache.m6g.xlarge"
},
diff --git a/docs/operator-migration/images/post-operator-apply.png b/docs/operator-migration/images/post-operator-apply.png
new file mode 100644
index 00000000..9ac05b96
Binary files /dev/null and b/docs/operator-migration/images/post-operator-apply.png differ
diff --git a/docs/operator-migration/images/post-operator-k8s.svg b/docs/operator-migration/images/post-operator-k8s.svg
new file mode 100644
index 00000000..20ebd449
--- /dev/null
+++ b/docs/operator-migration/images/post-operator-k8s.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/operator-migration/images/pre-operator-infra.svg b/docs/operator-migration/images/pre-operator-infra.svg
new file mode 100644
index 00000000..c1b474ea
--- /dev/null
+++ b/docs/operator-migration/images/pre-operator-infra.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/operator-migration/images/pre-operator-k8s.svg b/docs/operator-migration/images/pre-operator-k8s.svg
new file mode 100644
index 00000000..93719397
--- /dev/null
+++ b/docs/operator-migration/images/pre-operator-k8s.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/operator-migration/post-operator.tf.disabled b/docs/operator-migration/post-operator.tf.disabled
new file mode 100644
index 00000000..62ae09e3
--- /dev/null
+++ b/docs/operator-migration/post-operator.tf.disabled
@@ -0,0 +1,113 @@
+provider "aws" {
+ region = "us-west-2"
+
+ default_tags {
+ tags = {
+ GithubRepo = "terraform-aws-wandb"
+ GithubOrg = "wandb"
+ Enviroment = "Example"
+ Example = "PublicDnsExternal"
+ }
+ }
+}
+
+terraform {
+ required_version = "~> 1.0"
+ required_providers {
+ aws = {
+ source = "hashicorp/aws"
+ version = "~> 4.0" # Post-Operator
+ }
+ kubernetes = {
+ source = "hashicorp/kubernetes"
+ version = "~> 2.23"
+ }
+ }
+}
+
+module "wandb_infra" {
+ source = "wandb/wandb/aws"
+ version = "4.7.2"
+
+ namespace = var.namespace
+ public_access = true
+ external_dns = true
+
+ enable_dummy_dns = var.enable_dummy_dns # Post-Operator
+ enable_operator_alb = var.enable_operator_alb # Post-Operator
+ deletion_protection = false
+
+ database_instance_class = var.database_instance_class
+ database_engine_version = var.database_engine_version
+ database_snapshot_identifier = var.database_snapshot_identifier
+ database_sort_buffer_size = var.database_sort_buffer_size
+
+ database_performance_insights_kms_key_arn = null
+
+ allowed_inbound_cidr = var.allowed_inbound_cidr
+ allowed_inbound_ipv6_cidr = ["::/0"]
+
+ eks_cluster_version = "1.25"
+ kubernetes_public_access = true
+ kubernetes_public_access_cidrs = ["0.0.0.0/0"]
+
+ domain_name = var.domain_name
+ zone_id = var.zone_id
+ subdomain = var.subdomain
+
+ # Add License Post-Operator
+ license = var.wandb_license
+
+ # Use standard sizing Post-Operator
+ size = var.size
+
+ # Set the External DNS Custom Domain Filter Post-Operator
+ custom_domain_filter = var.custom_domain_filter
+
+ bucket_name = var.bucket_name
+ bucket_kms_key_arn = var.bucket_kms_key_arn
+ use_internal_queue = true
+
+ aws_loadbalancer_controller_tags = var.aws_loadbalancer_controller_tags
+}
+
+data "aws_eks_cluster" "app_cluster" {
+ name = module.wandb_infra.cluster_id
+}
+
+data "aws_eks_cluster_auth" "app_cluster" {
+ name = module.wandb_infra.cluster_id
+}
+
+provider "kubernetes" {
+ host = data.aws_eks_cluster.app_cluster.endpoint
+ cluster_ca_certificate = base64decode(data.aws_eks_cluster.app_cluster.certificate_authority[0].data)
+ token = data.aws_eks_cluster_auth.app_cluster.token
+ exec {
+ api_version = "client.authentication.k8s.io/v1beta1"
+ args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.app_cluster.name]
+ command = "aws"
+ }
+}
+
+# Enable the Helm provider
+provider "helm" {
+ kubernetes {
+ host = data.aws_eks_cluster.app_cluster.endpoint
+ cluster_ca_certificate = base64decode(data.aws_eks_cluster.app_cluster.certificate_authority[0].data)
+ token = data.aws_eks_cluster_auth.app_cluster.token
+ exec {
+ api_version = "client.authentication.k8s.io/v1beta1"
+ args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.app_cluster.name]
+ command = "aws"
+ }
+ }
+}
+
+output "bucket_name" {
+ value = module.wandb_infra.bucket_name
+}
+
+output "bucket_queue_name" {
+ value = module.wandb_infra.bucket_queue_name
+}
diff --git a/docs/operator-migration/post-operator.tfvars b/docs/operator-migration/post-operator.tfvars
new file mode 100644
index 00000000..9c3ae571
--- /dev/null
+++ b/docs/operator-migration/post-operator.tfvars
@@ -0,0 +1,12 @@
+namespace = "operator-upgrade"
+domain_name = "sandbox-aws.wandb.ml"
+zone_id = "Z032246913CW32RVRY0WU"
+subdomain = "operator-upgrade"
+wandb_license = "eyJh"
+# wandb_version = "0.51.2" Is now coming from the Release Channel or set in the User Spec.
+
+# Needed Operator Variables for Upgrade
+size = "small"
+enable_dummy_dns = true
+enable_operator_alb = true
+custom_domain_filter = "sandbox-aws.wandb.ml"
\ No newline at end of file
diff --git a/docs/operator-migration/pre-operator.tf b/docs/operator-migration/pre-operator.tf
new file mode 100644
index 00000000..40bf5a2b
--- /dev/null
+++ b/docs/operator-migration/pre-operator.tf
@@ -0,0 +1,112 @@
+provider "aws" {
+ region = "us-west-2"
+
+ default_tags {
+ tags = {
+ GithubRepo = "terraform-aws-wandb"
+ GithubOrg = "wandb"
+ Enviroment = "Example"
+ Example = "PublicDnsExternal"
+ }
+ }
+}
+
+terraform {
+ required_version = "~> 1.0"
+ required_providers {
+ aws = {
+ source = "hashicorp/aws"
+ version = "~> 3.6" # Pre-Operator
+ }
+ kubernetes = {
+ source = "hashicorp/kubernetes"
+ version = "~> 2.23"
+ }
+ }
+}
+
+module "wandb_infra" {
+ source = "wandb/wandb/aws"
+ version = "1.16.10"
+
+ namespace = var.namespace
+ public_access = true
+ external_dns = true
+
+ deletion_protection = false
+
+ database_instance_class = var.database_instance_class
+ database_engine_version = var.database_engine_version
+ database_snapshot_identifier = var.database_snapshot_identifier
+ database_sort_buffer_size = var.database_sort_buffer_size
+
+ database_performance_insights_kms_key_arn = null
+
+ allowed_inbound_cidr = var.allowed_inbound_cidr
+ allowed_inbound_ipv6_cidr = ["::/0"]
+
+ eks_cluster_version = "1.25"
+ kubernetes_public_access = true
+ kubernetes_public_access_cidrs = ["0.0.0.0/0"]
+
+ domain_name = var.domain_name
+ zone_id = var.zone_id
+ subdomain = var.subdomain
+
+ bucket_name = var.bucket_name
+ bucket_kms_key_arn = var.bucket_kms_key_arn
+ use_internal_queue = true
+}
+
+data "aws_eks_cluster" "app_cluster" {
+ name = module.wandb_infra.cluster_id
+}
+
+data "aws_eks_cluster_auth" "app_cluster" {
+ name = module.wandb_infra.cluster_id
+}
+
+provider "kubernetes" {
+ host = data.aws_eks_cluster.app_cluster.endpoint
+ cluster_ca_certificate = base64decode(data.aws_eks_cluster.app_cluster.certificate_authority[0].data)
+ token = data.aws_eks_cluster_auth.app_cluster.token
+ exec {
+ api_version = "client.authentication.k8s.io/v1beta1"
+ args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.app_cluster.name]
+ command = "aws"
+ }
+}
+
+module "wandb_app" {
+ source = "wandb/wandb/kubernetes"
+ version = "1.12.0"
+
+ license = var.wandb_license
+
+ host = module.wandb_infra.url
+ bucket = "s3://${module.wandb_infra.bucket_name}"
+ bucket_aws_region = module.wandb_infra.bucket_region
+ bucket_queue = "internal://"
+ bucket_kms_key_arn = module.wandb_infra.kms_key_arn
+ database_connection_string = "mysql://${module.wandb_infra.database_connection_string}"
+ redis_connection_string = "redis://${module.wandb_infra.elasticache_connection_string}?tls=true&ttlInSeconds=604800"
+
+ wandb_image = var.wandb_image
+ wandb_version = var.wandb_version
+
+ service_port = module.wandb_infra.internal_app_port
+
+ depends_on = [module.wandb_infra]
+
+ other_wandb_env = merge({
+ "GORILLA_CUSTOMER_SECRET_STORE_SOURCE" = "aws-secretmanager://${var.namespace}?namespace=${var.namespace}"
+ }, var.other_wandb_env)
+}
+
+output "bucket_name" {
+ value = module.wandb_infra.bucket_name
+}
+
+output "bucket_queue_name" {
+ value = module.wandb_infra.bucket_queue_name
+}
diff --git a/docs/operator-migration/pre-operator.tfvars b/docs/operator-migration/pre-operator.tfvars
new file mode 100644
index 00000000..7cfb15a1
--- /dev/null
+++ b/docs/operator-migration/pre-operator.tfvars
@@ -0,0 +1,7 @@
+namespace = "operator-upgrade"
+domain_name = "sandbox-aws.wandb.ml"
+zone_id = "Z032246913CW32RVRY0WU"
+subdomain = "operator-upgrade"
+wandb_license = "eyJh"
+wandb_version = "0.51.2"
+# size = "small"
\ No newline at end of file
diff --git a/docs/operator-migration/readme.md b/docs/operator-migration/readme.md
new file mode 100644
index 00000000..10179d04
--- /dev/null
+++ b/docs/operator-migration/readme.md
@@ -0,0 +1,267 @@
+# Operator Migration
+
+This guide details the steps required to upgrade from **_pre-operator_** to **_post-operator_** environments using the [terraform-aws-wandb](https://registry.terraform.io/modules/wandb/wandb/aws/latest) module.
+
+## Introduction to Operator Shift
+
+The transition to a Kubernetes [operator](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/) pattern is crucial for our architecture. This section explains the transition from **_pre_** to **_post_** architectures.
+
+### Reasons for the Architecture Shift
+
+Historically, the W&B application was deployed as a single Deployment and pod within a Kubernetes Cluster or Docker container. We have always recommended externalizing the Metadata Store and Object Store to decouple state from the application, especially in production environments.
+
+As the application grew, the need to evolve from a monolithic container to a distributed system became apparent. This change facilitates backend logic handling and seamlessly introduces **_in-kubernetes_** infrastructure capabilities. It also supports deploying new services essential for additional features that W&B relies on.
+
+Previously, any Kubernetes-related changes required updating the [terraform-kubernetes-wandb](https://github.com/wandb/terraform-kubernetes-wandb), ensuring compatibility across cloud providers, configuring necessary Terraform variables, and executing a terraform apply for each backend or Kubernetes-level change. This process was not scalable and placed a significant burden on our support staff to assist customers with upgrades.
+
+The solution was to implement an **_Operator_** that connects to a central [deploy.wandb.ai](https://deploy.wandb.ai) server with its `license` to request the latest specification changes for a given **_Release Channel_** and apply them. Helm was chosen as both the deployment mechanism for our operator and the means for the operator to handle all configuration templating of the W&B Kubernetes stack; Helmception.
+
+You can install the operator from [charts/operator](https://github.com/wandb/helm-charts/tree/main/charts/operator). This installation creates a deployment called `controller-manager` and utilizes a **_Custom Resource_** definition named `weightsandbiases.apps.wandb.com` (shortName: `wandb`), which takes a single `spec` and applies it to the cluster:
+
+```yaml
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+ name: weightsandbiases.apps.wandb.com
+```
+
+The `controller-manager` installs [charts/operator-wandb](https://github.com/wandb/helm-charts/tree/main/charts/operator-wandb) based on the spec of the **_Custom Resource_**, **_Release Channel_**, and a **_User Defined Config_** in the new **_System Console_**. This hierarchy allows for maximum configuration flexibility at the user end and enables W&B to release new images, configurations, features, and Helm updates without requiring Terraform reruns.
+
+## Before and After Architecture
+
+Previously, our architecture used:
+
+```hcl
+module "wandb_infra" {
+ source = "wandb/wandb/aws"
+ version = "1.16.10"
+ ...
+}
+```
+
+to control the infrastructure:
+
+![pre-operator-infra](./images/pre-operator-infra.svg)
+
+and this module to deploy the W&B application:
+
+```hcl
+module "wandb_app" {
+ source = "wandb/wandb/kubernetes"
+ version = "1.12.0"
+}
+```
+
+![pre-operator-k8s](./images/pre-operator-k8s.svg)
+
+Post-transition, the architecture uses:
+
+```hcl
+module "wandb_infra" {
+ source = "wandb/wandb/aws"
+ version = "4.7.2"
+ ...
+}
+```
+
+to manage both the installation of infrastructure and the W&B application to the Kubernetes cluster, thus eliminating the need for the `module "wandb_app"` in `post-operator.tf`.
+
+![post-operator-k8s](./images/post-operator-k8s.svg)
+
+This architectural shift facilitates the introduction of additional customer features (like OpenTelemetry, Prometheus, HPA's, Kafka, and image updates) without requiring manual Terraform operations by SRE/Infrastructure teams.
+
+### Specification Hierarchy
+
+In our operator model, configuration specifications follow a hierarchical model where higher-level specifications override lower-level ones. Here’s how it works:
+
+- **Release Channel Spec**: This base level configuration sets default values and configurations based on the **_Release Channel_** set by W&B for the deployment.
+- **User Input Spec**: Users can override the default settings provided by the Release Channel Spec through the System Console.
+- **Custom Resource Spec**: The highest level of specification, which comes from the Terraform configuration. Any values specified here will override both the User Input and Release Channel specifications.
+
+This hierarchical model ensures that configurations are flexible and customizable to meet varying needs while maintaining a manageable and systematic approach to upgrades and changes.
+
+## Migration
+
+To commence with a base installation of the W&B Pre-Operator, ensure that `post-operator.tf` has a `.disabled` file extension and `pre-operator.tf` is active (i.e., does not have a `.disabled` extension).
+
+### Prerequisites
+
+Before initiating the migration process, ensure the following prerequisites are met:
+
+- **Egress**: The deployment can't be airgapped. It needs access to [deploy.wandb.ai](deploy.wandb.ai) to get the latest spec for the **_Release Channel_**.
+- **AWS Credentials**: Proper AWS credentials configured to interact with your AWS resources.
+- **Terraform Installed**: The latest version of Terraform should be installed on your system.
+- **Route53 Hosted Zone**: An existing Route53 hosted zone corresponding to the domain under which the application will be served.
+- **Pre-Operator Terraform Files**: Ensure `pre-operator.tf` and associated variable files like `pre-operator.tfvars` are correctly set up.
+
+### Pre-Operator Setup
+
+Execute the following Terraform commands to initialize and apply the configuration for the Pre-Operator setup:
+
+```bash
+terraform init -upgrade
+terraform apply -var-file=./pre-operator.tfvars
+```
+
+`pre-operator.tf` should look something like this:
+
+```ini
+namespace = "operator-upgrade"
+domain_name = "sandbox-aws.wandb.ml"
+zone_id = "Z032246913CW32RVRY0WU"
+subdomain = "operator-upgrade"
+wandb_license = "ey..."
+wandb_version = "0.51.2"
+```
+
+The `pre-operator.tf` configuration calls two modules:
+
+```hcl
+module "wandb_infra" {
+ source = "wandb/wandb/aws"
+ version = "1.16.10"
+ ...
+}
+```
+
+This module spins up the infrastructure.
+
+```hcl
+module "wandb_app" {
+ source = "wandb/wandb/kubernetes"
+ version = "1.12.0"
+}
+```
+
+This module deploys the application.
+
+### Post-Operator Setup
+
+Make sure that `pre-operator.tf` has a `.disabled` extension, and `post-operator.tf` is active.
+
+The `post-operator.tfvars` includes additional variables:
+
+```ini
+...
+# wandb_version = "0.51.2" is now managed via the Release Channel or set in the User Spec.
+
+# Required Operator Variables for Upgrade:
+size = "small"
+enable_dummy_dns = true
+enable_operator_alb = true
+custom_domain_filter = "sandbox-aws.wandb.ml"
+```
+
+Run the following commands to initialize and apply the Post-Operator configuration:
+
+```bash
+terraform init -upgrade
+terraform apply -var-file=./post-operator.tfvars
+```
+
+The plan and apply steps will update the following resources:
+
+```yaml
+actions:
+ create:
+ - aws_efs_backup_policy.storage_class
+ - aws_efs_file_system.storage_class
+ - aws_efs_mount_target.storage_class["0"]
+ - aws_efs_mount_target.storage_class["1"]
+ - aws_eks_addon.efs
+ - aws_iam_openid_connect_provider.eks
+ - aws_iam_policy.secrets_manager
+ - aws_iam_role_policy_attachment.ebs_csi
+ - aws_iam_role_policy_attachment.eks_efs
+ - aws_iam_role_policy_attachment.node_secrets_manager
+ - aws_security_group.storage_class_nfs
+ - aws_security_group_rule.nfs_ingress
+ - random_pet.efs
+ - aws_s3_bucket_acl.file_storage
+ - aws_s3_bucket_cors_configuration.file_storage
+ - aws_s3_bucket_ownership_controls.file_storage
+ - aws_s3_bucket_server_side_encryption_configuration.file_storage
+ - helm_release.operator
+ - helm_release.wandb
+ - aws_cloudwatch_log_group.this[0]
+ - aws_iam_policy.default
+ - aws_iam_role.default
+ - aws_iam_role_policy_attachment.default
+ - helm_release.external_dns
+ - aws_default_network_acl.this[0]
+ - aws_default_route_table.default[0]
+ - aws_iam_policy.default
+ - aws_iam_role.default
+ - aws_iam_role_policy_attachment.default
+ - helm_release.aws_load_balancer_controller
+
+ update_in_place:
+ - aws_iam_policy.node_IMDSv2
+ - aws_iam_policy.node_cloudwatch
+ - aws_iam_policy.node_kms
+ - aws_iam_policy.node_s3
+ - aws_iam_policy.node_sqs
+ - aws_eks_cluster.this[0]
+ - aws_elasticache_replication_group.default
+ - aws_rds_cluster.this[0]
+ - aws_rds_cluster_instance.this["1"]
+ - aws_default_security_group.this[0]
+ - aws_subnet.private[0]
+ - aws_subnet.private[1]
+ - aws_subnet.public[0]
+ - aws_subnet.public[1]
+ - aws_launch_template.workers["primary"]
+
+ destroy:
+ - kubernetes_config_map.config_map
+ - kubernetes_deployment.wandb
+ - kubernetes_priority_class.priority
+ - kubernetes_secret.secret
+ - kubernetes_service.prometheus
+ - kubernetes_service.service
+ - random_id.snapshot_identifier[0]
+
+ replace:
+ - aws_autoscaling_attachment.autoscaling_attachment["primary"]
+ - aws_route53_record.alb
+ - aws_eks_node_group.workers["primary"]
+```
+
+You should see something like this:
+
+![post-operator-apply](./images/post-operator-apply.png)
+
+Note that in `post-operator.tf`, there is a single:
+
+```hcl
+module "wandb_infra" {
+ source = "wandb/wandb/aws"
+ version = "4.7.2"
+ ...
+}
+```
+
+#### Changes in the Post-Operator Configuration:
+
+1. **Update Required Providers**: Change `required_providers.aws.version` from `3.6` to `4.0` for provider compatibility.
+2. **DNS and Load Balancer Configuration**: Integrate `enable_dummy_dns` and `enable_operator_alb` to manage DNS records and AWS Load Balancer setup through an Ingress.
+3. **License and Size Configuration**: Transfer the `license` and `size` parameters directly to the `wandb_infra` module to match new operational requirements.
+4. **Custom Domain Handling**: If necessary, use `custom_domain_filter` to troubleshoot DNS issues by checking the External DNS pod logs within the `kube-system` namespace.
+5. **Helm Provider Configuration**: Enable and configure the Helm provider to manage Kubernetes resources effectively:
+
+```hcl
+provider "helm" {
+ kubernetes {
+ host = data.aws_eks_cluster.app_cluster.endpoint
+ cluster_ca_certificate = base64decode(data.aws_eks_cluster.app_cluster.certificate_authority[0].data)
+ token = data.aws_eks_cluster_auth.app_cluster.token
+ exec {
+ api_version = "client.authentication.k8s.io/v1beta1"
+ args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.app_cluster.name]
+ command = "aws"
+ }
+ }
+}
+```
+
+This comprehensive setup ensures a smooth transition from the Pre-Operator to the Post-Operator configuration, leveraging new efficiencies and capabilities enabled by the operator model.
diff --git a/docs/operator-migration/variables.tf b/docs/operator-migration/variables.tf
new file mode 100644
index 00000000..f7c3bd7e
--- /dev/null
+++ b/docs/operator-migration/variables.tf
@@ -0,0 +1,145 @@
+variable "namespace" {
+ type = string
+ description = "Name prefix used for resources"
+}
+
+variable "domain_name" {
+ type = string
+ description = "Domain name used to access instance."
+}
+
+variable "zone_id" {
+ type = string
+ description = "Id of Route53 zone"
+}
+
+variable "size" {
+ default = "small"
+ description = "Deployment size"
+ nullable = true
+ type = string
+}
+
+variable "subdomain" {
+ type = string
+ default = null
+ description = "Subdomain for accessing the Weights & Biases UI."
+}
+
+variable "wandb_license" {
+ type = string
+}
+
+variable "database_engine_version" {
+ description = "Version for MySQL Auora"
+ type = string
+ default = "8.0.mysql_aurora.3.03.0"
+}
+
+variable "database_instance_class" {
+ description = "Instance type to use by database master instance."
+ type = string
+ default = "db.r5.large"
+}
+
+variable "database_snapshot_identifier" {
+ description = "Specifies whether or not to create this cluster from a snapshot. You can use either the name or ARN when specifying a DB cluster snapshot, or the ARN when specifying a DB snapshot"
+ type = string
+ default = null
+}
+
+variable "database_sort_buffer_size" {
+ description = "Specifies the sort_buffer_size value to set for the database"
+ type = number
+ default = 262144
+}
+
+variable "wandb_version" {
+ description = "The version of Weights & Biases local to deploy."
+ type = string
+ default = "latest"
+}
+
+variable "wandb_image" {
+ description = "Docker repository of to pull the wandb image from."
+ type = string
+ default = "wandb/local"
+}
+
+variable "bucket_name" {
+ type = string
+ default = ""
+}
+
+variable "bucket_kms_key_arn" {
+ type = string
+ description = "The Amazon Resource Name of the KMS key with which S3 storage bucket objects will be encrypted."
+ default = ""
+}
+
+variable "enable_dummy_dns" {
+ type = bool
+ default = false
+ description = "Boolean indicating whether or not to enable dummy DNS for the old alb"
+}
+
+variable "enable_operator_alb" {
+ type = bool
+ default = false
+ description = "Boolean indicating whether to use operatore ALB (true) or not (false)."
+}
+
+variable "custom_domain_filter" {
+ description = "A custom domain filter to be used by external-dns instead of the default FQDN. If not set, the local FQDN is used."
+ type = string
+ default = null
+}
+
+variable "allowed_inbound_cidr" {
+ default = ["0.0.0.0/0"]
+ nullable = false
+ type = list(string)
+}
+
+
+variable "allowed_inbound_ipv6_cidr" {
+ default = ["::/0"]
+ nullable = false
+ type = list(string)
+}
+
+variable "other_wandb_env" {
+ type = map(string)
+ description = "Extra environment variables for W&B"
+ default = {}
+}
+
+variable "system_reserved_cpu_millicores" {
+ description = "(Optional) The amount of 'system-reserved' CPU millicores to pass to the kubelet. For example: 100. A value of -1 disables the flag."
+ type = number
+ default = -1
+}
+
+variable "system_reserved_memory_megabytes" {
+ description = "(Optional) The amount of 'system-reserved' memory in megabytes to pass to the kubelet. For example: 100. A value of -1 disables the flag."
+ type = number
+ default = -1
+}
+
+variable "system_reserved_ephemeral_megabytes" {
+ description = "(Optional) The amount of 'system-reserved' ephemeral storage in megabytes to pass to the kubelet. For example: 1000. A value of -1 disables the flag."
+ type = number
+ default = -1
+}
+
+variable "system_reserved_pid" {
+ description = "(Optional) The amount of 'system-reserved' process ids [pid] to pass to the kubelet. For example: 1000. A value of -1 disables the flag."
+ type = number
+ default = -1
+}
+
+variable "aws_loadbalancer_controller_tags" {
+ description = "(Optional) A map of AWS tags to apply to all resources managed by the load balancer controller"
+ type = map(string)
+ default = {}
+}
diff --git a/main.tf b/main.tf
index 74002f07..97122925 100644
--- a/main.tf
+++ b/main.tf
@@ -231,6 +231,15 @@ locals {
lb_name_truncated = "${substr(var.namespace, 0, local.max_lb_name_length)}-alb-k8s"
}
+data "aws_region" "current" {}
+
+module "iam_role" {
+ count = var.enable_yace ? 1 : 0
+ source = "./modules/iam_role"
+ namespace = var.namespace
+ aws_iam_openid_connect_provider_url = module.app_eks.aws_iam_openid_connect_provider
+}
+
module "wandb" {
source = "wandb/wandb/helm"
version = "1.2.0"
@@ -309,6 +318,53 @@ module "wandb" {
}, var.app_wandb_env)
}
+ # To support otel rds and redis metrics need operator-wandb chart minimum version 0.13.8 ( yace subchart)
+ yace = var.enable_yace ? {
+ install = true
+ regions = [data.aws_region.current.name]
+ serviceAccount = { annotations = { "eks.amazonaws.com/role-arn" = module.iam_role[0].role_arn} }
+ } : {
+ install = false
+ regions = []
+ serviceAccount = {}
+ }
+
+ otel = {
+ daemonset = var.enable_yace ? {
+ config = {
+ receivers = {
+ prometheus = {
+ config = {
+ scrape_configs = [
+ { job_name = "yace"
+ scheme = "http"
+ metrics_path = "/metrics"
+ dns_sd_configs = [
+ { names = ["yace"]
+ type = "A"
+ port = 5000
+ }
+ ]
+ }
+ ]
+ }
+ }
+ }
+ service = {
+ pipelines = {
+ metrics = {
+ receivers = ["hostmetrics", "k8s_cluster", "kubeletstats", "prometheus"]
+ }
+ }
+ }
+ }
+ } : { config = {
+ receivers = {}
+ service = {}
+ }
+ }
+ }
+
mysql = { install = false }
redis = { install = false }
diff --git a/modules/app_eks/add-ons.tf b/modules/app_eks/add-ons.tf
index 52d50e4a..e3a5ae81 100644
--- a/modules/app_eks/add-ons.tf
+++ b/modules/app_eks/add-ons.tf
@@ -1,4 +1,3 @@
-
### IAM policy and role for vpc-cni
data "aws_iam_policy_document" "oidc_assume_role" {
statement {
@@ -37,9 +36,9 @@ resource "aws_eks_addon" "aws_efs_csi_driver" {
]
cluster_name = var.namespace
addon_name = "aws-efs-csi-driver"
- addon_version = "v1.7.7-eksbuild.1"
+ addon_version = "v2.0.4-eksbuild.1"
resolve_conflicts = "OVERWRITE"
- }
+}
resource "aws_eks_addon" "aws_ebs_csi_driver" {
depends_on = [
@@ -47,7 +46,7 @@ resource "aws_eks_addon" "aws_ebs_csi_driver" {
]
cluster_name = var.namespace
addon_name = "aws-ebs-csi-driver"
- addon_version = "v1.25.0-eksbuild.1"
+ addon_version = "v1.31.0-eksbuild.1"
resolve_conflicts = "OVERWRITE"
}
@@ -57,7 +56,7 @@ resource "aws_eks_addon" "coredns" {
]
cluster_name = var.namespace
addon_name = "coredns"
- addon_version = "v1.9.3-eksbuild.11"
+ addon_version = "v1.10.1-eksbuild.11"
resolve_conflicts = "OVERWRITE"
}
@@ -67,14 +66,14 @@ resource "aws_eks_addon" "kube_proxy" {
]
cluster_name = var.namespace
addon_name = "kube-proxy"
- addon_version = "v1.25.14-eksbuild.2"
+ addon_version = "v1.27.12-eksbuild.5"
resolve_conflicts = "OVERWRITE"
}
resource "aws_eks_addon" "vpc_cni" {
- cluster_name = var.namespace
- addon_name = "vpc-cni"
- addon_version = "v1.18.0-eksbuild.1"
- resolve_conflicts = "OVERWRITE"
+ cluster_name = var.namespace
+ addon_name = "vpc-cni"
+ addon_version = "v1.18.2-eksbuild.1"
+ resolve_conflicts = "OVERWRITE"
service_account_role_arn = aws_iam_role.oidc.arn
-}
\ No newline at end of file
+}
diff --git a/modules/app_eks/iam-policies.tf b/modules/app_eks/iam-policies.tf
index 6b0b11c6..6ce0528a 100644
--- a/modules/app_eks/iam-policies.tf
+++ b/modules/app_eks/iam-policies.tf
@@ -43,3 +43,23 @@ resource "aws_iam_policy" "secrets_manager" {
name = "${var.namespace}-secrets-manager"
policy = data.aws_iam_policy_document.secrets_manager.json
}
+
+# IAM Policy for IRSA
+resource "aws_iam_policy" "irsa" {
+ name = "${var.namespace}-irsa-policy"
+ description = "IRSA IAM Policy"
+
+ policy = jsonencode({
+ Version = "2012-10-17"
+ Statement = [
+ {
+ Effect = "Allow"
+ Action = [
+ "s3:*",
+ "kms:*",
+ ]
+ Resource = "*"
+ }
+ ]
+ })
+}
diff --git a/modules/app_eks/iam-role-attachments.tf b/modules/app_eks/iam-role-attachments.tf
index e82fe63b..92f0ff09 100644
--- a/modules/app_eks/iam-role-attachments.tf
+++ b/modules/app_eks/iam-role-attachments.tf
@@ -52,3 +52,10 @@ resource "aws_iam_role_policy_attachment" "node_secrets_manager" {
role = aws_iam_role.node.name
policy_arn = aws_iam_policy.secrets_manager.arn
}
+
+# Attach IRSA Policy to the IRSA Role
+resource "aws_iam_policy_attachment" "irsa" {
+ name = "irsa-policy-attachment"
+ roles = [aws_iam_role.irsa.name]
+ policy_arn = aws_iam_policy.irsa.arn
+}
diff --git a/modules/app_eks/iam-roles.tf b/modules/app_eks/iam-roles.tf
index dc70d132..9654b4ce 100644
--- a/modules/app_eks/iam-roles.tf
+++ b/modules/app_eks/iam-roles.tf
@@ -1,4 +1,29 @@
resource "aws_iam_role" "node" {
name = "${var.namespace}-node"
assume_role_policy = data.aws_iam_policy_document.node_assume.json
+
+}
+
+# IAM Role for IRSA
+resource "aws_iam_role" "irsa" {
+ name = "${var.namespace}-irsa-role"
+ assume_role_policy = jsonencode({
+ Version = "2012-10-17"
+ Statement = [
+ {
+ Sid = ""
+ Effect = "Allow"
+ Principal = {
+ Federated = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/${aws_iam_openid_connect_provider.eks.url}"
+ }
+ Action = "sts:AssumeRoleWithWebIdentity"
+ Condition = {
+ StringLike = {
+ "${aws_iam_openid_connect_provider.eks.url}:sub" = "system:serviceaccount:${var.namespace}:*"
+ "${aws_iam_openid_connect_provider.eks.url}:aud" = "sts.amazonaws.com"
+ }
+ }
+ }
+ ]
+ })
}
diff --git a/modules/app_eks/outputs.tf b/modules/app_eks/outputs.tf
index 304b51db..cc791455 100644
--- a/modules/app_eks/outputs.tf
+++ b/modules/app_eks/outputs.tf
@@ -17,3 +17,7 @@ output "node_role" {
output "primary_workers_security_group_id" {
value = aws_security_group.primary_workers.id
}
+
+output "aws_iam_openid_connect_provider" {
+ value = aws_iam_openid_connect_provider.eks.url
+}
\ No newline at end of file
diff --git a/modules/iam_role/main.tf b/modules/iam_role/main.tf
new file mode 100644
index 00000000..5d982e56
--- /dev/null
+++ b/modules/iam_role/main.tf
@@ -0,0 +1,51 @@
+data "aws_caller_identity" "current" {}
+
+resource "aws_iam_role" "irsa" {
+ name = "${var.namespace}-yace-irsa-role"
+ assume_role_policy = jsonencode({
+ Version = "2012-10-17"
+ Statement = [
+ {
+ Sid = ""
+ Effect = "Allow"
+ Principal = {
+ Federated = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/${var.aws_iam_openid_connect_provider_url}"
+ }
+ Action = ["sts:AssumeRoleWithWebIdentity"]
+ Condition = {
+ StringLike = {
+ "${var.aws_iam_openid_connect_provider_url}:sub" = "system:serviceaccount:*:yace"
+ "${var.aws_iam_openid_connect_provider_url}:aud" = "sts.amazonaws.com"
+ }
+ }
+ }
+ ]
+ })
+}
+
+
+resource "aws_iam_policy" "irsa" {
+ name = "${var.namespace}-yace-irsa-policy"
+ description = "IRSA IAM Policy"
+
+ policy = jsonencode({
+ Version = "2012-10-17"
+ Statement = [
+ {
+ Effect = "Allow"
+ Action = [
+ "tag:GetResources",
+ "cloudwatch:GetMetricData",
+ "cloudwatch:GetMetricStatistics",
+ "cloudwatch:ListMetrics"
+ ]
+ Resource = "*"
+ }
+ ]
+ })
+}
+
+resource "aws_iam_role_policy_attachment" "default" {
+ role = aws_iam_role.irsa.name
+ policy_arn = aws_iam_policy.irsa.arn
+}
\ No newline at end of file
diff --git a/modules/iam_role/outputs.tf b/modules/iam_role/outputs.tf
new file mode 100644
index 00000000..989f8c0c
--- /dev/null
+++ b/modules/iam_role/outputs.tf
@@ -0,0 +1,3 @@
+output "role_arn" {
+ value = aws_iam_role.irsa.arn
+}
\ No newline at end of file
diff --git a/modules/iam_role/variables.tf b/modules/iam_role/variables.tf
new file mode 100644
index 00000000..4cd10397
--- /dev/null
+++ b/modules/iam_role/variables.tf
@@ -0,0 +1,8 @@
+variable "namespace" {
+ type = string
+ description = "The name prefix for all resources created."
+}
+
+variable "aws_iam_openid_connect_provider_url" {
+ type = string
+}
\ No newline at end of file
diff --git a/variables.tf b/variables.tf
index 9f5f475f..bf75219d 100644
--- a/variables.tf
+++ b/variables.tf
@@ -440,4 +440,10 @@ variable "parquet_wandb_env" {
type = map(string)
description = "Extra environment variables for W&B"
default = {}
+}
+
+variable "enable_yace" {
+ type = bool
+ description = "deploy yet another cloudwatch exporter to fetch aws resources metrics"
+ default = true
}
\ No newline at end of file