diff --git a/CHANGELOG.md b/CHANGELOG.md index df212ea5..9734eb7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,27 @@ All notable changes to this project will be documented in this file. +### [4.12.2](https://github.com/wandb/terraform-aws-wandb/compare/v4.12.1...v4.12.2) (2024-06-17) + + +### Bug Fixes + +* Revert resolve conflicts var ([#233](https://github.com/wandb/terraform-aws-wandb/issues/233)) ([778f147](https://github.com/wandb/terraform-aws-wandb/commit/778f147aa9962fde6a74b7d35501ec7dd7abf2a9)) + +### [4.12.1](https://github.com/wandb/terraform-aws-wandb/compare/v4.12.0...v4.12.1) (2024-06-17) + + +### Bug Fixes + +* Remove white space ([#231](https://github.com/wandb/terraform-aws-wandb/issues/231)) ([974b4f3](https://github.com/wandb/terraform-aws-wandb/commit/974b4f3ec0d01b34cf6d83008c9fe2a0d3d8ee7a)) + +## [4.12.0](https://github.com/wandb/terraform-aws-wandb/compare/v4.11.0...v4.12.0) (2024-06-17) + + +### Features + +* Added support yace ([#218](https://github.com/wandb/terraform-aws-wandb/issues/218)) ([12e053d](https://github.com/wandb/terraform-aws-wandb/commit/12e053d520f6998689d3bec0352b320a9105ba9e)) + ## [4.11.0](https://github.com/wandb/terraform-aws-wandb/compare/v4.10.2...v4.11.0) (2024-05-18) diff --git a/README.md b/README.md index 0bc592c9..3f9de70e 100644 --- a/README.md +++ b/README.md @@ -102,13 +102,14 @@ resources that lack official modules. Users can update the EKS cluster version to the latest version offered by AWS. This can be done using the environment variable `eks_cluster_version`. Note that, cluster and nodegroup version updates can only be done in increments of one version at a time. For example, if your current cluster version is `1.21` and the latest version available is `1.25` - you'd need to: 1. update the cluster version in the app_eks module from `1.21` to `1.22` -2. run `terraform apply` +2. run `terraform apply` 3. update the cluster version to `1.23` 4. run `terraform apply` 5. update the cluster version to `1.24` -...and so on and so forth. + ...and so on and so forth. Upgrades must be executed in step-wise fashion from one version to the next. You cannot skip versions when upgrading EKS. + ### Notes on EKS Add-ons @@ -252,7 +253,11 @@ CLI and re-run the apply. Running pods will not be impacted. ## Migrations -#### Upgrading from 3.x -> 4.x +### Upgrading to Operator + +See our upgrade guide [here](./docs/operator-migration/readme.md) + +### Upgrading from 3.x -> 4.x - If egress access for retrieving the wandb/controller image is not available, Terraform apply may experience failures. - It's necessary to supply a license variable within the module, as shown: diff --git a/docs/operator-migration/images/post-operator-apply.png b/docs/operator-migration/images/post-operator-apply.png new file mode 100644 index 00000000..9ac05b96 Binary files /dev/null and b/docs/operator-migration/images/post-operator-apply.png differ diff --git a/docs/operator-migration/images/post-operator-k8s.svg b/docs/operator-migration/images/post-operator-k8s.svg new file mode 100644 index 00000000..20ebd449 --- /dev/null +++ b/docs/operator-migration/images/post-operator-k8s.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/operator-migration/images/pre-operator-infra.svg b/docs/operator-migration/images/pre-operator-infra.svg new file mode 100644 index 00000000..c1b474ea --- /dev/null +++ b/docs/operator-migration/images/pre-operator-infra.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/operator-migration/images/pre-operator-k8s.svg b/docs/operator-migration/images/pre-operator-k8s.svg new file mode 100644 index 00000000..93719397 --- /dev/null +++ b/docs/operator-migration/images/pre-operator-k8s.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/operator-migration/post-operator.tf.disabled b/docs/operator-migration/post-operator.tf.disabled new file mode 100644 index 00000000..62ae09e3 --- /dev/null +++ b/docs/operator-migration/post-operator.tf.disabled @@ -0,0 +1,113 @@ +provider "aws" { + region = "us-west-2" + + default_tags { + tags = { + GithubRepo = "terraform-aws-wandb" + GithubOrg = "wandb" + Enviroment = "Example" + Example = "PublicDnsExternal" + } + } +} + +terraform { + required_version = "~> 1.0" + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 4.0" # Post-Operator + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.23" + } + } +} + +module "wandb_infra" { + source = "wandb/wandb/aws" + version = "4.7.2" + + namespace = var.namespace + public_access = true + external_dns = true + + enable_dummy_dns = var.enable_dummy_dns # Post-Operator + enable_operator_alb = var.enable_operator_alb # Post-Operator + deletion_protection = false + + database_instance_class = var.database_instance_class + database_engine_version = var.database_engine_version + database_snapshot_identifier = var.database_snapshot_identifier + database_sort_buffer_size = var.database_sort_buffer_size + + database_performance_insights_kms_key_arn = null + + allowed_inbound_cidr = var.allowed_inbound_cidr + allowed_inbound_ipv6_cidr = ["::/0"] + + eks_cluster_version = "1.25" + kubernetes_public_access = true + kubernetes_public_access_cidrs = ["0.0.0.0/0"] + + domain_name = var.domain_name + zone_id = var.zone_id + subdomain = var.subdomain + + # Add License Post-Operator + license = var.wandb_license + + # Use standard sizing Post-Operator + size = var.size + + # Set the External DNS Custom Domain Filter Post-Operator + custom_domain_filter = var.custom_domain_filter + + bucket_name = var.bucket_name + bucket_kms_key_arn = var.bucket_kms_key_arn + use_internal_queue = true + + aws_loadbalancer_controller_tags = var.aws_loadbalancer_controller_tags +} + +data "aws_eks_cluster" "app_cluster" { + name = module.wandb_infra.cluster_id +} + +data "aws_eks_cluster_auth" "app_cluster" { + name = module.wandb_infra.cluster_id +} + +provider "kubernetes" { + host = data.aws_eks_cluster.app_cluster.endpoint + cluster_ca_certificate = base64decode(data.aws_eks_cluster.app_cluster.certificate_authority[0].data) + token = data.aws_eks_cluster_auth.app_cluster.token + exec { + api_version = "client.authentication.k8s.io/v1beta1" + args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.app_cluster.name] + command = "aws" + } +} + +# Enable the Helm provider +provider "helm" { + kubernetes { + host = data.aws_eks_cluster.app_cluster.endpoint + cluster_ca_certificate = base64decode(data.aws_eks_cluster.app_cluster.certificate_authority[0].data) + token = data.aws_eks_cluster_auth.app_cluster.token + exec { + api_version = "client.authentication.k8s.io/v1beta1" + args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.app_cluster.name] + command = "aws" + } + } +} + +output "bucket_name" { + value = module.wandb_infra.bucket_name +} + +output "bucket_queue_name" { + value = module.wandb_infra.bucket_queue_name +} diff --git a/docs/operator-migration/post-operator.tfvars b/docs/operator-migration/post-operator.tfvars new file mode 100644 index 00000000..9c3ae571 --- /dev/null +++ b/docs/operator-migration/post-operator.tfvars @@ -0,0 +1,12 @@ +namespace = "operator-upgrade" +domain_name = "sandbox-aws.wandb.ml" +zone_id = "Z032246913CW32RVRY0WU" +subdomain = "operator-upgrade" +wandb_license = "eyJh" +# wandb_version = "0.51.2" Is now coming from the Release Channel or set in the User Spec. + +# Needed Operator Variables for Upgrade +size = "small" +enable_dummy_dns = true +enable_operator_alb = true +custom_domain_filter = "sandbox-aws.wandb.ml" \ No newline at end of file diff --git a/docs/operator-migration/pre-operator.tf b/docs/operator-migration/pre-operator.tf new file mode 100644 index 00000000..40bf5a2b --- /dev/null +++ b/docs/operator-migration/pre-operator.tf @@ -0,0 +1,112 @@ +provider "aws" { + region = "us-west-2" + + default_tags { + tags = { + GithubRepo = "terraform-aws-wandb" + GithubOrg = "wandb" + Enviroment = "Example" + Example = "PublicDnsExternal" + } + } +} + +terraform { + required_version = "~> 1.0" + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 3.6" # Pre-Operator + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.23" + } + } +} + +module "wandb_infra" { + source = "wandb/wandb/aws" + version = "1.16.10" + + namespace = var.namespace + public_access = true + external_dns = true + + deletion_protection = false + + database_instance_class = var.database_instance_class + database_engine_version = var.database_engine_version + database_snapshot_identifier = var.database_snapshot_identifier + database_sort_buffer_size = var.database_sort_buffer_size + + database_performance_insights_kms_key_arn = null + + allowed_inbound_cidr = var.allowed_inbound_cidr + allowed_inbound_ipv6_cidr = ["::/0"] + + eks_cluster_version = "1.25" + kubernetes_public_access = true + kubernetes_public_access_cidrs = ["0.0.0.0/0"] + + domain_name = var.domain_name + zone_id = var.zone_id + subdomain = var.subdomain + + bucket_name = var.bucket_name + bucket_kms_key_arn = var.bucket_kms_key_arn + use_internal_queue = true +} + +data "aws_eks_cluster" "app_cluster" { + name = module.wandb_infra.cluster_id +} + +data "aws_eks_cluster_auth" "app_cluster" { + name = module.wandb_infra.cluster_id +} + +provider "kubernetes" { + host = data.aws_eks_cluster.app_cluster.endpoint + cluster_ca_certificate = base64decode(data.aws_eks_cluster.app_cluster.certificate_authority[0].data) + token = data.aws_eks_cluster_auth.app_cluster.token + exec { + api_version = "client.authentication.k8s.io/v1beta1" + args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.app_cluster.name] + command = "aws" + } +} + +module "wandb_app" { + source = "wandb/wandb/kubernetes" + version = "1.12.0" + + license = var.wandb_license + + host = module.wandb_infra.url + bucket = "s3://${module.wandb_infra.bucket_name}" + bucket_aws_region = module.wandb_infra.bucket_region + bucket_queue = "internal://" + bucket_kms_key_arn = module.wandb_infra.kms_key_arn + database_connection_string = "mysql://${module.wandb_infra.database_connection_string}" + redis_connection_string = "redis://${module.wandb_infra.elasticache_connection_string}?tls=true&ttlInSeconds=604800" + + wandb_image = var.wandb_image + wandb_version = var.wandb_version + + service_port = module.wandb_infra.internal_app_port + + depends_on = [module.wandb_infra] + + other_wandb_env = merge({ + "GORILLA_CUSTOMER_SECRET_STORE_SOURCE" = "aws-secretmanager://${var.namespace}?namespace=${var.namespace}" + }, var.other_wandb_env) +} + +output "bucket_name" { + value = module.wandb_infra.bucket_name +} + +output "bucket_queue_name" { + value = module.wandb_infra.bucket_queue_name +} diff --git a/docs/operator-migration/pre-operator.tfvars b/docs/operator-migration/pre-operator.tfvars new file mode 100644 index 00000000..7cfb15a1 --- /dev/null +++ b/docs/operator-migration/pre-operator.tfvars @@ -0,0 +1,7 @@ +namespace = "operator-upgrade" +domain_name = "sandbox-aws.wandb.ml" +zone_id = "Z032246913CW32RVRY0WU" +subdomain = "operator-upgrade" +wandb_license = "eyJh" +wandb_version = "0.51.2" +# size = "small" \ No newline at end of file diff --git a/docs/operator-migration/readme.md b/docs/operator-migration/readme.md new file mode 100644 index 00000000..10179d04 --- /dev/null +++ b/docs/operator-migration/readme.md @@ -0,0 +1,267 @@ +# Operator Migration + +This guide details the steps required to upgrade from **_pre-operator_** to **_post-operator_** environments using the [terraform-aws-wandb](https://registry.terraform.io/modules/wandb/wandb/aws/latest) module. + +## Introduction to Operator Shift + +The transition to a Kubernetes [operator](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/) pattern is crucial for our architecture. This section explains the transition from **_pre_** to **_post_** architectures. + +### Reasons for the Architecture Shift + +Historically, the W&B application was deployed as a single Deployment and pod within a Kubernetes Cluster or Docker container. We have always recommended externalizing the Metadata Store and Object Store to decouple state from the application, especially in production environments. + +As the application grew, the need to evolve from a monolithic container to a distributed system became apparent. This change facilitates backend logic handling and seamlessly introduces **_in-kubernetes_** infrastructure capabilities. It also supports deploying new services essential for additional features that W&B relies on. + +Previously, any Kubernetes-related changes required updating the [terraform-kubernetes-wandb](https://github.com/wandb/terraform-kubernetes-wandb), ensuring compatibility across cloud providers, configuring necessary Terraform variables, and executing a terraform apply for each backend or Kubernetes-level change. This process was not scalable and placed a significant burden on our support staff to assist customers with upgrades. + +The solution was to implement an **_Operator_** that connects to a central [deploy.wandb.ai](https://deploy.wandb.ai) server with its `license` to request the latest specification changes for a given **_Release Channel_** and apply them. Helm was chosen as both the deployment mechanism for our operator and the means for the operator to handle all configuration templating of the W&B Kubernetes stack; Helmception. + +You can install the operator from [charts/operator](https://github.com/wandb/helm-charts/tree/main/charts/operator). This installation creates a deployment called `controller-manager` and utilizes a **_Custom Resource_** definition named `weightsandbiases.apps.wandb.com` (shortName: `wandb`), which takes a single `spec` and applies it to the cluster: + +```yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: weightsandbiases.apps.wandb.com +``` + +The `controller-manager` installs [charts/operator-wandb](https://github.com/wandb/helm-charts/tree/main/charts/operator-wandb) based on the spec of the **_Custom Resource_**, **_Release Channel_**, and a **_User Defined Config_** in the new **_System Console_**. This hierarchy allows for maximum configuration flexibility at the user end and enables W&B to release new images, configurations, features, and Helm updates without requiring Terraform reruns. + +## Before and After Architecture + +Previously, our architecture used: + +```hcl +module "wandb_infra" { + source = "wandb/wandb/aws" + version = "1.16.10" + ... +} +``` + +to control the infrastructure: + +![pre-operator-infra](./images/pre-operator-infra.svg) + +and this module to deploy the W&B application: + +```hcl +module "wandb_app" { + source = "wandb/wandb/kubernetes" + version = "1.12.0" +} +``` + +![pre-operator-k8s](./images/pre-operator-k8s.svg) + +Post-transition, the architecture uses: + +```hcl +module "wandb_infra" { + source = "wandb/wandb/aws" + version = "4.7.2" + ... +} +``` + +to manage both the installation of infrastructure and the W&B application to the Kubernetes cluster, thus eliminating the need for the `module "wandb_app"` in `post-operator.tf`. + +![post-operator-k8s](./images/post-operator-k8s.svg) + +This architectural shift facilitates the introduction of additional customer features (like OpenTelemetry, Prometheus, HPA's, Kafka, and image updates) without requiring manual Terraform operations by SRE/Infrastructure teams. + +### Specification Hierarchy + +In our operator model, configuration specifications follow a hierarchical model where higher-level specifications override lower-level ones. Here’s how it works: + +- **Release Channel Spec**: This base level configuration sets default values and configurations based on the **_Release Channel_** set by W&B for the deployment. +- **User Input Spec**: Users can override the default settings provided by the Release Channel Spec through the System Console. +- **Custom Resource Spec**: The highest level of specification, which comes from the Terraform configuration. Any values specified here will override both the User Input and Release Channel specifications. + +This hierarchical model ensures that configurations are flexible and customizable to meet varying needs while maintaining a manageable and systematic approach to upgrades and changes. + +## Migration + +To commence with a base installation of the W&B Pre-Operator, ensure that `post-operator.tf` has a `.disabled` file extension and `pre-operator.tf` is active (i.e., does not have a `.disabled` extension). + +### Prerequisites + +Before initiating the migration process, ensure the following prerequisites are met: + +- **Egress**: The deployment can't be airgapped. It needs access to [deploy.wandb.ai](deploy.wandb.ai) to get the latest spec for the **_Release Channel_**. +- **AWS Credentials**: Proper AWS credentials configured to interact with your AWS resources. +- **Terraform Installed**: The latest version of Terraform should be installed on your system. +- **Route53 Hosted Zone**: An existing Route53 hosted zone corresponding to the domain under which the application will be served. +- **Pre-Operator Terraform Files**: Ensure `pre-operator.tf` and associated variable files like `pre-operator.tfvars` are correctly set up. + +### Pre-Operator Setup + +Execute the following Terraform commands to initialize and apply the configuration for the Pre-Operator setup: + +```bash +terraform init -upgrade +terraform apply -var-file=./pre-operator.tfvars +``` + +`pre-operator.tf` should look something like this: + +```ini +namespace = "operator-upgrade" +domain_name = "sandbox-aws.wandb.ml" +zone_id = "Z032246913CW32RVRY0WU" +subdomain = "operator-upgrade" +wandb_license = "ey..." +wandb_version = "0.51.2" +``` + +The `pre-operator.tf` configuration calls two modules: + +```hcl +module "wandb_infra" { + source = "wandb/wandb/aws" + version = "1.16.10" + ... +} +``` + +This module spins up the infrastructure. + +```hcl +module "wandb_app" { + source = "wandb/wandb/kubernetes" + version = "1.12.0" +} +``` + +This module deploys the application. + +### Post-Operator Setup + +Make sure that `pre-operator.tf` has a `.disabled` extension, and `post-operator.tf` is active. + +The `post-operator.tfvars` includes additional variables: + +```ini +... +# wandb_version = "0.51.2" is now managed via the Release Channel or set in the User Spec. + +# Required Operator Variables for Upgrade: +size = "small" +enable_dummy_dns = true +enable_operator_alb = true +custom_domain_filter = "sandbox-aws.wandb.ml" +``` + +Run the following commands to initialize and apply the Post-Operator configuration: + +```bash +terraform init -upgrade +terraform apply -var-file=./post-operator.tfvars +``` + +The plan and apply steps will update the following resources: + +```yaml +actions: + create: + - aws_efs_backup_policy.storage_class + - aws_efs_file_system.storage_class + - aws_efs_mount_target.storage_class["0"] + - aws_efs_mount_target.storage_class["1"] + - aws_eks_addon.efs + - aws_iam_openid_connect_provider.eks + - aws_iam_policy.secrets_manager + - aws_iam_role_policy_attachment.ebs_csi + - aws_iam_role_policy_attachment.eks_efs + - aws_iam_role_policy_attachment.node_secrets_manager + - aws_security_group.storage_class_nfs + - aws_security_group_rule.nfs_ingress + - random_pet.efs + - aws_s3_bucket_acl.file_storage + - aws_s3_bucket_cors_configuration.file_storage + - aws_s3_bucket_ownership_controls.file_storage + - aws_s3_bucket_server_side_encryption_configuration.file_storage + - helm_release.operator + - helm_release.wandb + - aws_cloudwatch_log_group.this[0] + - aws_iam_policy.default + - aws_iam_role.default + - aws_iam_role_policy_attachment.default + - helm_release.external_dns + - aws_default_network_acl.this[0] + - aws_default_route_table.default[0] + - aws_iam_policy.default + - aws_iam_role.default + - aws_iam_role_policy_attachment.default + - helm_release.aws_load_balancer_controller + + update_in_place: + - aws_iam_policy.node_IMDSv2 + - aws_iam_policy.node_cloudwatch + - aws_iam_policy.node_kms + - aws_iam_policy.node_s3 + - aws_iam_policy.node_sqs + - aws_eks_cluster.this[0] + - aws_elasticache_replication_group.default + - aws_rds_cluster.this[0] + - aws_rds_cluster_instance.this["1"] + - aws_default_security_group.this[0] + - aws_subnet.private[0] + - aws_subnet.private[1] + - aws_subnet.public[0] + - aws_subnet.public[1] + - aws_launch_template.workers["primary"] + + destroy: + - kubernetes_config_map.config_map + - kubernetes_deployment.wandb + - kubernetes_priority_class.priority + - kubernetes_secret.secret + - kubernetes_service.prometheus + - kubernetes_service.service + - random_id.snapshot_identifier[0] + + replace: + - aws_autoscaling_attachment.autoscaling_attachment["primary"] + - aws_route53_record.alb + - aws_eks_node_group.workers["primary"] +``` + +You should see something like this: + +![post-operator-apply](./images/post-operator-apply.png) + +Note that in `post-operator.tf`, there is a single: + +```hcl +module "wandb_infra" { + source = "wandb/wandb/aws" + version = "4.7.2" + ... +} +``` + +#### Changes in the Post-Operator Configuration: + +1. **Update Required Providers**: Change `required_providers.aws.version` from `3.6` to `4.0` for provider compatibility. +2. **DNS and Load Balancer Configuration**: Integrate `enable_dummy_dns` and `enable_operator_alb` to manage DNS records and AWS Load Balancer setup through an Ingress. +3. **License and Size Configuration**: Transfer the `license` and `size` parameters directly to the `wandb_infra` module to match new operational requirements. +4. **Custom Domain Handling**: If necessary, use `custom_domain_filter` to troubleshoot DNS issues by checking the External DNS pod logs within the `kube-system` namespace. +5. **Helm Provider Configuration**: Enable and configure the Helm provider to manage Kubernetes resources effectively: + +```hcl +provider "helm" { + kubernetes { + host = data.aws_eks_cluster.app_cluster.endpoint + cluster_ca_certificate = base64decode(data.aws_eks_cluster.app_cluster.certificate_authority[0].data) + token = data.aws_eks_cluster_auth.app_cluster.token + exec { + api_version = "client.authentication.k8s.io/v1beta1" + args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.app_cluster.name] + command = "aws" + } + } +} +``` + +This comprehensive setup ensures a smooth transition from the Pre-Operator to the Post-Operator configuration, leveraging new efficiencies and capabilities enabled by the operator model. diff --git a/docs/operator-migration/variables.tf b/docs/operator-migration/variables.tf new file mode 100644 index 00000000..f7c3bd7e --- /dev/null +++ b/docs/operator-migration/variables.tf @@ -0,0 +1,145 @@ +variable "namespace" { + type = string + description = "Name prefix used for resources" +} + +variable "domain_name" { + type = string + description = "Domain name used to access instance." +} + +variable "zone_id" { + type = string + description = "Id of Route53 zone" +} + +variable "size" { + default = "small" + description = "Deployment size" + nullable = true + type = string +} + +variable "subdomain" { + type = string + default = null + description = "Subdomain for accessing the Weights & Biases UI." +} + +variable "wandb_license" { + type = string +} + +variable "database_engine_version" { + description = "Version for MySQL Auora" + type = string + default = "8.0.mysql_aurora.3.03.0" +} + +variable "database_instance_class" { + description = "Instance type to use by database master instance." + type = string + default = "db.r5.large" +} + +variable "database_snapshot_identifier" { + description = "Specifies whether or not to create this cluster from a snapshot. You can use either the name or ARN when specifying a DB cluster snapshot, or the ARN when specifying a DB snapshot" + type = string + default = null +} + +variable "database_sort_buffer_size" { + description = "Specifies the sort_buffer_size value to set for the database" + type = number + default = 262144 +} + +variable "wandb_version" { + description = "The version of Weights & Biases local to deploy." + type = string + default = "latest" +} + +variable "wandb_image" { + description = "Docker repository of to pull the wandb image from." + type = string + default = "wandb/local" +} + +variable "bucket_name" { + type = string + default = "" +} + +variable "bucket_kms_key_arn" { + type = string + description = "The Amazon Resource Name of the KMS key with which S3 storage bucket objects will be encrypted." + default = "" +} + +variable "enable_dummy_dns" { + type = bool + default = false + description = "Boolean indicating whether or not to enable dummy DNS for the old alb" +} + +variable "enable_operator_alb" { + type = bool + default = false + description = "Boolean indicating whether to use operatore ALB (true) or not (false)." +} + +variable "custom_domain_filter" { + description = "A custom domain filter to be used by external-dns instead of the default FQDN. If not set, the local FQDN is used." + type = string + default = null +} + +variable "allowed_inbound_cidr" { + default = ["0.0.0.0/0"] + nullable = false + type = list(string) +} + + +variable "allowed_inbound_ipv6_cidr" { + default = ["::/0"] + nullable = false + type = list(string) +} + +variable "other_wandb_env" { + type = map(string) + description = "Extra environment variables for W&B" + default = {} +} + +variable "system_reserved_cpu_millicores" { + description = "(Optional) The amount of 'system-reserved' CPU millicores to pass to the kubelet. For example: 100. A value of -1 disables the flag." + type = number + default = -1 +} + +variable "system_reserved_memory_megabytes" { + description = "(Optional) The amount of 'system-reserved' memory in megabytes to pass to the kubelet. For example: 100. A value of -1 disables the flag." + type = number + default = -1 +} + +variable "system_reserved_ephemeral_megabytes" { + description = "(Optional) The amount of 'system-reserved' ephemeral storage in megabytes to pass to the kubelet. For example: 1000. A value of -1 disables the flag." + type = number + default = -1 +} + +variable "system_reserved_pid" { + description = "(Optional) The amount of 'system-reserved' process ids [pid] to pass to the kubelet. For example: 1000. A value of -1 disables the flag." + type = number + default = -1 +} + +variable "aws_loadbalancer_controller_tags" { + description = "(Optional) A map of AWS tags to apply to all resources managed by the load balancer controller" + type = map(string) + default = {} +} diff --git a/main.tf b/main.tf index 3b6fc78d..e551263e 100644 --- a/main.tf +++ b/main.tf @@ -222,6 +222,15 @@ locals { lb_name_truncated = "${substr(var.namespace, 0, local.max_lb_name_length)}-alb-k8s" } +data "aws_region" "current" {} + +module "iam_role" { + count = var.enable_yace ? 1 : 0 + source = "./modules/iam_role" + namespace = var.namespace + aws_iam_openid_connect_provider_url = module.app_eks.aws_iam_openid_connect_provider +} + module "wandb" { source = "wandb/wandb/helm" version = "1.2.0" @@ -300,6 +309,53 @@ module "wandb" { }, var.app_wandb_env) } + # To support otel rds and redis metrics need operator-wandb chart minimum version 0.13.8 ( yace subchart) + yace = var.enable_yace ? { + install = true + regions = [data.aws_region.current.name] + serviceAccount = { annotations = { "eks.amazonaws.com/role-arn" = module.iam_role[0].role_arn} } + } : { + install = false + regions = [] + serviceAccount = {} + } + + otel = { + daemonset = var.enable_yace ? { + config = { + receivers = { + prometheus = { + config = { + scrape_configs = [ + { job_name = "yace" + scheme = "http" + metrics_path = "/metrics" + dns_sd_configs = [ + { names = ["yace"] + type = "A" + port = 5000 + } + ] + } + ] + } + } + } + service = { + pipelines = { + metrics = { + receivers = ["hostmetrics", "k8s_cluster", "kubeletstats", "prometheus"] + } + } + } + } + } : { config = { + receivers = {} + service = {} + } + } + } + mysql = { install = false } redis = { install = false } diff --git a/modules/app_eks/add-ons.tf b/modules/app_eks/add-ons.tf index 4f77765b..cc43043d 100644 --- a/modules/app_eks/add-ons.tf +++ b/modules/app_eks/add-ons.tf @@ -1,4 +1,3 @@ - ### IAM policy and role for vpc-cni data "aws_iam_policy_document" "oidc_assume_role" { statement { @@ -39,8 +38,7 @@ resource "aws_eks_addon" "aws_efs_csi_driver" { addon_name = "aws-efs-csi-driver" addon_version = "v2.0.3-eksbuild.1" resolve_conflicts = "OVERWRITE" - - } +} resource "aws_eks_addon" "aws_ebs_csi_driver" { depends_on = [ @@ -50,7 +48,6 @@ resource "aws_eks_addon" "aws_ebs_csi_driver" { addon_name = "aws-ebs-csi-driver" addon_version = "v1.31.0-eksbuild.1" resolve_conflicts = "OVERWRITE" - } resource "aws_eks_addon" "coredns" { @@ -74,9 +71,9 @@ resource "aws_eks_addon" "kube_proxy" { } resource "aws_eks_addon" "vpc_cni" { - cluster_name = var.namespace - addon_name = "vpc-cni" - addon_version = "v1.18.0-eksbuild.1" - resolve_conflicts = "OVERWRITE" + cluster_name = var.namespace + addon_name = "vpc-cni" + addon_version = "v1.18.0-eksbuild.1" + resolve_conflicts = "OVERWRITE" service_account_role_arn = aws_iam_role.oidc.arn -} \ No newline at end of file +} diff --git a/modules/app_eks/outputs.tf b/modules/app_eks/outputs.tf index 304b51db..cc791455 100644 --- a/modules/app_eks/outputs.tf +++ b/modules/app_eks/outputs.tf @@ -17,3 +17,7 @@ output "node_role" { output "primary_workers_security_group_id" { value = aws_security_group.primary_workers.id } + +output "aws_iam_openid_connect_provider" { + value = aws_iam_openid_connect_provider.eks.url +} \ No newline at end of file diff --git a/modules/iam_role/main.tf b/modules/iam_role/main.tf new file mode 100644 index 00000000..5d982e56 --- /dev/null +++ b/modules/iam_role/main.tf @@ -0,0 +1,51 @@ +data "aws_caller_identity" "current" {} + +resource "aws_iam_role" "irsa" { + name = "${var.namespace}-yace-irsa-role" + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "" + Effect = "Allow" + Principal = { + Federated = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/${var.aws_iam_openid_connect_provider_url}" + } + Action = ["sts:AssumeRoleWithWebIdentity"] + Condition = { + StringLike = { + "${var.aws_iam_openid_connect_provider_url}:sub" = "system:serviceaccount:*:yace" + "${var.aws_iam_openid_connect_provider_url}:aud" = "sts.amazonaws.com" + } + } + } + ] + }) +} + + +resource "aws_iam_policy" "irsa" { + name = "${var.namespace}-yace-irsa-policy" + description = "IRSA IAM Policy" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "tag:GetResources", + "cloudwatch:GetMetricData", + "cloudwatch:GetMetricStatistics", + "cloudwatch:ListMetrics" + ] + Resource = "*" + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "default" { + role = aws_iam_role.irsa.name + policy_arn = aws_iam_policy.irsa.arn +} \ No newline at end of file diff --git a/modules/iam_role/outputs.tf b/modules/iam_role/outputs.tf new file mode 100644 index 00000000..989f8c0c --- /dev/null +++ b/modules/iam_role/outputs.tf @@ -0,0 +1,3 @@ +output "role_arn" { + value = aws_iam_role.irsa.arn +} \ No newline at end of file diff --git a/modules/iam_role/variables.tf b/modules/iam_role/variables.tf new file mode 100644 index 00000000..4cd10397 --- /dev/null +++ b/modules/iam_role/variables.tf @@ -0,0 +1,8 @@ +variable "namespace" { + type = string + description = "The name prefix for all resources created." +} + +variable "aws_iam_openid_connect_provider_url" { + type = string +} \ No newline at end of file diff --git a/variables.tf b/variables.tf index 067fd9f5..bf75219d 100644 --- a/variables.tf +++ b/variables.tf @@ -441,3 +441,9 @@ variable "parquet_wandb_env" { description = "Extra environment variables for W&B" default = {} } + +variable "enable_yace" { + type = bool + description = "deploy yet another cloudwatch exporter to fetch aws resources metrics" + default = true +} \ No newline at end of file