Merge pull request #1 from eric-golinko-db/add-ml-code

Add ML Code
eric-golinko-db · Feb 26, 2024 · cedb7a0 · cedb7a0
2 parents d91ba00 + 21de314
commit cedb7a0
Show file tree

Hide file tree

Showing 49 changed files with 2,023 additions and 2 deletions.
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
@@ -0,0 +1,7 @@
+# CI/CD Workflow Definitions
+This directory contains CI/CD workflow definitions using [GitHub Actions](https://docs.github.com/en/actions),
+under ``workflows``. These workflows cover testing and deployment of both ML code (for model training, batch inference, etc) and the 
+Databricks ML asset definitions under ``mlops_stacks_gcp_fs/assets``. 
+
+To set up CI/CD for a new project,
+please refer to [ML asset config - set up CI CD](../../mlops_stacks_gcp_fs/assets/README.md#set-up-ci-and-cd).
diff --git a/.github/workflows/mlops_stacks_gcp_fs-bundle-cd-prod.yml b/.github/workflows/mlops_stacks_gcp_fs-bundle-cd-prod.yml
@@ -0,0 +1,34 @@
+# This GitHub workflow deploys Bundle assets (ML asset config and more)
+# defined under mlops_stacks_gcp_fs/assets/*
+# and mlops_stacks_gcp_fs/databricks.yml with prod deployment target configs,
+# when PRs are merged into the release branch
+name: Bundle Deployment for mlops_stacks_gcp_fs Prod
+
+on:
+  push:
+    branches:
+      - 'release'
+  workflow_dispatch:
+
+defaults:
+  run:
+    working-directory: ./mlops_stacks_gcp_fs
+
+env:
+  DATABRICKS_TOKEN: ${{ secrets.PROD_WORKSPACE_TOKEN }}
+
+jobs:
+  prod:
+    concurrency: mlops_stacks_gcp_fs-prod-bundle-job
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v3
+      - uses: databricks/setup-cli@v0.211.0
+      - name: Validate Bundle For Prod
+        id: validate
+        run: |
+          databricks bundle validate -t prod
+      - name: Deploy Bundle to Prod
+        id: deploy
+        run: |
+          databricks bundle deploy -t prod
diff --git a/.github/workflows/mlops_stacks_gcp_fs-bundle-cd-staging.yml b/.github/workflows/mlops_stacks_gcp_fs-bundle-cd-staging.yml
@@ -0,0 +1,34 @@
+# This GitHub workflow deploys Bundle assets (ML asset config and more)
+# defined under mlops_stacks_gcp_fs/assets/*
+# and mlops_stacks_gcp_fs/databricks.yml with staging deployment target configs,
+# when PRs are merged into the default branch
+name: Bundle Deployment for mlops_stacks_gcp_fs Staging
+
+on:
+  push:
+    branches:
+      - 'main'
+  workflow_dispatch:
+
+defaults:
+  run:
+    working-directory: ./mlops_stacks_gcp_fs
+
+env:
+  DATABRICKS_TOKEN: ${{ secrets.STAGING_WORKSPACE_TOKEN }}
+
+jobs:
+  staging:
+    concurrency: mlops_stacks_gcp_fs-staging-bundle-job
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v3
+      - uses: databricks/setup-cli@v0.211.0
+      - name: Validate Bundle For Staging
+        id: validate
+        run: |
+          databricks bundle validate -t staging
+      - name: Deploy Bundle to Staging
+        id: deploy
+        run: |
+          databricks bundle deploy -t staging
diff --git a/.github/workflows/mlops_stacks_gcp_fs-bundle-ci.yml b/.github/workflows/mlops_stacks_gcp_fs-bundle-ci.yml
@@ -0,0 +1,93 @@
+# This GitHub workflow validates Bundle config (ML asset config and more)
+# defined under mlops_stacks_gcp_fs/assets/*
+# and mlops_stacks_gcp_fs/databricks.yml, when PRs are merged into the main branch
+name: Bundle validation for mlops_stacks_gcp_fs
+
+on:
+  workflow_dispatch:
+  pull_request_target:
+
+defaults:
+  run:
+    working-directory: ./mlops_stacks_gcp_fs/
+
+env:
+  STAGING_WORKSPACE_TOKEN: ${{ secrets.STAGING_WORKSPACE_TOKEN }}
+  PROD_WORKSPACE_TOKEN: ${{ secrets.PROD_WORKSPACE_TOKEN }}
+
+jobs:
+  staging:
+    concurrency: mlops_stacks_gcp_fs-staging-bundle-job
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          ref: ${{ github.event.pull_request.head.sha || github.sha }}
+      - uses: databricks/setup-cli@v0.211.0
+      - name: Validate Bundle For Staging
+        id: validate
+        env:
+          DATABRICKS_TOKEN: ${{ env.STAGING_WORKSPACE_TOKEN }}
+        run: |
+          databricks bundle validate -t staging > ../validate_output.txt
+      - name: Create Comment with Bundle Configuration
+        uses: actions/github-script@v6
+        id: comment
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const fs = require('fs');
+            const fileContents = fs.readFileSync('validate_output.txt', 'utf8');
+            const output = `#### Bundle Staging Config Validated 🖌
+            <details><summary>Staging Validation Output</summary>
+
+            \`\`\`\n
+            ${fileContents}
+            \`\`\`
+
+            </details>`
+
+            github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: output
+            })
+
+  prod:
+    concurrency: mlops_stacks_gcp_fs-prod-bundle-job
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          ref: ${{ github.event.pull_request.head.sha || github.sha }}
+      - uses: databricks/setup-cli@v0.211.0
+      - name: Validate Bundle For Prod
+        id: validate
+        env:
+          DATABRICKS_TOKEN: ${{ env.PROD_WORKSPACE_TOKEN }}
+        run: |
+          databricks bundle validate -t prod > ../validate_output.txt
+      - name: Create Comment with Bundle Configuration
+        uses: actions/github-script@v6
+        id: comment
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const fs = require('fs');
+            const fileContents = fs.readFileSync('validate_output.txt', 'utf8');
+            const output = `#### Bundle Prod Config Validated 🖌
+            <details><summary>Prod Validation Output</summary>
+
+            \`\`\`\n
+            ${fileContents}
+            \`\`\`
+
+            </details>`
+
+            github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: output
+            })
diff --git a/.github/workflows/mlops_stacks_gcp_fs-lint-cicd-workflow-files.yml b/.github/workflows/mlops_stacks_gcp_fs-lint-cicd-workflow-files.yml
@@ -0,0 +1,19 @@
+name: Lint CI/CD workflow files
+on:
+  pull_request:
+    paths:
+      - '.github/workflows/**'
+  workflow_dispatch:
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Download actionlint
+        id: get_actionlint
+        run: bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash)
+        shell: bash
+      - name: Check workflow files
+        run: ${{ steps.get_actionlint.outputs.executable }} -color
+        shell: bash
diff --git a/.github/workflows/mlops_stacks_gcp_fs-run-tests-fs.yml b/.github/workflows/mlops_stacks_gcp_fs-run-tests-fs.yml
@@ -0,0 +1,59 @@
+name: Feature and Training Integration Tests for mlops_stacks_gcp_fs
+on:
+  workflow_dispatch:
+  pull_request:
+
+defaults:
+  run:
+    working-directory: ./mlops_stacks_gcp_fs/
+
+env:
+  DATABRICKS_TOKEN: ${{ secrets.STAGING_WORKSPACE_TOKEN }}
+
+concurrency: mlops_stacks_gcp_fs-feature-training-integration-test-staging
+
+jobs:
+  unit_tests:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+      # Feature store tests bring up a local Spark session, so Java is required.
+      - uses: actions/setup-java@v2
+        with:
+          distribution: 'temurin'
+          java-version: '11'
+      - name: Install dependencies
+        run: |
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt
+            pip install -r ../test-requirements.txt
+      - name: Run tests with pytest
+        run: |
+            pytest
+
+  integration_test:
+    needs: unit_tests
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+      - uses: databricks/setup-cli@v0.211.0
+      - name: Validate Bundle For Test Deployment Target in Staging Workspace
+        id: validate
+        run: |
+          databricks bundle validate -t test
+      - name: Deploy Bundle to Test Deployment Target in Staging Workspace
+        id: deploy
+        run: |
+          databricks bundle deploy -t test
+      - name: Run Feature Engineering Workflow for Test Deployment Target in Staging Workspace
+        id: feature_engineering
+        run: |
+          databricks bundle run write_feature_table_job -t test
+      - name: Run Training Workflow for Test Deployment Target in Staging Workspace
+        id: training
+        run: |
+          databricks bundle run model_training_job -t test
diff --git a/mlops_stacks_gcp_fs/README.md b/mlops_stacks_gcp_fs/README.md
@@ -0,0 +1,5 @@
+# mlops_stacks_gcp_fs
+
+This directory contains python code, notebooks and ML asset configs related to one ML project.
+
+See the [Project overview](../docs/project-overview.md) for details on code structure of project directory.
diff --git a/mlops_stacks_gcp_fs/__init__.py b/mlops_stacks_gcp_fs/__init__.py
diff --git a/mlops_stacks_gcp_fs/assets/README.md b/mlops_stacks_gcp_fs/assets/README.md
@@ -134,7 +134,7 @@ new_cluster: &new_cluster
     spark_version: 13.3.x-cpu-ml-scala2.12
     node_type_id: n2-highmem-4
     custom_tags:
-      clusterSource: mlops-stack/0.2
+      clusterSource: mlops-stack
 
 resources:
   jobs:
@@ -189,7 +189,7 @@ new_cluster: &new_cluster
     spark_version: 13.3.x-cpu-ml-scala2.12
     node_type_id: n2-highmem-4
     custom_tags:
-      clusterSource: mlops-stack/0.2
+      clusterSource: mlops-stack
 
 resources:
   jobs:

diff --git a/mlops_stacks_gcp_fs/assets/batch-inference-workflow-asset.yml b/mlops_stacks_gcp_fs/assets/batch-inference-workflow-asset.yml
@@ -0,0 +1,41 @@
+new_cluster: &new_cluster
+  new_cluster:
+    num_workers: 3
+    spark_version: 13.3.x-cpu-ml-scala2.12
+    node_type_id: n2-highmem-4
+    custom_tags:
+      clusterSource: mlops-stack
+
+common_permissions: &permissions
+  permissions:
+    - level: CAN_VIEW
+      group_name: users
+
+resources:
+  jobs:
+    batch_inference_job:
+      name: ${bundle.target}-mlops_stacks_gcp_fs-batch-inference-job
+      tasks:
+        - task_key: batch_inference_job
+          <<: *new_cluster
+          notebook_task:
+            notebook_path: ../deployment/batch_inference/notebooks/BatchInference.py
+            base_parameters:
+              env: ${bundle.target}
+              input_table_name: hive_metastore.default.taxi_scoring_sample_feature_store_inference_input
+              output_table_name: ${bundle.target}_mlops_stacks_gcp_fs_predictions
+              model_name: ${var.model_name}
+              # git source information of current ML asset deployment. It will be persisted as part of the workflow run
+              git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit}
+
+      schedule:
+        quartz_cron_expression: "0 0 11 * * ?" # daily at 11am
+        timezone_id: UTC
+      <<: *permissions
+      # If you want to turn on notifications for this job, please uncomment the below code,
+      # and provide a list of emails to the on_failure argument.
+      #
+      #  email_notifications:
+      #    on_failure:
+      #      - first@company.com
+      #      - second@company.com
diff --git a/mlops_stacks_gcp_fs/assets/feature-engineering-workflow-asset.yml b/mlops_stacks_gcp_fs/assets/feature-engineering-workflow-asset.yml
@@ -0,0 +1,64 @@
+new_cluster: &new_cluster
+  new_cluster:
+    num_workers: 3
+    spark_version: 13.3.x-cpu-ml-scala2.12
+    node_type_id: n2-highmem-4
+    custom_tags:
+      clusterSource: mlops-stack
+
+common_permissions: &permissions
+  permissions:
+    - level: CAN_VIEW
+      group_name: users
+
+resources:
+  jobs:
+    write_feature_table_job:
+      name: ${bundle.target}-mlops_stacks_gcp_fs-write-feature-table-job
+      job_clusters:
+        - job_cluster_key: write_feature_table_job_cluster
+          <<: *new_cluster
+      tasks:
+        - task_key: PickupFeatures
+          job_cluster_key: write_feature_table_job_cluster
+          notebook_task:
+            notebook_path: ../feature_engineering/notebooks/GenerateAndWriteFeatures.py
+            base_parameters:
+              # TODO modify these arguments to reflect your setup.
+              input_table_path: /databricks-datasets/nyctaxi-with-zipcodes/subsampled
+              # TODO: Empty start/end dates will process the whole range. Update this as needed to process recent data.
+              input_start_date: ""
+              input_end_date: ""
+              timestamp_column: tpep_pickup_datetime
+              output_table_name: feature_store_taxi_example.${bundle.target}_mlops_stacks_gcp_fs_trip_pickup_features
+              features_transform_module: pickup_features
+              primary_keys: zip
+              # git source information of current ML asset deployment. It will be persisted as part of the workflow run
+              git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit}
+        - task_key: DropoffFeatures
+          job_cluster_key: write_feature_table_job_cluster
+          notebook_task:
+            notebook_path: ../feature_engineering/notebooks/GenerateAndWriteFeatures.py
+            base_parameters:
+              # TODO: modify these arguments to reflect your setup.
+              input_table_path: /databricks-datasets/nyctaxi-with-zipcodes/subsampled
+              # TODO: Empty start/end dates will process the whole range. Update this as needed to process recent data.
+              input_start_date: ""
+              input_end_date: ""
+              timestamp_column: tpep_dropoff_datetime
+              output_table_name: feature_store_taxi_example.${bundle.target}_mlops_stacks_gcp_fs_trip_dropoff_features
+              features_transform_module: dropoff_features
+              primary_keys: zip
+              # git source information of current ML asset deployment. It will be persisted as part of the workflow run
+              git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit}
+      schedule:
+        quartz_cron_expression: "0 0 7 * * ?" # daily at 7am
+        timezone_id: UTC
+      <<: *permissions
+      # If you want to turn on notifications for this job, please uncomment the below code,
+      # and provide a list of emails to the on_failure argument.
+      #
+      #  email_notifications:
+      #    on_failure:
+      #      - first@company.com
+      #      - second@company.com