Skip to content

Commit

Permalink
Merge pull request #1 from eric-golinko-db/add-ml-code
Browse files Browse the repository at this point in the history
Add ML Code
  • Loading branch information
eric-golinko-db authored Feb 26, 2024
2 parents d91ba00 + 21de314 commit cedb7a0
Show file tree
Hide file tree
Showing 49 changed files with 2,023 additions and 2 deletions.
7 changes: 7 additions & 0 deletions .github/workflows/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# CI/CD Workflow Definitions
This directory contains CI/CD workflow definitions using [GitHub Actions](https://docs.github.com/en/actions),
under ``workflows``. These workflows cover testing and deployment of both ML code (for model training, batch inference, etc) and the
Databricks ML asset definitions under ``mlops_stacks_gcp_fs/assets``.

To set up CI/CD for a new project,
please refer to [ML asset config - set up CI CD](../../mlops_stacks_gcp_fs/assets/README.md#set-up-ci-and-cd).
34 changes: 34 additions & 0 deletions .github/workflows/mlops_stacks_gcp_fs-bundle-cd-prod.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# This GitHub workflow deploys Bundle assets (ML asset config and more)
# defined under mlops_stacks_gcp_fs/assets/*
# and mlops_stacks_gcp_fs/databricks.yml with prod deployment target configs,
# when PRs are merged into the release branch
name: Bundle Deployment for mlops_stacks_gcp_fs Prod

on:
push:
branches:
- 'release'
workflow_dispatch:

defaults:
run:
working-directory: ./mlops_stacks_gcp_fs

env:
DATABRICKS_TOKEN: ${{ secrets.PROD_WORKSPACE_TOKEN }}

jobs:
prod:
concurrency: mlops_stacks_gcp_fs-prod-bundle-job
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- uses: databricks/setup-cli@v0.211.0
- name: Validate Bundle For Prod
id: validate
run: |
databricks bundle validate -t prod
- name: Deploy Bundle to Prod
id: deploy
run: |
databricks bundle deploy -t prod
34 changes: 34 additions & 0 deletions .github/workflows/mlops_stacks_gcp_fs-bundle-cd-staging.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# This GitHub workflow deploys Bundle assets (ML asset config and more)
# defined under mlops_stacks_gcp_fs/assets/*
# and mlops_stacks_gcp_fs/databricks.yml with staging deployment target configs,
# when PRs are merged into the default branch
name: Bundle Deployment for mlops_stacks_gcp_fs Staging

on:
push:
branches:
- 'main'
workflow_dispatch:

defaults:
run:
working-directory: ./mlops_stacks_gcp_fs

env:
DATABRICKS_TOKEN: ${{ secrets.STAGING_WORKSPACE_TOKEN }}

jobs:
staging:
concurrency: mlops_stacks_gcp_fs-staging-bundle-job
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- uses: databricks/setup-cli@v0.211.0
- name: Validate Bundle For Staging
id: validate
run: |
databricks bundle validate -t staging
- name: Deploy Bundle to Staging
id: deploy
run: |
databricks bundle deploy -t staging
93 changes: 93 additions & 0 deletions .github/workflows/mlops_stacks_gcp_fs-bundle-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# This GitHub workflow validates Bundle config (ML asset config and more)
# defined under mlops_stacks_gcp_fs/assets/*
# and mlops_stacks_gcp_fs/databricks.yml, when PRs are merged into the main branch
name: Bundle validation for mlops_stacks_gcp_fs

on:
workflow_dispatch:
pull_request_target:

defaults:
run:
working-directory: ./mlops_stacks_gcp_fs/

env:
STAGING_WORKSPACE_TOKEN: ${{ secrets.STAGING_WORKSPACE_TOKEN }}
PROD_WORKSPACE_TOKEN: ${{ secrets.PROD_WORKSPACE_TOKEN }}

jobs:
staging:
concurrency: mlops_stacks_gcp_fs-staging-bundle-job
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha || github.sha }}
- uses: databricks/setup-cli@v0.211.0
- name: Validate Bundle For Staging
id: validate
env:
DATABRICKS_TOKEN: ${{ env.STAGING_WORKSPACE_TOKEN }}
run: |
databricks bundle validate -t staging > ../validate_output.txt
- name: Create Comment with Bundle Configuration
uses: actions/github-script@v6
id: comment
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
const fs = require('fs');
const fileContents = fs.readFileSync('validate_output.txt', 'utf8');
const output = `#### Bundle Staging Config Validated 🖌
<details><summary>Staging Validation Output</summary>
\`\`\`\n
${fileContents}
\`\`\`
</details>`
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: output
})
prod:
concurrency: mlops_stacks_gcp_fs-prod-bundle-job
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha || github.sha }}
- uses: databricks/setup-cli@v0.211.0
- name: Validate Bundle For Prod
id: validate
env:
DATABRICKS_TOKEN: ${{ env.PROD_WORKSPACE_TOKEN }}
run: |
databricks bundle validate -t prod > ../validate_output.txt
- name: Create Comment with Bundle Configuration
uses: actions/github-script@v6
id: comment
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
const fs = require('fs');
const fileContents = fs.readFileSync('validate_output.txt', 'utf8');
const output = `#### Bundle Prod Config Validated 🖌
<details><summary>Prod Validation Output</summary>
\`\`\`\n
${fileContents}
\`\`\`
</details>`
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: output
})
19 changes: 19 additions & 0 deletions .github/workflows/mlops_stacks_gcp_fs-lint-cicd-workflow-files.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: Lint CI/CD workflow files
on:
pull_request:
paths:
- '.github/workflows/**'
workflow_dispatch:

jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Download actionlint
id: get_actionlint
run: bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash)
shell: bash
- name: Check workflow files
run: ${{ steps.get_actionlint.outputs.executable }} -color
shell: bash
59 changes: 59 additions & 0 deletions .github/workflows/mlops_stacks_gcp_fs-run-tests-fs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
name: Feature and Training Integration Tests for mlops_stacks_gcp_fs
on:
workflow_dispatch:
pull_request:

defaults:
run:
working-directory: ./mlops_stacks_gcp_fs/

env:
DATABRICKS_TOKEN: ${{ secrets.STAGING_WORKSPACE_TOKEN }}

concurrency: mlops_stacks_gcp_fs-feature-training-integration-test-staging

jobs:
unit_tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: 3.8
# Feature store tests bring up a local Spark session, so Java is required.
- uses: actions/setup-java@v2
with:
distribution: 'temurin'
java-version: '11'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install -r ../test-requirements.txt
- name: Run tests with pytest
run: |
pytest
integration_test:
needs: unit_tests
runs-on: ubuntu-22.04
steps:
- name: Checkout repo
uses: actions/checkout@v3
- uses: databricks/setup-cli@v0.211.0
- name: Validate Bundle For Test Deployment Target in Staging Workspace
id: validate
run: |
databricks bundle validate -t test
- name: Deploy Bundle to Test Deployment Target in Staging Workspace
id: deploy
run: |
databricks bundle deploy -t test
- name: Run Feature Engineering Workflow for Test Deployment Target in Staging Workspace
id: feature_engineering
run: |
databricks bundle run write_feature_table_job -t test
- name: Run Training Workflow for Test Deployment Target in Staging Workspace
id: training
run: |
databricks bundle run model_training_job -t test
5 changes: 5 additions & 0 deletions mlops_stacks_gcp_fs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# mlops_stacks_gcp_fs

This directory contains python code, notebooks and ML asset configs related to one ML project.

See the [Project overview](../docs/project-overview.md) for details on code structure of project directory.
Empty file added mlops_stacks_gcp_fs/__init__.py
Empty file.
4 changes: 2 additions & 2 deletions mlops_stacks_gcp_fs/assets/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ new_cluster: &new_cluster
spark_version: 13.3.x-cpu-ml-scala2.12
node_type_id: n2-highmem-4
custom_tags:
clusterSource: mlops-stack/0.2
clusterSource: mlops-stack
resources:
jobs:
Expand Down Expand Up @@ -189,7 +189,7 @@ new_cluster: &new_cluster
spark_version: 13.3.x-cpu-ml-scala2.12
node_type_id: n2-highmem-4
custom_tags:
clusterSource: mlops-stack/0.2
clusterSource: mlops-stack
resources:
jobs:
Expand Down
41 changes: 41 additions & 0 deletions mlops_stacks_gcp_fs/assets/batch-inference-workflow-asset.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
new_cluster: &new_cluster
new_cluster:
num_workers: 3
spark_version: 13.3.x-cpu-ml-scala2.12
node_type_id: n2-highmem-4
custom_tags:
clusterSource: mlops-stack

common_permissions: &permissions
permissions:
- level: CAN_VIEW
group_name: users

resources:
jobs:
batch_inference_job:
name: ${bundle.target}-mlops_stacks_gcp_fs-batch-inference-job
tasks:
- task_key: batch_inference_job
<<: *new_cluster
notebook_task:
notebook_path: ../deployment/batch_inference/notebooks/BatchInference.py
base_parameters:
env: ${bundle.target}
input_table_name: hive_metastore.default.taxi_scoring_sample_feature_store_inference_input
output_table_name: ${bundle.target}_mlops_stacks_gcp_fs_predictions
model_name: ${var.model_name}
# git source information of current ML asset deployment. It will be persisted as part of the workflow run
git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit}

schedule:
quartz_cron_expression: "0 0 11 * * ?" # daily at 11am
timezone_id: UTC
<<: *permissions
# If you want to turn on notifications for this job, please uncomment the below code,
# and provide a list of emails to the on_failure argument.
#
# email_notifications:
# on_failure:
# - first@company.com
# - second@company.com
64 changes: 64 additions & 0 deletions mlops_stacks_gcp_fs/assets/feature-engineering-workflow-asset.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
new_cluster: &new_cluster
new_cluster:
num_workers: 3
spark_version: 13.3.x-cpu-ml-scala2.12
node_type_id: n2-highmem-4
custom_tags:
clusterSource: mlops-stack

common_permissions: &permissions
permissions:
- level: CAN_VIEW
group_name: users

resources:
jobs:
write_feature_table_job:
name: ${bundle.target}-mlops_stacks_gcp_fs-write-feature-table-job
job_clusters:
- job_cluster_key: write_feature_table_job_cluster
<<: *new_cluster
tasks:
- task_key: PickupFeatures
job_cluster_key: write_feature_table_job_cluster
notebook_task:
notebook_path: ../feature_engineering/notebooks/GenerateAndWriteFeatures.py
base_parameters:
# TODO modify these arguments to reflect your setup.
input_table_path: /databricks-datasets/nyctaxi-with-zipcodes/subsampled
# TODO: Empty start/end dates will process the whole range. Update this as needed to process recent data.
input_start_date: ""
input_end_date: ""
timestamp_column: tpep_pickup_datetime
output_table_name: feature_store_taxi_example.${bundle.target}_mlops_stacks_gcp_fs_trip_pickup_features
features_transform_module: pickup_features
primary_keys: zip
# git source information of current ML asset deployment. It will be persisted as part of the workflow run
git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit}
- task_key: DropoffFeatures
job_cluster_key: write_feature_table_job_cluster
notebook_task:
notebook_path: ../feature_engineering/notebooks/GenerateAndWriteFeatures.py
base_parameters:
# TODO: modify these arguments to reflect your setup.
input_table_path: /databricks-datasets/nyctaxi-with-zipcodes/subsampled
# TODO: Empty start/end dates will process the whole range. Update this as needed to process recent data.
input_start_date: ""
input_end_date: ""
timestamp_column: tpep_dropoff_datetime
output_table_name: feature_store_taxi_example.${bundle.target}_mlops_stacks_gcp_fs_trip_dropoff_features
features_transform_module: dropoff_features
primary_keys: zip
# git source information of current ML asset deployment. It will be persisted as part of the workflow run
git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit}
schedule:
quartz_cron_expression: "0 0 7 * * ?" # daily at 7am
timezone_id: UTC
<<: *permissions
# If you want to turn on notifications for this job, please uncomment the below code,
# and provide a list of emails to the on_failure argument.
#
# email_notifications:
# on_failure:
# - first@company.com
# - second@company.com
Loading

0 comments on commit cedb7a0

Please sign in to comment.