-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from eric-golinko-db/add-ml-code
Add ML Code
- Loading branch information
Showing
49 changed files
with
2,023 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# CI/CD Workflow Definitions | ||
This directory contains CI/CD workflow definitions using [GitHub Actions](https://docs.github.com/en/actions), | ||
under ``workflows``. These workflows cover testing and deployment of both ML code (for model training, batch inference, etc) and the | ||
Databricks ML asset definitions under ``mlops_stacks_gcp_fs/assets``. | ||
|
||
To set up CI/CD for a new project, | ||
please refer to [ML asset config - set up CI CD](../../mlops_stacks_gcp_fs/assets/README.md#set-up-ci-and-cd). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# This GitHub workflow deploys Bundle assets (ML asset config and more) | ||
# defined under mlops_stacks_gcp_fs/assets/* | ||
# and mlops_stacks_gcp_fs/databricks.yml with prod deployment target configs, | ||
# when PRs are merged into the release branch | ||
name: Bundle Deployment for mlops_stacks_gcp_fs Prod | ||
|
||
on: | ||
push: | ||
branches: | ||
- 'release' | ||
workflow_dispatch: | ||
|
||
defaults: | ||
run: | ||
working-directory: ./mlops_stacks_gcp_fs | ||
|
||
env: | ||
DATABRICKS_TOKEN: ${{ secrets.PROD_WORKSPACE_TOKEN }} | ||
|
||
jobs: | ||
prod: | ||
concurrency: mlops_stacks_gcp_fs-prod-bundle-job | ||
runs-on: ubuntu-22.04 | ||
steps: | ||
- uses: actions/checkout@v3 | ||
- uses: databricks/setup-cli@v0.211.0 | ||
- name: Validate Bundle For Prod | ||
id: validate | ||
run: | | ||
databricks bundle validate -t prod | ||
- name: Deploy Bundle to Prod | ||
id: deploy | ||
run: | | ||
databricks bundle deploy -t prod |
34 changes: 34 additions & 0 deletions
34
.github/workflows/mlops_stacks_gcp_fs-bundle-cd-staging.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# This GitHub workflow deploys Bundle assets (ML asset config and more) | ||
# defined under mlops_stacks_gcp_fs/assets/* | ||
# and mlops_stacks_gcp_fs/databricks.yml with staging deployment target configs, | ||
# when PRs are merged into the default branch | ||
name: Bundle Deployment for mlops_stacks_gcp_fs Staging | ||
|
||
on: | ||
push: | ||
branches: | ||
- 'main' | ||
workflow_dispatch: | ||
|
||
defaults: | ||
run: | ||
working-directory: ./mlops_stacks_gcp_fs | ||
|
||
env: | ||
DATABRICKS_TOKEN: ${{ secrets.STAGING_WORKSPACE_TOKEN }} | ||
|
||
jobs: | ||
staging: | ||
concurrency: mlops_stacks_gcp_fs-staging-bundle-job | ||
runs-on: ubuntu-22.04 | ||
steps: | ||
- uses: actions/checkout@v3 | ||
- uses: databricks/setup-cli@v0.211.0 | ||
- name: Validate Bundle For Staging | ||
id: validate | ||
run: | | ||
databricks bundle validate -t staging | ||
- name: Deploy Bundle to Staging | ||
id: deploy | ||
run: | | ||
databricks bundle deploy -t staging |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
# This GitHub workflow validates Bundle config (ML asset config and more) | ||
# defined under mlops_stacks_gcp_fs/assets/* | ||
# and mlops_stacks_gcp_fs/databricks.yml, when PRs are merged into the main branch | ||
name: Bundle validation for mlops_stacks_gcp_fs | ||
|
||
on: | ||
workflow_dispatch: | ||
pull_request_target: | ||
|
||
defaults: | ||
run: | ||
working-directory: ./mlops_stacks_gcp_fs/ | ||
|
||
env: | ||
STAGING_WORKSPACE_TOKEN: ${{ secrets.STAGING_WORKSPACE_TOKEN }} | ||
PROD_WORKSPACE_TOKEN: ${{ secrets.PROD_WORKSPACE_TOKEN }} | ||
|
||
jobs: | ||
staging: | ||
concurrency: mlops_stacks_gcp_fs-staging-bundle-job | ||
runs-on: ubuntu-22.04 | ||
steps: | ||
- uses: actions/checkout@v3 | ||
with: | ||
ref: ${{ github.event.pull_request.head.sha || github.sha }} | ||
- uses: databricks/setup-cli@v0.211.0 | ||
- name: Validate Bundle For Staging | ||
id: validate | ||
env: | ||
DATABRICKS_TOKEN: ${{ env.STAGING_WORKSPACE_TOKEN }} | ||
run: | | ||
databricks bundle validate -t staging > ../validate_output.txt | ||
- name: Create Comment with Bundle Configuration | ||
uses: actions/github-script@v6 | ||
id: comment | ||
with: | ||
github-token: ${{ secrets.GITHUB_TOKEN }} | ||
script: | | ||
const fs = require('fs'); | ||
const fileContents = fs.readFileSync('validate_output.txt', 'utf8'); | ||
const output = `#### Bundle Staging Config Validated 🖌 | ||
<details><summary>Staging Validation Output</summary> | ||
\`\`\`\n | ||
${fileContents} | ||
\`\`\` | ||
</details>` | ||
github.rest.issues.createComment({ | ||
issue_number: context.issue.number, | ||
owner: context.repo.owner, | ||
repo: context.repo.repo, | ||
body: output | ||
}) | ||
prod: | ||
concurrency: mlops_stacks_gcp_fs-prod-bundle-job | ||
runs-on: ubuntu-22.04 | ||
steps: | ||
- uses: actions/checkout@v3 | ||
with: | ||
ref: ${{ github.event.pull_request.head.sha || github.sha }} | ||
- uses: databricks/setup-cli@v0.211.0 | ||
- name: Validate Bundle For Prod | ||
id: validate | ||
env: | ||
DATABRICKS_TOKEN: ${{ env.PROD_WORKSPACE_TOKEN }} | ||
run: | | ||
databricks bundle validate -t prod > ../validate_output.txt | ||
- name: Create Comment with Bundle Configuration | ||
uses: actions/github-script@v6 | ||
id: comment | ||
with: | ||
github-token: ${{ secrets.GITHUB_TOKEN }} | ||
script: | | ||
const fs = require('fs'); | ||
const fileContents = fs.readFileSync('validate_output.txt', 'utf8'); | ||
const output = `#### Bundle Prod Config Validated 🖌 | ||
<details><summary>Prod Validation Output</summary> | ||
\`\`\`\n | ||
${fileContents} | ||
\`\`\` | ||
</details>` | ||
github.rest.issues.createComment({ | ||
issue_number: context.issue.number, | ||
owner: context.repo.owner, | ||
repo: context.repo.repo, | ||
body: output | ||
}) |
19 changes: 19 additions & 0 deletions
19
.github/workflows/mlops_stacks_gcp_fs-lint-cicd-workflow-files.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
name: Lint CI/CD workflow files | ||
on: | ||
pull_request: | ||
paths: | ||
- '.github/workflows/**' | ||
workflow_dispatch: | ||
|
||
jobs: | ||
lint: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v3 | ||
- name: Download actionlint | ||
id: get_actionlint | ||
run: bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash) | ||
shell: bash | ||
- name: Check workflow files | ||
run: ${{ steps.get_actionlint.outputs.executable }} -color | ||
shell: bash |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
name: Feature and Training Integration Tests for mlops_stacks_gcp_fs | ||
on: | ||
workflow_dispatch: | ||
pull_request: | ||
|
||
defaults: | ||
run: | ||
working-directory: ./mlops_stacks_gcp_fs/ | ||
|
||
env: | ||
DATABRICKS_TOKEN: ${{ secrets.STAGING_WORKSPACE_TOKEN }} | ||
|
||
concurrency: mlops_stacks_gcp_fs-feature-training-integration-test-staging | ||
|
||
jobs: | ||
unit_tests: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v2 | ||
- uses: actions/setup-python@v2 | ||
with: | ||
python-version: 3.8 | ||
# Feature store tests bring up a local Spark session, so Java is required. | ||
- uses: actions/setup-java@v2 | ||
with: | ||
distribution: 'temurin' | ||
java-version: '11' | ||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install -r requirements.txt | ||
pip install -r ../test-requirements.txt | ||
- name: Run tests with pytest | ||
run: | | ||
pytest | ||
integration_test: | ||
needs: unit_tests | ||
runs-on: ubuntu-22.04 | ||
steps: | ||
- name: Checkout repo | ||
uses: actions/checkout@v3 | ||
- uses: databricks/setup-cli@v0.211.0 | ||
- name: Validate Bundle For Test Deployment Target in Staging Workspace | ||
id: validate | ||
run: | | ||
databricks bundle validate -t test | ||
- name: Deploy Bundle to Test Deployment Target in Staging Workspace | ||
id: deploy | ||
run: | | ||
databricks bundle deploy -t test | ||
- name: Run Feature Engineering Workflow for Test Deployment Target in Staging Workspace | ||
id: feature_engineering | ||
run: | | ||
databricks bundle run write_feature_table_job -t test | ||
- name: Run Training Workflow for Test Deployment Target in Staging Workspace | ||
id: training | ||
run: | | ||
databricks bundle run model_training_job -t test |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# mlops_stacks_gcp_fs | ||
|
||
This directory contains python code, notebooks and ML asset configs related to one ML project. | ||
|
||
See the [Project overview](../docs/project-overview.md) for details on code structure of project directory. |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
41 changes: 41 additions & 0 deletions
41
mlops_stacks_gcp_fs/assets/batch-inference-workflow-asset.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
new_cluster: &new_cluster | ||
new_cluster: | ||
num_workers: 3 | ||
spark_version: 13.3.x-cpu-ml-scala2.12 | ||
node_type_id: n2-highmem-4 | ||
custom_tags: | ||
clusterSource: mlops-stack | ||
|
||
common_permissions: &permissions | ||
permissions: | ||
- level: CAN_VIEW | ||
group_name: users | ||
|
||
resources: | ||
jobs: | ||
batch_inference_job: | ||
name: ${bundle.target}-mlops_stacks_gcp_fs-batch-inference-job | ||
tasks: | ||
- task_key: batch_inference_job | ||
<<: *new_cluster | ||
notebook_task: | ||
notebook_path: ../deployment/batch_inference/notebooks/BatchInference.py | ||
base_parameters: | ||
env: ${bundle.target} | ||
input_table_name: hive_metastore.default.taxi_scoring_sample_feature_store_inference_input | ||
output_table_name: ${bundle.target}_mlops_stacks_gcp_fs_predictions | ||
model_name: ${var.model_name} | ||
# git source information of current ML asset deployment. It will be persisted as part of the workflow run | ||
git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} | ||
|
||
schedule: | ||
quartz_cron_expression: "0 0 11 * * ?" # daily at 11am | ||
timezone_id: UTC | ||
<<: *permissions | ||
# If you want to turn on notifications for this job, please uncomment the below code, | ||
# and provide a list of emails to the on_failure argument. | ||
# | ||
# email_notifications: | ||
# on_failure: | ||
# - first@company.com | ||
# - second@company.com |
64 changes: 64 additions & 0 deletions
64
mlops_stacks_gcp_fs/assets/feature-engineering-workflow-asset.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
new_cluster: &new_cluster | ||
new_cluster: | ||
num_workers: 3 | ||
spark_version: 13.3.x-cpu-ml-scala2.12 | ||
node_type_id: n2-highmem-4 | ||
custom_tags: | ||
clusterSource: mlops-stack | ||
|
||
common_permissions: &permissions | ||
permissions: | ||
- level: CAN_VIEW | ||
group_name: users | ||
|
||
resources: | ||
jobs: | ||
write_feature_table_job: | ||
name: ${bundle.target}-mlops_stacks_gcp_fs-write-feature-table-job | ||
job_clusters: | ||
- job_cluster_key: write_feature_table_job_cluster | ||
<<: *new_cluster | ||
tasks: | ||
- task_key: PickupFeatures | ||
job_cluster_key: write_feature_table_job_cluster | ||
notebook_task: | ||
notebook_path: ../feature_engineering/notebooks/GenerateAndWriteFeatures.py | ||
base_parameters: | ||
# TODO modify these arguments to reflect your setup. | ||
input_table_path: /databricks-datasets/nyctaxi-with-zipcodes/subsampled | ||
# TODO: Empty start/end dates will process the whole range. Update this as needed to process recent data. | ||
input_start_date: "" | ||
input_end_date: "" | ||
timestamp_column: tpep_pickup_datetime | ||
output_table_name: feature_store_taxi_example.${bundle.target}_mlops_stacks_gcp_fs_trip_pickup_features | ||
features_transform_module: pickup_features | ||
primary_keys: zip | ||
# git source information of current ML asset deployment. It will be persisted as part of the workflow run | ||
git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} | ||
- task_key: DropoffFeatures | ||
job_cluster_key: write_feature_table_job_cluster | ||
notebook_task: | ||
notebook_path: ../feature_engineering/notebooks/GenerateAndWriteFeatures.py | ||
base_parameters: | ||
# TODO: modify these arguments to reflect your setup. | ||
input_table_path: /databricks-datasets/nyctaxi-with-zipcodes/subsampled | ||
# TODO: Empty start/end dates will process the whole range. Update this as needed to process recent data. | ||
input_start_date: "" | ||
input_end_date: "" | ||
timestamp_column: tpep_dropoff_datetime | ||
output_table_name: feature_store_taxi_example.${bundle.target}_mlops_stacks_gcp_fs_trip_dropoff_features | ||
features_transform_module: dropoff_features | ||
primary_keys: zip | ||
# git source information of current ML asset deployment. It will be persisted as part of the workflow run | ||
git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} | ||
schedule: | ||
quartz_cron_expression: "0 0 7 * * ?" # daily at 7am | ||
timezone_id: UTC | ||
<<: *permissions | ||
# If you want to turn on notifications for this job, please uncomment the below code, | ||
# and provide a list of emails to the on_failure argument. | ||
# | ||
# email_notifications: | ||
# on_failure: | ||
# - first@company.com | ||
# - second@company.com |
Oops, something went wrong.