Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lakehouse monitoring integration #156

Merged
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions databricks_template_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -256,9 +256,22 @@
]
}
},
"input_include_feature_store": {
"input_inference_table_name": {
"order": 17,
"type": "string",
"description": "\nFully qualified name of inference table to attach monitoring to.\nThis table must already exist and service principals must have access.",
"default": "dev.my_mlops_project.predictions",
"skip_prompt_if": {
"properties": {
"input_setup_cicd_and_project": {
"const": "CICD_Only"
}
}
}
},
"input_include_feature_store": {
"order": 18,
"type": "string",
"description": "\nWhether to include Feature Store",
"default": "no",
"enum": ["no", "yes"],
Expand All @@ -271,7 +284,7 @@
}
},
"input_include_mlflow_recipes": {
"order": 18,
"order": 19,
"type": "string",
"description": "\nWhether to include MLflow Recipes",
"default": "no",
Expand Down
6 changes: 3 additions & 3 deletions template/{{.input_root_dir}}/README.md.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ contained in the following files:
│ │
│ ├── ml-artifacts-resource.yml <- ML resource config definition for model and experiment
│ │
│ ├── monitoring-workflow-resource.yml <- ML resource config definition for data monitoring workflow
│ ├── monitoring-resource.yml <- ML resource config definition for quality monitoring workflow
{{- else if (eq .input_include_feature_store `yes`) }}
│ ├── training <- Training folder contains Notebook that trains and registers the model with feature store support.
│ │
Expand Down Expand Up @@ -89,7 +89,7 @@ contained in the following files:
│ │
│ ├── ml-artifacts-resource.yml <- ML resource config definition for model and experiment
│ │
│ ├── monitoring-workflow-resource.yml <- ML resource config definition for data monitoring workflow
│ ├── monitoring-resource.yml <- ML resource config definition for quality monitoring workflow
{{- else }}
│ ├── training <- Folder for model development via MLflow recipes.
│ │ │
Expand Down Expand Up @@ -127,7 +127,7 @@ contained in the following files:
│ │
│ ├── ml-artifacts-resource.yml <- ML resource config definition for model and experiment
│ │
│ ├── monitoring-workflow-resource.yml <- ML resource config definition for data monitoring workflow
│ ├── monitoring-resource.yml <- ML resource config definition for quality monitoring workflow
{{- end }}
{{- end }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ contained in the following files:
│ │
│ ├── ml-artifacts-resource.yml <- ML resource config definition for model and experiment
│ │
│ ├── monitoring-workflow-resource.yml <- ML resource config definition for data monitoring workflow
│ ├── monitoring-resource.yml <- ML resource config definition for quality monitoring workflow
{{- else if (eq .input_include_feature_store `yes`) }}
│ ├── training <- Training folder contains Notebook that trains and registers the model with feature store support.
│ │
Expand Down Expand Up @@ -92,7 +92,7 @@ contained in the following files:
│ │
│ ├── ml-artifacts-resource.yml <- ML resource config definition for model and experiment
│ │
│ ├── monitoring-workflow-resource.yml <- ML resource config definition for data monitoring workflow
│ ├── monitoring-resource.yml <- ML resource config definition for quality monitoring workflow
{{- else }}
│ ├── training <- Folder for model development via MLflow recipes.
│ │ │
Expand Down Expand Up @@ -130,7 +130,7 @@ contained in the following files:
│ │
│ ├── ml-artifacts-resource.yml <- ML resource config definition for model and experiment
│ │
│ ├── monitoring-workflow-resource.yml <- ML resource config definition for data monitoring workflow
│ ├── monitoring-resource.yml <- ML resource config definition for quality monitoring workflow
{{- end }}
```

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ variables:
include:
# Resources folder contains ML artifact resources for the ML project that defines model and experiment
# And workflows resources for the ML project including model training -> validation -> deployment,
# {{- if (eq .input_include_feature_store `yes`) }} feature engineering, {{ end }} batch inference, data monitoring, metric refresh, alerts and triggering retraining
# {{- if (eq .input_include_feature_store `yes`) }} feature engineering, {{ end }} batch inference, quality monitoring, metric refresh, alerts and triggering retraining
- ./resources/*.yml

# Deployment Target specific values for workspace
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Monitoring

Databricks Data Monitoring is currently in Private Preview.

Please contact a Databricks representative for more information.
To enable monitoring as part of a scheduled Databricks workflow, please update all the TODOs in the [monitoring resource file](../resources/monitoring-resource.yml), and refer to
[{{template `project_name_alphanumeric_underscore` .}}/resources/README.md](../resources/README.md). The implementation supports monitoring of batch inference tables directly.
For real time inference tables, unpacking is required before monitoring can be attached.
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import sys
import pathlib

sys.path.append(str(pathlib.Path(__file__).parent.parent.parent.resolve()))

"""The SQL query is divided into three main parts. The first part selects the top 5
arpitjasa-db marked this conversation as resolved.
Show resolved Hide resolved
values of the metric to be monitored,
ordered by the time window.
```sql
( SELECT
{metric_to_monitor}
FROM
{table_name_under_monitor}_profile_metrics
where
column_name = ":table"
and slice_key IS NULL
and model_id!= "*"
and log_type = "INPUT"
ORDER BY
window DESC
LIMIT
5
)
```
The `column_name = ":table"` and `slice_key IS NULL` conditions ensure that the metric
is selected for the entire table within the given granularity. The `log_type = "INPUT"`
condition ensures that the primary table metrics are considered, but not the baseline
table metrics. The `model_id!= "*"` condition ensures that the metric aggregated across
all model IDs is not selected.

The second part of the query selects the metric values that exceed
the defined threshold:
```sql
WHERE
{metric_to_monitor} > {metric_violation_threshold}
```
The final part of the query sets the `query_result` to 1 if the threshold violation
occurred more than twice within the checking interval, and 0 otherwise:
```sql
SELECT
CASE
WHEN COUNT(*) > 2 THEN 1
ELSE 0
END AS query_result
```
"""

sql_query = """SELECT
CASE
WHEN COUNT(*) > 2 THEN 1
arpitjasa-db marked this conversation as resolved.
Show resolved Hide resolved
ELSE 0
END AS query_result
FROM
(
SELECT
{metric_to_monitor}
FROM
{table_name_under_monitor}_profile_metrics
where
column_name = ":table"
and slice_key IS NULL
and model_id != "*"
and log_type = "INPUT"
ORDER BY
window DESC
LIMIT
5
)
WHERE
{metric_to_monitor} > {metric_violation_threshold}"""
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Databricks notebook source
##################################################################################
# This notebook runs a sql query and set the result as job task value
#
# This notebook has the following parameters:
#
# * table_name_under_monitor (required) - The name of a table that is currently being monitored
# * metric_to_monitor (required) - Metric to be monitored for threshold violation
# * metric_violation_threshold (required) - Threshold value for metric violation
arpitjasa-db marked this conversation as resolved.
Show resolved Hide resolved
##################################################################################

# List of input args needed to run the notebook as a job.
# Provide them via DB widgets or notebook arguments.
#
# Name of the table that is currently being monitored
dbutils.widgets.text(
"table_name_under_monitor", "{{ .input_inference_table_name }}", label="Full (three-Level) table name"
)
# Metric to be used for threshold violation check
dbutils.widgets.text(
"metric_to_monitor", "root_mean_squared_error", label="Metric to be monitored for threshold violation"
)

# Threshold value to be checked
dbutils.widgets.text(
"metric_violation_threshold", "100", label="Threshold value for metric violation"
)

# COMMAND ----------

import os
import sys
notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get())
%cd $notebook_path
%cd ..
sys.path.append("../..")

# COMMAND ----------

from metric_violation_check_query import sql_query

table_name_under_monitor = dbutils.widgets.get("table_name_under_monitor")
metric_to_monitor = dbutils.widgets.get("metric_to_monitor")
metric_violation_threshold = dbutils.widgets.get("metric_violation_threshold")

formatted_sql_query = sql_query.format(
table_name_under_monitor=table_name_under_monitor,
metric_to_monitor=metric_to_monitor,
metric_violation_threshold=metric_violation_threshold)
is_metric_violated = bool(spark.sql(formatted_sql_query).toPandas()["query_result"][0])

dbutils.jobs.taskValues.set("is_metric_violated", is_metric_violated)


Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ During databricks CLI bundles deployment, the root config file will be loaded, v
ML Resource Configurations in this directory:
- model workflow (`{{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml`)
- batch inference workflow (`{{template `project_name_alphanumeric_underscore` .}}/resources/batch-inference-workflow-resource.yml`)
- monitoring workflow (`{{template `project_name_alphanumeric_underscore` .}}/resources/monitoring-workflow-resource.yml`)
- monitoring resource and workflow (`{{template `project_name_alphanumeric_underscore` .}}/resources/monitoring-resource.yml`)
- feature engineering workflow (`{{template `project_name_alphanumeric_underscore` .}}/resources/feature-engineering-workflow-resource.yml`)
- model definition and experiment definition (`{{template `project_name_alphanumeric_underscore` .}}/resources/ml-artifacts-resource.yml`)

Expand Down Expand Up @@ -143,6 +143,39 @@ Model validation contains three components:
To set up and enable model validation, update [validation.py](../validation/validation.py) to return desired custom metrics and validation thresholds, then
resolve the `TODOs` in the ModelValidation task of [model-workflow-resource.yml](./model-workflow-resource.yml).


### Setting up monitoring
The monitoring workflow focuses on building a plug-and-play stack component for monitoring the feature drifts and model drifts and retrain based on the
violation threshold defined given the ground truth labels.

Its central purpose is to track production model performances, feature distributions and comparing different versions.

Monitoring contains four components:
* [metric_violation_check_query.py](../monitoring/metric_violation_check_query.py) defines a query that checks for violation of the monitored metric.
* [notebooks/MonitoredMetricViolationCheck](../monitoring/notebooks/MonitoredMetricViolationCheck.py) acts as an entry point, executing the violation check query against the monitored inference table.
It emits a boolean value based on the query result.
* [monitoring-resource.yml](./monitoring-resource.yml) contains the resource config, inputs parameters for monitoring, and orchestrates model retraining based on monitoring. It first runs the [notebooks/MonitoredMetricViolationCheck](../monitoring/notebooks/MonitoredMetricViolationCheck.py)
entry point then decides whether to execute the model retraining workflow.

To set up and enable monitoring:
* If it is not done already, generate inference table, join it with ground truth labels, and update the table name in [monitoring-resource.yml](./monitoring-resource.yml).
arpitjasa-db marked this conversation as resolved.
Show resolved Hide resolved
* Resolve the `TODOs` in [monitoring-resource.yml](./monitoring-resource.yml)
arpitjasa-db marked this conversation as resolved.
Show resolved Hide resolved
* OPTIONAL: Update the query in [metric_violation_check_query.py](../monitoring/metric_violation_check_query.py) to customize when the metric is considered to be in violation.

Retraining Constraints:
The retraining job has constraints for optimal functioning:
* Labels must be provided by the user, joined correctly for retraining history, and available on time with the retraining frequency.
* Retraining Frequency is tightly coupled with the granularity of the monitor. Users should take into account and ensure that their retraining frequency is equal to or close to the granularity of the monitor.
* If the granularity of the monitor is 1 day and retraining frequency is 1 hour, the job will preemptively stop as there is no new data to evaluate retraining criteria
* If the granularity of the monitor is 1 day and retraining frequency is 1 week, retraining would be stale and not be efficient

Permissions:
Permissions for monitoring are inherited from the original table's permissions.
* Users who own the monitored table or its parent catalog/schema can create, update, and view monitors.
* Users with read permissions on the monitored table can view its monitor.

Therefore, ensure that service principals are the owners or have the necessary permissions to manage the monitored table.

## Develop and test config changes

### databricks CLI bundles schema overview
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@ resources:
name: ${var.model_name}
description: MLflow registered model for the "{{ .input_project_name }}" ML Project for ${bundle.target} deployment target.
<<: *permissions
depends_on:
- resources.jobs.model_training_job.id
- resources.jobs.batch_inference_job.id
{{- else -}}
registered_models:
model:
Expand All @@ -29,9 +26,6 @@ resources:
schema_name: {{ .input_schema_name }}
comment: Registered model in Unity Catalog for the "{{ .input_project_name }}" ML Project for ${bundle.target} deployment target.
<<: *grants
depends_on:
- resources.jobs.model_training_job.id
- resources.jobs.batch_inference_job.id{{end}}

experiments:
experiment:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Please complete all the TODOs in this file.
# NOTE: Monitoring only works on Unity Catalog tables.

resources:
quality_monitors:
{{ .input_project_name }}_quality_monitor:
full_name: {{ .input_inference_table_name }}
# TODO: Update the output schema name as per your requirements
output_schema_name: ${bundle.target}.{{ .input_project_name }}
arpitjasa-db marked this conversation as resolved.
Show resolved Hide resolved
# TODO: Update the below parameters as per your requirements
inference_log:
granularities: [1 day]
model_id_col: model_id
prediction_col: predictions
label_col: labels
problem_type: PROBLEM_TYPE_REGRESSION
timestamp_col: timestamp
schedule:
quartz_cron_expression: 0 0 8 * * ? # Run Every day at 8am
timezone_id: UTC

new_cluster: &new_cluster
new_cluster:
num_workers: 3
spark_version: 13.3.x-cpu-ml-scala2.12
node_type_id: {{template `cloud_specific_node_type_id` .}}
custom_tags:
clusterSource: mlops-stacks_{{template `stacks_version` .}}

common_permissions: &permissions
permissions:
- level: CAN_VIEW
group_name: users

resources:
jobs:
retraining_job:
name: ${bundle.target}-{{ .input_project_name }}-monitoring-retraining-job
tasks:
- task_key: monitored_metric_violation_check
<<: *new_cluster
notebook_task:
notebook_path: ../monitoring/notebooks/MonitoredMetricViolationCheck.py
base_parameters:
env: ${bundle.target}
table_name_under_monitor: {{ .input_inference_table_name }}
# TODO: Update the metric to be monitored and violation threshold
metric_to_monitor: root_mean_squared_error
metric_violation_threshold: 100
# git source information of current ML resource deployment. It will be persisted as part of the workflow run
git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit}

- task_key: is_metric_violated
depends_on:
- task_key: monitored_metric_violation_check
condition_task:
op: EQUAL_TO
left: "{{"{{tasks.monitored_metric_violation_check.values.is_metric_violated}}"}}"
right: "true"
git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit}

- task_key: trigger_retraining
depends_on:
- task_key: is_metric_violated
outcome: "true"
run_job_task:
job_id: ${resources.jobs.model_training_job.id}
git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit}

schedule:
quartz_cron_expression: "0 0 18 * * ?" # daily at 6pm
timezone_id: UTC
<<: *permissions
# If you want to turn on notifications for this job, please uncomment the below code,
# and provide a list of emails to the on_failure argument.
#
# email_notifications:
# on_failure:
# - first@company.com
# - second@company.com

This file was deleted.

Loading