Commit to github

databricks · Jul 15, 2024 · 2ff35cc · 2ff35cc
1 parent 2f425d5
commit 2ff35cc
Show file tree

Hide file tree

Showing 17 changed files with 449 additions and 4 deletions.
diff --git a/bundle.yml b/bundle.yml
@@ -3,7 +3,7 @@ bundle:
   name: medium_post_report
 
 workspace:
-  host: https://e2-dogfood.staging.cloud.databricks.com/
+  host: https://adb-2541733722036151.11.azuredatabricks.net/
 
 resources:
 
@@ -39,7 +39,7 @@ resources:
           new_cluster:
             spark_version: 13.1.x-scala2.12
             num_workers: 1
-            node_type_id: i3.xlarge            
+            node_type_id: Standard_DS3_v2            
 
 environments:
   development:
@@ -51,7 +51,7 @@ environments:
 
   qa: # This environment is when deploying test runs from a pull request on GitHub.
     workspace:
-      host: https://e2-demo-west.cloud.databricks.com/
+      host: https://adb-2541733722036151.11.azuredatabricks.net/
     resources:
       pipelines:
         medium_metrics_pipeline:
@@ -62,7 +62,7 @@ environments:
 
   production:
     workspace:
-      host: https://e2-demo-west.cloud.databricks.com/
+      host: https://adb-2541733722036151.11.azuredatabricks.net/
     resources:
       pipelines:
         medium_metrics_pipeline:

diff --git a/demo_for_dbas_python/.gitignore b/demo_for_dbas_python/.gitignore
@@ -0,0 +1,8 @@
+.databricks/
+build/
+dist/
+__pycache__/
+*.egg-info
+.venv/
+scratch/**
+!scratch/README.md
diff --git a/demo_for_dbas_python/.vscode/__builtins__.pyi b/demo_for_dbas_python/.vscode/__builtins__.pyi
@@ -0,0 +1,3 @@
+# Typings for Pylance in Visual Studio Code
+# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md
+from databricks.sdk.runtime import *
diff --git a/demo_for_dbas_python/README.md b/demo_for_dbas_python/README.md
@@ -0,0 +1,47 @@
+# demo_for_dbas_python
+
+The 'demo_for_dbas_python' project was generated by using the default-python template.
+
+## Getting started
+
+1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html
+
+2. Authenticate to your Databricks workspace, if you have not done so already:
+    ```
+    $ databricks configure
+    ```
+
+3. To deploy a development copy of this project, type:
+    ```
+    $ databricks bundle deploy --target dev
+    ```
+    (Note that "dev" is the default target, so the `--target` parameter
+    is optional here.)
+
+    This deploys everything that's defined for this project.
+    For example, the default template would deploy a job called
+    `[dev yourname] demo_for_dbas_python_job` to your workspace.
+    You can find that job by opening your workpace and clicking on **Workflows**.
+
+4. Similarly, to deploy a production copy, type:
+   ```
+   $ databricks bundle deploy --target prod
+   ```
+
+   Note that the default job from the template has a schedule that runs every day
+   (defined in resources/demo_for_dbas_python_job.yml). The schedule
+   is paused when deploying in development mode (see
+   https://docs.databricks.com/dev-tools/bundles/deployment-modes.html).
+
+5. To run a job or pipeline, use the "run" command:
+   ```
+   $ databricks bundle run
+   ```
+
+6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
+   https://docs.databricks.com/dev-tools/vscode-ext.html. Or read the "getting started" documentation for
+   **Databricks Connect** for instructions on running the included Python code from a different IDE.
+
+7. For documentation on the Databricks asset bundles format used
+   for this project, and for CI/CD configuration, see
+   https://docs.databricks.com/dev-tools/bundles/index.html.
diff --git a/demo_for_dbas_python/databricks.yml b/demo_for_dbas_python/databricks.yml
@@ -0,0 +1,41 @@
+# This is a Databricks asset bundle definition for demo_for_dbas_python.
+# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
+bundle:
+  name: demo_for_dbas_python
+
+include:
+  - resources/*.yml
+
+targets:
+  # The 'dev' target, for development purposes. This target is the default.
+  dev:
+    # We use 'mode: development' to indicate this is a personal development copy:
+    # - Deployed resources get prefixed with '[dev my_user_name]'
+    # - Any job schedules and triggers are paused by default
+    # - The 'development' mode is used for Delta Live Tables pipelines
+    mode: development
+    default: true
+    workspace:
+      host: https://adb-2541733722036151.11.azuredatabricks.net
+
+  ## Optionally, there could be a 'staging' target here.
+  ## (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/ci-cd.html.)
+  #
+  # staging:
+  #   workspace:
+  #     host: https://adb-2541733722036151.11.azuredatabricks.net
+
+  # The 'prod' target, used for production deployment.
+  prod:
+    # We use 'mode: production' to indicate this is a production deployment.
+    # Doing so enables strict verification of the settings below.
+    mode: production
+    workspace:
+      host: https://adb-2541733722036151.11.azuredatabricks.net
+      # We always use /Users/satish.muralikrishnan@databricks.com for all resources to make sure we only have a single copy.
+      # If this path results in an error, please make sure you have a recent version of the CLI installed.
+      root_path: /Users/satish.muralikrishnan@databricks.com/.bundle/${bundle.name}/${bundle.target}
+    run_as:
+      # This runs as satish.muralikrishnan@databricks.com in production. We could also use a service principal here,
+      # see https://docs.databricks.com/dev-tools/bundles/permissions.html.
+      user_name: satish.muralikrishnan@databricks.com
diff --git a/demo_for_dbas_python/fixtures/.gitkeep b/demo_for_dbas_python/fixtures/.gitkeep
@@ -0,0 +1,22 @@
+# Fixtures
+
+This folder is reserved for fixtures, such as CSV files.
+
+Below is an example of how to load fixtures as a data frame:
+
+```
+import pandas as pd
+import os
+
+def get_absolute_path(*relative_parts):
+    if 'dbutils' in globals():
+        base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore
+        path = os.path.normpath(os.path.join(base_dir, *relative_parts))
+        return path if path.startswith("/Workspace") else "/Workspace" + path
+    else:
+        return os.path.join(*relative_parts)
+
+csv_file = get_absolute_path("..", "fixtures", "mycsv.csv")
+df = pd.read_csv(csv_file)
+display(df)
+```
diff --git a/demo_for_dbas_python/pytest.ini b/demo_for_dbas_python/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+testpaths = tests
+pythonpath = src
diff --git a/demo_for_dbas_python/requirements-dev.txt b/demo_for_dbas_python/requirements-dev.txt
@@ -0,0 +1,29 @@
+## requirements-dev.txt: dependencies for local development.
+##
+## For defining dependencies used by jobs in Databricks Workflows, see
+## https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
+
+## Add code completion support for DLT
+databricks-dlt
+
+## pytest is the default package used for testing
+pytest
+
+## Dependencies for building wheel files
+setuptools
+wheel
+
+## databricks-connect can be used to run parts of this project locally.
+## See https://docs.databricks.com/dev-tools/databricks-connect.html.
+##
+## databricks-connect is automatically installed if you're using Databricks
+## extension for Visual Studio Code
+## (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html).
+##
+## To manually install databricks-connect, either follow the instructions
+## at https://docs.databricks.com/dev-tools/databricks-connect.html
+## to install the package system-wide. Or uncomment the line below to install a
+## version of db-connect that corresponds to the Databricks Runtime version used
+## for this project.
+#
+# databricks-connect>=13.3,<13.4
diff --git a/demo_for_dbas_python/resources/demo_for_dbas_python_job.yml b/demo_for_dbas_python/resources/demo_for_dbas_python_job.yml
@@ -0,0 +1,48 @@
+# The main job for demo_for_dbas_python.
+resources:
+  jobs:
+    demo_for_dbas_python_job:
+      name: demo_for_dbas_python_job
+
+      schedule:
+        # Run every day at 8:37 AM
+        quartz_cron_expression: '44 37 8 * * ?'
+        timezone_id: Europe/Amsterdam
+
+      email_notifications:
+        on_failure:
+          - satish.muralikrishnan@databricks.com
+
+      tasks:
+        - task_key: notebook_task
+          job_cluster_key: job_cluster
+          notebook_task:
+            notebook_path: ../src/notebook.ipynb
+
+        - task_key: refresh_pipeline
+          depends_on:
+            - task_key: notebook_task
+          pipeline_task:
+            pipeline_id: ${resources.pipelines.demo_for_dbas_python_pipeline.id}
+
+        - task_key: main_task
+          depends_on:
+            - task_key: refresh_pipeline
+          job_cluster_key: job_cluster
+          python_wheel_task:
+            package_name: demo_for_dbas_python
+            entry_point: main
+          libraries:
+            # By default we just include the .whl file generated for the demo_for_dbas_python package.
+            # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
+            # for more information on how to add other libraries.
+            - whl: ../dist/*.whl
+
+      job_clusters:
+        - job_cluster_key: job_cluster
+          new_cluster:
+            spark_version: 13.3.x-scala2.12
+            node_type_id: Standard_D3_v2
+            autoscale:
+                min_workers: 1
+                max_workers: 4
diff --git a/demo_for_dbas_python/resources/demo_for_dbas_python_pipeline.yml b/demo_for_dbas_python/resources/demo_for_dbas_python_pipeline.yml
@@ -0,0 +1,12 @@
+# The main pipeline for demo_for_dbas_python
+resources:
+  pipelines:
+    demo_for_dbas_python_pipeline:
+      name: demo_for_dbas_python_pipeline
+      target: demo_for_dbas_python_${bundle.environment}
+      libraries:
+        - notebook:
+            path: ../src/dlt_pipeline.ipynb
+
+      configuration:
+        bundle.sourcePath: /Workspace/${workspace.file_path}/src
diff --git a/demo_for_dbas_python/scratch/README.md b/demo_for_dbas_python/scratch/README.md
@@ -0,0 +1,4 @@
+# scratch
+
+This folder is reserved for personal, exploratory notebooks.
+By default these are not committed to Git, as 'scratch' is listed in .gitignore.
diff --git a/demo_for_dbas_python/setup.py b/demo_for_dbas_python/setup.py
@@ -0,0 +1,37 @@
+"""
+setup.py configuration script describing how to build and package this project.
+
+This file is primarily used by the setuptools library and typically should not
+be executed directly. See README.md for how to deploy, test, and run
+the demo_for_dbas_python project.
+"""
+from setuptools import setup, find_packages
+
+import sys
+sys.path.append('./src')
+
+import datetime
+import demo_for_dbas_python
+
+setup(
+    name="demo_for_dbas_python",
+    # We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.)
+    # to ensure that changes to wheel package are picked up when used on all-purpose clusters
+    version=demo_for_dbas_python.__version__ + "+" + datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S"),
+    url="https://databricks.com",
+    author="satish.muralikrishnan@databricks.com",
+    description="wheel file based on demo_for_dbas_python/src",
+    packages=find_packages(where='./src'),
+    package_dir={'': 'src'},
+    entry_points={
+        "packages": [
+            "main=demo_for_dbas_python.main:main"
+        ]
+    },
+    install_requires=[
+        # Dependencies in case the output wheel file is used as a library dependency.
+        # For defining dependencies, when this package is used in Databricks, see:
+        # https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
+        "setuptools"
+    ],
+)
diff --git a/demo_for_dbas_python/src/demo_for_dbas_python/__init__.py b/demo_for_dbas_python/src/demo_for_dbas_python/__init__.py
@@ -0,0 +1 @@
+__version__ = "0.0.1"
diff --git a/demo_for_dbas_python/src/demo_for_dbas_python/main.py b/demo_for_dbas_python/src/demo_for_dbas_python/main.py
@@ -0,0 +1,21 @@
+from pyspark.sql import SparkSession, DataFrame
+
+def get_taxis(spark: SparkSession) -> DataFrame:
+  return spark.read.table("samples.nyctaxi.trips")
+
+
+# Create a new Databricks Connect session. If this fails,
+# check that you have configured Databricks Connect correctly.
+# See https://docs.databricks.com/dev-tools/databricks-connect.html.
+def get_spark() -> SparkSession:
+  try:
+    from databricks.connect import DatabricksSession
+    return DatabricksSession.builder.getOrCreate()
+  except ImportError:
+    return SparkSession.builder.getOrCreate()
+
+def main():
+  get_taxis(get_spark()).show(5)
+
+if __name__ == '__main__':
+  main()