Skip to content

Commit

Permalink
Merge branch 'main' into sean/dropwizard-logging
Browse files Browse the repository at this point in the history
  • Loading branch information
sullis authored Oct 15, 2024
2 parents 46c4863 + ac01e2d commit d55adf4
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 22 deletions.
5 changes: 3 additions & 2 deletions .asf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,14 @@ github:
required_status_checks:
# strict means "Require branches to be up to date before merging".
strict: true
# contexts are the names of checks that must pass
# Contexts are the names of checks that must pass. This is the value
# of the job's `name` property if it's present.
contexts:
- markdown-link-check
- build
- regtest
- site
- helm-tests
- "Helm tests"

features:
wiki: false
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/check-md-link.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,5 @@ jobs:
with:
use-quiet-mode: 'yes'
config-file: '.github/workflows/check-md-link-config.json'
folder-path: 'regtests, regtests/client/python/docs, regtests/client/python, .github, build-logic, polaris-core, polaris-service, extension, spec, k8, notebooks'
folder-path: 'regtests, regtests/client/python/docs, regtests/client/python, .github, build-logic, polaris-core, polaris-service, extension, spec, k8, getting-started'
file-path: 'CHAT_BYLAWS.md, CODE_OF_CONDUCT.md, CONTRIBUTING.md, README.md SECURITY.md'
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ regtests/output/
/polaris-venv/
/pyproject.toml

# Notebooks
notebooks/.ipynb_checkpoints/
# Notebook Checkpoints
**/.ipynb_checkpoints/

# Metastore
metastore_db/
Expand Down
45 changes: 45 additions & 0 deletions getting-started/spark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

# Getting Started with Apache Spark and Apache Polaris

This getting started guide provides a `docker-compose` file to set up [Apache Spark](https://spark.apache.org/) with Apache Polaris. Apache Polaris is configured as an Iceberg REST Catalog in Spark.
A Jupyter notebook is used to run PySpark.

## Run the `docker-compose` file
To start the `docker-compose` file, run this command from the repo's root directory:
```
docker-compose -f getting-started/spark/docker-compose.yml up
```

This will spin up 2 container services
* The `polaris` service for running Apache Polaris using an in-memory metastore
* The `jupyter` service for running Jupyter notebook with PySpark

## Access the Jupyter notebook interface
In the Jupyter notebook container log, look for the URL to access the Jupyter notebook. The url should be in the format, `http://127.0.0.1:8888/lab?token=<token>`.

Open the Jupyter notebook in a browser.
Navigate to [`notebooks/SparkPolaris.ipynb`](http://127.0.0.1:8888/lab/tree/notebooks/SparkPolaris.ipynb) <!-- markdown-link-check-disable-line -->

## Change the Polaris credential
The Polaris service will create a new root crendential on startup, find this credential in the Polaris service log and change the `polaris_credential` variable in the first cell of the jupyter notebook

## Run the Jupyter notebook
You can now run all cells in the notebook or write your own code!
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
services:
polaris:
build:
context: .
context: ../../
network: host
ports:
- "8181:8181"
Expand All @@ -37,8 +37,8 @@ services:
retries: 5
jupyter:
build:
context: .
dockerfile: ./notebooks/Dockerfile
context: ../../ # this is necessary to expose `regtests/` dir to notebooks/Dockerfile
dockerfile: ./getting-started/spark/notebooks/Dockerfile
network: host
ports:
- "8888:8888"
Expand All @@ -57,4 +57,4 @@ volumes:
driver_opts:
o: bind
type: none
device: ./notebooks
device: ./notebooks/
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@
"from polaris.catalog.api_client import ApiClient as CatalogApiClient\n",
"from polaris.catalog.api_client import Configuration as CatalogApiClientConfiguration\n",
"\n",
"client_id = 'b3b6497353b33ea7'\n",
"client_secret = '623a67ee71d75825238e3e269df5cdac' # pragma: allowlist secret\n",
"# (CHANGE ME): This credential changes on every Polaris service restart\n",
"# In the Polaris log, look for the `realm: default-realm root principal credentials:` string\n",
"polaris_credential = '35df9f8a34199df0:101b9d35700032416210ad2d39b1b4e3' # pragma: allowlist secret\n",
"\n",
"client_id, client_secret = polaris_credential.split(\":\")\n",
"client = CatalogApiClient(CatalogApiClientConfiguration(username=client_id,\n",
" password=client_secret,\n",
" host='http://polaris:8181/api/catalog'))\n",
Expand All @@ -42,8 +45,7 @@
"source": [
"# Create our first catalog\n",
"\n",
"* Creates a catalog named `polaris_catalog` that writes to a specified location in S3.\n",
"* An AWS IAM role is specified - this role is assumed whenever we read or write data in the catalog"
"* Creates a catalog named `polaris_catalog` that writes to a specified location in the Local Filesystem."
]
},
{
Expand All @@ -59,11 +61,9 @@
" host='http://polaris:8181/api/management/v1'))\n",
"root_client = PolarisDefaultApi(client)\n",
"\n",
"storage_conf = AwsStorageConfigInfo(storage_type=\"S3\",\n",
" allowed_locations=[\"s3://datalake-storage-team/polaris_test/\"],\n",
" role_arn=\"arn:aws:iam::631484165566:role/datalake-storage-integration-role\")\n",
"storage_conf = FileStorageConfigInfo(storage_type=\"FILE\", allowed_locations=[\"file:///tmp\"])\n",
"catalog_name = 'polaris_demo'\n",
"catalog = Catalog(name=catalog_name, type='INTERNAL', properties={\"default-base-location\": \"s3://datalake-storage-team/polaris_test/polaris_catalog\"},\n",
"catalog = Catalog(name=catalog_name, type='INTERNAL', properties={\"default-base-location\": \"file:///tmp/polaris/\"},\n",
" storage_config_info=storage_conf)\n",
"catalog.storage_config_info = storage_conf\n",
"root_client.create_catalog(create_catalog_request=CreateCatalogRequest(catalog=catalog))\n",
Expand Down Expand Up @@ -272,7 +272,7 @@
" .config(\"spark.sql.catalog.polaris.credential\", f\"{engineer_principal.credentials.client_id}:{engineer_principal.credentials.client_secret}\")\n",
"\n",
" # Set the warehouse to the name of the catalog we created\n",
" .config(\"spark.sql.catalog.polaris.warehouse\", 'polaris_demo')\n",
" .config(\"spark.sql.catalog.polaris.warehouse\", catalog_name)\n",
"\n",
" # Scope set to PRINCIPAL_ROLE:ALL\n",
" .config(\"spark.sql.catalog.polaris.scope\", 'PRINCIPAL_ROLE:ALL')\n",
Expand Down Expand Up @@ -454,7 +454,7 @@
" return codecs.decode(\"1F\", \"hex\").decode(\"UTF-8\").join(namespace)\n",
"\n",
"# Call loadTable\n",
"tbl_meta = collado_client.load_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n",
"tbl_meta = collado_client.load_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n",
"display(JSON(tbl_meta.to_dict(), expanded=True))"
]
},
Expand Down Expand Up @@ -604,7 +604,7 @@
},
"outputs": [],
"source": [
"tbl_meta = pm_client.load_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n",
"tbl_meta = pm_client.load_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n",
"display(JSON(tbl_meta.to_dict(), expanded=True))"
]
},
Expand Down Expand Up @@ -632,7 +632,7 @@
},
"outputs": [],
"source": [
"pm_client.drop_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
"pm_client.drop_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
]
},
{
Expand Down Expand Up @@ -775,7 +775,7 @@
"# The ops_client fails to do any real damage even though the engineer normally has DROP_TABLE privileges\n",
"ops_client = IcebergCatalogAPI(CatalogApiClient(CatalogApiClientConfiguration(access_token=ops_token.access_token,\n",
" host='http://polaris:8181/api/catalog')))\n",
"ops_client.drop_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
"ops_client.drop_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
]
}
],
Expand Down

0 comments on commit d55adf4

Please sign in to comment.