diff --git a/.asf.yaml b/.asf.yaml index 217cc5250..dc4016038 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -45,13 +45,14 @@ github: required_status_checks: # strict means "Require branches to be up to date before merging". strict: true - # contexts are the names of checks that must pass + # Contexts are the names of checks that must pass. This is the value + # of the job's `name` property if it's present. contexts: - markdown-link-check - build - regtest - site - - helm-tests + - "Helm tests" features: wiki: false diff --git a/.github/workflows/check-md-link.yml b/.github/workflows/check-md-link.yml index 6cdb4195e..00a163593 100644 --- a/.github/workflows/check-md-link.yml +++ b/.github/workflows/check-md-link.yml @@ -41,5 +41,5 @@ jobs: with: use-quiet-mode: 'yes' config-file: '.github/workflows/check-md-link-config.json' - folder-path: 'regtests, regtests/client/python/docs, regtests/client/python, .github, build-logic, polaris-core, polaris-service, extension, spec, k8, notebooks' + folder-path: 'regtests, regtests/client/python/docs, regtests/client/python, .github, build-logic, polaris-core, polaris-service, extension, spec, k8, getting-started' file-path: 'CHAT_BYLAWS.md, CODE_OF_CONDUCT.md, CONTRIBUTING.md, README.md SECURITY.md' diff --git a/.gitignore b/.gitignore index 3855dceb5..62beb3bcc 100644 --- a/.gitignore +++ b/.gitignore @@ -26,8 +26,8 @@ regtests/output/ /polaris-venv/ /pyproject.toml -# Notebooks -notebooks/.ipynb_checkpoints/ +# Notebook Checkpoints +**/.ipynb_checkpoints/ # Metastore metastore_db/ diff --git a/getting-started/spark/README.md b/getting-started/spark/README.md new file mode 100644 index 000000000..55e4f9d94 --- /dev/null +++ b/getting-started/spark/README.md @@ -0,0 +1,45 @@ + + +# Getting Started with Apache Spark and Apache Polaris + +This getting started guide provides a `docker-compose` file to set up [Apache Spark](https://spark.apache.org/) with Apache Polaris. Apache Polaris is configured as an Iceberg REST Catalog in Spark. +A Jupyter notebook is used to run PySpark. + +## Run the `docker-compose` file +To start the `docker-compose` file, run this command from the repo's root directory: +``` +docker-compose -f getting-started/spark/docker-compose.yml up +``` + +This will spin up 2 container services +* The `polaris` service for running Apache Polaris using an in-memory metastore +* The `jupyter` service for running Jupyter notebook with PySpark + +## Access the Jupyter notebook interface +In the Jupyter notebook container log, look for the URL to access the Jupyter notebook. The url should be in the format, `http://127.0.0.1:8888/lab?token=`. + +Open the Jupyter notebook in a browser. +Navigate to [`notebooks/SparkPolaris.ipynb`](http://127.0.0.1:8888/lab/tree/notebooks/SparkPolaris.ipynb) + +## Change the Polaris credential +The Polaris service will create a new root crendential on startup, find this credential in the Polaris service log and change the `polaris_credential` variable in the first cell of the jupyter notebook + +## Run the Jupyter notebook +You can now run all cells in the notebook or write your own code! diff --git a/docker-compose-jupyter.yml b/getting-started/spark/docker-compose.yml similarity index 88% rename from docker-compose-jupyter.yml rename to getting-started/spark/docker-compose.yml index 97a6d1cec..4bda0320c 100644 --- a/docker-compose-jupyter.yml +++ b/getting-started/spark/docker-compose.yml @@ -20,7 +20,7 @@ services: polaris: build: - context: . + context: ../../ network: host ports: - "8181:8181" @@ -37,8 +37,8 @@ services: retries: 5 jupyter: build: - context: . - dockerfile: ./notebooks/Dockerfile + context: ../../ # this is necessary to expose `regtests/` dir to notebooks/Dockerfile + dockerfile: ./getting-started/spark/notebooks/Dockerfile network: host ports: - "8888:8888" @@ -57,4 +57,4 @@ volumes: driver_opts: o: bind type: none - device: ./notebooks + device: ./notebooks/ diff --git a/notebooks/Dockerfile b/getting-started/spark/notebooks/Dockerfile similarity index 100% rename from notebooks/Dockerfile rename to getting-started/spark/notebooks/Dockerfile diff --git a/notebooks/SparkPolaris.ipynb b/getting-started/spark/notebooks/SparkPolaris.ipynb similarity index 95% rename from notebooks/SparkPolaris.ipynb rename to getting-started/spark/notebooks/SparkPolaris.ipynb index 6510c670f..deb74e928 100644 --- a/notebooks/SparkPolaris.ipynb +++ b/getting-started/spark/notebooks/SparkPolaris.ipynb @@ -21,8 +21,11 @@ "from polaris.catalog.api_client import ApiClient as CatalogApiClient\n", "from polaris.catalog.api_client import Configuration as CatalogApiClientConfiguration\n", "\n", - "client_id = 'b3b6497353b33ea7'\n", - "client_secret = '623a67ee71d75825238e3e269df5cdac' # pragma: allowlist secret\n", + "# (CHANGE ME): This credential changes on every Polaris service restart\n", + "# In the Polaris log, look for the `realm: default-realm root principal credentials:` string\n", + "polaris_credential = '35df9f8a34199df0:101b9d35700032416210ad2d39b1b4e3' # pragma: allowlist secret\n", + "\n", + "client_id, client_secret = polaris_credential.split(\":\")\n", "client = CatalogApiClient(CatalogApiClientConfiguration(username=client_id,\n", " password=client_secret,\n", " host='http://polaris:8181/api/catalog'))\n", @@ -42,8 +45,7 @@ "source": [ "# Create our first catalog\n", "\n", - "* Creates a catalog named `polaris_catalog` that writes to a specified location in S3.\n", - "* An AWS IAM role is specified - this role is assumed whenever we read or write data in the catalog" + "* Creates a catalog named `polaris_catalog` that writes to a specified location in the Local Filesystem." ] }, { @@ -59,11 +61,9 @@ " host='http://polaris:8181/api/management/v1'))\n", "root_client = PolarisDefaultApi(client)\n", "\n", - "storage_conf = AwsStorageConfigInfo(storage_type=\"S3\",\n", - " allowed_locations=[\"s3://datalake-storage-team/polaris_test/\"],\n", - " role_arn=\"arn:aws:iam::631484165566:role/datalake-storage-integration-role\")\n", + "storage_conf = FileStorageConfigInfo(storage_type=\"FILE\", allowed_locations=[\"file:///tmp\"])\n", "catalog_name = 'polaris_demo'\n", - "catalog = Catalog(name=catalog_name, type='INTERNAL', properties={\"default-base-location\": \"s3://datalake-storage-team/polaris_test/polaris_catalog\"},\n", + "catalog = Catalog(name=catalog_name, type='INTERNAL', properties={\"default-base-location\": \"file:///tmp/polaris/\"},\n", " storage_config_info=storage_conf)\n", "catalog.storage_config_info = storage_conf\n", "root_client.create_catalog(create_catalog_request=CreateCatalogRequest(catalog=catalog))\n", @@ -272,7 +272,7 @@ " .config(\"spark.sql.catalog.polaris.credential\", f\"{engineer_principal.credentials.client_id}:{engineer_principal.credentials.client_secret}\")\n", "\n", " # Set the warehouse to the name of the catalog we created\n", - " .config(\"spark.sql.catalog.polaris.warehouse\", 'polaris_demo')\n", + " .config(\"spark.sql.catalog.polaris.warehouse\", catalog_name)\n", "\n", " # Scope set to PRINCIPAL_ROLE:ALL\n", " .config(\"spark.sql.catalog.polaris.scope\", 'PRINCIPAL_ROLE:ALL')\n", @@ -454,7 +454,7 @@ " return codecs.decode(\"1F\", \"hex\").decode(\"UTF-8\").join(namespace)\n", "\n", "# Call loadTable\n", - "tbl_meta = collado_client.load_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n", + "tbl_meta = collado_client.load_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n", "display(JSON(tbl_meta.to_dict(), expanded=True))" ] }, @@ -604,7 +604,7 @@ }, "outputs": [], "source": [ - "tbl_meta = pm_client.load_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n", + "tbl_meta = pm_client.load_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n", "display(JSON(tbl_meta.to_dict(), expanded=True))" ] }, @@ -632,7 +632,7 @@ }, "outputs": [], "source": [ - "pm_client.drop_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')" + "pm_client.drop_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')" ] }, { @@ -775,7 +775,7 @@ "# The ops_client fails to do any real damage even though the engineer normally has DROP_TABLE privileges\n", "ops_client = IcebergCatalogAPI(CatalogApiClient(CatalogApiClientConfiguration(access_token=ops_token.access_token,\n", " host='http://polaris:8181/api/catalog')))\n", - "ops_client.drop_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')" + "ops_client.drop_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')" ] } ],