From 66fd0bfda356896c99877774eebe39d3307f36a4 Mon Sep 17 00:00:00 2001 From: tfx-team Date: Wed, 10 Jul 2024 11:22:38 -0700 Subject: [PATCH] internal inly PiperOrigin-RevId: 651091420 --- .../tfx/CSV_Downloader_Component.ipynb | 365 ++++++++++++++++++ tfx/components/statistics_gen/executor.py | 9 +- tfx/dsl/compiler/placeholder_utils.py | 7 - tfx/dsl/compiler/placeholder_utils_test.py | 4 +- tfx/dsl/placeholder/placeholder_test_util.py | 5 +- tfx/utils/stats_utils.py | 21 + 6 files changed, 398 insertions(+), 13 deletions(-) create mode 100644 docs/tutorials/tfx/CSV_Downloader_Component.ipynb create mode 100644 tfx/utils/stats_utils.py diff --git a/docs/tutorials/tfx/CSV_Downloader_Component.ipynb b/docs/tutorials/tfx/CSV_Downloader_Component.ipynb new file mode 100644 index 00000000000..938f01043d3 --- /dev/null +++ b/docs/tutorials/tfx/CSV_Downloader_Component.ipynb @@ -0,0 +1,365 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "nl4XCJN9g8Bc" + }, + "source": [ + "Copyright 2023 The TensorFlow Authors.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dIUc9Zh3hM6H" + }, + "outputs": [], + "source": [ + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wU-hMBZVmyCo" + }, + "source": [ + "# TFX Pipeline Tutorial for Large Language Model using CNN Daily Dataset\n", + "\n", + "In this codelab, we use KerasNLP to load a pre-trained Large Language Model (LLM) - GPT-2 model - finetune it to a dataset. The dataset that is used in this demo is CNN daily dataset. Note that GPT-2 is used here only to demonstrate the end-to-end process; the techniques and tooling introduced in this codelab are potentially transferrable to other generative language models such as Google T5." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nJAp-HxKiKsE" + }, + "source": [ + "\u003cdiv class=\"devsite-table-wrapper\"\u003e\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", + "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/tfx/penguin_simple\"\u003e\n", + "\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\"/\u003eView on TensorFlow.org\u003c/a\u003e\u003c/td\u003e\n", + "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/tfx/penguin_simple.ipynb\"\u003e\n", + "\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\"\u003eRun in Google Colab\u003c/a\u003e\u003c/td\u003e\n", + "\u003ctd\u003e\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tfx/tree/master/docs/tutorials/tfx/penguin_simple.ipynb\"\u003e\n", + "\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\"\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\n", + "\u003ctd\u003e\u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tfx/docs/tutorials/tfx/penguin_simple.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\u003c/td\u003e\n", + "\u003c/table\u003e\u003c/div\u003e" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3MK3ryPikKtj" + }, + "source": [ + "# Before You Begin\n", + "\n", + "Colab offers different kinds of runtimes. Make sure to go to **Runtime -\u003e Change runtime** type and choose the GPU Hardware Accelerator runtime (which should have \u003e12G System RAM and ~15G GPU RAM) since you will finetune the GPT-2 model." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MMmMNdV1jZAS" + }, + "source": [ + "# Set Up\n", + "\n", + "We first install the TFX Python package." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "C23ItymvmVth" + }, + "source": [ + "## Upgrade Pip\n", + "To avoid upgrading Pip in a system when running locally, check to make sure that we are running in Colab. Local systems can of course be upgraded separately." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cfSG5IFamUq7" + }, + "outputs": [], + "source": [ + "try:\n", + " import colab\n", + " !pip install --upgrade pip\n", + "except:\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "te56mTWomdLq" + }, + "source": [ + "## Install TFX\n", + "\n", + "TFX is currently experiencing issues with Python 3.10 in Colab.\n", + "Therefore, simply running the command\n", + "```\n", + "!pip install -U tfx\n", + "```\n", + "to install tfx **will fail**. Hence, follow the code below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TGlfiX4PmcjZ" + }, + "outputs": [], + "source": [ + "%%shell\n", + "update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 3\n", + "curl -O https://bootstrap.pypa.io/get-pip.py\n", + "python get-pip.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nYHRZQjQEcS7" + }, + "outputs": [], + "source": [ + "# 1) TFX relies on an old version of google-api-core so we let google-auth float\n", + "# for the install. We grep it out below:\n", + "!grep -v google-auth /etc/requirements.core.in \u003e requirements.txt\n", + "\n", + "# 2) httplib2 should be included in /etc/requirements.core.in but it's not for\n", + "# reasons. We ensure it's included:\n", + "!grep httplib2 /etc/requirements.user.in \u003e\u003e requirements.txt\n", + "\n", + "# 3) google.colab package is not available as a wheel. We symlink that in so\n", + "# it's on the sys.path of Python 3.8:\n", + "!mkdir /usr/local/lib/python3.8/dist-packages/google\n", + "!ln -s /usr/local/lib/python3.10/dist-packages/google/colab /usr/local/lib/python3.8/dist-packages/google/colab\n", + "\n", + "# Now with those pre-requisites out of the way:\n", + "!pip install tfx==1.13.0 -r requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5MiV2iFkiqbL" + }, + "outputs": [], + "source": [ + "!pip install keras_nlp" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wZo6NOYQEcS7" + }, + "source": [ + "# Imports\n", + "Let's first get our imports out of the way." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VDhX6vgUEcS7" + }, + "outputs": [], + "source": [ + "from tensorflow import keras\n", + "from tfx.types import Channel\n", + "from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LJaN_u_8tEwi" + }, + "source": [ + "## Did you restart the runtime?\n", + "\n", + "If you are using Google Colab, the first time that you run the cell above, you must restart the runtime by clicking above \"RESTART RUNTIME\" button or using \"Runtime \u003e Restart runtime ...\" menu. This is because of the way that Colab loads packages.\n", + "\n", + "Check the TensorFlow and TFX versions." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fac1XkwrnXW6" + }, + "source": [ + "Let's check the library versions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VNwD6G4TXrlq" + }, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "print('TensorFlow version: {}'.format(tf.__version__))\n", + "from tfx import v1 as tfx\n", + "print('TFX version: {}'.format(tfx.__version__))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LnvgEYNwtMhJ" + }, + "source": [ + "## Set up variables\n", + "There are some variables used to define a pipeline. You can customize these variables as you want. By default all output from the pipeline will be generated under the current directory." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yVFcsQhWkbkw" + }, + "source": [ + "# CSV Downloader\n", + "In order to make the pipeline more efficient and possible for automation, it is useful to have a component that takes in a download link to the CSV file to be downloaded. Furthermore, one important goal of TFX production ML pipeline is to collect metadata containing information about the pipeline components, their executions, and resulting artifacts. In other words, the purpose of the metadata is to analyze the lineage of pipeline components and debug issues, and the CSV Downloader Component would help the users logging and tracking information about the source of the data and the preprocessing steps that the data have undergone before entering the pipeline. In this section, we declare a new artifact called CSVdoc and develop a custom component -- CSV Downloader -- which stores information about the dataset and downloads the CSV file in the CSVdoc artifact's URI." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Jc1JTbjjo0bd" + }, + "outputs": [], + "source": [ + "from tfx.types import artifact\n", + "from tfx import types\n", + "\n", + "Property = artifact.Property\n", + "PropertyType = artifact.PropertyType\n", + "\n", + "URL_PROPERTY = Property(type=PropertyType.STRING)\n", + "PATH_PROPERTY = Property(type=PropertyType.STRING)\n", + "\n", + "class CsvDoc(types.Artifact):\n", + " \"\"\" Artifact that contains the CSV dataset.\n", + "\n", + " - 'url' : saves the source of the original data.\n", + " - 'path': saves the path to the CSV file.\n", + " \"\"\"\n", + "\n", + " TYPE_NAME = 'CsvDoc'\n", + " PROPERTIES = {\n", + " 'url' : URL_PROPERTY,\n", + " 'path': PATH_PROPERTY,\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9Qks2al5X1Us" + }, + "outputs": [], + "source": [ + "from absl import logging\n", + "import requests\n", + "import os\n", + "import tfx.v1 as tfx\n", + "from tfx.dsl.component.experimental.decorators import component\n", + "\n", + "@tfx.dsl.components.component\n", + "def CsvDownloaderComponent(\n", + " url: tfx.dsl.components.Parameter[str],\n", + " file_name: tfx.dsl.components.Parameter[str],\n", + " saved_file: tfx.dsl.components.OutputArtifact[CsvDoc],\n", + ") -\u003e None:\n", + " response = requests.get(url)\n", + " saved_file.url = url\n", + " if response.status_code == 200:\n", + " file_path = os.path.join(saved_file.uri, file_name)\n", + " saved_file.path = file_path\n", + " url_content = response.content\n", + " with open(file_path, 'wb') as csv_file:\n", + " csv_file.write(url_content)\n", + " logging.info(f\"CSV file saved successfully at {file_path}\")\n", + " else:\n", + " raise Exception(\"CSV file failed to be saved.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3D3O4L6hYBBt" + }, + "outputs": [], + "source": [ + "downloader = CsvDownloaderComponent(\n", + " url = 'https://drive.google.com/uc?id=1YdZsJlRafqxiNSl0nHQkwR7rzrNlN9LI\u0026export=download', file_name ='testing_doc.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fGm5cG6cYE10" + }, + "outputs": [], + "source": [ + "from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext\n", + "context = InteractiveContext()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SHpBtrduYG7U" + }, + "outputs": [], + "source": [ + "context.run(downloader, enable_cache = False)" + ] + } + ], + "metadata": { + "colab": { + "name": "CSV_Downloader_Component.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/tfx/components/statistics_gen/executor.py b/tfx/components/statistics_gen/executor.py index 23aad742214..9b4a4fbdedf 100644 --- a/tfx/components/statistics_gen/executor.py +++ b/tfx/components/statistics_gen/executor.py @@ -18,7 +18,6 @@ from absl import logging import tensorflow_data_validation as tfdv from tensorflow_data_validation.statistics import stats_options as options -from tensorflow_data_validation.utils import dashboard_util from tfx import types from tfx.components.statistics_gen import stats_artifact_utils from tfx.components.util import examples_utils @@ -28,6 +27,8 @@ from tfx.types import standard_component_specs from tfx.utils import io_utils from tfx.utils import json_utils +from tfx.utils.google import stats_utils +# from tensorflow_data_validation.utils import dashboard_util # Default file name for stats generated. @@ -151,7 +152,8 @@ def Do( try: statistics_artifact.set_string_custom_property( - STATS_DASHBOARD_LINK, dashboard_util.generate_stats_dashboard_link() + STATS_DASHBOARD_LINK, + stats_utils.generate_stats_dashboard_link(statistics_artifact), ) except Exception as e: # pylint: disable=broad-except # log on failures to not bring down Statsgen jobs @@ -235,7 +237,8 @@ def Do( # Update sample rate for each split in stats_options if # sample_rate_by_split is provided split_stats_options = tfdv.StatsOptions.from_json( - stats_options.to_json()) + stats_options.to_json() + ) if sample_rate_by_split: sample_rate = sample_rate_by_split.get(split, None) if sample_rate is not None: diff --git a/tfx/dsl/compiler/placeholder_utils.py b/tfx/dsl/compiler/placeholder_utils.py index 884b75e68fa..527481a9706 100644 --- a/tfx/dsl/compiler/placeholder_utils.py +++ b/tfx/dsl/compiler/placeholder_utils.py @@ -129,13 +129,6 @@ def resolve_placeholder_expression( return result -def empty_placeholder_context() -> ResolutionContext: - """Returns an empty placeholder context.""" - return ResolutionContext( - exec_info=data_types.ExecutionInfo(), - ) - - class _Operation(enum.Enum): """Alias for Operation enum types in placeholder.proto.""" diff --git a/tfx/dsl/compiler/placeholder_utils_test.py b/tfx/dsl/compiler/placeholder_utils_test.py index 49fe6446d98..0594e588409 100644 --- a/tfx/dsl/compiler/placeholder_utils_test.py +++ b/tfx/dsl/compiler/placeholder_utils_test.py @@ -1689,8 +1689,8 @@ def testMakeProtoOpResolvesProto(self): placeholder_pb2.PlaceholderExpression(), ) resolved_proto = placeholder_utils.resolve_placeholder_expression( - placeholder_expression, placeholder_utils.empty_placeholder_context() - ) + placeholder_expression, placeholder_utils.ResolutionContext( + exec_info=data_types.ExecutionInfo())) self.assertProtoEquals( """ splits: "train" diff --git a/tfx/dsl/placeholder/placeholder_test_util.py b/tfx/dsl/placeholder/placeholder_test_util.py index b58729147c4..aade7e64461 100644 --- a/tfx/dsl/placeholder/placeholder_test_util.py +++ b/tfx/dsl/placeholder/placeholder_test_util.py @@ -17,6 +17,7 @@ from tfx.dsl.compiler import placeholder_utils from tfx.dsl.placeholder import placeholder_base +from tfx.orchestration.portable import data_types def resolve( @@ -38,7 +39,9 @@ def resolve( return placeholder_utils.resolve_placeholder_expression( placeholder.encode(), resolution_context - or placeholder_utils.empty_placeholder_context(), + or placeholder_utils.ResolutionContext( + exec_info=data_types.ExecutionInfo() + ), ) diff --git a/tfx/utils/stats_utils.py b/tfx/utils/stats_utils.py new file mode 100644 index 00000000000..8e607c4c4a5 --- /dev/null +++ b/tfx/utils/stats_utils.py @@ -0,0 +1,21 @@ +# Copyright 2024 Google LLC. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""stats_utils. + +This module is the parity for internal implementation, not available in OSS. +""" + + +def generate_stats_dashboard_link(unused_statistics_artifact): + return ''