From 002709354c9193af1981c7a308f6f2d299ffbcd4 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Thu, 24 Oct 2024 12:09:30 -0700 Subject: [PATCH] initial commit Signed-off-by: Sarah Yurick --- nemo_curator/__init__.py | 3 +++ nemo_curator/scripts/add_id.py | 5 ++++- .../classifiers/fineweb_edu_classifier_inference.py | 11 ++++++++--- nemo_curator/utils/script_utils.py | 11 +++++++++++ setup.py | 1 + 5 files changed, 27 insertions(+), 4 deletions(-) diff --git a/nemo_curator/__init__.py b/nemo_curator/__init__.py index c9982b72..3d4bfcf4 100644 --- a/nemo_curator/__init__.py +++ b/nemo_curator/__init__.py @@ -29,3 +29,6 @@ # See https://github.com/NVIDIA/NeMo-Curator/issues/33 # This also happens when reading and writing to files dask.config.set({"dataframe.convert-string": False}) + +# See setup.py +__version__ = "0.4.0" diff --git a/nemo_curator/scripts/add_id.py b/nemo_curator/scripts/add_id.py index bc88db2b..3b2a988d 100644 --- a/nemo_curator/scripts/add_id.py +++ b/nemo_curator/scripts/add_id.py @@ -16,6 +16,7 @@ import random import nemo_curator +from nemo_curator import __version__ from nemo_curator.datasets import DocumentDataset from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk from nemo_curator.utils.file_utils import ( @@ -54,7 +55,9 @@ def main(args): def attach_args( parser=argparse.ArgumentParser( - """ + f""" +NVIDIA NeMo Curator -- v{__version__} + Adds unique identifiers to each document in the dataset. Creates a new ID field with name specified by the argument "--id-field-name" within each json. diff --git a/nemo_curator/scripts/classifiers/fineweb_edu_classifier_inference.py b/nemo_curator/scripts/classifiers/fineweb_edu_classifier_inference.py index 3e704848..e7662e18 100644 --- a/nemo_curator/scripts/classifiers/fineweb_edu_classifier_inference.py +++ b/nemo_curator/scripts/classifiers/fineweb_edu_classifier_inference.py @@ -17,6 +17,9 @@ import warnings os.environ["RAPIDS_NO_INITIALIZE"] = "1" + +from nemo_curator import __version__ + from nemo_curator.classifiers import FineWebEduClassifier from nemo_curator.datasets import DocumentDataset @@ -29,13 +32,15 @@ def main(): - args = ArgumentHelper.parse_distributed_classifier_args().parse_args() + args = ArgumentHelper.parse_distributed_classifier_args( + description=f"\nNVIDIA NeMo Curator -- v{__version__}\n\nRun FineWeb-Edu classifier inference" + ).parse_args() print(f"Arguments parsed = {args}", flush=True) client_args = ArgumentHelper.parse_client_args(args) client_args["cluster_type"] = "gpu" client = get_client(**client_args) - print("Starting Fineweb classifier inference", flush=True) + print("Starting FineWeb-Edu classifier inference", flush=True) global_st = time.time() files_per_run = len(client.scheduler_info()["workers"]) * 2 @@ -95,7 +100,7 @@ def main(): global_et = time.time() print( - f"Total time taken for domain classifier inference: {global_et-global_st} s", + f"Total time taken for FineWeb-Edu classifier inference: {global_et-global_st} s", flush=True, ) client.close() diff --git a/nemo_curator/utils/script_utils.py b/nemo_curator/utils/script_utils.py index f591bfe0..50806f06 100644 --- a/nemo_curator/utils/script_utils.py +++ b/nemo_curator/utils/script_utils.py @@ -16,6 +16,8 @@ import psutil +from nemo_curator import __version__ + class ArgumentHelper: """ @@ -24,6 +26,15 @@ class ArgumentHelper: def __init__(self, parser: argparse.ArgumentParser): self.parser = parser + self.attach_version_arg() + + def attach_version_arg(self): + self.parser.add_argument( + "--version", "-v", + action="version", + version=f"NVIDIA NeMo Curator -- v{__version__}", + help="Show the version and exit." + ) @staticmethod def attach_bool_arg( diff --git a/setup.py b/setup.py index 8b35bc8e..cceccc75 100644 --- a/setup.py +++ b/setup.py @@ -101,6 +101,7 @@ def req_file(filename, folder="requirements"): "domain_classifier_inference=nemo_curator.scripts.classifiers.domain_classifier_inference:console_script", "quality_classifier_inference=nemo_curator.scripts.classifiers.quality_classifier_inference:console_script", "aegis_classifier_inference=nemo_curator.scripts.classifiers.aegis_classifier_inference:console_script", + "fineweb_edu_classifier_inference=nemo_curator.scripts.classifiers.fineweb_edu_classifier_inference:console_script", "verify_classification_results=nemo_curator.scripts.verify_classification_results:console_script", "blend_datasets=nemo_curator.scripts.blend_datasets:console_script", "semdedup_extract_embeddings=nemo_curator.scripts.semdedup.compute_embeddings:console_script",