Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
  • Loading branch information
sarahyurick committed Oct 24, 2024
1 parent 7d7767b commit 0027093
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 4 deletions.
3 changes: 3 additions & 0 deletions nemo_curator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,6 @@
# See https://github.com/NVIDIA/NeMo-Curator/issues/33
# This also happens when reading and writing to files
dask.config.set({"dataframe.convert-string": False})

# See setup.py
__version__ = "0.4.0"
5 changes: 4 additions & 1 deletion nemo_curator/scripts/add_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import random

import nemo_curator
from nemo_curator import __version__
from nemo_curator.datasets import DocumentDataset
from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk
from nemo_curator.utils.file_utils import (
Expand Down Expand Up @@ -54,7 +55,9 @@ def main(args):

def attach_args(
parser=argparse.ArgumentParser(
"""
f"""
NVIDIA NeMo Curator -- v{__version__}
Adds unique identifiers to each document in the dataset.
Creates a new ID field with name specified by the argument
"--id-field-name" within each json.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
import warnings

os.environ["RAPIDS_NO_INITIALIZE"] = "1"

from nemo_curator import __version__

from nemo_curator.classifiers import FineWebEduClassifier
from nemo_curator.datasets import DocumentDataset

Expand All @@ -29,13 +32,15 @@


def main():
args = ArgumentHelper.parse_distributed_classifier_args().parse_args()
args = ArgumentHelper.parse_distributed_classifier_args(
description=f"\nNVIDIA NeMo Curator -- v{__version__}\n\nRun FineWeb-Edu classifier inference"
).parse_args()
print(f"Arguments parsed = {args}", flush=True)

client_args = ArgumentHelper.parse_client_args(args)
client_args["cluster_type"] = "gpu"
client = get_client(**client_args)
print("Starting Fineweb classifier inference", flush=True)
print("Starting FineWeb-Edu classifier inference", flush=True)
global_st = time.time()
files_per_run = len(client.scheduler_info()["workers"]) * 2

Expand Down Expand Up @@ -95,7 +100,7 @@ def main():

global_et = time.time()
print(
f"Total time taken for domain classifier inference: {global_et-global_st} s",
f"Total time taken for FineWeb-Edu classifier inference: {global_et-global_st} s",
flush=True,
)
client.close()
Expand Down
11 changes: 11 additions & 0 deletions nemo_curator/utils/script_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

import psutil

from nemo_curator import __version__


class ArgumentHelper:
"""
Expand All @@ -24,6 +26,15 @@ class ArgumentHelper:

def __init__(self, parser: argparse.ArgumentParser):
self.parser = parser
self.attach_version_arg()

def attach_version_arg(self):
self.parser.add_argument(
"--version", "-v",
action="version",
version=f"NVIDIA NeMo Curator -- v{__version__}",
help="Show the version and exit."
)

@staticmethod
def attach_bool_arg(
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def req_file(filename, folder="requirements"):
"domain_classifier_inference=nemo_curator.scripts.classifiers.domain_classifier_inference:console_script",
"quality_classifier_inference=nemo_curator.scripts.classifiers.quality_classifier_inference:console_script",
"aegis_classifier_inference=nemo_curator.scripts.classifiers.aegis_classifier_inference:console_script",
"fineweb_edu_classifier_inference=nemo_curator.scripts.classifiers.fineweb_edu_classifier_inference:console_script",
"verify_classification_results=nemo_curator.scripts.verify_classification_results:console_script",
"blend_datasets=nemo_curator.scripts.blend_datasets:console_script",
"semdedup_extract_embeddings=nemo_curator.scripts.semdedup.compute_embeddings:console_script",
Expand Down

0 comments on commit 0027093

Please sign in to comment.