diff --git a/expts/scripts/build_dataset.py b/expts/scripts/build_dataset.py deleted file mode 100755 index dc598c5..0000000 --- a/expts/scripts/build_dataset.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python -import datasets -import pandas as pd -import safe as sf - -def convert_to_safe(sm, params) - -if __name__ == "__main__": - dt = datasets.load_dataset( - "/storage/shared_data/manu/zinc_dump/", - streaming=False, - num_proc=32, - cache_dir="/storage/shared_data/manu/.cache", - ) - dt = dt.cast_column("id", datasets.Value("string")) - - updated_dt = dt.map(update_info, num_proc=32, batched=True, batch_size=5000) - # unichem_df = pd.read_parquet("/storage/shared_data/manu/unichem", engine="fastparquet") - # unichem_df["id"] = unichem_df["id"].astype("str") - # unichem_dt_tmp = datasets.Dataset.from_pandas(unichem_df.drop(columns=["parquet_partition"])) - - # # 80% train, 20% test + validation - # train_test_valid = unichem_dt_tmp.train_test_split(test_size=0.2) - # # split the 20 % test into test and validation - # test_valid = train_test_valid["test"].train_test_split(test_size=0.5) - # # gather everyone if you want to have a single DatasetDict - # unichem_dt = datasets.DatasetDict( - # { - # "train": train_test_valid["train"], - # "test": test_valid["test"], - # "validation": test_valid["train"], - # } - # ) - - # unichem_dt = unichem_dt.select_columns(["id", "smiles", "source"]) - # print("SAVING the unichem to disk") - unichem_dt = datasets.load_dataset( - "/storage/shared_data/manu/processed_unichem", - cache_dir="/storage/shared_data/manu/.cache", - num_proc=32, - ) - - test_dt = datasets.concatenate_datasets([unichem_dt["test"], updated_dt["test"]]) - validation_dt = datasets.concatenate_datasets([unichem_dt["validation"], dt["validation"]]) - train_dt = datasets.concatenate_datasets([unichem_dt["train"], dt["train"]]) - dataset = datasets.DatasetDict(dict(train=train_dt, test=test_dt, validation=validation_dt)) - dataset.save_to_disk("/storage/shared_data/manu/processed_zinc_unichem", max_shard_size="1GB") diff --git a/expts/scripts/tokenizer_trainer.py b/expts/scripts/tokenizer_trainer.py index 03e2dc1..4694edf 100755 --- a/expts/scripts/tokenizer_trainer.py +++ b/expts/scripts/tokenizer_trainer.py @@ -12,28 +12,50 @@ class TokenizerTrainingArguments: """ Configuration for tokenizer training. """ + tokenizer_type: Optional[str] = field( default="bpe", metadata={"help": "Type of the tokenizer to train."} ) base_tokenizer: Optional[str] = field( - default=None, metadata={"help": "Optional base tokenizer to you. Otherwise, the tokenizer will be learnt from scratch using the safe tokenizer."} + default=None, + metadata={ + "help": "Optional base tokenizer to you. Otherwise, the tokenizer will be learnt from scratch using the safe tokenizer." + }, ) splitter: Optional[str] = field( default=None, metadata={"help": "Presplitter to use to train SAFE tokenizer."} - ) + ) dataset: str = field( default=None, metadata={"help": "Path to the dataset to load for training the tokenizer."} ) - text_column: Optional[str] = field(default="inputs", metadata={"help": "Column containing text data to process."}) - vocab_size: Optional[int] = field(default=1000, metadata={"help": "Target vocab size of the final tokenizer."}) - batch_size: Optional[int] = field(default=100, metadata={"help": "Batch size for training the tokenizer."}) + text_column: Optional[str] = field( + default="inputs", metadata={"help": "Column containing text data to process."} + ) + vocab_size: Optional[int] = field( + default=1000, metadata={"help": "Target vocab size of the final tokenizer."} + ) + batch_size: Optional[int] = field( + default=100, metadata={"help": "Batch size for training the tokenizer."} + ) n_examples: Optional[int] = field( default=None, metadata={"help": "Number of examples to train the tokenizer on."} ) - tokenizer_name: Optional[str] = field(default="safe", metadata={"help": "Name of new tokenizer."}) - outfile: Optional[str] = field(default=None, metadata={"help": "Path to the local save of the trained tokenizer"}) - all_split: Optional[bool] = field(default=False, metadata={"help": "Whether to use all the splits or just the train split if only that is available."}) - push_to_hub: Optional[bool] = field(default=False, metadata={"help": "Whether to push saved tokenizer to the hub."}) + tokenizer_name: Optional[str] = field( + default="safe", metadata={"help": "Name of new tokenizer."} + ) + outfile: Optional[str] = field( + default=None, metadata={"help": "Path to the local save of the trained tokenizer"} + ) + all_split: Optional[bool] = field( + default=False, + metadata={ + "help": "Whether to use all the splits or just the train split if only that is available." + }, + ) + push_to_hub: Optional[bool] = field( + default=False, metadata={"help": "Whether to push saved tokenizer to the hub."} + ) + if __name__ == "__main__": # Configuration @@ -52,15 +74,16 @@ class TokenizerTrainingArguments: tokenizer = AutoTokenizer.from_pretrained(args.base_tokenizer) tokenizer = tokenizer.train_new_from_iterator(dataset_iterator, vocab_size=args.vocab_size) else: - tokenizer = SAFETokenizer(tokenizer_type=args.tokenizer_type, splitter=args.splitter, trainer_args={'vocab_size':args.vocab_size}) + tokenizer = SAFETokenizer( + tokenizer_type=args.tokenizer_type, + splitter=args.splitter, + trainer_args={"vocab_size": args.vocab_size}, + ) tokenizer.train_from_iterator(dataset_iterator) - tokenizer_name = f"{args.tokenizer_name}-{args.tokenizer_type}-{args.vocab_size}" - # also save locally to the outfile specified + tokenizer_name = f"{args.tokenizer_name}-{args.tokenizer_type}-{args.vocab_size}" + # also save locally to the outfile specified if args.outfile is not None: tokenizer.save(args.outfile) tokenizer = tokenizer.get_pretrained() tokenizer.save_pretrained(tokenizer_name, push_to_hub=args.push_to_hub) - - -