Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
maclandrol committed Aug 8, 2023
1 parent 7460917 commit 0447c44
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 62 deletions.
47 changes: 0 additions & 47 deletions expts/scripts/build_dataset.py

This file was deleted.

53 changes: 38 additions & 15 deletions expts/scripts/tokenizer_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,50 @@ class TokenizerTrainingArguments:
"""
Configuration for tokenizer training.
"""

tokenizer_type: Optional[str] = field(
default="bpe", metadata={"help": "Type of the tokenizer to train."}
)
base_tokenizer: Optional[str] = field(
default=None, metadata={"help": "Optional base tokenizer to you. Otherwise, the tokenizer will be learnt from scratch using the safe tokenizer."}
default=None,
metadata={
"help": "Optional base tokenizer to you. Otherwise, the tokenizer will be learnt from scratch using the safe tokenizer."
},
)
splitter: Optional[str] = field(
default=None, metadata={"help": "Presplitter to use to train SAFE tokenizer."}
)
)
dataset: str = field(
default=None, metadata={"help": "Path to the dataset to load for training the tokenizer."}
)
text_column: Optional[str] = field(default="inputs", metadata={"help": "Column containing text data to process."})
vocab_size: Optional[int] = field(default=1000, metadata={"help": "Target vocab size of the final tokenizer."})
batch_size: Optional[int] = field(default=100, metadata={"help": "Batch size for training the tokenizer."})
text_column: Optional[str] = field(
default="inputs", metadata={"help": "Column containing text data to process."}
)
vocab_size: Optional[int] = field(
default=1000, metadata={"help": "Target vocab size of the final tokenizer."}
)
batch_size: Optional[int] = field(
default=100, metadata={"help": "Batch size for training the tokenizer."}
)
n_examples: Optional[int] = field(
default=None, metadata={"help": "Number of examples to train the tokenizer on."}
)
tokenizer_name: Optional[str] = field(default="safe", metadata={"help": "Name of new tokenizer."})
outfile: Optional[str] = field(default=None, metadata={"help": "Path to the local save of the trained tokenizer"})
all_split: Optional[bool] = field(default=False, metadata={"help": "Whether to use all the splits or just the train split if only that is available."})
push_to_hub: Optional[bool] = field(default=False, metadata={"help": "Whether to push saved tokenizer to the hub."})
tokenizer_name: Optional[str] = field(
default="safe", metadata={"help": "Name of new tokenizer."}
)
outfile: Optional[str] = field(
default=None, metadata={"help": "Path to the local save of the trained tokenizer"}
)
all_split: Optional[bool] = field(
default=False,
metadata={
"help": "Whether to use all the splits or just the train split if only that is available."
},
)
push_to_hub: Optional[bool] = field(
default=False, metadata={"help": "Whether to push saved tokenizer to the hub."}
)


if __name__ == "__main__":
# Configuration
Expand All @@ -52,15 +74,16 @@ class TokenizerTrainingArguments:
tokenizer = AutoTokenizer.from_pretrained(args.base_tokenizer)
tokenizer = tokenizer.train_new_from_iterator(dataset_iterator, vocab_size=args.vocab_size)
else:
tokenizer = SAFETokenizer(tokenizer_type=args.tokenizer_type, splitter=args.splitter, trainer_args={'vocab_size':args.vocab_size})
tokenizer = SAFETokenizer(
tokenizer_type=args.tokenizer_type,
splitter=args.splitter,
trainer_args={"vocab_size": args.vocab_size},
)
tokenizer.train_from_iterator(dataset_iterator)
tokenizer_name = f"{args.tokenizer_name}-{args.tokenizer_type}-{args.vocab_size}"
# also save locally to the outfile specified
tokenizer_name = f"{args.tokenizer_name}-{args.tokenizer_type}-{args.vocab_size}"
# also save locally to the outfile specified
if args.outfile is not None:
tokenizer.save(args.outfile)
tokenizer = tokenizer.get_pretrained()

tokenizer.save_pretrained(tokenizer_name, push_to_hub=args.push_to_hub)



0 comments on commit 0447c44

Please sign in to comment.