Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
maclandrol committed Aug 8, 2023
1 parent 471b3c2 commit 7460917
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 13 deletions.
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,4 @@ site/

.idea/

scripts/

data/
5 changes: 2 additions & 3 deletions expts/scripts/tokenizer_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@ class TokenizerTrainingArguments:
tokenizer_name: Optional[str] = field(default="safe", metadata={"help": "Name of new tokenizer."})
outfile: Optional[str] = field(default=None, metadata={"help": "Path to the local save of the trained tokenizer"})
all_split: Optional[bool] = field(default=False, metadata={"help": "Whether to use all the splits or just the train split if only that is available."})
push_to_hub: Optional[bool] = field(default=True, metadata={"help": "Whether to push saved tokenizer to the hub."})


push_to_hub: Optional[bool] = field(default=False, metadata={"help": "Whether to push saved tokenizer to the hub."})

if __name__ == "__main__":
# Configuration
Expand All @@ -61,6 +59,7 @@ class TokenizerTrainingArguments:
if args.outfile is not None:
tokenizer.save(args.outfile)
tokenizer = tokenizer.get_pretrained()

tokenizer.save_pretrained(tokenizer_name, push_to_hub=args.push_to_hub)


Expand Down
15 changes: 7 additions & 8 deletions safe/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,13 +155,13 @@ def set_special_tokens(cls, tokenizer, bos_token=CLS_TOKEN, eos_token=SEP_TOKEN)
bos_token: Optional bos token to use
eos_token: Optional eos token to use
"""
tokenizer._pad_token = PADDING_TOKEN
tokenizer._cls_token = CLS_TOKEN
tokenizer._sep_token = SEP_TOKEN
tokenizer._mask_token = MASK_TOKEN
tokenizer._unk_token = UNK_TOKEN
tokenizer._eos_token = eos_token
tokenizer._bos_token = bos_token
tokenizer.pad_token = PADDING_TOKEN
tokenizer.cls_token = CLS_TOKEN
tokenizer.sep_token = SEP_TOKEN
tokenizer.mask_token = MASK_TOKEN
tokenizer.unk_token = UNK_TOKEN
tokenizer.eos_token = eos_token
tokenizer.bos_token = bos_token
if isinstance(tokenizer, Tokenizer):
tokenizer.add_special_tokens(
[
Expand Down Expand Up @@ -267,7 +267,6 @@ def save_pretrained(self, *args, **kwargs):
"""Save pretrained tokenizer"""
self.tokenizer.save_pretrained(*args, **kwargs)


def save(self, file_name=None):
r"""
Saves the :class:`~tokenizers.Tokenizer` to the file at the given path.
Expand Down

0 comments on commit 7460917

Please sign in to comment.