diff --git a/expts/scripts/build_dataset.py b/expts/scripts/build_dataset.py
deleted file mode 100755
index dc598c5..0000000
--- a/expts/scripts/build_dataset.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env python
-import datasets
-import pandas as pd
-import safe as sf
-
-def convert_to_safe(sm, params)
-
-if __name__ == "__main__":
-    dt = datasets.load_dataset(
-        "/storage/shared_data/manu/zinc_dump/",
-        streaming=False,
-        num_proc=32,
-        cache_dir="/storage/shared_data/manu/.cache",
-    )
-    dt = dt.cast_column("id", datasets.Value("string"))
-
-    updated_dt = dt.map(update_info, num_proc=32, batched=True, batch_size=5000)
-    # unichem_df = pd.read_parquet("/storage/shared_data/manu/unichem", engine="fastparquet")
-    # unichem_df["id"] = unichem_df["id"].astype("str")
-    # unichem_dt_tmp = datasets.Dataset.from_pandas(unichem_df.drop(columns=["parquet_partition"]))
-
-    # # 80% train, 20% test + validation
-    # train_test_valid = unichem_dt_tmp.train_test_split(test_size=0.2)
-    # # split the 20 % test into  test and validation
-    # test_valid = train_test_valid["test"].train_test_split(test_size=0.5)
-    # # gather everyone if you want to have a single DatasetDict
-    # unichem_dt = datasets.DatasetDict(
-    #     {
-    #         "train": train_test_valid["train"],
-    #         "test": test_valid["test"],
-    #         "validation": test_valid["train"],
-    #     }
-    # )
-
-    # unichem_dt = unichem_dt.select_columns(["id", "smiles", "source"])
-    # print("SAVING the unichem to disk")
-    unichem_dt = datasets.load_dataset(
-        "/storage/shared_data/manu/processed_unichem",
-        cache_dir="/storage/shared_data/manu/.cache",
-        num_proc=32,
-    )
-
-    test_dt = datasets.concatenate_datasets([unichem_dt["test"], updated_dt["test"]])
-    validation_dt = datasets.concatenate_datasets([unichem_dt["validation"], dt["validation"]])
-    train_dt = datasets.concatenate_datasets([unichem_dt["train"], dt["train"]])
-    dataset = datasets.DatasetDict(dict(train=train_dt, test=test_dt, validation=validation_dt))
-    dataset.save_to_disk("/storage/shared_data/manu/processed_zinc_unichem", max_shard_size="1GB")
diff --git a/expts/scripts/tokenizer_trainer.py b/expts/scripts/tokenizer_trainer.py
index 03e2dc1..4694edf 100755
--- a/expts/scripts/tokenizer_trainer.py
+++ b/expts/scripts/tokenizer_trainer.py
@@ -12,28 +12,50 @@ class TokenizerTrainingArguments:
     """
     Configuration for tokenizer training.
     """
+
     tokenizer_type: Optional[str] = field(
         default="bpe", metadata={"help": "Type of the tokenizer to train."}
     )
     base_tokenizer: Optional[str] = field(
-        default=None, metadata={"help": "Optional base tokenizer to you. Otherwise, the tokenizer will be learnt from scratch using the safe tokenizer."}
+        default=None,
+        metadata={
+            "help": "Optional base tokenizer to you. Otherwise, the tokenizer will be learnt from scratch using the safe tokenizer."
+        },
     )
     splitter: Optional[str] = field(
         default=None, metadata={"help": "Presplitter to use to train SAFE tokenizer."}
-    )    
+    )
     dataset: str = field(
         default=None, metadata={"help": "Path to the dataset to load for training the tokenizer."}
     )
-    text_column: Optional[str] = field(default="inputs", metadata={"help": "Column containing text data to process."})
-    vocab_size: Optional[int] = field(default=1000, metadata={"help": "Target vocab size of the final tokenizer."})
-    batch_size: Optional[int] = field(default=100, metadata={"help": "Batch size for training the tokenizer."})
+    text_column: Optional[str] = field(
+        default="inputs", metadata={"help": "Column containing text data to process."}
+    )
+    vocab_size: Optional[int] = field(
+        default=1000, metadata={"help": "Target vocab size of the final tokenizer."}
+    )
+    batch_size: Optional[int] = field(
+        default=100, metadata={"help": "Batch size for training the tokenizer."}
+    )
     n_examples: Optional[int] = field(
         default=None, metadata={"help": "Number of examples to train the tokenizer on."}
     )
-    tokenizer_name: Optional[str] = field(default="safe", metadata={"help": "Name of new tokenizer."})
-    outfile: Optional[str] = field(default=None, metadata={"help": "Path to the local save of the trained tokenizer"})
-    all_split: Optional[bool] = field(default=False, metadata={"help": "Whether to use all the splits or just the train split if only that is available."})
-    push_to_hub: Optional[bool] = field(default=False, metadata={"help": "Whether to push saved tokenizer to the hub."})
+    tokenizer_name: Optional[str] = field(
+        default="safe", metadata={"help": "Name of new tokenizer."}
+    )
+    outfile: Optional[str] = field(
+        default=None, metadata={"help": "Path to the local save of the trained tokenizer"}
+    )
+    all_split: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Whether to use all the splits or just the train split if only that is available."
+        },
+    )
+    push_to_hub: Optional[bool] = field(
+        default=False, metadata={"help": "Whether to push saved tokenizer to the hub."}
+    )
+
 
 if __name__ == "__main__":
     # Configuration
@@ -52,15 +74,16 @@ class TokenizerTrainingArguments:
         tokenizer = AutoTokenizer.from_pretrained(args.base_tokenizer)
         tokenizer = tokenizer.train_new_from_iterator(dataset_iterator, vocab_size=args.vocab_size)
     else:
-        tokenizer = SAFETokenizer(tokenizer_type=args.tokenizer_type, splitter=args.splitter, trainer_args={'vocab_size':args.vocab_size})
+        tokenizer = SAFETokenizer(
+            tokenizer_type=args.tokenizer_type,
+            splitter=args.splitter,
+            trainer_args={"vocab_size": args.vocab_size},
+        )
         tokenizer.train_from_iterator(dataset_iterator)
-        tokenizer_name = f"{args.tokenizer_name}-{args.tokenizer_type}-{args.vocab_size}" 
-        # also save locally to the outfile specified 
+        tokenizer_name = f"{args.tokenizer_name}-{args.tokenizer_type}-{args.vocab_size}"
+        # also save locally to the outfile specified
         if args.outfile is not None:
             tokenizer.save(args.outfile)
         tokenizer = tokenizer.get_pretrained()
 
     tokenizer.save_pretrained(tokenizer_name, push_to_hub=args.push_to_hub)
-    
-
-