,\n",
+ "ray_scope=last,\n",
+ "remove_unused_columns=True,\n",
+ "report_to=[],\n",
+ "resume_from_checkpoint=None,\n",
+ "run_name=/content/artifacts/checkpoints,\n",
+ "save_on_each_node=False,\n",
+ "save_safetensors=False,\n",
+ "save_steps=50,\n",
+ "save_strategy=steps,\n",
+ "save_total_limit=None,\n",
+ "seed=42,\n",
+ "sharded_ddp=[],\n",
+ "skip_memory_metrics=True,\n",
+ "tf32=None,\n",
+ "torch_compile=False,\n",
+ "torch_compile_backend=None,\n",
+ "torch_compile_mode=None,\n",
+ "torchdynamo=None,\n",
+ "tpu_metrics_debug=False,\n",
+ "tpu_num_cores=None,\n",
+ "use_cpu=False,\n",
+ "use_ipex=False,\n",
+ "use_legacy_prediction_loop=False,\n",
+ "use_mps_device=False,\n",
+ "warmup_ratio=0.02,\n",
+ "warmup_steps=100,\n",
+ "weight_decay=0.0,\n",
+ ")"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"from utils import parse_training_args\n",
"\n",
@@ -234,333 +609,38 @@
"training_args"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "2ae3e5f9-e28e-457b-b6bf-a62a472241bf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# data = data[\"train\"].train_test_split(test_size=0.1)\n",
+ "# data\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "5bc91439-6108-445c-8f85-e6558c9f0677",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "! mkdir -p {trained_model_path_lora}"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
"id": "b33e407a-9d4f-49f6-a74b-b80db8cc3a8a",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
- "/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:321: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
- " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " \n",
- "
\n",
- " [70/70 05:58, Epoch 0/1]\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " Step | \n",
- " Training Loss | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1 | \n",
- " 1.749700 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 1.570100 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 1.678600 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 1.652600 | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " 1.571400 | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " 1.336300 | \n",
- "
\n",
- " \n",
- " 7 | \n",
- " 1.410100 | \n",
- "
\n",
- " \n",
- " 8 | \n",
- " 1.619600 | \n",
- "
\n",
- " \n",
- " 9 | \n",
- " 1.407600 | \n",
- "
\n",
- " \n",
- " 10 | \n",
- " 1.249800 | \n",
- "
\n",
- " \n",
- " 11 | \n",
- " 1.332800 | \n",
- "
\n",
- " \n",
- " 12 | \n",
- " 1.284300 | \n",
- "
\n",
- " \n",
- " 13 | \n",
- " 1.209300 | \n",
- "
\n",
- " \n",
- " 14 | \n",
- " 1.449600 | \n",
- "
\n",
- " \n",
- " 15 | \n",
- " 1.286200 | \n",
- "
\n",
- " \n",
- " 16 | \n",
- " 1.242400 | \n",
- "
\n",
- " \n",
- " 17 | \n",
- " 1.172300 | \n",
- "
\n",
- " \n",
- " 18 | \n",
- " 1.093300 | \n",
- "
\n",
- " \n",
- " 19 | \n",
- " 1.071700 | \n",
- "
\n",
- " \n",
- " 20 | \n",
- " 1.202900 | \n",
- "
\n",
- " \n",
- " 21 | \n",
- " 1.044200 | \n",
- "
\n",
- " \n",
- " 22 | \n",
- " 1.193000 | \n",
- "
\n",
- " \n",
- " 23 | \n",
- " 0.994300 | \n",
- "
\n",
- " \n",
- " 24 | \n",
- " 1.029900 | \n",
- "
\n",
- " \n",
- " 25 | \n",
- " 0.970600 | \n",
- "
\n",
- " \n",
- " 26 | \n",
- " 1.243500 | \n",
- "
\n",
- " \n",
- " 27 | \n",
- " 0.846200 | \n",
- "
\n",
- " \n",
- " 28 | \n",
- " 0.789700 | \n",
- "
\n",
- " \n",
- " 29 | \n",
- " 0.850500 | \n",
- "
\n",
- " \n",
- " 30 | \n",
- " 1.129400 | \n",
- "
\n",
- " \n",
- " 31 | \n",
- " 1.023200 | \n",
- "
\n",
- " \n",
- " 32 | \n",
- " 0.997900 | \n",
- "
\n",
- " \n",
- " 33 | \n",
- " 1.022900 | \n",
- "
\n",
- " \n",
- " 34 | \n",
- " 1.419800 | \n",
- "
\n",
- " \n",
- " 35 | \n",
- " 0.884000 | \n",
- "
\n",
- " \n",
- " 36 | \n",
- " 1.207200 | \n",
- "
\n",
- " \n",
- " 37 | \n",
- " 0.923300 | \n",
- "
\n",
- " \n",
- " 38 | \n",
- " 0.975200 | \n",
- "
\n",
- " \n",
- " 39 | \n",
- " 1.177900 | \n",
- "
\n",
- " \n",
- " 40 | \n",
- " 0.869100 | \n",
- "
\n",
- " \n",
- " 41 | \n",
- " 1.017900 | \n",
- "
\n",
- " \n",
- " 42 | \n",
- " 1.065500 | \n",
- "
\n",
- " \n",
- " 43 | \n",
- " 0.891700 | \n",
- "
\n",
- " \n",
- " 44 | \n",
- " 0.858800 | \n",
- "
\n",
- " \n",
- " 45 | \n",
- " 0.881300 | \n",
- "
\n",
- " \n",
- " 46 | \n",
- " 0.825700 | \n",
- "
\n",
- " \n",
- " 47 | \n",
- " 0.882800 | \n",
- "
\n",
- " \n",
- " 48 | \n",
- " 0.970900 | \n",
- "
\n",
- " \n",
- " 49 | \n",
- " 0.857600 | \n",
- "
\n",
- " \n",
- " 50 | \n",
- " 1.070600 | \n",
- "
\n",
- " \n",
- " 51 | \n",
- " 1.215500 | \n",
- "
\n",
- " \n",
- " 52 | \n",
- " 1.050100 | \n",
- "
\n",
- " \n",
- " 53 | \n",
- " 1.196000 | \n",
- "
\n",
- " \n",
- " 54 | \n",
- " 0.881400 | \n",
- "
\n",
- " \n",
- " 55 | \n",
- " 0.980300 | \n",
- "
\n",
- " \n",
- " 56 | \n",
- " 0.954300 | \n",
- "
\n",
- " \n",
- " 57 | \n",
- " 0.788500 | \n",
- "
\n",
- " \n",
- " 58 | \n",
- " 1.064000 | \n",
- "
\n",
- " \n",
- " 59 | \n",
- " 0.959800 | \n",
- "
\n",
- " \n",
- " 60 | \n",
- " 1.022900 | \n",
- "
\n",
- " \n",
- " 61 | \n",
- " 0.848200 | \n",
- "
\n",
- " \n",
- " 62 | \n",
- " 0.814400 | \n",
- "
\n",
- " \n",
- " 63 | \n",
- " 0.775100 | \n",
- "
\n",
- " \n",
- " 64 | \n",
- " 0.906600 | \n",
- "
\n",
- " \n",
- " 65 | \n",
- " 0.943300 | \n",
- "
\n",
- " \n",
- " 66 | \n",
- " 0.894900 | \n",
- "
\n",
- " \n",
- " 67 | \n",
- " 0.797500 | \n",
- "
\n",
- " \n",
- " 68 | \n",
- " 0.746600 | \n",
- "
\n",
- " \n",
- " 69 | \n",
- " 0.590100 | \n",
- "
\n",
- " \n",
- " 70 | \n",
- " 0.920300 | \n",
- "
\n",
- " \n",
- "
"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"trainer = transformers.Trainer(\n",
" model=model,\n",
" train_dataset=data[\"train\"],\n",
+ "# eval_dataset=data[\"test\"],\n",
" args=training_args,\n",
" data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n",
")\n",
@@ -572,99 +652,49 @@
"resume_from_checkpoint = checkpoint_path.is_dir() and any(checkpoint_path.iterdir())\n",
"if resume_from_checkpoint:\n",
" print(\"Resuming from checkpoint:\", list(checkpoint_path.rglob(\"\")))\n",
- "trainer.train(resume_from_checkpoint=resume_from_checkpoint)\n",
- "\n",
- "trainer.save_model(trained_model_path)"
+ "trainer.train(resume_from_checkpoint=resume_from_checkpoint)"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "dea4e68e-57a7-48bd-bad9-f03dfe3f8a06",
+ "id": "172e47a7-400e-4f82-a5e3-38135ecf532f",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
- "To disable this warning, you can either:\n",
- "\t- Avoid using `tokenizers` before the fork if possible\n",
- "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
- "Wed Jul 5 22:32:35 2023 \n",
- "+-----------------------------------------------------------------------------+\n",
- "| NVIDIA-SMI 525.105.17 Driver Version: 525.105.17 CUDA Version: 12.0 |\n",
- "|-------------------------------+----------------------+----------------------+\n",
- "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
- "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
- "| | | MIG M. |\n",
- "|===============================+======================+======================|\n",
- "| 0 NVIDIA L4 Off | 00000000:00:04.0 Off | 0 |\n",
- "| N/A 76C P0 57W / 72W | 7314MiB / 23034MiB | 89% Default |\n",
- "| | | N/A |\n",
- "+-------------------------------+----------------------+----------------------+\n",
- "| 1 NVIDIA L4 Off | 00000000:00:05.0 Off | 0 |\n",
- "| N/A 71C P0 51W / 72W | 5592MiB / 23034MiB | 1% Default |\n",
- "| | | N/A |\n",
- "+-------------------------------+----------------------+----------------------+\n",
- " \n",
- "+-----------------------------------------------------------------------------+\n",
- "| Processes: |\n",
- "| GPU GI CI PID Type Process name GPU Memory |\n",
- "| ID ID Usage |\n",
- "|=============================================================================|\n",
- "+-----------------------------------------------------------------------------+\n",
- "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
- "To disable this warning, you can either:\n",
- "\t- Avoid using `tokenizers` before the fork if possible\n",
- "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
- "total 13G\n",
- "4.0K drwxr-xr-x 1 root root 4.0K Jul 5 22:32 .\n",
- "8.0K drwxr-xr-x 1 root root 4.0K Jul 5 10:00 ..\n",
- "4.0K -rw-r--r-- 1 root root 440 Jul 5 22:32 README.md\n",
- "4.0K -rw-r--r-- 1 root root 407 Jul 5 22:32 adapter_config.json\n",
- " 19M -rw-r--r-- 1 root root 19M Jul 5 22:32 adapter_model.bin\n",
- "4.0K -rw-r--r-- 1 root root 707 Jul 5 22:23 config.json\n",
- "4.0K -rw-r--r-- 1 root root 116 Jul 5 22:23 generation_config.json\n",
- "9.3G -rw-r--r-- 1 root root 9.3G Jul 5 22:23 pytorch_model-00001-of-00002.bin\n",
- "3.7G -rw-r--r-- 1 root root 3.7G Jul 5 22:23 pytorch_model-00002-of-00002.bin\n",
- " 20K -rw-r--r-- 1 root root 17K Jul 5 22:23 pytorch_model.bin.index.json\n",
- "4.0K -rw-r--r-- 1 root root 3.9K Jul 5 22:32 training_args.bin\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "! nvidia-smi"
+ "model.save_pretrained(trained_model_path_lora)\n",
+ "model"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "f2f5dc80",
+ "id": "dea4e68e-57a7-48bd-bad9-f03dfe3f8a06",
"metadata": {},
"outputs": [],
"source": [
- "! ls -lash {trained_model_path}"
+ "! ls -lash {trained_model_path_lora}"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "3837a68d-6a98-494e-a890-1135509c546e",
+ "id": "09db36b7-ead6-4368-9bfb-13ba1ba800a5",
"metadata": {},
"outputs": [],
"source": [
- "print(model)"
+ "model = model.merge_and_unload().half()\n",
+ "model"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "09db36b7-ead6-4368-9bfb-13ba1ba800a5",
+ "id": "270a9a72-3a12-4d83-aa7d-2d167cb28cb4",
"metadata": {},
"outputs": [],
"source": [
- "model = model.merge_and_unload()"
+ "! ls -l {trained_model_path}"
]
},
{
@@ -672,18 +702,7 @@
"execution_count": null,
"id": "260e9d79-6eb8-4516-bf8f-825a25606391",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
- "To disable this warning, you can either:\n",
- "\t- Avoid using `tokenizers` before the fork if possible\n",
- "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"model.save_pretrained(trained_model_path)\n",
"tokenizer.save_pretrained(trained_model_path)"
@@ -694,26 +713,7 @@
"execution_count": null,
"id": "6d90a920-fb22-4291-8466-411ff41e31be",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
- "To disable this warning, you can either:\n",
- "\t- Avoid using `tokenizers` before the fork if possible\n",
- "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
- "total 13G\n",
- "4.0K drwxr-xr-x 1 root root 4.0K Jul 5 22:35 .\n",
- "8.0K drwxr-xr-x 1 root root 4.0K Jul 5 10:00 ..\n",
- "4.0K -rw-r--r-- 1 root root 707 Jul 5 22:35 config.json\n",
- "4.0K -rw-r--r-- 1 root root 116 Jul 5 22:35 generation_config.json\n",
- "9.3G -rw-r--r-- 1 root root 9.3G Jul 5 22:35 pytorch_model-00001-of-00002.bin\n",
- "3.7G -rw-r--r-- 1 root root 3.7G Jul 5 22:35 pytorch_model-00002-of-00002.bin\n",
- " 20K -rw-r--r-- 1 root root 17K Jul 5 22:35 pytorch_model.bin.index.json\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"! ls -lash {trained_model_path}"
]
@@ -771,7 +771,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.5"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/model-trainer-huggingface/src/utils.py b/model-trainer-huggingface/src/utils.py
index ddbe983..6db1cd8 100644
--- a/model-trainer-huggingface/src/utils.py
+++ b/model-trainer-huggingface/src/utils.py
@@ -9,9 +9,11 @@
def parse_training_args(params: typing.Mapping) -> TrainingArguments:
typed_params = dict(
per_device_train_batch_size=1,
+ per_device_eval_batch_size=1,
gradient_accumulation_steps=4,
- warmup_steps=2,
- learning_rate=2e-4,
+ warmup_ratio=0.02,
+ learning_rate=3e-5,
+ lr_scheduler_type="cosine",
fp16=True,
logging_steps=1,
output_dir="/content/artifacts/checkpoints",