diff --git a/model-trainer-huggingface/requirements.txt b/model-trainer-huggingface/requirements.txt index a4157a1..82e8ef7 100644 --- a/model-trainer-huggingface/requirements.txt +++ b/model-trainer-huggingface/requirements.txt @@ -8,6 +8,7 @@ bitsandbytes ipywidgets datasets protobuf +sentencepiece scipy einops diff --git a/model-trainer-huggingface/src/train.ipynb b/model-trainer-huggingface/src/train.ipynb index f6299c9..c5a3155 100644 --- a/model-trainer-huggingface/src/train.ipynb +++ b/model-trainer-huggingface/src/train.ipynb @@ -25,7 +25,29 @@ "execution_count": 1, "id": "86ccd646", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'dataset_urls': 'https://huggingface.co/datasets/weaviate/WithRetrieval-SchemaSplit-Train-80/resolve/main/WithRetrieval-Random-Train-80.json',\n", + " 'inference_prompt_template': '## Instruction\\nYour task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.\\n\\nOnly use the API reference to understand the syntax of the request.\\n\\n## Natural Language Query\\n{nlcommand}\\n\\n## Schema\\n{schema}\\n\\n## API reference\\n{apiRef}\\n\\n## Answer\\n```graphql\\n',\n", + " 'logging_steps': 50,\n", + " 'modules_to_save': 'embed_tokens, lm_head',\n", + " 'num_train_epochs': 3,\n", + " 'per_device_eval_batch_size': 4,\n", + " 'per_device_train_batch_size': 4,\n", + " 'prompt_template': '## Instruction\\nYour task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.\\n\\nOnly use the API reference to understand the syntax of the request.\\n\\n## Natural Language Query\\n{nlcommand}\\n\\n## Schema\\n{schema}\\n\\n## API reference\\n{apiRef}\\n\\n## Answer\\n{output}\\n',\n", + " 'push_to_hub': 'substratusai/wgql-WithRetrieval-SchemaSplit-Train-80',\n", + " 'save_steps': 50,\n", + " 'target_modules': 'q_proj, up_proj, o_proj, k_proj, down_proj, gate_proj, v_proj',\n", + " 'warmup_steps': 100}" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import json\n", "from pathlib import Path\n", @@ -34,48 +56,37 @@ "params_path = Path(\"/content/params.json\")\n", "if params_path.is_file():\n", " with params_path.open(\"r\", encoding=\"UTF-8\") as params_file:\n", - " params = json.load(params_file)\n" + " params = json.load(params_file)\n", + "\n", + "\n", + "params" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "9fafd16b-d8c9-47bf-9116-c27b1d43a019", "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "Found cached dataset json (/root/.cache/huggingface/datasets/json/default-2d69f16079490881/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)\n" + "Using the following URLs for the dataset: ['https://huggingface.co/datasets/weaviate/WithRetrieval-SchemaSplit-Train-80/resolve/main/WithRetrieval-Random-Train-80.json']\n" ] }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "67dee641ab0a490e9413ed9924ae1859", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00\n" + ] + } + ], "source": [ "default_prompt = \"\"\"\n", "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", @@ -139,56 +265,173 @@ "\"\"\"\n", "\n", "prompt = params.get(\"prompt_template\", default_prompt)\n", - "print(prompt.format_map(data[\"train\"][0]))" + "\n", + "eos_token = tokenizer.convert_ids_to_tokens(model.config.eos_token_id)\n", + "if prompt[-len(eos_token):] != eos_token:\n", + " prompt = prompt + eos_token\n", + "\n", + "print(prompt)\n" ] }, { "cell_type": "code", - "execution_count": null, - "id": "f5dd944b-e2bd-4bfd-a5fa-55bc90239926", + "execution_count": 6, + "id": "0abf96e1-3bc1-4ae7-80ac-c2e585e9c7c1", "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-2d69f16079490881/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-7224d2ad124fbca0.arrow\n" + "Fri Oct 20 03:33:09 2023 \n", + "+-----------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 525.105.17 Driver Version: 525.105.17 CUDA Version: 12.0 |\n", + "|-------------------------------+----------------------+----------------------+\n", + "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", + "| | | MIG M. |\n", + "|===============================+======================+======================|\n", + "| 0 NVIDIA L4 Off | 00000000:00:04.0 Off | 0 |\n", + "| N/A 68C P0 33W / 72W | 3570MiB / 23034MiB | 1% Default |\n", + "| | | N/A |\n", + "+-------------------------------+----------------------+----------------------+\n", + "| 1 NVIDIA L4 Off | 00000000:00:05.0 Off | 0 |\n", + "| N/A 75C P0 36W / 72W | 4096MiB / 23034MiB | 1% Default |\n", + "| | | N/A |\n", + "+-------------------------------+----------------------+----------------------+\n", + "| 2 NVIDIA L4 Off | 00000000:00:06.0 Off | 0 |\n", + "| N/A 72C P0 32W / 72W | 4096MiB / 23034MiB | 1% Default |\n", + "| | | N/A |\n", + "+-------------------------------+----------------------+----------------------+\n", + "| 3 NVIDIA L4 Off | 00000000:00:07.0 Off | 0 |\n", + "| N/A 75C P0 38W / 72W | 3570MiB / 23034MiB | 1% Default |\n", + "| | | N/A |\n", + "+-------------------------------+----------------------+----------------------+\n", + " \n", + "+-----------------------------------------------------------------------------+\n", + "| Processes: |\n", + "| GPU GI CI PID Type Process name GPU Memory |\n", + "| ID ID Usage |\n", + "|=============================================================================|\n", + "+-----------------------------------------------------------------------------+\n" ] - }, + } + ], + "source": [ + "! nvidia-smi" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "4d1e1795-c783-4ddf-999e-f1de19258928", + "metadata": {}, + "source": [ + "Prompt before fine tuning" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f5dd944b-e2bd-4bfd-a5fa-55bc90239926", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LlamaTokenizerFast(name_or_path='/content/model/', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=False), added_tokens_decoder={\n", + "\t0: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t1: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t2: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t32000: AddedToken(\"[PAD]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from typing import Dict\n", + "# source: https://github.com/artidoro/qlora\n", + "DEFAULT_PAD_TOKEN = params.get(\"pad_token\", \"[PAD]\")\n", + "\n", + "def smart_tokenizer_and_embedding_resize(\n", + " special_tokens_dict: Dict,\n", + " tokenizer: transformers.PreTrainedTokenizer,\n", + " model: transformers.PreTrainedModel,\n", + "):\n", + " \"\"\"Resize tokenizer and embedding.\n", + "\n", + " Note: This is the unoptimized version that may make your embedding size not be divisible by 64.\n", + " \"\"\"\n", + " num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)\n", + " model.resize_token_embeddings(len(tokenizer))\n", + " if num_new_tokens > 0:\n", + " input_embeddings_data = model.get_input_embeddings().weight.data\n", + " output_embeddings_data = model.get_output_embeddings().weight.data\n", + "\n", + " input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)\n", + " output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)\n", + "\n", + " input_embeddings_data[-num_new_tokens:] = input_embeddings_avg\n", + " output_embeddings_data[-num_new_tokens:] = output_embeddings_avg\n", + "\n", + "if tokenizer._pad_token is None:\n", + " smart_tokenizer_and_embedding_resize(\n", + " special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),\n", + " tokenizer=tokenizer,\n", + " model=model,\n", + " )\n", + "\n", + "if isinstance(tokenizer, transformers.LlamaTokenizer):\n", + " # LLaMA tokenizer may not have correct special tokens set.\n", + " # Check and add them if missing to prevent them from being parsed into different tokens.\n", + " # Note that these are present in the vocabulary.\n", + " # Note also that `model.config.pad_token_id` is 0 which corresponds to `` token.\n", + " print('Adding special tokens.')\n", + " tokenizer.add_special_tokens({\n", + " \"eos_token\": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),\n", + " \"bos_token\": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),\n", + " \"unk_token\": tokenizer.convert_ids_to_tokens(\n", + " model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id\n", + " ),\n", + " })\n", + "\n", + "tokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e78b510d", + "metadata": {}, + "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "DatasetDict({\n", - " train: Dataset({\n", - " features: ['prompt', 'completion'],\n", - " num_rows: 282\n", - " })\n", - "})\n", "After tokenizing: DatasetDict({\n", " train: Dataset({\n", - " features: ['prompt', 'completion', 'input_ids', 'token_type_ids', 'attention_mask'],\n", - " num_rows: 282\n", + " features: ['input', 'output', 'nlcommand', 'apiRef', 'apiRefPath', 'schema', 'schemaPath', 'input_ids', 'attention_mask'],\n", + " num_rows: 3190\n", " })\n", "})\n" ] } ], "source": [ - "if tokenizer.pad_token is None:\n", - " tokenizer.add_special_tokens({'pad_token': '[PAD]'})\n", - " model.resize_token_embeddings(len(tokenizer))\n", + "from typing import Dict\n", "\n", - "print(data)\n", - "data = data.map(lambda x: tokenizer(\n", - " prompt.format_map(x), padding='max_length', truncation=True))\n", + "data = data.map(lambda x: tokenizer(prompt.format_map(x)))\n", "\n", "print(\"After tokenizing:\", data)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "5dae6c6f-3ae1-4697-852e-fce24a82b9e8", "metadata": {}, "outputs": [ @@ -196,23 +439,32 @@ "name": "stdout", "output_type": "stream", "text": [ - "trainable params: 4,718,592 || all params: 6,926,439,296 || trainable%: 0.06812435363037071\n" + "LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules=['q_proj', 'up_proj', 'o_proj', 'k_proj', 'down_proj', 'gate_proj', 'v_proj'], lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['embed_tokens', 'lm_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None)\n", + "trainable params: 564,281,344 || all params: 7,040,552,960 || trainable%: 8.01473047935144\n" ] } ], "source": [ "from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training\n", "\n", - "lora_config2 = LoraConfig(\n", - " r=16,\n", - " lora_alpha=32,\n", - " lora_dropout=0.05,\n", - " bias=\"none\",\n", - " task_type=\"CAUSAL_LM\"\n", - ")\n", "target_modules = params.get(\"target_modules\")\n", "if target_modules:\n", - " lora_config2.target_modules = [mod.strip() for mod in target_modules.split(\",\")]\n", + " target_modules = [mod.strip() for mod in target_modules.split(\",\")]\n", + "\n", + "modules_to_save = params.get(\"modules_to_save\")\n", + "if modules_to_save:\n", + " modules_to_save = [mod.strip() for mod in modules_to_save.split(\",\")]\n", + "\n", + "lora_config2 = LoraConfig(\n", + " r=16,\n", + " lora_alpha=16,\n", + " lora_dropout=0.05,\n", + " bias=\"none\",\n", + " task_type=\"CAUSAL_LM\",\n", + " target_modules=target_modules,\n", + " modules_to_save = modules_to_save\n", + ")\n", + "print(lora_config2)\n", "\n", "model = prepare_model_for_kbit_training(model)\n", "\n", @@ -223,10 +475,133 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "70a3e36c-62cf-45aa-8f37-0db0e40857dc", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "TrainingArguments(\n", + "_n_gpu=4,\n", + "adafactor=False,\n", + "adam_beta1=0.9,\n", + "adam_beta2=0.999,\n", + "adam_epsilon=1e-08,\n", + "auto_find_batch_size=False,\n", + "bf16=False,\n", + "bf16_full_eval=False,\n", + "data_seed=None,\n", + "dataloader_drop_last=False,\n", + "dataloader_num_workers=0,\n", + "dataloader_pin_memory=True,\n", + "ddp_backend=None,\n", + "ddp_broadcast_buffers=None,\n", + "ddp_bucket_cap_mb=None,\n", + "ddp_find_unused_parameters=None,\n", + "ddp_timeout=1800,\n", + "debug=[],\n", + "deepspeed=None,\n", + "disable_tqdm=False,\n", + "dispatch_batches=None,\n", + "do_eval=False,\n", + "do_predict=False,\n", + "do_train=False,\n", + "eval_accumulation_steps=None,\n", + "eval_delay=0,\n", + "eval_steps=None,\n", + "evaluation_strategy=no,\n", + "fp16=True,\n", + "fp16_backend=auto,\n", + "fp16_full_eval=False,\n", + "fp16_opt_level=O1,\n", + "fsdp=[],\n", + "fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},\n", + "fsdp_min_num_params=0,\n", + "fsdp_transformer_layer_cls_to_wrap=None,\n", + "full_determinism=False,\n", + "gradient_accumulation_steps=4,\n", + "gradient_checkpointing=False,\n", + "greater_is_better=None,\n", + "group_by_length=False,\n", + "half_precision_backend=auto,\n", + "hub_always_push=False,\n", + "hub_model_id=None,\n", + "hub_private_repo=False,\n", + "hub_strategy=every_save,\n", + "hub_token=,\n", + "ignore_data_skip=False,\n", + "include_inputs_for_metrics=False,\n", + "include_tokens_per_second=False,\n", + "jit_mode_eval=False,\n", + "label_names=None,\n", + "label_smoothing_factor=0.0,\n", + "learning_rate=3e-05,\n", + "length_column_name=length,\n", + "load_best_model_at_end=False,\n", + "local_rank=0,\n", + "log_level=passive,\n", + "log_level_replica=warning,\n", + "log_on_each_node=True,\n", + "logging_dir=/content/artifacts/checkpoints/runs/Oct20_03-34-43_wgqlg-withretrieval-schemasplit-train-80-v4-model-notebook,\n", + "logging_first_step=False,\n", + "logging_nan_inf_filter=True,\n", + "logging_steps=50,\n", + "logging_strategy=steps,\n", + "lr_scheduler_type=cosine,\n", + "max_grad_norm=1.0,\n", + "max_steps=-1,\n", + "metric_for_best_model=None,\n", + "mp_parameters=,\n", + "no_cuda=False,\n", + "num_train_epochs=3.0,\n", + "optim=paged_adamw_32bit,\n", + "optim_args=None,\n", + "output_dir=/content/artifacts/checkpoints,\n", + "overwrite_output_dir=False,\n", + "past_index=-1,\n", + "per_device_eval_batch_size=4,\n", + "per_device_train_batch_size=4,\n", + "prediction_loss_only=False,\n", + "push_to_hub=False,\n", + "push_to_hub_model_id=None,\n", + "push_to_hub_organization=None,\n", + "push_to_hub_token=,\n", + "ray_scope=last,\n", + "remove_unused_columns=True,\n", + "report_to=[],\n", + "resume_from_checkpoint=None,\n", + "run_name=/content/artifacts/checkpoints,\n", + "save_on_each_node=False,\n", + "save_safetensors=False,\n", + "save_steps=50,\n", + "save_strategy=steps,\n", + "save_total_limit=None,\n", + "seed=42,\n", + "sharded_ddp=[],\n", + "skip_memory_metrics=True,\n", + "tf32=None,\n", + "torch_compile=False,\n", + "torch_compile_backend=None,\n", + "torch_compile_mode=None,\n", + "torchdynamo=None,\n", + "tpu_metrics_debug=False,\n", + "tpu_num_cores=None,\n", + "use_cpu=False,\n", + "use_ipex=False,\n", + "use_legacy_prediction_loop=False,\n", + "use_mps_device=False,\n", + "warmup_ratio=0.02,\n", + "warmup_steps=100,\n", + "weight_decay=0.0,\n", + ")" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from utils import parse_training_args\n", "\n", @@ -234,333 +609,38 @@ "training_args" ] }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2ae3e5f9-e28e-457b-b6bf-a62a472241bf", + "metadata": {}, + "outputs": [], + "source": [ + "# data = data[\"train\"].train_test_split(test_size=0.1)\n", + "# data\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "5bc91439-6108-445c-8f85-e6558c9f0677", + "metadata": {}, + "outputs": [], + "source": [ + "! mkdir -p {trained_model_path_lora}" + ] + }, { "cell_type": "code", "execution_count": null, "id": "b33e407a-9d4f-49f6-a74b-b80db8cc3a8a", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n", - "/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:321: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", - " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - " \n", - " \n", - " [70/70 05:58, Epoch 0/1]\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
StepTraining Loss
11.749700
21.570100
31.678600
41.652600
51.571400
61.336300
71.410100
81.619600
91.407600
101.249800
111.332800
121.284300
131.209300
141.449600
151.286200
161.242400
171.172300
181.093300
191.071700
201.202900
211.044200
221.193000
230.994300
241.029900
250.970600
261.243500
270.846200
280.789700
290.850500
301.129400
311.023200
320.997900
331.022900
341.419800
350.884000
361.207200
370.923300
380.975200
391.177900
400.869100
411.017900
421.065500
430.891700
440.858800
450.881300
460.825700
470.882800
480.970900
490.857600
501.070600
511.215500
521.050100
531.196000
540.881400
550.980300
560.954300
570.788500
581.064000
590.959800
601.022900
610.848200
620.814400
630.775100
640.906600
650.943300
660.894900
670.797500
680.746600
690.590100
700.920300

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "trainer = transformers.Trainer(\n", " model=model,\n", " train_dataset=data[\"train\"],\n", + "# eval_dataset=data[\"test\"],\n", " args=training_args,\n", " data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n", ")\n", @@ -572,99 +652,49 @@ "resume_from_checkpoint = checkpoint_path.is_dir() and any(checkpoint_path.iterdir())\n", "if resume_from_checkpoint:\n", " print(\"Resuming from checkpoint:\", list(checkpoint_path.rglob(\"\")))\n", - "trainer.train(resume_from_checkpoint=resume_from_checkpoint)\n", - "\n", - "trainer.save_model(trained_model_path)" + "trainer.train(resume_from_checkpoint=resume_from_checkpoint)" ] }, { "cell_type": "code", "execution_count": null, - "id": "dea4e68e-57a7-48bd-bad9-f03dfe3f8a06", + "id": "172e47a7-400e-4f82-a5e3-38135ecf532f", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "Wed Jul 5 22:32:35 2023 \n", - "+-----------------------------------------------------------------------------+\n", - "| NVIDIA-SMI 525.105.17 Driver Version: 525.105.17 CUDA Version: 12.0 |\n", - "|-------------------------------+----------------------+----------------------+\n", - "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", - "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", - "| | | MIG M. |\n", - "|===============================+======================+======================|\n", - "| 0 NVIDIA L4 Off | 00000000:00:04.0 Off | 0 |\n", - "| N/A 76C P0 57W / 72W | 7314MiB / 23034MiB | 89% Default |\n", - "| | | N/A |\n", - "+-------------------------------+----------------------+----------------------+\n", - "| 1 NVIDIA L4 Off | 00000000:00:05.0 Off | 0 |\n", - "| N/A 71C P0 51W / 72W | 5592MiB / 23034MiB | 1% Default |\n", - "| | | N/A |\n", - "+-------------------------------+----------------------+----------------------+\n", - " \n", - "+-----------------------------------------------------------------------------+\n", - "| Processes: |\n", - "| GPU GI CI PID Type Process name GPU Memory |\n", - "| ID ID Usage |\n", - "|=============================================================================|\n", - "+-----------------------------------------------------------------------------+\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "total 13G\n", - "4.0K drwxr-xr-x 1 root root 4.0K Jul 5 22:32 .\n", - "8.0K drwxr-xr-x 1 root root 4.0K Jul 5 10:00 ..\n", - "4.0K -rw-r--r-- 1 root root 440 Jul 5 22:32 README.md\n", - "4.0K -rw-r--r-- 1 root root 407 Jul 5 22:32 adapter_config.json\n", - " 19M -rw-r--r-- 1 root root 19M Jul 5 22:32 adapter_model.bin\n", - "4.0K -rw-r--r-- 1 root root 707 Jul 5 22:23 config.json\n", - "4.0K -rw-r--r-- 1 root root 116 Jul 5 22:23 generation_config.json\n", - "9.3G -rw-r--r-- 1 root root 9.3G Jul 5 22:23 pytorch_model-00001-of-00002.bin\n", - "3.7G -rw-r--r-- 1 root root 3.7G Jul 5 22:23 pytorch_model-00002-of-00002.bin\n", - " 20K -rw-r--r-- 1 root root 17K Jul 5 22:23 pytorch_model.bin.index.json\n", - "4.0K -rw-r--r-- 1 root root 3.9K Jul 5 22:32 training_args.bin\n" - ] - } - ], + "outputs": [], "source": [ - "! nvidia-smi" + "model.save_pretrained(trained_model_path_lora)\n", + "model" ] }, { "cell_type": "code", "execution_count": null, - "id": "f2f5dc80", + "id": "dea4e68e-57a7-48bd-bad9-f03dfe3f8a06", "metadata": {}, "outputs": [], "source": [ - "! ls -lash {trained_model_path}" + "! ls -lash {trained_model_path_lora}" ] }, { "cell_type": "code", "execution_count": null, - "id": "3837a68d-6a98-494e-a890-1135509c546e", + "id": "09db36b7-ead6-4368-9bfb-13ba1ba800a5", "metadata": {}, "outputs": [], "source": [ - "print(model)" + "model = model.merge_and_unload().half()\n", + "model" ] }, { "cell_type": "code", "execution_count": null, - "id": "09db36b7-ead6-4368-9bfb-13ba1ba800a5", + "id": "270a9a72-3a12-4d83-aa7d-2d167cb28cb4", "metadata": {}, "outputs": [], "source": [ - "model = model.merge_and_unload()" + "! ls -l {trained_model_path}" ] }, { @@ -672,18 +702,7 @@ "execution_count": null, "id": "260e9d79-6eb8-4516-bf8f-825a25606391", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - } - ], + "outputs": [], "source": [ "model.save_pretrained(trained_model_path)\n", "tokenizer.save_pretrained(trained_model_path)" @@ -694,26 +713,7 @@ "execution_count": null, "id": "6d90a920-fb22-4291-8466-411ff41e31be", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "total 13G\n", - "4.0K drwxr-xr-x 1 root root 4.0K Jul 5 22:35 .\n", - "8.0K drwxr-xr-x 1 root root 4.0K Jul 5 10:00 ..\n", - "4.0K -rw-r--r-- 1 root root 707 Jul 5 22:35 config.json\n", - "4.0K -rw-r--r-- 1 root root 116 Jul 5 22:35 generation_config.json\n", - "9.3G -rw-r--r-- 1 root root 9.3G Jul 5 22:35 pytorch_model-00001-of-00002.bin\n", - "3.7G -rw-r--r-- 1 root root 3.7G Jul 5 22:35 pytorch_model-00002-of-00002.bin\n", - " 20K -rw-r--r-- 1 root root 17K Jul 5 22:35 pytorch_model.bin.index.json\n" - ] - } - ], + "outputs": [], "source": [ "! ls -lash {trained_model_path}" ] @@ -771,7 +771,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/model-trainer-huggingface/src/utils.py b/model-trainer-huggingface/src/utils.py index ddbe983..6db1cd8 100644 --- a/model-trainer-huggingface/src/utils.py +++ b/model-trainer-huggingface/src/utils.py @@ -9,9 +9,11 @@ def parse_training_args(params: typing.Mapping) -> TrainingArguments: typed_params = dict( per_device_train_batch_size=1, + per_device_eval_batch_size=1, gradient_accumulation_steps=4, - warmup_steps=2, - learning_rate=2e-4, + warmup_ratio=0.02, + learning_rate=3e-5, + lr_scheduler_type="cosine", fp16=True, logging_steps=1, output_dir="/content/artifacts/checkpoints",