diff --git a/model-trainer-huggingface/requirements.txt b/model-trainer-huggingface/requirements.txt
index a4157a1..82e8ef7 100644
--- a/model-trainer-huggingface/requirements.txt
+++ b/model-trainer-huggingface/requirements.txt
@@ -8,6 +8,7 @@ bitsandbytes
 ipywidgets
 datasets
 protobuf
+sentencepiece
 
 scipy
 einops
diff --git a/model-trainer-huggingface/src/train.ipynb b/model-trainer-huggingface/src/train.ipynb
index f6299c9..c5a3155 100644
--- a/model-trainer-huggingface/src/train.ipynb
+++ b/model-trainer-huggingface/src/train.ipynb
@@ -25,7 +25,29 @@
    "execution_count": 1,
    "id": "86ccd646",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'dataset_urls': 'https://huggingface.co/datasets/weaviate/WithRetrieval-SchemaSplit-Train-80/resolve/main/WithRetrieval-Random-Train-80.json',\n",
+       " 'inference_prompt_template': '## Instruction\\nYour task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.\\n\\nOnly use the API reference to understand the syntax of the request.\\n\\n## Natural Language Query\\n{nlcommand}\\n\\n## Schema\\n{schema}\\n\\n## API reference\\n{apiRef}\\n\\n## Answer\\n```graphql\\n',\n",
+       " 'logging_steps': 50,\n",
+       " 'modules_to_save': 'embed_tokens, lm_head',\n",
+       " 'num_train_epochs': 3,\n",
+       " 'per_device_eval_batch_size': 4,\n",
+       " 'per_device_train_batch_size': 4,\n",
+       " 'prompt_template': '## Instruction\\nYour task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.\\n\\nOnly use the API reference to understand the syntax of the request.\\n\\n## Natural Language Query\\n{nlcommand}\\n\\n## Schema\\n{schema}\\n\\n## API reference\\n{apiRef}\\n\\n## Answer\\n{output}\\n',\n",
+       " 'push_to_hub': 'substratusai/wgql-WithRetrieval-SchemaSplit-Train-80',\n",
+       " 'save_steps': 50,\n",
+       " 'target_modules': 'q_proj, up_proj, o_proj, k_proj, down_proj, gate_proj, v_proj',\n",
+       " 'warmup_steps': 100}"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "import json\n",
     "from pathlib import Path\n",
@@ -34,48 +56,37 @@
     "params_path = Path(\"/content/params.json\")\n",
     "if params_path.is_file():\n",
     "    with params_path.open(\"r\", encoding=\"UTF-8\") as params_file:\n",
-    "        params = json.load(params_file)\n"
+    "        params = json.load(params_file)\n",
+    "\n",
+    "\n",
+    "params"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "id": "9fafd16b-d8c9-47bf-9116-c27b1d43a019",
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Found cached dataset json (/root/.cache/huggingface/datasets/json/default-2d69f16079490881/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)\n"
+      "Using the following URLs for the dataset: ['https://huggingface.co/datasets/weaviate/WithRetrieval-SchemaSplit-Train-80/resolve/main/WithRetrieval-Random-Train-80.json']\n"
      ]
     },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "67dee641ab0a490e9413ed9924ae1859",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "data": {
       "text/plain": [
        "DatasetDict({\n",
        "    train: Dataset({\n",
-       "        features: ['prompt', 'completion'],\n",
-       "        num_rows: 282\n",
+       "        features: ['input', 'output', 'nlcommand', 'apiRef', 'apiRefPath', 'schema', 'schemaPath'],\n",
+       "        num_rows: 3190\n",
        "    })\n",
        "})"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -96,10 +107,60 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "08e478fa-d095-4145-9bd1-b4feec7bc4f0",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dea592b6e66a45b8a4cbbb7041c54110",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "LlamaForCausalLM(\n",
+       "  (model): LlamaModel(\n",
+       "    (embed_tokens): Embedding(32000, 4096)\n",
+       "    (layers): ModuleList(\n",
+       "      (0-31): 32 x LlamaDecoderLayer(\n",
+       "        (self_attn): LlamaAttention(\n",
+       "          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "          (rotary_emb): LlamaRotaryEmbedding()\n",
+       "        )\n",
+       "        (mlp): LlamaMLP(\n",
+       "          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)\n",
+       "          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)\n",
+       "          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)\n",
+       "          (act_fn): SiLUActivation()\n",
+       "        )\n",
+       "        (input_layernorm): LlamaRMSNorm()\n",
+       "        (post_attention_layernorm): LlamaRMSNorm()\n",
+       "      )\n",
+       "    )\n",
+       "    (norm): LlamaRMSNorm()\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)\n",
+       ")"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "import transformers\n",
     "import torch\n",
@@ -108,27 +169,92 @@
     "\n",
     "model_path = \"/content/model/\"\n",
     "trained_model_path = \"/content/artifacts\"\n",
+    "trained_model_path_lora = \"/content/artifacts/lora\"\n",
     "\n",
-    "tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_path,\n",
+    "                                          local_files_only=True)\n",
     "model = AutoModelForCausalLM.from_pretrained(\n",
-    "            model_path, torch_dtype=torch.float16, device_map=\"auto\", trust_remote_code=True)\n"
+    "            model_path, torch_dtype=torch.float16, device_map=\"auto\", trust_remote_code=True)\n",
+    "model"
    ]
   },
   {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "4d1e1795-c783-4ddf-999e-f1de19258928",
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "88908150-1585-4781-9542-d68193d808bc",
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "LlamaConfig {\n",
+       "  \"_name_or_path\": \"/content/model/\",\n",
+       "  \"architectures\": [\n",
+       "    \"LlamaForCausalLM\"\n",
+       "  ],\n",
+       "  \"attention_bias\": false,\n",
+       "  \"bos_token_id\": 1,\n",
+       "  \"eos_token_id\": 2,\n",
+       "  \"hidden_act\": \"silu\",\n",
+       "  \"hidden_size\": 4096,\n",
+       "  \"initializer_range\": 0.02,\n",
+       "  \"intermediate_size\": 11008,\n",
+       "  \"max_position_embeddings\": 4096,\n",
+       "  \"model_type\": \"llama\",\n",
+       "  \"num_attention_heads\": 32,\n",
+       "  \"num_hidden_layers\": 32,\n",
+       "  \"num_key_value_heads\": 32,\n",
+       "  \"pretraining_tp\": 1,\n",
+       "  \"rms_norm_eps\": 1e-05,\n",
+       "  \"rope_scaling\": null,\n",
+       "  \"rope_theta\": 10000.0,\n",
+       "  \"tie_word_embeddings\": false,\n",
+       "  \"torch_dtype\": \"float16\",\n",
+       "  \"transformers_version\": \"4.34.1\",\n",
+       "  \"use_cache\": true,\n",
+       "  \"vocab_size\": 32000\n",
+       "}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "Prompt before fine tuning"
+    "model.config"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "id": "ec8a1a9f-fe60-49c7-ab20-04034323df8a",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "## Instruction\n",
+      "Your task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.\n",
+      "\n",
+      "Only use the API reference to understand the syntax of the request.\n",
+      "\n",
+      "## Natural Language Query\n",
+      "{nlcommand}\n",
+      "\n",
+      "## Schema\n",
+      "{schema}\n",
+      "\n",
+      "## API reference\n",
+      "{apiRef}\n",
+      "\n",
+      "## Answer\n",
+      "{output}\n",
+      "</s>\n"
+     ]
+    }
+   ],
    "source": [
     "default_prompt = \"\"\"\n",
     "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
@@ -139,56 +265,173 @@
     "\"\"\"\n",
     "\n",
     "prompt = params.get(\"prompt_template\", default_prompt)\n",
-    "print(prompt.format_map(data[\"train\"][0]))"
+    "\n",
+    "eos_token = tokenizer.convert_ids_to_tokens(model.config.eos_token_id)\n",
+    "if prompt[-len(eos_token):] != eos_token:\n",
+    "    prompt = prompt + eos_token\n",
+    "\n",
+    "print(prompt)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "f5dd944b-e2bd-4bfd-a5fa-55bc90239926",
+   "execution_count": 6,
+   "id": "0abf96e1-3bc1-4ae7-80ac-c2e585e9c7c1",
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-2d69f16079490881/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-7224d2ad124fbca0.arrow\n"
+      "Fri Oct 20 03:33:09 2023       \n",
+      "+-----------------------------------------------------------------------------+\n",
+      "| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |\n",
+      "|-------------------------------+----------------------+----------------------+\n",
+      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
+      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
+      "|                               |                      |               MIG M. |\n",
+      "|===============================+======================+======================|\n",
+      "|   0  NVIDIA L4           Off  | 00000000:00:04.0 Off |                    0 |\n",
+      "| N/A   68C    P0    33W /  72W |   3570MiB / 23034MiB |      1%      Default |\n",
+      "|                               |                      |                  N/A |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "|   1  NVIDIA L4           Off  | 00000000:00:05.0 Off |                    0 |\n",
+      "| N/A   75C    P0    36W /  72W |   4096MiB / 23034MiB |      1%      Default |\n",
+      "|                               |                      |                  N/A |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "|   2  NVIDIA L4           Off  | 00000000:00:06.0 Off |                    0 |\n",
+      "| N/A   72C    P0    32W /  72W |   4096MiB / 23034MiB |      1%      Default |\n",
+      "|                               |                      |                  N/A |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "|   3  NVIDIA L4           Off  | 00000000:00:07.0 Off |                    0 |\n",
+      "| N/A   75C    P0    38W /  72W |   3570MiB / 23034MiB |      1%      Default |\n",
+      "|                               |                      |                  N/A |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "                                                                               \n",
+      "+-----------------------------------------------------------------------------+\n",
+      "| Processes:                                                                  |\n",
+      "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
+      "|        ID   ID                                                   Usage      |\n",
+      "|=============================================================================|\n",
+      "+-----------------------------------------------------------------------------+\n"
      ]
-    },
+    }
+   ],
+   "source": [
+    "! nvidia-smi"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "4d1e1795-c783-4ddf-999e-f1de19258928",
+   "metadata": {},
+   "source": [
+    "Prompt before fine tuning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f5dd944b-e2bd-4bfd-a5fa-55bc90239926",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "LlamaTokenizerFast(name_or_path='/content/model/', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={\n",
+       "\t0: AddedToken(\"<unk>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
+       "\t1: AddedToken(\"<s>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
+       "\t2: AddedToken(\"</s>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
+       "\t32000: AddedToken(\"[PAD]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
+       "}"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from typing import Dict\n",
+    "# source: https://github.com/artidoro/qlora\n",
+    "DEFAULT_PAD_TOKEN = params.get(\"pad_token\", \"[PAD]\")\n",
+    "\n",
+    "def smart_tokenizer_and_embedding_resize(\n",
+    "    special_tokens_dict: Dict,\n",
+    "    tokenizer: transformers.PreTrainedTokenizer,\n",
+    "    model: transformers.PreTrainedModel,\n",
+    "):\n",
+    "    \"\"\"Resize tokenizer and embedding.\n",
+    "\n",
+    "    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.\n",
+    "    \"\"\"\n",
+    "    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)\n",
+    "    model.resize_token_embeddings(len(tokenizer))\n",
+    "    if num_new_tokens > 0:\n",
+    "        input_embeddings_data = model.get_input_embeddings().weight.data\n",
+    "        output_embeddings_data = model.get_output_embeddings().weight.data\n",
+    "\n",
+    "        input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)\n",
+    "        output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)\n",
+    "\n",
+    "        input_embeddings_data[-num_new_tokens:] = input_embeddings_avg\n",
+    "        output_embeddings_data[-num_new_tokens:] = output_embeddings_avg\n",
+    "\n",
+    "if tokenizer._pad_token is None:\n",
+    "    smart_tokenizer_and_embedding_resize(\n",
+    "        special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),\n",
+    "        tokenizer=tokenizer,\n",
+    "        model=model,\n",
+    "    )\n",
+    "\n",
+    "if isinstance(tokenizer, transformers.LlamaTokenizer):\n",
+    "    # LLaMA tokenizer may not have correct special tokens set.\n",
+    "    # Check and add them if missing to prevent them from being parsed into different tokens.\n",
+    "    # Note that these are present in the vocabulary.\n",
+    "    # Note also that `model.config.pad_token_id` is 0 which corresponds to `<unk>` token.\n",
+    "    print('Adding special tokens.')\n",
+    "    tokenizer.add_special_tokens({\n",
+    "            \"eos_token\": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),\n",
+    "            \"bos_token\": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),\n",
+    "            \"unk_token\": tokenizer.convert_ids_to_tokens(\n",
+    "                model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id\n",
+    "            ),\n",
+    "    })\n",
+    "\n",
+    "tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "e78b510d",
+   "metadata": {},
+   "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "DatasetDict({\n",
-      "    train: Dataset({\n",
-      "        features: ['prompt', 'completion'],\n",
-      "        num_rows: 282\n",
-      "    })\n",
-      "})\n",
       "After tokenizing: DatasetDict({\n",
       "    train: Dataset({\n",
-      "        features: ['prompt', 'completion', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
-      "        num_rows: 282\n",
+      "        features: ['input', 'output', 'nlcommand', 'apiRef', 'apiRefPath', 'schema', 'schemaPath', 'input_ids', 'attention_mask'],\n",
+      "        num_rows: 3190\n",
       "    })\n",
       "})\n"
      ]
     }
    ],
    "source": [
-    "if tokenizer.pad_token is None:\n",
-    "    tokenizer.add_special_tokens({'pad_token': '[PAD]'})\n",
-    "    model.resize_token_embeddings(len(tokenizer))\n",
+    "from typing import Dict\n",
     "\n",
-    "print(data)\n",
-    "data = data.map(lambda x: tokenizer(\n",
-    "    prompt.format_map(x), padding='max_length', truncation=True))\n",
+    "data = data.map(lambda x: tokenizer(prompt.format_map(x)))\n",
     "\n",
     "print(\"After tokenizing:\", data)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "id": "5dae6c6f-3ae1-4697-852e-fce24a82b9e8",
    "metadata": {},
    "outputs": [
@@ -196,23 +439,32 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "trainable params: 4,718,592 || all params: 6,926,439,296 || trainable%: 0.06812435363037071\n"
+      "LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules=['q_proj', 'up_proj', 'o_proj', 'k_proj', 'down_proj', 'gate_proj', 'v_proj'], lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['embed_tokens', 'lm_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None)\n",
+      "trainable params: 564,281,344 || all params: 7,040,552,960 || trainable%: 8.01473047935144\n"
      ]
     }
    ],
    "source": [
     "from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training\n",
     "\n",
-    "lora_config2 = LoraConfig(\n",
-    " r=16,\n",
-    " lora_alpha=32,\n",
-    " lora_dropout=0.05,\n",
-    " bias=\"none\",\n",
-    " task_type=\"CAUSAL_LM\"\n",
-    ")\n",
     "target_modules = params.get(\"target_modules\")\n",
     "if target_modules:\n",
-    "    lora_config2.target_modules = [mod.strip() for mod in target_modules.split(\",\")]\n",
+    "    target_modules = [mod.strip() for mod in target_modules.split(\",\")]\n",
+    "\n",
+    "modules_to_save = params.get(\"modules_to_save\")\n",
+    "if modules_to_save:\n",
+    "    modules_to_save = [mod.strip() for mod in modules_to_save.split(\",\")]\n",
+    "\n",
+    "lora_config2 = LoraConfig(\n",
+    "    r=16,\n",
+    "    lora_alpha=16,\n",
+    "    lora_dropout=0.05,\n",
+    "    bias=\"none\",\n",
+    "    task_type=\"CAUSAL_LM\",\n",
+    "    target_modules=target_modules,\n",
+    "    modules_to_save = modules_to_save\n",
+    ")\n",
+    "print(lora_config2)\n",
     "\n",
     "model = prepare_model_for_kbit_training(model)\n",
     "\n",
@@ -223,10 +475,133 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "id": "70a3e36c-62cf-45aa-8f37-0db0e40857dc",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "TrainingArguments(\n",
+       "_n_gpu=4,\n",
+       "adafactor=False,\n",
+       "adam_beta1=0.9,\n",
+       "adam_beta2=0.999,\n",
+       "adam_epsilon=1e-08,\n",
+       "auto_find_batch_size=False,\n",
+       "bf16=False,\n",
+       "bf16_full_eval=False,\n",
+       "data_seed=None,\n",
+       "dataloader_drop_last=False,\n",
+       "dataloader_num_workers=0,\n",
+       "dataloader_pin_memory=True,\n",
+       "ddp_backend=None,\n",
+       "ddp_broadcast_buffers=None,\n",
+       "ddp_bucket_cap_mb=None,\n",
+       "ddp_find_unused_parameters=None,\n",
+       "ddp_timeout=1800,\n",
+       "debug=[],\n",
+       "deepspeed=None,\n",
+       "disable_tqdm=False,\n",
+       "dispatch_batches=None,\n",
+       "do_eval=False,\n",
+       "do_predict=False,\n",
+       "do_train=False,\n",
+       "eval_accumulation_steps=None,\n",
+       "eval_delay=0,\n",
+       "eval_steps=None,\n",
+       "evaluation_strategy=no,\n",
+       "fp16=True,\n",
+       "fp16_backend=auto,\n",
+       "fp16_full_eval=False,\n",
+       "fp16_opt_level=O1,\n",
+       "fsdp=[],\n",
+       "fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},\n",
+       "fsdp_min_num_params=0,\n",
+       "fsdp_transformer_layer_cls_to_wrap=None,\n",
+       "full_determinism=False,\n",
+       "gradient_accumulation_steps=4,\n",
+       "gradient_checkpointing=False,\n",
+       "greater_is_better=None,\n",
+       "group_by_length=False,\n",
+       "half_precision_backend=auto,\n",
+       "hub_always_push=False,\n",
+       "hub_model_id=None,\n",
+       "hub_private_repo=False,\n",
+       "hub_strategy=every_save,\n",
+       "hub_token=<HUB_TOKEN>,\n",
+       "ignore_data_skip=False,\n",
+       "include_inputs_for_metrics=False,\n",
+       "include_tokens_per_second=False,\n",
+       "jit_mode_eval=False,\n",
+       "label_names=None,\n",
+       "label_smoothing_factor=0.0,\n",
+       "learning_rate=3e-05,\n",
+       "length_column_name=length,\n",
+       "load_best_model_at_end=False,\n",
+       "local_rank=0,\n",
+       "log_level=passive,\n",
+       "log_level_replica=warning,\n",
+       "log_on_each_node=True,\n",
+       "logging_dir=/content/artifacts/checkpoints/runs/Oct20_03-34-43_wgqlg-withretrieval-schemasplit-train-80-v4-model-notebook,\n",
+       "logging_first_step=False,\n",
+       "logging_nan_inf_filter=True,\n",
+       "logging_steps=50,\n",
+       "logging_strategy=steps,\n",
+       "lr_scheduler_type=cosine,\n",
+       "max_grad_norm=1.0,\n",
+       "max_steps=-1,\n",
+       "metric_for_best_model=None,\n",
+       "mp_parameters=,\n",
+       "no_cuda=False,\n",
+       "num_train_epochs=3.0,\n",
+       "optim=paged_adamw_32bit,\n",
+       "optim_args=None,\n",
+       "output_dir=/content/artifacts/checkpoints,\n",
+       "overwrite_output_dir=False,\n",
+       "past_index=-1,\n",
+       "per_device_eval_batch_size=4,\n",
+       "per_device_train_batch_size=4,\n",
+       "prediction_loss_only=False,\n",
+       "push_to_hub=False,\n",
+       "push_to_hub_model_id=None,\n",
+       "push_to_hub_organization=None,\n",
+       "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
+       "ray_scope=last,\n",
+       "remove_unused_columns=True,\n",
+       "report_to=[],\n",
+       "resume_from_checkpoint=None,\n",
+       "run_name=/content/artifacts/checkpoints,\n",
+       "save_on_each_node=False,\n",
+       "save_safetensors=False,\n",
+       "save_steps=50,\n",
+       "save_strategy=steps,\n",
+       "save_total_limit=None,\n",
+       "seed=42,\n",
+       "sharded_ddp=[],\n",
+       "skip_memory_metrics=True,\n",
+       "tf32=None,\n",
+       "torch_compile=False,\n",
+       "torch_compile_backend=None,\n",
+       "torch_compile_mode=None,\n",
+       "torchdynamo=None,\n",
+       "tpu_metrics_debug=False,\n",
+       "tpu_num_cores=None,\n",
+       "use_cpu=False,\n",
+       "use_ipex=False,\n",
+       "use_legacy_prediction_loop=False,\n",
+       "use_mps_device=False,\n",
+       "warmup_ratio=0.02,\n",
+       "warmup_steps=100,\n",
+       "weight_decay=0.0,\n",
+       ")"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "from utils import parse_training_args\n",
     "\n",
@@ -234,333 +609,38 @@
     "training_args"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "2ae3e5f9-e28e-457b-b6bf-a62a472241bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# data = data[\"train\"].train_test_split(test_size=0.1)\n",
+    "# data\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "5bc91439-6108-445c-8f85-e6558c9f0677",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! mkdir -p {trained_model_path_lora}"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "b33e407a-9d4f-49f6-a74b-b80db8cc3a8a",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
-      "/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:321: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
-      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "    <div>\n",
-       "      \n",
-       "      <progress value='70' max='70' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-       "      [70/70 05:58, Epoch 0/1]\n",
-       "    </div>\n",
-       "    <table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       " <tr style=\"text-align: left;\">\n",
-       "      <th>Step</th>\n",
-       "      <th>Training Loss</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <td>1</td>\n",
-       "      <td>1.749700</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>2</td>\n",
-       "      <td>1.570100</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>3</td>\n",
-       "      <td>1.678600</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>4</td>\n",
-       "      <td>1.652600</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>5</td>\n",
-       "      <td>1.571400</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>6</td>\n",
-       "      <td>1.336300</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>7</td>\n",
-       "      <td>1.410100</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>8</td>\n",
-       "      <td>1.619600</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>9</td>\n",
-       "      <td>1.407600</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>10</td>\n",
-       "      <td>1.249800</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>11</td>\n",
-       "      <td>1.332800</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>12</td>\n",
-       "      <td>1.284300</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>13</td>\n",
-       "      <td>1.209300</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>14</td>\n",
-       "      <td>1.449600</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>15</td>\n",
-       "      <td>1.286200</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>16</td>\n",
-       "      <td>1.242400</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>17</td>\n",
-       "      <td>1.172300</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>18</td>\n",
-       "      <td>1.093300</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>19</td>\n",
-       "      <td>1.071700</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>20</td>\n",
-       "      <td>1.202900</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>21</td>\n",
-       "      <td>1.044200</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>22</td>\n",
-       "      <td>1.193000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>23</td>\n",
-       "      <td>0.994300</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>24</td>\n",
-       "      <td>1.029900</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>25</td>\n",
-       "      <td>0.970600</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>26</td>\n",
-       "      <td>1.243500</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>27</td>\n",
-       "      <td>0.846200</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>28</td>\n",
-       "      <td>0.789700</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>29</td>\n",
-       "      <td>0.850500</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>30</td>\n",
-       "      <td>1.129400</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>31</td>\n",
-       "      <td>1.023200</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>32</td>\n",
-       "      <td>0.997900</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>33</td>\n",
-       "      <td>1.022900</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>34</td>\n",
-       "      <td>1.419800</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>35</td>\n",
-       "      <td>0.884000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>36</td>\n",
-       "      <td>1.207200</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>37</td>\n",
-       "      <td>0.923300</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>38</td>\n",
-       "      <td>0.975200</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>39</td>\n",
-       "      <td>1.177900</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>40</td>\n",
-       "      <td>0.869100</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>41</td>\n",
-       "      <td>1.017900</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>42</td>\n",
-       "      <td>1.065500</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>43</td>\n",
-       "      <td>0.891700</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>44</td>\n",
-       "      <td>0.858800</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>45</td>\n",
-       "      <td>0.881300</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>46</td>\n",
-       "      <td>0.825700</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>47</td>\n",
-       "      <td>0.882800</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>48</td>\n",
-       "      <td>0.970900</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>49</td>\n",
-       "      <td>0.857600</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>50</td>\n",
-       "      <td>1.070600</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>51</td>\n",
-       "      <td>1.215500</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>52</td>\n",
-       "      <td>1.050100</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>53</td>\n",
-       "      <td>1.196000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>54</td>\n",
-       "      <td>0.881400</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>55</td>\n",
-       "      <td>0.980300</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>56</td>\n",
-       "      <td>0.954300</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>57</td>\n",
-       "      <td>0.788500</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>58</td>\n",
-       "      <td>1.064000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>59</td>\n",
-       "      <td>0.959800</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>60</td>\n",
-       "      <td>1.022900</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>61</td>\n",
-       "      <td>0.848200</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>62</td>\n",
-       "      <td>0.814400</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>63</td>\n",
-       "      <td>0.775100</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>64</td>\n",
-       "      <td>0.906600</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>65</td>\n",
-       "      <td>0.943300</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>66</td>\n",
-       "      <td>0.894900</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>67</td>\n",
-       "      <td>0.797500</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>68</td>\n",
-       "      <td>0.746600</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>69</td>\n",
-       "      <td>0.590100</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>70</td>\n",
-       "      <td>0.920300</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table><p>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "trainer = transformers.Trainer(\n",
     "    model=model,\n",
     "    train_dataset=data[\"train\"],\n",
+    "#    eval_dataset=data[\"test\"],\n",
     "    args=training_args,\n",
     "    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n",
     ")\n",
@@ -572,99 +652,49 @@
     "resume_from_checkpoint = checkpoint_path.is_dir() and any(checkpoint_path.iterdir())\n",
     "if resume_from_checkpoint:\n",
     "    print(\"Resuming from checkpoint:\", list(checkpoint_path.rglob(\"\")))\n",
-    "trainer.train(resume_from_checkpoint=resume_from_checkpoint)\n",
-    "\n",
-    "trainer.save_model(trained_model_path)"
+    "trainer.train(resume_from_checkpoint=resume_from_checkpoint)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "dea4e68e-57a7-48bd-bad9-f03dfe3f8a06",
+   "id": "172e47a7-400e-4f82-a5e3-38135ecf532f",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "Wed Jul  5 22:32:35 2023       \n",
-      "+-----------------------------------------------------------------------------+\n",
-      "| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |\n",
-      "|-------------------------------+----------------------+----------------------+\n",
-      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
-      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
-      "|                               |                      |               MIG M. |\n",
-      "|===============================+======================+======================|\n",
-      "|   0  NVIDIA L4           Off  | 00000000:00:04.0 Off |                    0 |\n",
-      "| N/A   76C    P0    57W /  72W |   7314MiB / 23034MiB |     89%      Default |\n",
-      "|                               |                      |                  N/A |\n",
-      "+-------------------------------+----------------------+----------------------+\n",
-      "|   1  NVIDIA L4           Off  | 00000000:00:05.0 Off |                    0 |\n",
-      "| N/A   71C    P0    51W /  72W |   5592MiB / 23034MiB |      1%      Default |\n",
-      "|                               |                      |                  N/A |\n",
-      "+-------------------------------+----------------------+----------------------+\n",
-      "                                                                               \n",
-      "+-----------------------------------------------------------------------------+\n",
-      "| Processes:                                                                  |\n",
-      "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
-      "|        ID   ID                                                   Usage      |\n",
-      "|=============================================================================|\n",
-      "+-----------------------------------------------------------------------------+\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "total 13G\n",
-      "4.0K drwxr-xr-x 1 root root 4.0K Jul  5 22:32 .\n",
-      "8.0K drwxr-xr-x 1 root root 4.0K Jul  5 10:00 ..\n",
-      "4.0K -rw-r--r-- 1 root root  440 Jul  5 22:32 README.md\n",
-      "4.0K -rw-r--r-- 1 root root  407 Jul  5 22:32 adapter_config.json\n",
-      " 19M -rw-r--r-- 1 root root  19M Jul  5 22:32 adapter_model.bin\n",
-      "4.0K -rw-r--r-- 1 root root  707 Jul  5 22:23 config.json\n",
-      "4.0K -rw-r--r-- 1 root root  116 Jul  5 22:23 generation_config.json\n",
-      "9.3G -rw-r--r-- 1 root root 9.3G Jul  5 22:23 pytorch_model-00001-of-00002.bin\n",
-      "3.7G -rw-r--r-- 1 root root 3.7G Jul  5 22:23 pytorch_model-00002-of-00002.bin\n",
-      " 20K -rw-r--r-- 1 root root  17K Jul  5 22:23 pytorch_model.bin.index.json\n",
-      "4.0K -rw-r--r-- 1 root root 3.9K Jul  5 22:32 training_args.bin\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "! nvidia-smi"
+    "model.save_pretrained(trained_model_path_lora)\n",
+    "model"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f2f5dc80",
+   "id": "dea4e68e-57a7-48bd-bad9-f03dfe3f8a06",
    "metadata": {},
    "outputs": [],
    "source": [
-    "! ls -lash {trained_model_path}"
+    "! ls -lash {trained_model_path_lora}"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "3837a68d-6a98-494e-a890-1135509c546e",
+   "id": "09db36b7-ead6-4368-9bfb-13ba1ba800a5",
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(model)"
+    "model = model.merge_and_unload().half()\n",
+    "model"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "09db36b7-ead6-4368-9bfb-13ba1ba800a5",
+   "id": "270a9a72-3a12-4d83-aa7d-2d167cb28cb4",
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = model.merge_and_unload()"
+    "! ls -l {trained_model_path}"
    ]
   },
   {
@@ -672,18 +702,7 @@
    "execution_count": null,
    "id": "260e9d79-6eb8-4516-bf8f-825a25606391",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "model.save_pretrained(trained_model_path)\n",
     "tokenizer.save_pretrained(trained_model_path)"
@@ -694,26 +713,7 @@
    "execution_count": null,
    "id": "6d90a920-fb22-4291-8466-411ff41e31be",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "total 13G\n",
-      "4.0K drwxr-xr-x 1 root root 4.0K Jul  5 22:35 .\n",
-      "8.0K drwxr-xr-x 1 root root 4.0K Jul  5 10:00 ..\n",
-      "4.0K -rw-r--r-- 1 root root  707 Jul  5 22:35 config.json\n",
-      "4.0K -rw-r--r-- 1 root root  116 Jul  5 22:35 generation_config.json\n",
-      "9.3G -rw-r--r-- 1 root root 9.3G Jul  5 22:35 pytorch_model-00001-of-00002.bin\n",
-      "3.7G -rw-r--r-- 1 root root 3.7G Jul  5 22:35 pytorch_model-00002-of-00002.bin\n",
-      " 20K -rw-r--r-- 1 root root  17K Jul  5 22:35 pytorch_model.bin.index.json\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "! ls -lash {trained_model_path}"
    ]
@@ -771,7 +771,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
diff --git a/model-trainer-huggingface/src/utils.py b/model-trainer-huggingface/src/utils.py
index ddbe983..6db1cd8 100644
--- a/model-trainer-huggingface/src/utils.py
+++ b/model-trainer-huggingface/src/utils.py
@@ -9,9 +9,11 @@
 def parse_training_args(params: typing.Mapping) -> TrainingArguments:
     typed_params = dict(
         per_device_train_batch_size=1,
+        per_device_eval_batch_size=1,
         gradient_accumulation_steps=4,
-        warmup_steps=2,
-        learning_rate=2e-4,
+        warmup_ratio=0.02,
+        learning_rate=3e-5,
+        lr_scheduler_type="cosine",
         fp16=True,
         logging_steps=1,
         output_dir="/content/artifacts/checkpoints",