dnth · dnth · Oct 18, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 18, 2024
diff --git a/assets/demo/000b9c365c9e307a.jpg b/assets/demo/000b9c365c9e307a.jpg
diff --git a/assets/demo/00aa2580828a9009.jpg b/assets/demo/00aa2580828a9009.jpg
diff --git a/assets/demo/0a67368e57e5f3b2.jpg b/assets/demo/0a67368e57e5f3b2.jpg
diff --git a/assets/demo/0a6ee446579d2885.jpg b/assets/demo/0a6ee446579d2885.jpg
diff --git a/assets/demo/0a763eb264e84549.jpg b/assets/demo/0a763eb264e84549.jpg
diff --git a/assets/demo/0a76d1694905b51d.jpg b/assets/demo/0a76d1694905b51d.jpg
diff --git a/assets/demo/0a8caaad03cfd733.jpg b/assets/demo/0a8caaad03cfd733.jpg
diff --git a/assets/demo/0ac1f98f5a2f7416.jpg b/assets/demo/0ac1f98f5a2f7416.jpg
diff --git a/assets/demo/0aec83541fdc1f29.jpg b/assets/demo/0aec83541fdc1f29.jpg
diff --git a/assets/demo/0aff95480396cd85.jpg b/assets/demo/0aff95480396cd85.jpg
diff --git a/nbs/quickstart.ipynb b/nbs/quickstart.ipynb
@@ -29,11 +29,22 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 1,
       "metadata": {
         "id": "5_DEOCy61Mlg"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "'2.2.0+cu121'"
+            ]
+          },
+          "execution_count": 1,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "import torch\n",
         "\n",
@@ -49,15 +60,26 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 2,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "1MCW7-AN16Rq",
         "outputId": "9520b079-79a0-45f1-c7a8-7f0deb3cfe68"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "True"
+            ]
+          },
+          "execution_count": 2,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "torch.cuda.is_available()"
       ]
@@ -120,7 +142,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 1,
+      "execution_count": 3,
       "metadata": {},
       "outputs": [
         {
@@ -189,7 +211,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 2,
+      "execution_count": 4,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -203,10 +225,10 @@
           "name": "stderr",
           "output_type": "stream",
           "text": [
-            "\u001b[32m2024-10-17 00:44:23.897\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mxinfer.transformers.moondream\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m24\u001b[0m - \u001b[1mModel: vikhyatk/moondream2\u001b[0m\n",
-            "\u001b[32m2024-10-17 00:44:23.898\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mxinfer.transformers.moondream\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m25\u001b[0m - \u001b[1mRevision: 2024-08-26\u001b[0m\n",
-            "\u001b[32m2024-10-17 00:44:23.898\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mxinfer.transformers.moondream\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1mDevice: cuda\u001b[0m\n",
-            "\u001b[32m2024-10-17 00:44:23.898\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mxinfer.transformers.moondream\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mDtype: float16\u001b[0m\n",
+            "\u001b[32m2024-10-19 00:49:55.881\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mxinfer.transformers.moondream\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m24\u001b[0m - \u001b[1mModel: vikhyatk/moondream2\u001b[0m\n",
+            "\u001b[32m2024-10-19 00:49:55.881\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mxinfer.transformers.moondream\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m25\u001b[0m - \u001b[1mRevision: 2024-08-26\u001b[0m\n",
+            "\u001b[32m2024-10-19 00:49:55.882\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mxinfer.transformers.moondream\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1mDevice: cuda\u001b[0m\n",
+            "\u001b[32m2024-10-19 00:49:55.882\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mxinfer.transformers.moondream\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mDtype: float16\u001b[0m\n",
             "PhiForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.\n",
             "  - If you're using `trust_remote_code=True`, you can get rid of this warning by loading the model with an auto class. See https://huggingface.co/docs/transformers/en/model_doc/auto#auto-classes\n",
             "  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).\n",
@@ -227,7 +249,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 3,
+      "execution_count": 5,
       "metadata": {},
       "outputs": [
         {
@@ -238,7 +260,7 @@
               "<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=773x767>"
             ]
           },
-          "execution_count": 3,
+          "execution_count": 5,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -260,16 +282,16 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 4,
+      "execution_count": 6,
       "metadata": {},
       "outputs": [
         {
           "data": {
             "text/plain": [
-              "'An anime-style illustration depicts a young girl with white hair and green eyes, wearing a white jacket and holding a large burger, seated at a table with a plate of food in front of her.'"
+              "'An anime-style illustration depicts a young girl with white hair and green eyes, wearing a white jacket, holding a large burger in her hands and smiling.'"
             ]
           },
-          "execution_count": 4,
+          "execution_count": 6,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -290,16 +312,16 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 5,
+      "execution_count": 7,
       "metadata": {},
       "outputs": [
         {
           "data": {
             "text/plain": [
-              "'The image depicts a young girl with long, white hair and blue eyes sitting at a table, holding a large burger in her hands. The background shows a dimly lit room with a window, suggesting an indoor setting.'"
+              "'The image depicts a young girl with long, white hair and blue eyes sitting at a table, holding a large burger in her hands. The background shows a cozy indoor setting with a window and a chair visible.'"
             ]
           },
-          "execution_count": 5,
+          "execution_count": 7,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -315,12 +337,12 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "If you'd like to see the generation info, you can set `verbose=True`. This might be useful if you're running some sort of benchmark on the inference time."
+        "If you'd like to see the inference stats, you can do so by calling the `print_stats` method. This might be useful if you're running some sort of benchmark on the inference time."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 6,
+      "execution_count": 8,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -330,28 +352,42 @@
         "outputId": "54903d6b-f81e-4f28-da4d-a4398bd6e532"
       },
       "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\u001b[32m2024-10-17 00:44:32.383\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mxinfer.transformers.moondream\u001b[0m:\u001b[36minfer\u001b[0m:\u001b[36m91\u001b[0m - \u001b[1mInference time: 687.3982 ms\u001b[0m\n",
-            "\u001b[32m2024-10-17 00:44:32.384\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mxinfer.transformers.moondream\u001b[0m:\u001b[36minfer\u001b[0m:\u001b[36m92\u001b[0m - \u001b[1mDevice: cuda\u001b[0m\n",
-            "\u001b[32m2024-10-17 00:44:32.384\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mxinfer.transformers.moondream\u001b[0m:\u001b[36minfer\u001b[0m:\u001b[36m93\u001b[0m - \u001b[1mDtype: torch.float16\u001b[0m\n"
-          ]
-        },
         {
           "data": {
+            "text/html": [
+              "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                    Model Stats                    </span>\n",
+              "╭───────────────────────────┬─────────────────────╮\n",
+              "│<span style=\"font-weight: bold\"> Attribute                 </span>│<span style=\"font-weight: bold\"> Value               </span>│\n",
+              "├───────────────────────────┼─────────────────────┤\n",
+              "│<span style=\"color: #008080; text-decoration-color: #008080\"> Model ID                  </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> vikhyatk/moondream2 </span>│\n",
+              "│<span style=\"color: #008080; text-decoration-color: #008080\"> Device                    </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> cuda                </span>│\n",
+              "│<span style=\"color: #008080; text-decoration-color: #008080\"> Dtype                     </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> torch.float16       </span>│\n",
+              "│<span style=\"color: #008080; text-decoration-color: #008080\"> Number of Inferences      </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 2                   </span>│\n",
+              "│<span style=\"color: #008080; text-decoration-color: #008080\"> Total Inference Time (ms) </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 1511.7485           </span>│\n",
+              "│<span style=\"color: #008080; text-decoration-color: #008080\"> Average Latency (ms)      </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 755.8742            </span>│\n",
+              "╰───────────────────────────┴─────────────────────╯\n",
+              "</pre>\n"
+            ],
             "text/plain": [
-              "'The image depicts a young girl with long, white hair and blue eyes sitting at a table, holding a large burger in her hands. The background shows a dimly lit room with a window, suggesting an indoor setting.'"
+              "\u001b[3m                    Model Stats                    \u001b[0m\n",
+              "╭───────────────────────────┬─────────────────────╮\n",
+              "│\u001b[1m \u001b[0m\u001b[1mAttribute                \u001b[0m\u001b[1m \u001b[0m│\u001b[1m \u001b[0m\u001b[1mValue              \u001b[0m\u001b[1m \u001b[0m│\n",
+              "├───────────────────────────┼─────────────────────┤\n",
+              "│\u001b[36m \u001b[0m\u001b[36mModel ID                 \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mvikhyatk/moondream2\u001b[0m\u001b[35m \u001b[0m│\n",
+              "│\u001b[36m \u001b[0m\u001b[36mDevice                   \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mcuda               \u001b[0m\u001b[35m \u001b[0m│\n",
+              "│\u001b[36m \u001b[0m\u001b[36mDtype                    \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mtorch.float16      \u001b[0m\u001b[35m \u001b[0m│\n",
+              "│\u001b[36m \u001b[0m\u001b[36mNumber of Inferences     \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m2                  \u001b[0m\u001b[35m \u001b[0m│\n",
+              "│\u001b[36m \u001b[0m\u001b[36mTotal Inference Time (ms)\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m1511.7485          \u001b[0m\u001b[35m \u001b[0m│\n",
+              "│\u001b[36m \u001b[0m\u001b[36mAverage Latency (ms)     \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m755.8742           \u001b[0m\u001b[35m \u001b[0m│\n",
+              "╰───────────────────────────┴─────────────────────╯\n"
             ]
           },
-          "execution_count": 6,
           "metadata": {},
-          "output_type": "execute_result"
+          "output_type": "display_data"
         }
       ],
       "source": [
-        "model.infer(image, prompt, verbose=True)"
+        "model.stats.print_stats()"
       ]
     },
     {
@@ -365,7 +401,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 7,
+      "execution_count": 9,
       "metadata": {},
       "outputs": [
         {
@@ -375,7 +411,7 @@
               " 'The image depicts a young girl with long, white hair and blue eyes sitting at a table, holding a large burger in her hands. The background shows a cozy indoor setting with a window and a chair visible.']"
             ]
           },
-          "execution_count": 7,
+          "execution_count": 9,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -384,6 +420,44 @@
         "model.infer_batch([image, image], [prompt, prompt])"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "For convenience, you can also launch a Gradio interface to interact with the model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "* Running on local URL:  http://127.0.0.1:7860\n",
+            "\n",
+            "To create a public link, set `share=True` in `launch()`.\n"
+          ]
+        },
+        {
+          "data": {
+            "text/html": [
+              "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "model.launch_gradio()"
+      ]
+    },
     {
       "cell_type": "markdown",
       "metadata": {},