From 058eed727c9742a3a254928396cd7f34969cf3f1 Mon Sep 17 00:00:00 2001 From: Dickson Neoh Date: Wed, 6 Nov 2024 23:02:58 +0800 Subject: [PATCH] Open browser to fastapi docs page when running model_serve (#54) * open browser * add serving test basic * add option to automatically open browser --- nbs/serving.ipynb | 390 +------------------------------------- tests/test_serve_model.py | 22 +++ xinfer/serve.py | 25 ++- 3 files changed, 48 insertions(+), 389 deletions(-) create mode 100644 tests/test_serve_model.py diff --git a/nbs/serving.ipynb b/nbs/serving.ipynb index b20ebc4..302f7f4 100644 --- a/nbs/serving.ipynb +++ b/nbs/serving.ipynb @@ -2,387 +2,17 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32m2024-11-01 17:17:02.368\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.core\u001b[0m:\u001b[36mlist_models\u001b[0m:\u001b[36m55\u001b[0m - \u001b[1mShowing interactive table in Jupyter Notebook. Type in the search bar to filter the models.\u001b[0m\n" - ] - }, - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "\n", - "\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "
ImplementationModel IDInput --> Output
\n", - "\n", - "
\n", - "Loading ITables v2.2.2 from the init_notebook_mode cell...\n", - "(need help?)
\n", - "\n" - ], - "text/plain": [ - " Implementation Model ID \\\n", - "0 timm timm/bat_resnext26ts.ch_in1k \n", - "1 timm timm/beit_base_patch16_224.in22k_ft_in22k_in1k \n", - "2 timm timm/beit_base_patch16_384.in22k_ft_in22k_in1k \n", - "3 timm timm/beit_large_patch16_224.in22k_ft_in22k_in1k \n", - "4 timm timm/beit_large_patch16_384.in22k_ft_in22k_in1k \n", - "... ... ... \n", - "1291 vllm vllm/allenai/Molmo-7B-D-0924 \n", - "1292 vllm vllm/allenai/Molmo-7B-O-0924 \n", - "1293 vllm vllm/allenai/Molmo-72B-0924 \n", - "1294 vllm vllm/microsoft/Phi-3.5-vision-instruct \n", - "1295 vllm vllm/microsoft/Phi-3-vision-128k-instruct \n", - "\n", - " Input --> Output \n", - "0 image --> categories \n", - "1 image --> categories \n", - "2 image --> categories \n", - "3 image --> categories \n", - "4 image --> categories \n", - "... ... \n", - "1291 image-text --> text \n", - "1292 image-text --> text \n", - "1293 image-text --> text \n", - "1294 image-text --> text \n", - "1295 image-text --> text \n", - "\n", - "[1296 rows x 3 columns]" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import xinfer\n", - "\n", - "xinfer.list_models(interactive=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# model = xinfer.create_model(\"vllm/microsoft/Phi-3.5-vision-instruct\", device=\"cuda\", dtype=\"float16\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-11-01 17:17:03,267\tINFO worker.py:1807 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "INFO 2024-11-01 17:17:05,020 serve 55531 api.py:277 - Started Serve in namespace \"serve\".\n", - "INFO 2024-11-01 17:17:05,029 serve 55531 api.py:259 - Connecting to existing Serve app in namespace \"serve\". New http options will not be applied.\n", - "WARNING 2024-11-01 17:17:05,030 serve 55531 api.py:85 - The new client HTTP config differs from the existing one in the following fields: ['location']. The new HTTP config is ignored.\n", - "\u001b[36m(ServeController pid=55808)\u001b[0m INFO 2024-11-01 17:17:05,094 controller 55808 deployment_state.py:1604 - Deploying new version of Deployment(name='XInferModel', app='default') (initial target replicas: 1).\n", - "\u001b[36m(ProxyActor pid=55813)\u001b[0m INFO 2024-11-01 17:17:05,004 proxy 192.168.100.60 proxy.py:1191 - Proxy starting on node 0e1972e04709e7ecaa3814bc42e11be9dc4187d42edafa6099b4b6a9 (HTTP port: 8000).\n", - "\u001b[36m(ServeController pid=55808)\u001b[0m INFO 2024-11-01 17:17:05,199 controller 55808 deployment_state.py:1850 - Adding 1 replica to Deployment(name='XInferModel', app='default').\n", - "\u001b[36m(ServeReplica:default:XInferModel pid=55811)\u001b[0m 2024-11-01 17:17:09.266 | INFO | xinfer.models:__init__:63 - Model: vikhyatk/moondream2\n", - "\u001b[36m(ServeReplica:default:XInferModel pid=55811)\u001b[0m 2024-11-01 17:17:09.267 | INFO | xinfer.models:__init__:64 - Device: cuda\n", - "\u001b[36m(ServeReplica:default:XInferModel pid=55811)\u001b[0m 2024-11-01 17:17:09.267 | INFO | xinfer.models:__init__:65 - Dtype: float16\n", - "\u001b[36m(ServeReplica:default:XInferModel pid=55811)\u001b[0m PhiForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.\n", - "\u001b[36m(ServeReplica:default:XInferModel pid=55811)\u001b[0m - If you're using `trust_remote_code=True`, you can get rid of this warning by loading the model with an auto class. See https://huggingface.co/docs/transformers/en/model_doc/auto#auto-classes\n", - "\u001b[36m(ServeReplica:default:XInferModel pid=55811)\u001b[0m - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).\n", - "\u001b[36m(ServeReplica:default:XInferModel pid=55811)\u001b[0m - If you are not the owner of the model architecture class, please contact the model code owner to update it.\n", - "INFO 2024-11-01 17:17:16,118 serve 55531 client.py:492 - Deployment 'XInferModel:bdc2x8r1' is ready at `http://127.0.0.1:8000/`. component=serve deployment=XInferModel\n", - "INFO 2024-11-01 17:17:16,119 serve 55531 api.py:549 - Deployed app 'default' successfully.\n", - "\u001b[36m(ServeReplica:default:XInferModel pid=55811)\u001b[0m INFO 2024-11-01 17:17:38,529 default_XInferModel u9micml4 f99f8d3f-f6e2-4efc-94f2-305d21308560 /infer replica.py:378 - __CALL__ OK 1260.5ms\n", - "WARNING 2024-11-01 17:17:47,345 serve 55531 api.py:557 - Got KeyboardInterrupt, exiting...\n", - "\u001b[32m2024-11-01 17:17:47.346\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.serve\u001b[0m:\u001b[36mserve_model\u001b[0m:\u001b[36m94\u001b[0m - \u001b[1mReceiving shutdown signal. Cleaning up...\u001b[0m\n", - "\u001b[36m(ServeController pid=55808)\u001b[0m INFO 2024-11-01 17:17:47,355 controller 55808 deployment_state.py:1866 - Removing 1 replica from Deployment(name='XInferModel', app='default').\n", - "\u001b[36m(ServeController pid=55808)\u001b[0m INFO 2024-11-01 17:17:49,373 controller 55808 deployment_state.py:2191 - Replica(id='u9micml4', deployment='XInferModel', app='default') is stopped.\n" - ] - } - ], - "source": [ + "import xinfer\n", "xinfer.serve_model(\n", " \"vikhyatk/moondream2\",\n", " device=\"cuda\",\n", " dtype=\"float16\",\n", - " blocking=True,\n", + " blocking=False,\n", + " open_api_docs=True,\n", " # port=8001,\n", " # deployment_kwargs={\n", " # \"num_replicas\": 1,\n", @@ -393,17 +23,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO 2024-11-01 17:17:51,787 serve 55531 api.py:125 - Nothing to shut down. There's no Serve application running on this Ray cluster.\n" - ] - } - ], + "outputs": [], "source": [ "from ray import serve\n", "\n", diff --git a/tests/test_serve_model.py b/tests/test_serve_model.py new file mode 100644 index 0000000..5c0dd9c --- /dev/null +++ b/tests/test_serve_model.py @@ -0,0 +1,22 @@ +from ray import serve + +from xinfer.serve import serve_model + + +def test_serve_model(): + serve_model("vikhyatk/moondream2", blocking=False, open_api_docs=False) + + serve.shutdown() + + +def test_serve_model_custom_deployment(): + """Test model serving with custom deployment options""" + deployment_kwargs = {"num_replicas": 2, "ray_actor_options": {"num_cpus": 2}} + handle = serve_model( + "vikhyatk/moondream2", + deployment_kwargs=deployment_kwargs, + blocking=False, + open_api_docs=False, + ) + assert handle.deployment_id.name == "XInferModel" + serve.shutdown() diff --git a/xinfer/serve.py b/xinfer/serve.py index c2573c0..30da1b9 100644 --- a/xinfer/serve.py +++ b/xinfer/serve.py @@ -68,6 +68,7 @@ def serve_model( host: str = "127.0.0.1", port: int = 8000, blocking: bool = True, + open_api_docs: bool = True, **model_kwargs, ): deployment_kwargs = deployment_kwargs or {} @@ -84,12 +85,26 @@ def serve_model( app = deployment.bind(model_id, **model_kwargs) try: - handle = serve.run(app, blocking=blocking) - if not blocking: + handle = serve.run(app) + logger.info(f"Open FastAPI docs at http://{host}:{port}/docs") + if open_api_docs: + import webbrowser + + webbrowser.open(f"http://{host}:{port}/docs") + + if blocking: + try: + while True: + time.sleep(1) + except (KeyboardInterrupt, SystemExit): + logger.info("Receiving shutdown signal. Cleaning up...") + serve.shutdown() + else: logger.info( "Running server in non-blocking mode, remember to call serve.shutdown() to stop the server" ) - return handle # Return handle without shutting down - except (KeyboardInterrupt, SystemExit): - logger.info("Receiving shutdown signal. Cleaning up...") + return handle + except Exception as e: + logger.error(f"Error starting server: {str(e)}") serve.shutdown() + raise