From 43972e100d1257375584927f32cf3fd33fe74383 Mon Sep 17 00:00:00 2001 From: Andrew Truong Date: Thu, 24 Oct 2024 23:24:17 -0700 Subject: [PATCH] wip --- docs/docs/tutorial-eval.md | 346 ++++++++++-------- docs/docs/tutorial-rag.md | 722 ++++++++++++++++++++----------------- 2 files changed, 589 insertions(+), 479 deletions(-) diff --git a/docs/docs/tutorial-eval.md b/docs/docs/tutorial-eval.md index 2b4f202244..1d0be1772b 100644 --- a/docs/docs/tutorial-eval.md +++ b/docs/docs/tutorial-eval.md @@ -1,3 +1,7 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + + # Tutorial: Build an Evaluation pipeline To iterate on an application, we need a way to evaluate if it's improving. To do so, a common practice is to test it against the same set of examples when there is a change. Weave has a first-class way to track evaluations with `Model` & `Evaluation` classes. We have built the APIs to make minimal assumptions to allow for the flexibility to support a wide array of use-cases. @@ -17,50 +21,68 @@ Weave automatically captures when they are used and updates the version when the ::: -```python -import json -import openai -import weave - -# highlight-next-line -class ExtractFruitsModel(weave.Model): - model_name: str - prompt_template: str + + + ```python + import json + import openai + import weave # highlight-next-line - @weave.op() - # highlight-next-line - async def predict(self, sentence: str) -> dict: - client = openai.AsyncClient() - - response = await client.chat.completions.create( - model=self.model_name, - messages=[ - {"role": "user", "content": self.prompt_template.format(sentence=sentence)} - ], - ) - result = response.choices[0].message.content - if result is None: - raise ValueError("No response from model") - parsed = json.loads(result) - return parsed -``` - -You can instantiate `Model` objects as normal like this: + class ExtractFruitsModel(weave.Model): + model_name: str + prompt_template: str -```python -import asyncio -import weave + # highlight-next-line + @weave.op() + # highlight-next-line + async def predict(self, sentence: str) -> dict: + client = openai.AsyncClient() + + response = await client.chat.completions.create( + model=self.model_name, + messages=[ + {"role": "user", "content": self.prompt_template.format(sentence=sentence)} + ], + ) + result = response.choices[0].message.content + if result is None: + raise ValueError("No response from model") + parsed = json.loads(result) + return parsed + ``` + + + ```plaintext + This feature is not available in TypeScript yet. Stay tuned! + ``` + + -weave.init('intro-example') +You can instantiate `Model` objects as normal like this: -model = ExtractFruitsModel(model_name='gpt-3.5-turbo-1106', - prompt_template='Extract fields ("fruit": , "color": , "flavor": ) from the following text, as json: {sentence}') -sentence = "There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy." -print(asyncio.run(model.predict(sentence))) -# if you're in a Jupyter Notebook, run: -# await model.predict(sentence) -``` + + + ```python + import asyncio + import weave + + weave.init('intro-example') + + model = ExtractFruitsModel(model_name='gpt-3.5-turbo-1106', + prompt_template='Extract fields ("fruit": , "color": , "flavor": ) from the following text, as json: {sentence}') + sentence = "There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy." + print(asyncio.run(model.predict(sentence))) + # if you're in a Jupyter Notebook, run: + # await model.predict(sentence) + ``` + + + ```plaintext + This feature is not available in TypeScript yet. Stay tuned! + ``` + + :::note Checkout the [Models](/guides/core-types/models) guide to learn more. @@ -68,21 +90,32 @@ Checkout the [Models](/guides/core-types/models) guide to learn more. ## 2. Collect some examples -```python -sentences = ["There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.", -"Pounits are a bright green color and are more savory than sweet.", -"Finally, there are fruits called glowls, which have a very sour and bitter taste which is acidic and caustic, and a pale orange tinge to them."] -labels = [ - {'fruit': 'neoskizzles', 'color': 'purple', 'flavor': 'candy'}, - {'fruit': 'pounits', 'color': 'bright green', 'flavor': 'savory'}, - {'fruit': 'glowls', 'color': 'pale orange', 'flavor': 'sour and bitter'} -] -examples = [ - {'id': '0', 'sentence': sentences[0], 'target': labels[0]}, - {'id': '1', 'sentence': sentences[1], 'target': labels[1]}, - {'id': '2', 'sentence': sentences[2], 'target': labels[2]} -] -``` + + + ```python + sentences = [ + "There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.", + "Pounits are a bright green color and are more savory than sweet.", + "Finally, there are fruits called glowls, which have a very sour and bitter taste which is acidic and caustic, and a pale orange tinge to them." + ] + labels = [ + {'fruit': 'neoskizzles', 'color': 'purple', 'flavor': 'candy'}, + {'fruit': 'pounits', 'color': 'bright green', 'flavor': 'savory'}, + {'fruit': 'glowls', 'color': 'pale orange', 'flavor': 'sour and bitter'} + ] + examples = [ + {'id': '0', 'sentence': sentences[0], 'target': labels[0]}, + {'id': '1', 'sentence': sentences[1], 'target': labels[1]}, + {'id': '2', 'sentence': sentences[2], 'target': labels[2]} + ] + ``` + + + ```plaintext + This feature is not available in TypeScript yet. Stay tuned! + ``` + + ## 3. Evaluate a `Model` @@ -92,117 +125,136 @@ Here, we'll use a default scoring class `MultiTaskBinaryClassificationF1` and we Here `sentence` is passed to the model's predict function, and `target` is used in the scoring function, these are inferred based on the argument names of the `predict` and scoring functions. The `fruit` key needs to be outputted by the model's predict function and must also be existing as a column in the dataset (or outputted by the `preprocess_model_input` function if defined). -```python -import weave -from weave.flow.scorer import MultiTaskBinaryClassificationF1 + + + ```python + import weave + from weave.flow.scorer import MultiTaskBinaryClassificationF1 -weave.init('intro-example') + weave.init('intro-example') -@weave.op() -def fruit_name_score(target: dict, model_output: dict) -> dict: - return {'correct': target['fruit'] == model_output['fruit']} + @weave.op() + def fruit_name_score(target: dict, model_output: dict) -> dict: + return {'correct': target['fruit'] == model_output['fruit']} -# highlight-next-line -evaluation = weave.Evaluation( - # highlight-next-line - dataset=examples, # highlight-next-line - scorers=[ + evaluation = weave.Evaluation( # highlight-next-line - MultiTaskBinaryClassificationF1(class_names=["fruit", "color", "flavor"]), + dataset=examples, # highlight-next-line - fruit_name_score + scorers=[ + # highlight-next-line + MultiTaskBinaryClassificationF1(class_names=["fruit", "color", "flavor"]), + # highlight-next-line + fruit_name_score + # highlight-next-line + ], + # highlight-next-line + ) # highlight-next-line - ], -# highlight-next-line -) -# highlight-next-line -print(asyncio.run(evaluation.evaluate(model))) -# if you're in a Jupyter Notebook, run: -# await evaluation.evaluate(model) -``` + print(asyncio.run(evaluation.evaluate(model))) + # if you're in a Jupyter Notebook, run: + # await evaluation.evaluate(model) + ``` + + + ```plaintext + This feature is not available in TypeScript yet. Stay tuned! + ``` + + In some applications we want to create custom `Scorer` classes - where for example a standardized `LLMJudge` class should be created with specific parameters (e.g. chat model, prompt), specific scoring of each row, and specific calculation of an aggregate score. See the tutorial on defining a `Scorer` class in the next chapter on [Model-Based Evaluation of RAG applications](/tutorial-rag#optional-defining-a-scorer-class) for more information. ## 4. Pulling it all together -```python -import json -import asyncio -# highlight-next-line -import weave -# highlight-next-line -from weave.flow.scorer import MultiTaskBinaryClassificationF1 -import openai - -# We create a model class with one predict function. -# All inputs, predictions and parameters are automatically captured for easy inspection. + + + ```python + import json + import asyncio + # highlight-next-line + import weave + # highlight-next-line + from weave.flow.scorer import MultiTaskBinaryClassificationF1 + import openai -# highlight-next-line -class ExtractFruitsModel(weave.Model): - model_name: str - prompt_template: str + # We create a model class with one predict function. + # All inputs, predictions and parameters are automatically captured for easy inspection. # highlight-next-line + class ExtractFruitsModel(weave.Model): + model_name: str + prompt_template: str + + # highlight-next-line + @weave.op() + # highlight-next-line + async def predict(self, sentence: str) -> dict: + client = openai.AsyncClient() + + response = await client.chat.completions.create( + model=self.model_name, + messages=[ + {"role": "user", "content": self.prompt_template.format(sentence=sentence)} + ], + response_format={ "type": "json_object" } + ) + result = response.choices[0].message.content + if result is None: + raise ValueError("No response from model") + parsed = json.loads(result) + return parsed + + # We call init to begin capturing data in the project, intro-example. + weave.init('intro-example') + + # We create our model with our system prompt. + model = ExtractFruitsModel(name='gpt4', + model_name='gpt-4-0125-preview', + prompt_template='Extract fields ("fruit": , "color": , "flavor") from the following text, as json: {sentence}') + sentences = ["There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.", + "Pounits are a bright green color and are more savory than sweet.", + "Finally, there are fruits called glowls, which have a very sour and bitter taste which is acidic and caustic, and a pale orange tinge to them."] + labels = [ + {'fruit': 'neoskizzles', 'color': 'purple', 'flavor': 'candy'}, + {'fruit': 'pounits', 'color': 'bright green', 'flavor': 'savory'}, + {'fruit': 'glowls', 'color': 'pale orange', 'flavor': 'sour and bitter'} + ] + examples = [ + {'id': '0', 'sentence': sentences[0], 'target': labels[0]}, + {'id': '1', 'sentence': sentences[1], 'target': labels[1]}, + {'id': '2', 'sentence': sentences[2], 'target': labels[2]} + ] + # If you have already published the Dataset, you can run: + # dataset = weave.ref('example_labels').get() + + # We define a scoring function to compare our model predictions with a ground truth label. @weave.op() + def fruit_name_score(target: dict, model_output: dict) -> dict: + return {'correct': target['fruit'] == model_output['fruit']} + + # Finally, we run an evaluation of this model. + # This will generate a prediction for each input example, and then score it with each scoring function. # highlight-next-line - async def predict(self, sentence: str) -> dict: - client = openai.AsyncClient() - - response = await client.chat.completions.create( - model=self.model_name, - messages=[ - {"role": "user", "content": self.prompt_template.format(sentence=sentence)} - ], - response_format={ "type": "json_object" } - ) - result = response.choices[0].message.content - if result is None: - raise ValueError("No response from model") - parsed = json.loads(result) - return parsed - -# We call init to begin capturing data in the project, intro-example. -weave.init('intro-example') - -# We create our model with our system prompt. -model = ExtractFruitsModel(name='gpt4', - model_name='gpt-4-0125-preview', - prompt_template='Extract fields ("fruit": , "color": , "flavor") from the following text, as json: {sentence}') -sentences = ["There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.", -"Pounits are a bright green color and are more savory than sweet.", -"Finally, there are fruits called glowls, which have a very sour and bitter taste which is acidic and caustic, and a pale orange tinge to them."] -labels = [ - {'fruit': 'neoskizzles', 'color': 'purple', 'flavor': 'candy'}, - {'fruit': 'pounits', 'color': 'bright green', 'flavor': 'savory'}, - {'fruit': 'glowls', 'color': 'pale orange', 'flavor': 'sour and bitter'} -] -examples = [ - {'id': '0', 'sentence': sentences[0], 'target': labels[0]}, - {'id': '1', 'sentence': sentences[1], 'target': labels[1]}, - {'id': '2', 'sentence': sentences[2], 'target': labels[2]} -] -# If you have already published the Dataset, you can run: -# dataset = weave.ref('example_labels').get() - -# We define a scoring function to compare our model predictions with a ground truth label. -@weave.op() -def fruit_name_score(target: dict, model_output: dict) -> dict: - return {'correct': target['fruit'] == model_output['fruit']} - -# Finally, we run an evaluation of this model. -# This will generate a prediction for each input example, and then score it with each scoring function. -# highlight-next-line -evaluation = weave.Evaluation( - name='fruit_eval', + evaluation = weave.Evaluation( + # highlight-next-line + name='fruit_eval', + # highlight-next-line + dataset=examples, scorers=[MultiTaskBinaryClassificationF1(class_names=["fruit", "color", "flavor"]), fruit_name_score], # highlight-next-line - dataset=examples, scorers=[MultiTaskBinaryClassificationF1(class_names=["fruit", "color", "flavor"]), fruit_name_score], -# highlight-next-line -) -print(asyncio.run(evaluation.evaluate(model))) -# if you're in a Jupyter Notebook, run: -# await evaluation.evaluate(model) -``` + ) + print(asyncio.run(evaluation.evaluate(model))) + # if you're in a Jupyter Notebook, run: + # await evaluation.evaluate(model) + ``` + + + ```plaintext + This feature is not available in TypeScript yet. Stay tuned! + ``` + + ## What's next? diff --git a/docs/docs/tutorial-rag.md b/docs/docs/tutorial-rag.md index 43fbf3d999..d87a52b69b 100644 --- a/docs/docs/tutorial-rag.md +++ b/docs/docs/tutorial-rag.md @@ -1,3 +1,7 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + + # Tutorial: Model-Based Evaluation of RAG applications Retrieval Augmented Generation (RAG) is a common way of building Generative AI applications that have access to custom knowledge bases. @@ -13,104 +17,122 @@ Check out the [RAG++ course](https://www.wandb.courses/courses/rag-in-production First, we compute the embeddings for our articles. You would typically do this once with your articles and put the embeddings & metadata in a database, but here we're doing it every time we run our script for simplicity. -```python -from openai import OpenAI -import weave -from weave import Model -import numpy as np -import json -import asyncio - -articles = [ - "Novo Nordisk and Eli Lilly rival soars 32 percent after promising weight loss drug results Shares of Denmarks Zealand Pharma shot 32 percent higher in morning trade, after results showed success in its liver disease treatment survodutide, which is also on trial as a drug to treat obesity. The trial “tells us that the 6mg dose is safe, which is the top dose used in the ongoing [Phase 3] obesity trial too,” one analyst said in a note. The results come amid feverish investor interest in drugs that can be used for weight loss.", - "Berkshire shares jump after big profit gain as Buffetts conglomerate nears $1 trillion valuation Berkshire Hathaway shares rose on Monday after Warren Buffetts conglomerate posted strong earnings for the fourth quarter over the weekend. Berkshires Class A and B shares jumped more than 1.5%, each. Class A shares are higher by more than 17% this year, while Class B has gained more than 18%. Berkshire was last valued at $930.1 billion, up from $905.5 billion where it closed on Friday, according to FactSet. Berkshire on Saturday posted fourth-quarter operating earnings of $8.481 billion, about 28 percent higher than the $6.625 billion from the year-ago period, driven by big gains in its insurance business. Operating earnings refers to profits from businesses across insurance, railroads and utilities. Meanwhile, Berkshires cash levels also swelled to record levels. The conglomerate held $167.6 billion in cash in the fourth quarter, surpassing the $157.2 billion record the conglomerate held in the prior quarter.", - "Highmark Health says its combining tech from Google and Epic to give doctors easier access to information Highmark Health announced it is integrating technology from Google Cloud and the health-care software company Epic Systems. The integration aims to make it easier for both payers and providers to access key information they need, even if its stored across multiple points and formats, the company said. Highmark is the parent company of a health plan with 7 million members, a provider network of 14 hospitals and other entities", - "Rivian and Lucid shares plunge after weak EV earnings reports Shares of electric vehicle makers Rivian and Lucid fell Thursday after the companies reported stagnant production in their fourth-quarter earnings after the bell Wednesday. Rivian shares sank about 25 percent, and Lucids stock dropped around 17 percent. Rivian forecast it will make 57,000 vehicles in 2024, slightly less than the 57,232 vehicles it produced in 2023. Lucid said it expects to make 9,000 vehicles in 2024, more than the 8,428 vehicles it made in 2023.", - "Mauritius blocks Norwegian cruise ship over fears of a potential cholera outbreak Local authorities on Sunday denied permission for the Norwegian Dawn ship, which has 2,184 passengers and 1,026 crew on board, to access the Mauritius capital of Port Louis, citing “potential health risks.” The Mauritius Ports Authority said Sunday that samples were taken from at least 15 passengers on board the cruise ship. A spokesperson for the U.S.-headquartered Norwegian Cruise Line Holdings said Sunday that 'a small number of guests experienced mild symptoms of a stomach-related illness' during Norwegian Dawns South Africa voyage.", - "Intuitive Machines lands on the moon in historic first for a U.S. company Intuitive Machines Nova-C cargo lander, named Odysseus after the mythological Greek hero, is the first U.S. spacecraft to soft land on the lunar surface since 1972. Intuitive Machines is the first company to pull off a moon landing — government agencies have carried out all previously successful missions. The company's stock surged in extended trading Thursday, after falling 11 percent in regular trading.", - "Lunar landing photos: Intuitive Machines Odysseus sends back first images from the moon Intuitive Machines cargo moon lander Odysseus returned its first images from the surface. Company executives believe the lander caught its landing gear sideways on the moon's surface while touching down and tipped over. Despite resting on its side, the company's historic IM-1 mission is still operating on the moon.", -] - -def docs_to_embeddings(docs: list) -> list: - openai = OpenAI() - document_embeddings = [] - for doc in docs: - response = ( - openai.embeddings.create(input=doc, model="text-embedding-3-small") - .data[0] - .embedding - ) - document_embeddings.append(response) - return document_embeddings + + + ```python + from openai import OpenAI + import weave + from weave import Model + import numpy as np + import json + import asyncio + + articles = [ + "Novo Nordisk and Eli Lilly rival soars 32 percent after promising weight loss drug results Shares of Denmarks Zealand Pharma shot 32 percent higher in morning trade, after results showed success in its liver disease treatment survodutide, which is also on trial as a drug to treat obesity. The trial “tells us that the 6mg dose is safe, which is the top dose used in the ongoing [Phase 3] obesity trial too,” one analyst said in a note. The results come amid feverish investor interest in drugs that can be used for weight loss.", + "Berkshire shares jump after big profit gain as Buffetts conglomerate nears $1 trillion valuation Berkshire Hathaway shares rose on Monday after Warren Buffetts conglomerate posted strong earnings for the fourth quarter over the weekend. Berkshires Class A and B shares jumped more than 1.5%, each. Class A shares are higher by more than 17% this year, while Class B has gained more than 18%. Berkshire was last valued at $930.1 billion, up from $905.5 billion where it closed on Friday, according to FactSet. Berkshire on Saturday posted fourth-quarter operating earnings of $8.481 billion, about 28 percent higher than the $6.625 billion from the year-ago period, driven by big gains in its insurance business. Operating earnings refers to profits from businesses across insurance, railroads and utilities. Meanwhile, Berkshires cash levels also swelled to record levels. The conglomerate held $167.6 billion in cash in the fourth quarter, surpassing the $157.2 billion record the conglomerate held in the prior quarter.", + "Highmark Health says its combining tech from Google and Epic to give doctors easier access to information Highmark Health announced it is integrating technology from Google Cloud and the health-care software company Epic Systems. The integration aims to make it easier for both payers and providers to access key information they need, even if its stored across multiple points and formats, the company said. Highmark is the parent company of a health plan with 7 million members, a provider network of 14 hospitals and other entities", + "Rivian and Lucid shares plunge after weak EV earnings reports Shares of electric vehicle makers Rivian and Lucid fell Thursday after the companies reported stagnant production in their fourth-quarter earnings after the bell Wednesday. Rivian shares sank about 25 percent, and Lucids stock dropped around 17 percent. Rivian forecast it will make 57,000 vehicles in 2024, slightly less than the 57,232 vehicles it produced in 2023. Lucid said it expects to make 9,000 vehicles in 2024, more than the 8,428 vehicles it made in 2023.", + "Mauritius blocks Norwegian cruise ship over fears of a potential cholera outbreak Local authorities on Sunday denied permission for the Norwegian Dawn ship, which has 2,184 passengers and 1,026 crew on board, to access the Mauritius capital of Port Louis, citing “potential health risks.” The Mauritius Ports Authority said Sunday that samples were taken from at least 15 passengers on board the cruise ship. A spokesperson for the U.S.-headquartered Norwegian Cruise Line Holdings said Sunday that 'a small number of guests experienced mild symptoms of a stomach-related illness' during Norwegian Dawns South Africa voyage.", + "Intuitive Machines lands on the moon in historic first for a U.S. company Intuitive Machines Nova-C cargo lander, named Odysseus after the mythological Greek hero, is the first U.S. spacecraft to soft land on the lunar surface since 1972. Intuitive Machines is the first company to pull off a moon landing — government agencies have carried out all previously successful missions. The company's stock surged in extended trading Thursday, after falling 11 percent in regular trading.", + "Lunar landing photos: Intuitive Machines Odysseus sends back first images from the moon Intuitive Machines cargo moon lander Odysseus returned its first images from the surface. Company executives believe the lander caught its landing gear sideways on the moon's surface while touching down and tipped over. Despite resting on its side, the company's historic IM-1 mission is still operating on the moon.", + ] -article_embeddings = docs_to_embeddings(articles) # Note: you would typically do this once with your articles and put the embeddings & metadata in a database -``` + def docs_to_embeddings(docs: list) -> list: + openai = OpenAI() + document_embeddings = [] + for doc in docs: + response = ( + openai.embeddings.create(input=doc, model="text-embedding-3-small") + .data[0] + .embedding + ) + document_embeddings.append(response) + return document_embeddings + + article_embeddings = docs_to_embeddings(articles) # Note: you would typically do this once with your articles and put the embeddings & metadata in a database + ``` + + + ```plaintext + This feature is not available in TypeScript yet. Stay tuned! + ``` + + ## 2. Create a RAG app Next, we wrap our retrieval function `get_most_relevant_document` with a `weave.op()` decorator and we create our `Model` class. We call `weave.init('rag-qa')` to begin tracking all the inputs and outputs of our functions for later inspection. -```python -from openai import OpenAI -import weave -from weave import Model -import numpy as np -import asyncio - -# highlight-next-line -@weave.op() -def get_most_relevant_document(query): - openai = OpenAI() - query_embedding = ( - openai.embeddings.create(input=query, model="text-embedding-3-small") - .data[0] - .embedding - ) - similarities = [ - np.dot(query_embedding, doc_emb) - / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_emb)) - for doc_emb in article_embeddings - ] - # Get the index of the most similar document - most_relevant_doc_index = np.argmax(similarities) - return articles[most_relevant_doc_index] - -# highlight-next-line -class RAGModel(Model): - system_message: str - model_name: str = "gpt-3.5-turbo-1106" + + + ```python + from openai import OpenAI + import weave + from weave import Model + import numpy as np + import asyncio -# highlight-next-line + # highlight-next-line @weave.op() - def predict(self, question: str) -> dict: # note: `question` will be used later to select data from our evaluation rows - from openai import OpenAI - context = get_most_relevant_document(question) - client = OpenAI() - query = f"""Use the following information to answer the subsequent question. If the answer cannot be found, write "I don't know." - Context: - \"\"\" - {context} - \"\"\" - Question: {question}""" - response = client.chat.completions.create( - model=self.model_name, - messages=[ - {"role": "system", "content": self.system_message}, - {"role": "user", "content": query}, - ], - temperature=0.0, - response_format={"type": "text"}, + def get_most_relevant_document(query): + openai = OpenAI() + query_embedding = ( + openai.embeddings.create(input=query, model="text-embedding-3-small") + .data[0] + .embedding ) - answer = response.choices[0].message.content - return {'answer': answer, 'context': context} - -# highlight-next-line -weave.init('rag-qa') -model = RAGModel( - system_message="You are an expert in finance and answer questions related to finance, financial services, and financial markets. When responding based on provided information, be sure to cite the source." -) -model.predict("What significant result was reported about Zealand Pharma's obesity trial?") -``` + similarities = [ + np.dot(query_embedding, doc_emb) + / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_emb)) + for doc_emb in article_embeddings + ] + # Get the index of the most similar document + most_relevant_doc_index = np.argmax(similarities) + return articles[most_relevant_doc_index] + + # highlight-next-line + class RAGModel(Model): + system_message: str + model_name: str = "gpt-3.5-turbo-1106" + + # highlight-next-line + @weave.op() + def predict(self, question: str) -> dict: # note: `question` will be used later to select data from our evaluation rows + from openai import OpenAI + context = get_most_relevant_document(question) + client = OpenAI() + query = f"""Use the following information to answer the subsequent question. If the answer cannot be found, write "I don't know." + Context: + \"\"\" + {context} + \"\"\" + Question: {question}""" + response = client.chat.completions.create( + model=self.model_name, + messages=[ + {"role": "system", "content": self.system_message}, + {"role": "user", "content": query}, + ], + temperature=0.0, + response_format={"type": "text"}, + ) + answer = response.choices[0].message.content + return {'answer': answer, 'context': context} + + # highlight-next-line + weave.init('rag-qa') + model = RAGModel( + system_message="You are an expert in finance and answer questions related to finance, financial services, and financial markets. When responding based on provided information, be sure to cite the source." + ) + model.predict("What significant result was reported about Zealand Pharma's obesity trial?") + ``` + + + ```plaintext + This feature is not available in TypeScript yet. Stay tuned! + ``` + + ## 3. Evaluating with an LLM Judge @@ -120,54 +142,63 @@ When there aren't simple ways to evaluate your application, one approach is to u As we did in the [Build an Evaluation pipeline tutorial](/tutorial-eval), we'll define a set of example rows to test our app against and a scoring function. Our scoring function will take one row and evaluate it. The input arguments should match with the corresponding keys in our row, so `question` here will be taken from the row dictionary. `model_output` is the output of the model. The input to the model will be taken from the example based on its input argument, so `question` here too. We're using `async` functions so they run fast in parallel. If you need a quick introduction to async, you can find one [here](https://docs.python.org/3/library/asyncio.html). -```python -from openai import OpenAI -import weave -import asyncio - -# highlight-next-line -@weave.op() -async def context_precision_score(question, model_output): - context_precision_prompt = """Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output. - Output in only valid JSON format. - - question: {question} - context: {context} - answer: {answer} - verdict: """ - client = OpenAI() - - prompt = context_precision_prompt.format( - question=question, - context=model_output['context'], - answer=model_output['answer'], - ) + + + ```python + from openai import OpenAI + import weave + import asyncio - response = client.chat.completions.create( - model="gpt-4-turbo-preview", - messages=[{"role": "user", "content": prompt}], - response_format={ "type": "json_object" } - ) - response_message = response.choices[0].message - response = json.loads(response_message.content) - return { - "verdict": int(response["verdict"]) == 1, - } - -questions = [ - {"question": "What significant result was reported about Zealand Pharma's obesity trial?"}, - {"question": "How much did Berkshire Hathaway's cash levels increase in the fourth quarter?"}, - {"question": "What is the goal of Highmark Health's integration of Google Cloud and Epic Systems technology?"}, - {"question": "What were Rivian and Lucid's vehicle production forecasts for 2024?"}, - {"question": "Why was the Norwegian Dawn cruise ship denied access to Mauritius?"}, - {"question": "Which company achieved the first U.S. moon landing since 1972?"}, - {"question": "What issue did Intuitive Machines' lunar lander encounter upon landing on the moon?"} -] -# highlight-next-line -evaluation = weave.Evaluation(dataset=questions, scorers=[context_precision_score]) -# highlight-next-line -asyncio.run(evaluation.evaluate(model)) # note: you'll need to define a model to evaluate -``` + # highlight-next-line + @weave.op() + async def context_precision_score(question, model_output): + context_precision_prompt = """Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output. + Output in only valid JSON format. + + question: {question} + context: {context} + answer: {answer} + verdict: """ + client = OpenAI() + + prompt = context_precision_prompt.format( + question=question, + context=model_output['context'], + answer=model_output['answer'], + ) + + response = client.chat.completions.create( + model="gpt-4-turbo-preview", + messages=[{"role": "user", "content": prompt}], + response_format={ "type": "json_object" } + ) + response_message = response.choices[0].message + response = json.loads(response_message.content) + return { + "verdict": int(response["verdict"]) == 1, + } + + questions = [ + {"question": "What significant result was reported about Zealand Pharma's obesity trial?"}, + {"question": "How much did Berkshire Hathaway's cash levels increase in the fourth quarter?"}, + {"question": "What is the goal of Highmark Health's integration of Google Cloud and Epic Systems technology?"}, + {"question": "What were Rivian and Lucid's vehicle production forecasts for 2024?"}, + {"question": "Why was the Norwegian Dawn cruise ship denied access to Mauritius?"}, + {"question": "Which company achieved the first U.S. moon landing since 1972?"}, + {"question": "What issue did Intuitive Machines' lunar lander encounter upon landing on the moon?"} + ] + # highlight-next-line + evaluation = weave.Evaluation(dataset=questions, scorers=[context_precision_score]) + # highlight-next-line + asyncio.run(evaluation.evaluate(model)) # note: you'll need to define a model to evaluate + ``` + + + ```plaintext + This feature is not available in TypeScript yet. Stay tuned! + ``` + + ### Optional: Defining a `Scorer` class In some applications we want to create custom evaluation classes - where for example a standardized `LLMJudge` class should be created with specific parameters (e.g. chat model, prompt), specific scoring of each row, and specific calculation of an aggregate score. In order to do that Weave defines a list of ready-to-use `Scorer` classes and also makes it easy to create a custom `Scorer` - in the following we'll see how to create a custom `class CorrectnessLLMJudge(Scorer)`. @@ -181,79 +212,97 @@ On a high-level the steps to create custom Scorer are quite simple: - this function has to have a `@weave.op()` decorator. -```python -from weave.flow.scorer import Scorer -from weave import WeaveList - -class CorrectnessLLMJudge(Scorer): - prompt: str - model_name: str - device: str - - @weave.op() - async def score(self, model_output: Optional[dict], query: str, answer: str) -> Any: - """Score the correctness of the predictions by comparing the pred, query, target. - Args: - - model_output: the dict that will be provided by the model that is evaluated - - query: the question asked - as defined in the dataset - - answer: the target answer - as defined in the dataset - Returns: - - single dict {metric name: single evaluation value}""" - - # get_model is defined as general model getter based on provided params (OpenAI,HF...) - eval_model = get_model( - model_name = self.model_name, - prompt = self.prompt - device = self.device, - ) - # async evaluation to speed up evaluation - this doesn't have to be async - grade = await eval_model.async_predict( - { - "query": query, - "answer": answer, - "result": model_output.get("result"), + + + ```python + from weave.flow.scorer import Scorer + from weave import WeaveList + + class CorrectnessLLMJudge(Scorer): + prompt: str + model_name: str + device: str + + @weave.op() + async def score(self, model_output: Optional[dict], query: str, answer: str) -> Any: + """Score the correctness of the predictions by comparing the pred, query, target. + Args: + - model_output: the dict that will be provided by the model that is evaluated + - query: the question asked - as defined in the dataset + - answer: the target answer - as defined in the dataset + Returns: + - single dict {metric name: single evaluation value}""" + + # get_model is defined as general model getter based on provided params (OpenAI,HF...) + eval_model = get_model( + model_name = self.model_name, + prompt = self.prompt + device = self.device, + ) + # async evaluation to speed up evaluation - this doesn't have to be async + grade = await eval_model.async_predict( + { + "query": query, + "answer": answer, + "result": model_output.get("result"), + } + ) + # output parsing - could be done more reobustly with pydantic + evaluation = "incorrect" not in grade["text"].strip().lower() + + # the column name displayed in Weave + return {"correct": evaluation} + + @weave.op() + def summarize(self, score_rows: WeaveList) -> Optional[dict]: + """Aggregate all the scores that are calculated for each row by the scoring function. + Args: + - score_rows: a WeaveList object, nested dict of metrics and scores + Returns: + - nested dict with the same structure as the input""" + + # if nothing is provided the weave.flow.scorer.auto_summarize function is used + # return auto_summarize(score_rows) + + valid_data = [x.get("correct") for x in score_rows if x.get("correct") is not None] + count_true = list(valid_data).count(True) + int_data = [int(x) for x in valid_data] + + sample_mean = np.mean(int_data) if int_data else 0 + sample_variance = np.var(int_data) if int_data else 0 + sample_error = np.sqrt(sample_variance / len(int_data)) if int_data else 0 + + # the extra "correct" layer is not necessary but adds structure in the UI + return { + "correct": { + "true_count": count_true, + "true_fraction": sample_mean, + "stderr": sample_error, + } } - ) - # output parsing - could be done more reobustly with pydantic - evaluation = "incorrect" not in grade["text"].strip().lower() - - # the column name displayed in Weave - return {"correct": evaluation} - - @weave.op() - def summarize(self, score_rows: WeaveList) -> Optional[dict]: - """Aggregate all the scores that are calculated for each row by the scoring function. - Args: - - score_rows: a WeaveList object, nested dict of metrics and scores - Returns: - - nested dict with the same structure as the input""" - - # if nothing is provided the weave.flow.scorer.auto_summarize function is used - # return auto_summarize(score_rows) - - valid_data = [x.get("correct") for x in score_rows if x.get("correct") is not None] - count_true = list(valid_data).count(True) - int_data = [int(x) for x in valid_data] - - sample_mean = np.mean(int_data) if int_data else 0 - sample_variance = np.var(int_data) if int_data else 0 - sample_error = np.sqrt(sample_variance / len(int_data)) if int_data else 0 - - # the extra "correct" layer is not necessary but adds structure in the UI - return { - "correct": { - "true_count": count_true, - "true_fraction": sample_mean, - "stderr": sample_error, - } - } -``` + ``` + + + ```plaintext + This feature is not available in TypeScript yet. Stay tuned! + ``` + + To use this as a scorer, you would initialize it and pass it to `scorers` argument in your `Evaluation like this: -```python -evaluation = weave.Evaluation(dataset=questions, scorers=[CorrectnessLLMJudge()]) -``` + + + ```python + evaluation = weave.Evaluation(dataset=questions, scorers=[CorrectnessLLMJudge()]) + ``` + + + ```plaintext + This feature is not available in TypeScript yet. Stay tuned! + ``` + + ## 4. Pulling it all together @@ -268,143 +317,152 @@ To get the same result for your RAG apps: Here, we show the code in it's entirety. -```python -from openai import OpenAI -import weave -from weave import Model -import numpy as np -import json -import asyncio - -# Examples we've gathered that we want to use for evaluations -articles = [ - "Novo Nordisk and Eli Lilly rival soars 32 percent after promising weight loss drug results Shares of Denmarks Zealand Pharma shot 32 percent higher in morning trade, after results showed success in its liver disease treatment survodutide, which is also on trial as a drug to treat obesity. The trial “tells us that the 6mg dose is safe, which is the top dose used in the ongoing [Phase 3] obesity trial too,” one analyst said in a note. The results come amid feverish investor interest in drugs that can be used for weight loss.", - "Berkshire shares jump after big profit gain as Buffetts conglomerate nears $1 trillion valuation Berkshire Hathaway shares rose on Monday after Warren Buffetts conglomerate posted strong earnings for the fourth quarter over the weekend. Berkshires Class A and B shares jumped more than 1.5%, each. Class A shares are higher by more than 17% this year, while Class B has gained more than 18%. Berkshire was last valued at $930.1 billion, up from $905.5 billion where it closed on Friday, according to FactSet. Berkshire on Saturday posted fourth-quarter operating earnings of $8.481 billion, about 28 percent higher than the $6.625 billion from the year-ago period, driven by big gains in its insurance business. Operating earnings refers to profits from businesses across insurance, railroads and utilities. Meanwhile, Berkshires cash levels also swelled to record levels. The conglomerate held $167.6 billion in cash in the fourth quarter, surpassing the $157.2 billion record the conglomerate held in the prior quarter.", - "Highmark Health says its combining tech from Google and Epic to give doctors easier access to information Highmark Health announced it is integrating technology from Google Cloud and the health-care software company Epic Systems. The integration aims to make it easier for both payers and providers to access key information they need, even if it's stored across multiple points and formats, the company said. Highmark is the parent company of a health plan with 7 million members, a provider network of 14 hospitals and other entities", - "Rivian and Lucid shares plunge after weak EV earnings reports Shares of electric vehicle makers Rivian and Lucid fell Thursday after the companies reported stagnant production in their fourth-quarter earnings after the bell Wednesday. Rivian shares sank about 25 percent, and Lucids stock dropped around 17 percent. Rivian forecast it will make 57,000 vehicles in 2024, slightly less than the 57,232 vehicles it produced in 2023. Lucid said it expects to make 9,000 vehicles in 2024, more than the 8,428 vehicles it made in 2023.", - "Mauritius blocks Norwegian cruise ship over fears of a potential cholera outbreak Local authorities on Sunday denied permission for the Norwegian Dawn ship, which has 2,184 passengers and 1,026 crew on board, to access the Mauritius capital of Port Louis, citing “potential health risks.” The Mauritius Ports Authority said Sunday that samples were taken from at least 15 passengers on board the cruise ship. A spokesperson for the U.S.-headquartered Norwegian Cruise Line Holdings said Sunday that 'a small number of guests experienced mild symptoms of a stomach-related illness' during Norwegian Dawns South Africa voyage.", - "Intuitive Machines lands on the moon in historic first for a U.S. company Intuitive Machines Nova-C cargo lander, named Odysseus after the mythological Greek hero, is the first U.S. spacecraft to soft land on the lunar surface since 1972. Intuitive Machines is the first company to pull off a moon landing — government agencies have carried out all previously successful missions. The company's stock surged in extended trading Thursday, after falling 11 percent in regular trading.", - "Lunar landing photos: Intuitive Machines Odysseus sends back first images from the moon Intuitive Machines cargo moon lander Odysseus returned its first images from the surface. Company executives believe the lander caught its landing gear sideways on the surface of the moon while touching down and tipped over. Despite resting on its side, the company's historic IM-1 mission is still operating on the moon.", -] - -def docs_to_embeddings(docs: list) -> list: - openai = OpenAI() - document_embeddings = [] - for doc in docs: - response = ( - openai.embeddings.create(input=doc, model="text-embedding-3-small") + + + ```python + from openai import OpenAI + import weave + from weave import Model + import numpy as np + import json + import asyncio + + # Examples we've gathered that we want to use for evaluations + articles = [ + "Novo Nordisk and Eli Lilly rival soars 32 percent after promising weight loss drug results Shares of Denmarks Zealand Pharma shot 32 percent higher in morning trade, after results showed success in its liver disease treatment survodutide, which is also on trial as a drug to treat obesity. The trial “tells us that the 6mg dose is safe, which is the top dose used in the ongoing [Phase 3] obesity trial too,” one analyst said in a note. The results come amid feverish investor interest in drugs that can be used for weight loss.", + "Berkshire shares jump after big profit gain as Buffetts conglomerate nears $1 trillion valuation Berkshire Hathaway shares rose on Monday after Warren Buffetts conglomerate posted strong earnings for the fourth quarter over the weekend. Berkshires Class A and B shares jumped more than 1.5%, each. Class A shares are higher by more than 17% this year, while Class B has gained more than 18%. Berkshire was last valued at $930.1 billion, up from $905.5 billion where it closed on Friday, according to FactSet. Berkshire on Saturday posted fourth-quarter operating earnings of $8.481 billion, about 28 percent higher than the $6.625 billion from the year-ago period, driven by big gains in its insurance business. Operating earnings refers to profits from businesses across insurance, railroads and utilities. Meanwhile, Berkshires cash levels also swelled to record levels. The conglomerate held $167.6 billion in cash in the fourth quarter, surpassing the $157.2 billion record the conglomerate held in the prior quarter.", + "Highmark Health says its combining tech from Google and Epic to give doctors easier access to information Highmark Health announced it is integrating technology from Google Cloud and the health-care software company Epic Systems. The integration aims to make it easier for both payers and providers to access key information they need, even if it's stored across multiple points and formats, the company said. Highmark is the parent company of a health plan with 7 million members, a provider network of 14 hospitals and other entities", + "Rivian and Lucid shares plunge after weak EV earnings reports Shares of electric vehicle makers Rivian and Lucid fell Thursday after the companies reported stagnant production in their fourth-quarter earnings after the bell Wednesday. Rivian shares sank about 25 percent, and Lucids stock dropped around 17 percent. Rivian forecast it will make 57,000 vehicles in 2024, slightly less than the 57,232 vehicles it produced in 2023. Lucid said it expects to make 9,000 vehicles in 2024, more than the 8,428 vehicles it made in 2023.", + "Mauritius blocks Norwegian cruise ship over fears of a potential cholera outbreak Local authorities on Sunday denied permission for the Norwegian Dawn ship, which has 2,184 passengers and 1,026 crew on board, to access the Mauritius capital of Port Louis, citing “potential health risks.” The Mauritius Ports Authority said Sunday that samples were taken from at least 15 passengers on board the cruise ship. A spokesperson for the U.S.-headquartered Norwegian Cruise Line Holdings said Sunday that 'a small number of guests experienced mild symptoms of a stomach-related illness' during Norwegian Dawns South Africa voyage.", + "Intuitive Machines lands on the moon in historic first for a U.S. company Intuitive Machines Nova-C cargo lander, named Odysseus after the mythological Greek hero, is the first U.S. spacecraft to soft land on the lunar surface since 1972. Intuitive Machines is the first company to pull off a moon landing — government agencies have carried out all previously successful missions. The company's stock surged in extended trading Thursday, after falling 11 percent in regular trading.", + "Lunar landing photos: Intuitive Machines Odysseus sends back first images from the moon Intuitive Machines cargo moon lander Odysseus returned its first images from the surface. Company executives believe the lander caught its landing gear sideways on the surface of the moon while touching down and tipped over. Despite resting on its side, the company's historic IM-1 mission is still operating on the moon.", + ] + + def docs_to_embeddings(docs: list) -> list: + openai = OpenAI() + document_embeddings = [] + for doc in docs: + response = ( + openai.embeddings.create(input=doc, model="text-embedding-3-small") + .data[0] + .embedding + ) + document_embeddings.append(response) + return document_embeddings + + article_embeddings = docs_to_embeddings(articles) # Note: you would typically do this once with your articles and put the embeddings & metadata in a database + + # We've added a decorator to our retrieval step + # highlight-next-line + @weave.op() + def get_most_relevant_document(query): + openai = OpenAI() + query_embedding = ( + openai.embeddings.create(input=query, model="text-embedding-3-small") .data[0] .embedding ) - document_embeddings.append(response) - return document_embeddings - -article_embeddings = docs_to_embeddings(articles) # Note: you would typically do this once with your articles and put the embeddings & metadata in a database - -# We've added a decorator to our retrieval step -# highlight-next-line -@weave.op() -def get_most_relevant_document(query): - openai = OpenAI() - query_embedding = ( - openai.embeddings.create(input=query, model="text-embedding-3-small") - .data[0] - .embedding + similarities = [ + np.dot(query_embedding, doc_emb) + / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_emb)) + for doc_emb in article_embeddings + ] + # Get the index of the most similar document + most_relevant_doc_index = np.argmax(similarities) + return articles[most_relevant_doc_index] + + # We create a Model subclass with some details about our app, along with a predict function that produces a response + # highlight-next-line + class RAGModel(Model): + system_message: str + model_name: str = "gpt-3.5-turbo-1106" + + # highlight-next-line + @weave.op() + # highlight-next-line + def predict(self, question: str) -> dict: # note: `question` will be used later to select data from our evaluation rows + from openai import OpenAI + context = get_most_relevant_document(question) + client = OpenAI() + query = f"""Use the following information to answer the subsequent question. If the answer cannot be found, write "I don't know." + Context: + \"\"\" + {context} + \"\"\" + Question: {question}""" + response = client.chat.completions.create( + model=self.model_name, + messages=[ + {"role": "system", "content": self.system_message}, + {"role": "user", "content": query}, + ], + temperature=0.0, + response_format={"type": "text"}, + ) + answer = response.choices[0].message.content + return {'answer': answer, 'context': context} + + # highlight-next-line + weave.init('rag-qa') + # highlight-next-line + model = RAGModel( + system_message="You are an expert in finance and answer questions related to finance, financial services, and financial markets. When responding based on provided information, be sure to cite the source." ) - similarities = [ - np.dot(query_embedding, doc_emb) - / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_emb)) - for doc_emb in article_embeddings - ] - # Get the index of the most similar document - most_relevant_doc_index = np.argmax(similarities) - return articles[most_relevant_doc_index] - -# We create a Model subclass with some details about our app, along with a predict function that produces a response -# highlight-next-line -class RAGModel(Model): - system_message: str - model_name: str = "gpt-3.5-turbo-1106" -# highlight-next-line + # Here is our scoring function uses our question and model_output to product a score + # highlight-next-line @weave.op() -# highlight-next-line - def predict(self, question: str) -> dict: # note: `question` will be used later to select data from our evaluation rows - from openai import OpenAI - context = get_most_relevant_document(question) + # highlight-next-line + async def context_precision_score(question, model_output): + context_precision_prompt = """Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output. + Output in only valid JSON format. + + question: {question} + context: {context} + answer: {answer} + verdict: """ client = OpenAI() - query = f"""Use the following information to answer the subsequent question. If the answer cannot be found, write "I don't know." - Context: - \"\"\" - {context} - \"\"\" - Question: {question}""" + + prompt = context_precision_prompt.format( + question=question, + context=model_output['context'], + answer=model_output['answer'], + ) + response = client.chat.completions.create( - model=self.model_name, - messages=[ - {"role": "system", "content": self.system_message}, - {"role": "user", "content": query}, - ], - temperature=0.0, - response_format={"type": "text"}, + model="gpt-4-turbo-preview", + messages=[{"role": "user", "content": prompt}], + response_format={ "type": "json_object" } ) - answer = response.choices[0].message.content - return {'answer': answer, 'context': context} - -# highlight-next-line -weave.init('rag-qa') -# highlight-next-line -model = RAGModel( - system_message="You are an expert in finance and answer questions related to finance, financial services, and financial markets. When responding based on provided information, be sure to cite the source." -) - -# Here is our scoring function uses our question and model_output to product a score -# highlight-next-line -@weave.op() -# highlight-next-line -async def context_precision_score(question, model_output): - context_precision_prompt = """Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output. - Output in only valid JSON format. - - question: {question} - context: {context} - answer: {answer} - verdict: """ - client = OpenAI() - - prompt = context_precision_prompt.format( - question=question, - context=model_output['context'], - answer=model_output['answer'], - ) + response_message = response.choices[0].message + response = json.loads(response_message.content) + return { + "verdict": int(response["verdict"]) == 1, + } - response = client.chat.completions.create( - model="gpt-4-turbo-preview", - messages=[{"role": "user", "content": prompt}], - response_format={ "type": "json_object" } - ) - response_message = response.choices[0].message - response = json.loads(response_message.content) - return { - "verdict": int(response["verdict"]) == 1, - } - -questions = [ - {"question": "What significant result was reported about Zealand Pharma's obesity trial?"}, - {"question": "How much did Berkshire Hathaway's cash levels increase in the fourth quarter?"}, - {"question": "What is the goal of Highmark Health's integration of Google Cloud and Epic Systems technology?"}, - {"question": "What were Rivian and Lucid's vehicle production forecasts for 2024?"}, - {"question": "Why was the Norwegian Dawn cruise ship denied access to Mauritius?"}, - {"question": "Which company achieved the first U.S. moon landing since 1972?"}, - {"question": "What issue did Intuitive Machines' lunar lander encounter upon landing on the moon?"} -] - -# We define an Evaluation object and pass our example questions along with scoring functions -# highlight-next-line -evaluation = weave.Evaluation(dataset=questions, scorers=[context_precision_score]) -# highlight-next-line -asyncio.run(evaluation.evaluate(model)) -``` + questions = [ + {"question": "What significant result was reported about Zealand Pharma's obesity trial?"}, + {"question": "How much did Berkshire Hathaway's cash levels increase in the fourth quarter?"}, + {"question": "What is the goal of Highmark Health's integration of Google Cloud and Epic Systems technology?"}, + {"question": "What were Rivian and Lucid's vehicle production forecasts for 2024?"}, + {"question": "Why was the Norwegian Dawn cruise ship denied access to Mauritius?"}, + {"question": "Which company achieved the first U.S. moon landing since 1972?"}, + {"question": "What issue did Intuitive Machines' lunar lander encounter upon landing on the moon?"} + ] + + # We define an Evaluation object and pass our example questions along with scoring functions + # highlight-next-line + evaluation = weave.Evaluation(dataset=questions, scorers=[context_precision_score]) + # highlight-next-line + asyncio.run(evaluation.evaluate(model)) + ``` + + + ```plaintext + This feature is not available in TypeScript yet. Stay tuned! + ``` + + ## Conclusion