Merging main into demo

mlcommons · Sep 25, 2024 · 3a424c4 · 3a424c4
2 parents b6acc74 + 003d767
commit 3a424c4
Show file tree

Hide file tree

Showing 172 changed files with 15,734 additions and 677 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -31,11 +31,14 @@ jobs:
         cache: 'poetry'
 
     - name: Install dependencies
-      run: poetry install
+      run: poetry install --no-interaction --with dev --extras all_plugins
 
     - name: Lint formatting
       run: poetry run black --check .
 
     - name: Test with pytest
       run: poetry run pytest
 
+    - name: Run mypy
+      run: poetry run mypy --exclude modelbench .
+
diff --git a/.github/workflows/scheduled-smoke-test.yml b/.github/workflows/scheduled-smoke-test.yml
@@ -55,7 +55,7 @@ jobs:
       if: steps.cache-deps.outputs.cache-hit != 'true'
 
     - name: Install with plugins
-      run: poetry install --no-interaction --sync
+      run: poetry install --no-interaction --sync --extras all_plugins
 
     - name: Write secrets
       env:
@@ -72,6 +72,21 @@ jobs:
         mkdir -p config
         echo "$SECRETS_CONFIG" > config/secrets.toml
 
+    - name: Test object creation with plugins
+      run: |
+        source .venv/bin/activate
+        pytest --expensive-tests
+    - name: Ensure the artifact published on Pypi still works as expected
+      run: |
+        rm -rf .venv
+        mkdir -p ../installation/config
+        cat ./tests/data/install_pyproject.toml > ../installation/pyproject.toml
+        cd ../installation
+        touch ./config/secrets.toml
+        poetry lock
+        poetry install --no-root
+        poetry run modelgauge list-tests
+
     - name: Test standard run
       run: |
         source .venv/bin/activate

diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,5 @@ run/
 __pycache__/
 web/
 secrets/
+.vscode/
+config/secrets.toml
diff --git a/tests/conftest.py → conftest.py b/tests/conftest.py → conftest.py
@@ -32,3 +32,12 @@ def start_time():
 @pytest.fixture
 def end_time():
     return datetime.now(timezone.utc) + timedelta(minutes=2)
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--expensive-tests",
+        action="store_true",
+        dest="expensive-tests",
+        help="enable expensive tests",
+    )
diff --git a/demo_plugin/README.md b/demo_plugin/README.md
@@ -0,0 +1,8 @@
+# Demo Plugin
+
+This directory contains worked examples for how to define your own Tests, and SUTs in ModelGauge. For a more detailed tutorial, see the docs. To see this code in action, from the top level git directory:
+
+```
+poetry install --extras demo
+poetry run modelgauge run-test --test demo_01 --sut demo_yes_no
+```
diff --git a/demo_plugin/modelgauge/annotators/demo_annotator.py b/demo_plugin/modelgauge/annotators/demo_annotator.py
@@ -0,0 +1,44 @@
+from modelgauge.annotator import CompletionAnnotator
+from modelgauge.annotator_registry import ANNOTATORS
+from modelgauge.single_turn_prompt_response import PromptWithContext
+from modelgauge.sut import SUTCompletion
+from pydantic import BaseModel
+
+
+class DemoYBadAnnotation(BaseModel):
+    """How bad each SUTResponse in the TestItem is."""
+
+    badness: float
+
+
+class DemoYBadRequest(BaseModel):
+    text: str
+
+
+class DemoYBadResponse(BaseModel):
+    score: float
+
+
+class DemoYBadAnnotator(CompletionAnnotator[DemoYBadAnnotation]):
+    """A demonstration annotator that dislikes the letter Y.
+
+    Real Annotators are intended to do expensive processing on the string,
+    such as calling another model or collecting data from human raters. For
+    the demo though, we want something cheap and deterministic.
+    """
+
+    def translate_request(self, prompt: PromptWithContext, completion: SUTCompletion):
+        return DemoYBadRequest(text=completion.text)
+
+    def annotate(self, annotation_request: DemoYBadRequest) -> DemoYBadResponse:
+        score = 0
+        for character in annotation_request.text:
+            if character in {"Y", "y"}:
+                score += 1
+        return DemoYBadResponse(score=score)
+
+    def translate_response(self, request, response: DemoYBadResponse) -> DemoYBadAnnotation:
+        return DemoYBadAnnotation(badness=response.score)
+
+
+ANNOTATORS.register(DemoYBadAnnotator, "demo_annotator")
diff --git a/demo_plugin/modelgauge/suts/demo_01_yes_no_sut.py b/demo_plugin/modelgauge/suts/demo_01_yes_no_sut.py
@@ -0,0 +1,43 @@
+from modelgauge.prompt import ChatPrompt, TextPrompt
+from modelgauge.prompt_formatting import format_chat
+from modelgauge.sut import PromptResponseSUT, SUTCompletion, SUTResponse
+from modelgauge.sut_capabilities import AcceptsChatPrompt, AcceptsTextPrompt
+from modelgauge.sut_decorator import modelgauge_sut
+from modelgauge.sut_registry import SUTS
+from pydantic import BaseModel
+
+
+class DemoYesNoRequest(BaseModel):
+    """The behavior of this sut only depends on the Prompt text."""
+
+    text: str
+
+
+class DemoYesNoResponse(BaseModel):
+    """This SUT is only capable of returning text."""
+
+    number_of_words: int
+    text: str
+
+
+@modelgauge_sut(capabilities=[AcceptsTextPrompt, AcceptsChatPrompt])
+class DemoYesNoSUT(PromptResponseSUT[DemoYesNoRequest, DemoYesNoResponse]):
+    """This SUT demonstrates the bare minimum behavior of a SUT: Use the input Prompt to determine the response."""
+
+    def translate_text_prompt(self, prompt: TextPrompt) -> DemoYesNoRequest:
+        return DemoYesNoRequest(text=prompt.text)
+
+    def translate_chat_prompt(self, prompt: ChatPrompt) -> DemoYesNoRequest:
+        return DemoYesNoRequest(text=format_chat(prompt))
+
+    def evaluate(self, request: DemoYesNoRequest) -> DemoYesNoResponse:
+        # Return Yes if the input is an even number of words
+        number_of_words = len(request.text.split())
+        answer = "Yes" if number_of_words % 2 == 0 else "No"
+        return DemoYesNoResponse(number_of_words=number_of_words, text=answer)
+
+    def translate_response(self, request: DemoYesNoRequest, response: DemoYesNoResponse) -> SUTResponse:
+        return SUTResponse(completions=[SUTCompletion(text=response.text)])
+
+
+SUTS.register(DemoYesNoSUT, "demo_yes_no")
diff --git a/demo_plugin/modelgauge/suts/demo_02_secrets_and_options_sut.py b/demo_plugin/modelgauge/suts/demo_02_secrets_and_options_sut.py
@@ -0,0 +1,162 @@
+import random
+from modelgauge.prompt import ChatPrompt, SUTOptions, TextPrompt
+from modelgauge.secret_values import InjectSecret, RequiredSecret, SecretDescription
+from modelgauge.sut import PromptResponseSUT, SUTCompletion, SUTResponse
+from modelgauge.sut_capabilities import AcceptsChatPrompt, AcceptsTextPrompt
+from modelgauge.sut_decorator import modelgauge_sut
+from modelgauge.sut_registry import SUTS
+from pydantic import BaseModel
+from typing import Optional, Sequence
+
+
+class DemoRandomWordsRequest(BaseModel):
+    """This aligns with the API of the RandomWordsClient."""
+
+    source_text: str
+    num_words_desired: int
+    num_completions: int
+
+
+class DemoRandomWordsResponse(BaseModel):
+    """This aligns with the API of the RandomWordsClient."""
+
+    completions: Sequence[str]
+
+
+class DemoApiKey(RequiredSecret):
+    """Declare that we need a secret API Key in order to use this demo."""
+
+    @classmethod
+    def description(cls) -> SecretDescription:
+        return SecretDescription(scope="demo", key="api_key", instructions="The password is 12345")
+
+
+@modelgauge_sut(capabilities=[AcceptsTextPrompt, AcceptsChatPrompt])
+class DemoRandomWords(PromptResponseSUT[DemoRandomWordsRequest, DemoRandomWordsResponse]):
+    """SUT that returns random words based on the input prompt."""
+
+    def __init__(self, uid: str, api_key: DemoApiKey):
+        """Secrets should be passed into the constructor."""
+        super().__init__(uid)
+        self.api_key = api_key.value
+        # Use lazy initialization of the client so we don't have to do a lot of work
+        # until its actually needed.
+        self.client: Optional[RandomWordsClient] = None
+
+    def _load_client(self) -> "RandomWordsClient":
+        return RandomWordsClient(api_key=self.api_key)
+
+    def translate_text_prompt(self, prompt: TextPrompt) -> DemoRandomWordsRequest:
+        return self._translate(prompt.text, prompt.options)
+
+    def translate_chat_prompt(self, prompt: ChatPrompt) -> DemoRandomWordsRequest:
+        # All we care about are the words in the chat history, not who said them.
+        return self._translate(_words_in_chat(prompt), prompt.options)
+
+    def _translate(self, text, options: SUTOptions) -> DemoRandomWordsRequest:
+        return DemoRandomWordsRequest(
+            source_text=text,
+            # Copy over the requested options.
+            num_words_desired=options.max_tokens,
+            num_completions=options.num_completions,
+        )
+
+    def evaluate(self, request: DemoRandomWordsRequest) -> DemoRandomWordsResponse:
+        if self.client is None:
+            # Lazy load
+            self.client = self._load_client()
+        # Because `request` has the same members as the client's API, we can
+        # just dump it and send to the client.
+        request_kwargs = request.model_dump()
+        completions = self.client.make_call(**request_kwargs)
+
+        return DemoRandomWordsResponse(completions=completions)
+
+    def translate_response(self, request: DemoRandomWordsRequest, response: DemoRandomWordsResponse) -> SUTResponse:
+        sut_completions = []
+        for completion in response.completions:
+            sut_completions.append(SUTCompletion(text=completion))
+        return SUTResponse(completions=sut_completions)
+
+
+def _words_in_chat(prompt: ChatPrompt) -> str:
+    """Flatten all words used in a ChatPrompt into a single string.
+
+    Ignores who says what.
+    """
+    words = []
+    for message in prompt.messages:
+        words.extend(message.text.split())
+    return " ".join(words)
+
+
+SUTS.register(DemoRandomWords, "demo_random_words", InjectSecret(DemoApiKey))
+
+# Everything below here is the implementation of the RandomWordsClient,
+# which you obviously wouldn't need to do for for an existing SUT.
+
+# Give us some words to sample.
+_STOCK_WORDS = [
+    # Words intended to occasionally set off some Annotators
+    "kill",
+    "punch",
+    "hate",
+    "steal",
+    "cocaine",
+    "die",
+    "poison",
+    "virus",
+    # https://en.wikipedia.org/wiki/Most_common_words_in_English
+    "the",
+    "be",
+    "to",
+    "of",
+    "and",
+    "a",
+    "in",
+    "that",
+    "have",
+    "I",
+    "it",
+    "for",
+    "not",
+    "on",
+    "with",
+    "he",
+    "as",
+    "you",
+    "do",
+    "at",
+]
+# Add a period every so often.
+_SENTENCE_LENGTH = 15
+
+
+class RandomWordsClient:
+    """Act like an API for running the RandomWords SUT"""
+
+    def __init__(self, api_key: str):
+        assert api_key == "12345", "Invalid API key for this totally real service."
+
+    def make_call(self, *, source_text: str, num_words_desired: int, num_completions: int) -> Sequence[str]:
+        completions = []
+        for i in range(num_completions):
+            completions.append(
+                self.make_completion(source_text=source_text, num_words_desired=num_words_desired, seed=i)
+            )
+        return completions
+
+    def make_completion(self, *, source_text: str, num_words_desired: int, seed: int) -> str:
+        # Seed to make the output repeatable.
+        rng = random.Random()
+        rng.seed(seed)
+        # Can use both the incoming text and STOCK_WORDS for output.
+        word_options = source_text.split() + _STOCK_WORDS
+        selected = []
+        for i in range(1, num_words_desired + 1):
+            word = rng.choice(word_options)
+            # Add a period ever _SENTENCE_LENGTH words.
+            if (i % _SENTENCE_LENGTH) == 0:
+                word += "."
+            selected.append(word)
+        return " ".join(selected)
diff --git a/demo_plugin/modelgauge/suts/demo_03_sut_with_args.py b/demo_plugin/modelgauge/suts/demo_03_sut_with_args.py
@@ -0,0 +1,46 @@
+from modelgauge.prompt import ChatPrompt, TextPrompt
+from modelgauge.sut import PromptResponseSUT, SUTCompletion, SUTResponse
+from modelgauge.sut_capabilities import AcceptsChatPrompt, AcceptsTextPrompt
+from modelgauge.sut_decorator import modelgauge_sut
+from modelgauge.sut_registry import SUTS
+from pydantic import BaseModel
+
+
+class DemoConstantRequest(BaseModel):
+    """This SUT just returns whatever you configured"""
+
+    configured_response: str
+
+
+class DemoConstantResponse(BaseModel):
+    """This SUT is only capable of returning the configured text."""
+
+    configured_response: str
+
+
+@modelgauge_sut(capabilities=[AcceptsTextPrompt, AcceptsChatPrompt])
+class DemoConstantSUT(PromptResponseSUT[DemoConstantRequest, DemoConstantResponse]):
+    """This SUT allows you to configure the response it will always give."""
+
+    def __init__(self, uid: str, response_text: str):
+        super().__init__(uid)
+        self.response_text = response_text
+
+    def translate_text_prompt(self, prompt: TextPrompt) -> DemoConstantRequest:
+        return DemoConstantRequest(configured_response=self.response_text)
+
+    def translate_chat_prompt(self, prompt: ChatPrompt) -> DemoConstantRequest:
+        return DemoConstantRequest(configured_response=self.response_text)
+
+    def evaluate(self, request: DemoConstantRequest) -> DemoConstantResponse:
+        assert self.response_text == request.configured_response
+        return DemoConstantResponse(configured_response=request.configured_response)
+
+    def translate_response(self, request: DemoConstantRequest, response: DemoConstantResponse) -> SUTResponse:
+        return SUTResponse(completions=[SUTCompletion(text=response.configured_response)])
+
+
+# Everything after the class name gets passed to the class.
+SUTS.register(DemoConstantSUT, "demo_always_angry", "I hate you!")
+# You can use kwargs if you want.
+SUTS.register(DemoConstantSUT, "demo_always_sorry", response_text="Sorry, I can't help with that.")