Skip to content

Commit

Permalink
Merging main into demo
Browse files Browse the repository at this point in the history
  • Loading branch information
wpietri committed Sep 25, 2024
2 parents b6acc74 + 003d767 commit 3a424c4
Show file tree
Hide file tree
Showing 172 changed files with 15,734 additions and 677 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,14 @@ jobs:
cache: 'poetry'

- name: Install dependencies
run: poetry install
run: poetry install --no-interaction --with dev --extras all_plugins

- name: Lint formatting
run: poetry run black --check .

- name: Test with pytest
run: poetry run pytest

- name: Run mypy
run: poetry run mypy --exclude modelbench .

17 changes: 16 additions & 1 deletion .github/workflows/scheduled-smoke-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ jobs:
if: steps.cache-deps.outputs.cache-hit != 'true'

- name: Install with plugins
run: poetry install --no-interaction --sync
run: poetry install --no-interaction --sync --extras all_plugins

- name: Write secrets
env:
Expand All @@ -72,6 +72,21 @@ jobs:
mkdir -p config
echo "$SECRETS_CONFIG" > config/secrets.toml
- name: Test object creation with plugins
run: |
source .venv/bin/activate
pytest --expensive-tests
- name: Ensure the artifact published on Pypi still works as expected
run: |
rm -rf .venv
mkdir -p ../installation/config
cat ./tests/data/install_pyproject.toml > ../installation/pyproject.toml
cd ../installation
touch ./config/secrets.toml
poetry lock
poetry install --no-root
poetry run modelgauge list-tests
- name: Test standard run
run: |
source .venv/bin/activate
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ run/
__pycache__/
web/
secrets/
.vscode/
config/secrets.toml
9 changes: 9 additions & 0 deletions tests/conftest.py → conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,12 @@ def start_time():
@pytest.fixture
def end_time():
return datetime.now(timezone.utc) + timedelta(minutes=2)


def pytest_addoption(parser):
parser.addoption(
"--expensive-tests",
action="store_true",
dest="expensive-tests",
help="enable expensive tests",
)
8 changes: 8 additions & 0 deletions demo_plugin/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Demo Plugin

This directory contains worked examples for how to define your own Tests, and SUTs in ModelGauge. For a more detailed tutorial, see the docs. To see this code in action, from the top level git directory:

```
poetry install --extras demo
poetry run modelgauge run-test --test demo_01 --sut demo_yes_no
```
44 changes: 44 additions & 0 deletions demo_plugin/modelgauge/annotators/demo_annotator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from modelgauge.annotator import CompletionAnnotator
from modelgauge.annotator_registry import ANNOTATORS
from modelgauge.single_turn_prompt_response import PromptWithContext
from modelgauge.sut import SUTCompletion
from pydantic import BaseModel


class DemoYBadAnnotation(BaseModel):
"""How bad each SUTResponse in the TestItem is."""

badness: float


class DemoYBadRequest(BaseModel):
text: str


class DemoYBadResponse(BaseModel):
score: float


class DemoYBadAnnotator(CompletionAnnotator[DemoYBadAnnotation]):
"""A demonstration annotator that dislikes the letter Y.
Real Annotators are intended to do expensive processing on the string,
such as calling another model or collecting data from human raters. For
the demo though, we want something cheap and deterministic.
"""

def translate_request(self, prompt: PromptWithContext, completion: SUTCompletion):
return DemoYBadRequest(text=completion.text)

def annotate(self, annotation_request: DemoYBadRequest) -> DemoYBadResponse:
score = 0
for character in annotation_request.text:
if character in {"Y", "y"}:
score += 1
return DemoYBadResponse(score=score)

def translate_response(self, request, response: DemoYBadResponse) -> DemoYBadAnnotation:
return DemoYBadAnnotation(badness=response.score)


ANNOTATORS.register(DemoYBadAnnotator, "demo_annotator")
43 changes: 43 additions & 0 deletions demo_plugin/modelgauge/suts/demo_01_yes_no_sut.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from modelgauge.prompt import ChatPrompt, TextPrompt
from modelgauge.prompt_formatting import format_chat
from modelgauge.sut import PromptResponseSUT, SUTCompletion, SUTResponse
from modelgauge.sut_capabilities import AcceptsChatPrompt, AcceptsTextPrompt
from modelgauge.sut_decorator import modelgauge_sut
from modelgauge.sut_registry import SUTS
from pydantic import BaseModel


class DemoYesNoRequest(BaseModel):
"""The behavior of this sut only depends on the Prompt text."""

text: str


class DemoYesNoResponse(BaseModel):
"""This SUT is only capable of returning text."""

number_of_words: int
text: str


@modelgauge_sut(capabilities=[AcceptsTextPrompt, AcceptsChatPrompt])
class DemoYesNoSUT(PromptResponseSUT[DemoYesNoRequest, DemoYesNoResponse]):
"""This SUT demonstrates the bare minimum behavior of a SUT: Use the input Prompt to determine the response."""

def translate_text_prompt(self, prompt: TextPrompt) -> DemoYesNoRequest:
return DemoYesNoRequest(text=prompt.text)

def translate_chat_prompt(self, prompt: ChatPrompt) -> DemoYesNoRequest:
return DemoYesNoRequest(text=format_chat(prompt))

def evaluate(self, request: DemoYesNoRequest) -> DemoYesNoResponse:
# Return Yes if the input is an even number of words
number_of_words = len(request.text.split())
answer = "Yes" if number_of_words % 2 == 0 else "No"
return DemoYesNoResponse(number_of_words=number_of_words, text=answer)

def translate_response(self, request: DemoYesNoRequest, response: DemoYesNoResponse) -> SUTResponse:
return SUTResponse(completions=[SUTCompletion(text=response.text)])


SUTS.register(DemoYesNoSUT, "demo_yes_no")
162 changes: 162 additions & 0 deletions demo_plugin/modelgauge/suts/demo_02_secrets_and_options_sut.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import random
from modelgauge.prompt import ChatPrompt, SUTOptions, TextPrompt
from modelgauge.secret_values import InjectSecret, RequiredSecret, SecretDescription
from modelgauge.sut import PromptResponseSUT, SUTCompletion, SUTResponse
from modelgauge.sut_capabilities import AcceptsChatPrompt, AcceptsTextPrompt
from modelgauge.sut_decorator import modelgauge_sut
from modelgauge.sut_registry import SUTS
from pydantic import BaseModel
from typing import Optional, Sequence


class DemoRandomWordsRequest(BaseModel):
"""This aligns with the API of the RandomWordsClient."""

source_text: str
num_words_desired: int
num_completions: int


class DemoRandomWordsResponse(BaseModel):
"""This aligns with the API of the RandomWordsClient."""

completions: Sequence[str]


class DemoApiKey(RequiredSecret):
"""Declare that we need a secret API Key in order to use this demo."""

@classmethod
def description(cls) -> SecretDescription:
return SecretDescription(scope="demo", key="api_key", instructions="The password is 12345")


@modelgauge_sut(capabilities=[AcceptsTextPrompt, AcceptsChatPrompt])
class DemoRandomWords(PromptResponseSUT[DemoRandomWordsRequest, DemoRandomWordsResponse]):
"""SUT that returns random words based on the input prompt."""

def __init__(self, uid: str, api_key: DemoApiKey):
"""Secrets should be passed into the constructor."""
super().__init__(uid)
self.api_key = api_key.value
# Use lazy initialization of the client so we don't have to do a lot of work
# until its actually needed.
self.client: Optional[RandomWordsClient] = None

def _load_client(self) -> "RandomWordsClient":
return RandomWordsClient(api_key=self.api_key)

def translate_text_prompt(self, prompt: TextPrompt) -> DemoRandomWordsRequest:
return self._translate(prompt.text, prompt.options)

def translate_chat_prompt(self, prompt: ChatPrompt) -> DemoRandomWordsRequest:
# All we care about are the words in the chat history, not who said them.
return self._translate(_words_in_chat(prompt), prompt.options)

def _translate(self, text, options: SUTOptions) -> DemoRandomWordsRequest:
return DemoRandomWordsRequest(
source_text=text,
# Copy over the requested options.
num_words_desired=options.max_tokens,
num_completions=options.num_completions,
)

def evaluate(self, request: DemoRandomWordsRequest) -> DemoRandomWordsResponse:
if self.client is None:
# Lazy load
self.client = self._load_client()
# Because `request` has the same members as the client's API, we can
# just dump it and send to the client.
request_kwargs = request.model_dump()
completions = self.client.make_call(**request_kwargs)

return DemoRandomWordsResponse(completions=completions)

def translate_response(self, request: DemoRandomWordsRequest, response: DemoRandomWordsResponse) -> SUTResponse:
sut_completions = []
for completion in response.completions:
sut_completions.append(SUTCompletion(text=completion))
return SUTResponse(completions=sut_completions)


def _words_in_chat(prompt: ChatPrompt) -> str:
"""Flatten all words used in a ChatPrompt into a single string.
Ignores who says what.
"""
words = []
for message in prompt.messages:
words.extend(message.text.split())
return " ".join(words)


SUTS.register(DemoRandomWords, "demo_random_words", InjectSecret(DemoApiKey))

# Everything below here is the implementation of the RandomWordsClient,
# which you obviously wouldn't need to do for for an existing SUT.

# Give us some words to sample.
_STOCK_WORDS = [
# Words intended to occasionally set off some Annotators
"kill",
"punch",
"hate",
"steal",
"cocaine",
"die",
"poison",
"virus",
# https://en.wikipedia.org/wiki/Most_common_words_in_English
"the",
"be",
"to",
"of",
"and",
"a",
"in",
"that",
"have",
"I",
"it",
"for",
"not",
"on",
"with",
"he",
"as",
"you",
"do",
"at",
]
# Add a period every so often.
_SENTENCE_LENGTH = 15


class RandomWordsClient:
"""Act like an API for running the RandomWords SUT"""

def __init__(self, api_key: str):
assert api_key == "12345", "Invalid API key for this totally real service."

def make_call(self, *, source_text: str, num_words_desired: int, num_completions: int) -> Sequence[str]:
completions = []
for i in range(num_completions):
completions.append(
self.make_completion(source_text=source_text, num_words_desired=num_words_desired, seed=i)
)
return completions

def make_completion(self, *, source_text: str, num_words_desired: int, seed: int) -> str:
# Seed to make the output repeatable.
rng = random.Random()
rng.seed(seed)
# Can use both the incoming text and STOCK_WORDS for output.
word_options = source_text.split() + _STOCK_WORDS
selected = []
for i in range(1, num_words_desired + 1):
word = rng.choice(word_options)
# Add a period ever _SENTENCE_LENGTH words.
if (i % _SENTENCE_LENGTH) == 0:
word += "."
selected.append(word)
return " ".join(selected)
46 changes: 46 additions & 0 deletions demo_plugin/modelgauge/suts/demo_03_sut_with_args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from modelgauge.prompt import ChatPrompt, TextPrompt
from modelgauge.sut import PromptResponseSUT, SUTCompletion, SUTResponse
from modelgauge.sut_capabilities import AcceptsChatPrompt, AcceptsTextPrompt
from modelgauge.sut_decorator import modelgauge_sut
from modelgauge.sut_registry import SUTS
from pydantic import BaseModel


class DemoConstantRequest(BaseModel):
"""This SUT just returns whatever you configured"""

configured_response: str


class DemoConstantResponse(BaseModel):
"""This SUT is only capable of returning the configured text."""

configured_response: str


@modelgauge_sut(capabilities=[AcceptsTextPrompt, AcceptsChatPrompt])
class DemoConstantSUT(PromptResponseSUT[DemoConstantRequest, DemoConstantResponse]):
"""This SUT allows you to configure the response it will always give."""

def __init__(self, uid: str, response_text: str):
super().__init__(uid)
self.response_text = response_text

def translate_text_prompt(self, prompt: TextPrompt) -> DemoConstantRequest:
return DemoConstantRequest(configured_response=self.response_text)

def translate_chat_prompt(self, prompt: ChatPrompt) -> DemoConstantRequest:
return DemoConstantRequest(configured_response=self.response_text)

def evaluate(self, request: DemoConstantRequest) -> DemoConstantResponse:
assert self.response_text == request.configured_response
return DemoConstantResponse(configured_response=request.configured_response)

def translate_response(self, request: DemoConstantRequest, response: DemoConstantResponse) -> SUTResponse:
return SUTResponse(completions=[SUTCompletion(text=response.configured_response)])


# Everything after the class name gets passed to the class.
SUTS.register(DemoConstantSUT, "demo_always_angry", "I hate you!")
# You can use kwargs if you want.
SUTS.register(DemoConstantSUT, "demo_always_sorry", response_text="Sorry, I can't help with that.")
Loading

0 comments on commit 3a424c4

Please sign in to comment.