Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

General formatting #57

Merged
merged 2 commits into from
Feb 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,6 @@ pythonpath = [
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.black]
line-length = 120
4 changes: 1 addition & 3 deletions src/coffee/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,7 @@ def __init__(
self.harm_scores = harm_scores

def value(self):
return sum([s.normalized_value() for s in self.harm_scores]) / len(
self.harm_scores
)
return sum([s.normalized_value() for s in self.harm_scores]) / len(self.harm_scores)

def stars(self):
return float_to_stars(self.value())
Expand Down
29 changes: 7 additions & 22 deletions src/coffee/helm.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,13 +128,9 @@ def load_scores(self):
if t.__class__ == BbqHelmTest:
test_sut_scores = self.hackily_extract_bbq_scores(focus, glob_path)
elif t.__class__ == RealToxicityPromptsHelmTest:
test_sut_scores = self.hackily_extract_toxicity_scores(
focus, glob_path
)
test_sut_scores = self.hackily_extract_toxicity_scores(focus, glob_path)
else:
raise NotImplementedError(
f"need to add score extration for {t.__class__}"
)
raise NotImplementedError(f"need to add score extration for {t.__class__}")
result.add(t, s, test_sut_scores)
return result

Expand Down Expand Up @@ -225,15 +221,11 @@ def _build_runspecs(self, suts, tests):
runspecs.append(r + separator + "model=" + s.key)
return runspecs

def _execute(
self, command: List[str], output_dir: pathlib.Path
) -> subprocess.CompletedProcess:
def _execute(self, command: List[str], output_dir: pathlib.Path) -> subprocess.CompletedProcess:
if coffee.app_config.debug:
return self._run_with_debug_settings(command, output_dir)
else:
return subprocess.run(
" ".join(command), shell=True, capture_output=True, cwd=output_dir
)
return subprocess.run(" ".join(command), shell=True, capture_output=True, cwd=output_dir)

def _run_with_debug_settings(self, command, output_dir):
with subprocess.Popen(
Expand All @@ -257,13 +249,8 @@ def _make_output_dir(self):
return o

def _helm_command_for_runspecs(self, bbq_runspecs, max_instances):
command = [
"python "
+ str(pathlib.Path(__file__).parent.parent / "dubious_helm_cli_wrapper.py")
]
command.extend(
["--suite", "v1"]
) # this is a fixed string for now, which is probably wrong
command = ["python " + str(pathlib.Path(__file__).parent.parent / "dubious_helm_cli_wrapper.py")]
command.extend(["--suite", "v1"]) # this is a fixed string for now, which is probably wrong
command.extend(["-n", "1"]) # working around a bug
command.extend(["--max-eval-instances", str(max_instances)])

Expand All @@ -276,7 +263,5 @@ def _model_deployment_conf(self, sut: HelmSut):
"name": sut.key,
"tokenizer_name": sut.tokenizer_name,
"max_sequence_length": sut.tokenizer_max_length,
"client_spec": {
"class_name": "helm.proxy.clients.huggingface_client.HuggingFaceClient"
},
"client_spec": {"class_name": "helm.proxy.clients.huggingface_client.HuggingFaceClient"},
}
18 changes: 4 additions & 14 deletions src/coffee/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,27 +37,19 @@ def cli(output_dir: pathlib.Path, max_instances: int, debug: bool, web_only) ->
suts = [HelmSut.GPT2, HelmSut.PYTHIA_70M, HelmSut.FB_OPT_125M]
benchmark_scores = []
for benchmark_definition in [GeneralChatBotBenchmarkDefinition()]:
print(
termcolor.colored(
f'Starting run for benchmark "{benchmark_definition.name()}"', "green"
)
)
print(termcolor.colored(f'Starting run for benchmark "{benchmark_definition.name()}"', "green"))
harm_scores_by_sut = defaultdict(list)
for harm in benchmark_definition.harms():
print(termcolor.colored(f' Examining harm "term{harm.name()}"', "yellow"))

if web_only:
# this is a little sketchy for now, a quick fix to make testing HTML changes easier
tests = itertools.chain(
*[harm.tests() for harm in benchmark_definition.harms()]
)
tests = itertools.chain(*[harm.tests() for harm in benchmark_definition.harms()])
result = HelmResult(list(tests), suts, pathlib.Path("./run"), None)
else:
result = runner.run(harm.tests(), suts, max_instances)
if not result.success():
print(
f"HELM execution failed with return code {result.execution_result.returncode}:"
)
print(f"HELM execution failed with return code {result.execution_result.returncode}:")
print("stdout:")
print(result.helm_stdout())
print("stderr:")
Expand All @@ -75,9 +67,7 @@ def cli(output_dir: pathlib.Path, max_instances: int, debug: bool, web_only) ->
)
harm_scores_by_sut[sut].append(score)
for sut in suts:
benchmark_scores.append(
BenchmarkScore(benchmark_definition, sut, harm_scores_by_sut[sut])
)
benchmark_scores.append(BenchmarkScore(benchmark_definition, sut, harm_scores_by_sut[sut]))
print()
print(termcolor.colored(f"Benchmarking complete, rendering reports...", "green"))
static_site_generator = StaticSiteGenerator()
Expand Down
33 changes: 8 additions & 25 deletions src/coffee/static_site_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,7 @@ def display_stars(score, size) -> Markup:
<path d="M3.612 15.443c-.386.198-.824-.149-.746-.592l.83-4.73L.173 6.765c-.329-.314-.158-.888.283-.95l4.898-.696L7.538.792c.197-.39.73-.39.927 0l2.184 4.327 4.898.696c.441.062.612.636.282.95l-3.522 3.356.83 4.73c.078.443-.36.79-.746.592L8 13.187l-4.389 2.256z"/>
</svg></span>
"""
final_html = (
(stars_html * stars)
+ (half_star_html * half_star)
+ (empty_stars_html * empty_stars)
)
final_html = (stars_html * stars) + (half_star_html * half_star) + (empty_stars_html * empty_stars)
return Markup(final_html)


Expand Down Expand Up @@ -93,9 +89,7 @@ def display_stars(score, size) -> Markup:

class StaticSiteGenerator:
def __init__(self) -> None:
self.env = Environment(
loader=PackageLoader("coffee"), autoescape=select_autoescape()
)
self.env = Environment(loader=PackageLoader("coffee"), autoescape=select_autoescape())
self.env.filters["display_stars"] = display_stars

def _template_dir(self):
Expand Down Expand Up @@ -141,37 +135,26 @@ def _grouped_benchmark_scores(self, benchmark_scores: list[BenchmarkScore]) -> d
benchmark_scores_dict[benchmark_definition] = grouped_benchmark_scores_list
return benchmark_scores_dict

def _generate_benchmarks_page(
self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path
) -> None:
def _generate_benchmarks_page(self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path) -> None:
self._write_file(
output=output_dir / "benchmarks.html",
template_name="benchmarks.html",
grouped_benchmark_scores=self._grouped_benchmark_scores(benchmark_scores),
show_benchmark_header=True,
)

def _generate_benchmark_pages(
self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path
) -> None:
for benchmark_definition, benchmark_scores in self._grouped_benchmark_scores(
benchmark_scores
).items():
def _generate_benchmark_pages(self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path) -> None:
for benchmark_definition, benchmark_scores in self._grouped_benchmark_scores(benchmark_scores).items():
for benchmark_score in benchmark_scores:
self._write_file(
output=output_dir
/ f"{benchmark_score.benchmark_definition.path_name()}.html",
output=output_dir / f"{benchmark_score.benchmark_definition.path_name()}.html",
template_name="benchmark.html",
benchmark_definition=benchmark_definition,
grouped_benchmark_scores=self._grouped_benchmark_scores(
benchmark_scores
),
grouped_benchmark_scores=self._grouped_benchmark_scores(benchmark_scores),
stars_description=STARS_DESCRIPTION,
)

def _generate_test_report_pages(
self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path
) -> None:
def _generate_test_report_pages(self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path) -> None:
for benchmark_score in benchmark_scores:
self._write_file(
output=output_dir
Expand Down
18 changes: 9 additions & 9 deletions tests/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,14 +132,14 @@ def test_toxicity_scoring(datafiles):


def test_quantize_stars():
assert 0 == quantize_stars(0)
assert 5 == quantize_stars(5)
assert quantize_stars(0) == 0
assert quantize_stars(5) == 5

assert 4.5 == quantize_stars(4.5)
assert quantize_stars(4.5) == 4.5

assert 4.5 == quantize_stars(4.26)
assert 4.5 == quantize_stars(4.3)
assert 4.5 == quantize_stars(4.4)
assert 4.5 == quantize_stars(4.6)
assert 4.5 == quantize_stars(4.7)
assert 4.5 == quantize_stars(4.74)
assert quantize_stars(4.26) == 4.5
assert quantize_stars(4.3) == 4.5
assert quantize_stars(4.4) == 4.5
assert quantize_stars(4.6) == 4.5
assert quantize_stars(4.7) == 4.5
assert quantize_stars(4.74) == 4.5
19 changes: 8 additions & 11 deletions tests/test_helm_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ def test_cli_helm_runner_command(cwd_tmpdir):
runner.run([BbqHelmTest()], [HelmSut.GPT2])
shell_arguments = runner._execute.call_args.args[0]
runspecs = shell_arguments[shell_arguments.index("-r") + 1 :]
assert "bbq:subject=Age,model=openai/gpt2" == runspecs[0]
assert len(BbqHelmTest.CATEGORIES) == len(runspecs)
assert runspecs[0] == "bbq:subject=Age,model=openai/gpt2"
assert len(runspecs) == len(BbqHelmTest.CATEGORIES)


def test_runspec_without_params():
Expand Down Expand Up @@ -56,10 +56,7 @@ def test_cli_helm_runner_command_handles_huggingface_models_with_config(cwd_tmpd
assert fb["name"] == HelmSut.FB_OPT_125M.key
assert fb["tokenizer_name"] == HelmSut.FB_OPT_125M.tokenizer_name
assert fb["max_sequence_length"] == HelmSut.FB_OPT_125M.tokenizer_max_length
assert (
fb["client_spec"]["class_name"]
== "helm.proxy.clients.huggingface_client.HuggingFaceClient"
)
assert fb["client_spec"]["class_name"] == "helm.proxy.clients.huggingface_client.HuggingFaceClient"


@pytest.mark.datafiles(SIMPLE_BBQ_DATA)
Expand All @@ -68,8 +65,8 @@ def test_read_scores(datafiles):
scores = hr.load_scores()
sut_scores = scores.for_sut(HelmSut.GPT2)
assert "BbqHelmTest" in sut_scores
assert 2 == len(sut_scores["BbqHelmTest"])
assert 0.7 == sut_scores["BbqHelmTest"]["Age"]["bbq_accuracy"]
assert len(sut_scores["BbqHelmTest"]) == 2
assert sut_scores["BbqHelmTest"]["Age"]["bbq_accuracy"] == 0.7


def test_helmsut_basics():
Expand All @@ -78,6 +75,6 @@ def test_helmsut_basics():


def test_helmsut_huggingface():
assert HelmSut.GPT2.huggingface == False
assert HelmSut.FB_OPT_125M.huggingface == True
assert HelmSut.PYTHIA_70M.huggingface == True
assert HelmSut.GPT2.huggingface is False
assert HelmSut.FB_OPT_125M.huggingface is True
assert HelmSut.PYTHIA_70M.huggingface is True
Loading