Skip to content

Commit

Permalink
General formatting (#57)
Browse files Browse the repository at this point in the history
* set black line length to 100; ensure tests have expectations last

* set black line length to 120
  • Loading branch information
dhosterman authored Feb 1, 2024
1 parent 6776971 commit 4b7899d
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 84 deletions.
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,6 @@ pythonpath = [
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.black]
line-length = 120
4 changes: 1 addition & 3 deletions src/coffee/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,7 @@ def __init__(
self.harm_scores = harm_scores

def value(self):
return sum([s.normalized_value() for s in self.harm_scores]) / len(
self.harm_scores
)
return sum([s.normalized_value() for s in self.harm_scores]) / len(self.harm_scores)

def stars(self):
return float_to_stars(self.value())
Expand Down
29 changes: 7 additions & 22 deletions src/coffee/helm.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,13 +128,9 @@ def load_scores(self):
if t.__class__ == BbqHelmTest:
test_sut_scores = self.hackily_extract_bbq_scores(focus, glob_path)
elif t.__class__ == RealToxicityPromptsHelmTest:
test_sut_scores = self.hackily_extract_toxicity_scores(
focus, glob_path
)
test_sut_scores = self.hackily_extract_toxicity_scores(focus, glob_path)
else:
raise NotImplementedError(
f"need to add score extration for {t.__class__}"
)
raise NotImplementedError(f"need to add score extration for {t.__class__}")
result.add(t, s, test_sut_scores)
return result

Expand Down Expand Up @@ -225,15 +221,11 @@ def _build_runspecs(self, suts, tests):
runspecs.append(r + separator + "model=" + s.key)
return runspecs

def _execute(
self, command: List[str], output_dir: pathlib.Path
) -> subprocess.CompletedProcess:
def _execute(self, command: List[str], output_dir: pathlib.Path) -> subprocess.CompletedProcess:
if coffee.app_config.debug:
return self._run_with_debug_settings(command, output_dir)
else:
return subprocess.run(
" ".join(command), shell=True, capture_output=True, cwd=output_dir
)
return subprocess.run(" ".join(command), shell=True, capture_output=True, cwd=output_dir)

def _run_with_debug_settings(self, command, output_dir):
with subprocess.Popen(
Expand All @@ -257,13 +249,8 @@ def _make_output_dir(self):
return o

def _helm_command_for_runspecs(self, bbq_runspecs, max_instances):
command = [
"python "
+ str(pathlib.Path(__file__).parent.parent / "dubious_helm_cli_wrapper.py")
]
command.extend(
["--suite", "v1"]
) # this is a fixed string for now, which is probably wrong
command = ["python " + str(pathlib.Path(__file__).parent.parent / "dubious_helm_cli_wrapper.py")]
command.extend(["--suite", "v1"]) # this is a fixed string for now, which is probably wrong
command.extend(["-n", "1"]) # working around a bug
command.extend(["--max-eval-instances", str(max_instances)])

Expand All @@ -276,7 +263,5 @@ def _model_deployment_conf(self, sut: HelmSut):
"name": sut.key,
"tokenizer_name": sut.tokenizer_name,
"max_sequence_length": sut.tokenizer_max_length,
"client_spec": {
"class_name": "helm.proxy.clients.huggingface_client.HuggingFaceClient"
},
"client_spec": {"class_name": "helm.proxy.clients.huggingface_client.HuggingFaceClient"},
}
18 changes: 4 additions & 14 deletions src/coffee/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,27 +37,19 @@ def cli(output_dir: pathlib.Path, max_instances: int, debug: bool, web_only) ->
suts = [HelmSut.GPT2, HelmSut.PYTHIA_70M, HelmSut.FB_OPT_125M]
benchmark_scores = []
for benchmark_definition in [GeneralChatBotBenchmarkDefinition()]:
print(
termcolor.colored(
f'Starting run for benchmark "{benchmark_definition.name()}"', "green"
)
)
print(termcolor.colored(f'Starting run for benchmark "{benchmark_definition.name()}"', "green"))
harm_scores_by_sut = defaultdict(list)
for harm in benchmark_definition.harms():
print(termcolor.colored(f' Examining harm "term{harm.name()}"', "yellow"))

if web_only:
# this is a little sketchy for now, a quick fix to make testing HTML changes easier
tests = itertools.chain(
*[harm.tests() for harm in benchmark_definition.harms()]
)
tests = itertools.chain(*[harm.tests() for harm in benchmark_definition.harms()])
result = HelmResult(list(tests), suts, pathlib.Path("./run"), None)
else:
result = runner.run(harm.tests(), suts, max_instances)
if not result.success():
print(
f"HELM execution failed with return code {result.execution_result.returncode}:"
)
print(f"HELM execution failed with return code {result.execution_result.returncode}:")
print("stdout:")
print(result.helm_stdout())
print("stderr:")
Expand All @@ -75,9 +67,7 @@ def cli(output_dir: pathlib.Path, max_instances: int, debug: bool, web_only) ->
)
harm_scores_by_sut[sut].append(score)
for sut in suts:
benchmark_scores.append(
BenchmarkScore(benchmark_definition, sut, harm_scores_by_sut[sut])
)
benchmark_scores.append(BenchmarkScore(benchmark_definition, sut, harm_scores_by_sut[sut]))
print()
print(termcolor.colored(f"Benchmarking complete, rendering reports...", "green"))
static_site_generator = StaticSiteGenerator()
Expand Down
33 changes: 8 additions & 25 deletions src/coffee/static_site_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,7 @@ def display_stars(score, size) -> Markup:
<path d="M3.612 15.443c-.386.198-.824-.149-.746-.592l.83-4.73L.173 6.765c-.329-.314-.158-.888.283-.95l4.898-.696L7.538.792c.197-.39.73-.39.927 0l2.184 4.327 4.898.696c.441.062.612.636.282.95l-3.522 3.356.83 4.73c.078.443-.36.79-.746.592L8 13.187l-4.389 2.256z"/>
</svg></span>
"""
final_html = (
(stars_html * stars)
+ (half_star_html * half_star)
+ (empty_stars_html * empty_stars)
)
final_html = (stars_html * stars) + (half_star_html * half_star) + (empty_stars_html * empty_stars)
return Markup(final_html)


Expand Down Expand Up @@ -93,9 +89,7 @@ def display_stars(score, size) -> Markup:

class StaticSiteGenerator:
def __init__(self) -> None:
self.env = Environment(
loader=PackageLoader("coffee"), autoescape=select_autoescape()
)
self.env = Environment(loader=PackageLoader("coffee"), autoescape=select_autoescape())
self.env.filters["display_stars"] = display_stars

def _template_dir(self):
Expand Down Expand Up @@ -141,37 +135,26 @@ def _grouped_benchmark_scores(self, benchmark_scores: list[BenchmarkScore]) -> d
benchmark_scores_dict[benchmark_definition] = grouped_benchmark_scores_list
return benchmark_scores_dict

def _generate_benchmarks_page(
self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path
) -> None:
def _generate_benchmarks_page(self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path) -> None:
self._write_file(
output=output_dir / "benchmarks.html",
template_name="benchmarks.html",
grouped_benchmark_scores=self._grouped_benchmark_scores(benchmark_scores),
show_benchmark_header=True,
)

def _generate_benchmark_pages(
self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path
) -> None:
for benchmark_definition, benchmark_scores in self._grouped_benchmark_scores(
benchmark_scores
).items():
def _generate_benchmark_pages(self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path) -> None:
for benchmark_definition, benchmark_scores in self._grouped_benchmark_scores(benchmark_scores).items():
for benchmark_score in benchmark_scores:
self._write_file(
output=output_dir
/ f"{benchmark_score.benchmark_definition.path_name()}.html",
output=output_dir / f"{benchmark_score.benchmark_definition.path_name()}.html",
template_name="benchmark.html",
benchmark_definition=benchmark_definition,
grouped_benchmark_scores=self._grouped_benchmark_scores(
benchmark_scores
),
grouped_benchmark_scores=self._grouped_benchmark_scores(benchmark_scores),
stars_description=STARS_DESCRIPTION,
)

def _generate_test_report_pages(
self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path
) -> None:
def _generate_test_report_pages(self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path) -> None:
for benchmark_score in benchmark_scores:
self._write_file(
output=output_dir
Expand Down
18 changes: 9 additions & 9 deletions tests/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,14 +132,14 @@ def test_toxicity_scoring(datafiles):


def test_quantize_stars():
assert 0 == quantize_stars(0)
assert 5 == quantize_stars(5)
assert quantize_stars(0) == 0
assert quantize_stars(5) == 5

assert 4.5 == quantize_stars(4.5)
assert quantize_stars(4.5) == 4.5

assert 4.5 == quantize_stars(4.26)
assert 4.5 == quantize_stars(4.3)
assert 4.5 == quantize_stars(4.4)
assert 4.5 == quantize_stars(4.6)
assert 4.5 == quantize_stars(4.7)
assert 4.5 == quantize_stars(4.74)
assert quantize_stars(4.26) == 4.5
assert quantize_stars(4.3) == 4.5
assert quantize_stars(4.4) == 4.5
assert quantize_stars(4.6) == 4.5
assert quantize_stars(4.7) == 4.5
assert quantize_stars(4.74) == 4.5
19 changes: 8 additions & 11 deletions tests/test_helm_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ def test_cli_helm_runner_command(cwd_tmpdir):
runner.run([BbqHelmTest()], [HelmSut.GPT2])
shell_arguments = runner._execute.call_args.args[0]
runspecs = shell_arguments[shell_arguments.index("-r") + 1 :]
assert "bbq:subject=Age,model=openai/gpt2" == runspecs[0]
assert len(BbqHelmTest.CATEGORIES) == len(runspecs)
assert runspecs[0] == "bbq:subject=Age,model=openai/gpt2"
assert len(runspecs) == len(BbqHelmTest.CATEGORIES)


def test_runspec_without_params():
Expand Down Expand Up @@ -56,10 +56,7 @@ def test_cli_helm_runner_command_handles_huggingface_models_with_config(cwd_tmpd
assert fb["name"] == HelmSut.FB_OPT_125M.key
assert fb["tokenizer_name"] == HelmSut.FB_OPT_125M.tokenizer_name
assert fb["max_sequence_length"] == HelmSut.FB_OPT_125M.tokenizer_max_length
assert (
fb["client_spec"]["class_name"]
== "helm.proxy.clients.huggingface_client.HuggingFaceClient"
)
assert fb["client_spec"]["class_name"] == "helm.proxy.clients.huggingface_client.HuggingFaceClient"


@pytest.mark.datafiles(SIMPLE_BBQ_DATA)
Expand All @@ -68,8 +65,8 @@ def test_read_scores(datafiles):
scores = hr.load_scores()
sut_scores = scores.for_sut(HelmSut.GPT2)
assert "BbqHelmTest" in sut_scores
assert 2 == len(sut_scores["BbqHelmTest"])
assert 0.7 == sut_scores["BbqHelmTest"]["Age"]["bbq_accuracy"]
assert len(sut_scores["BbqHelmTest"]) == 2
assert sut_scores["BbqHelmTest"]["Age"]["bbq_accuracy"] == 0.7


def test_helmsut_basics():
Expand All @@ -78,6 +75,6 @@ def test_helmsut_basics():


def test_helmsut_huggingface():
assert HelmSut.GPT2.huggingface == False
assert HelmSut.FB_OPT_125M.huggingface == True
assert HelmSut.PYTHIA_70M.huggingface == True
assert HelmSut.GPT2.huggingface is False
assert HelmSut.FB_OPT_125M.huggingface is True
assert HelmSut.PYTHIA_70M.huggingface is True

0 comments on commit 4b7899d

Please sign in to comment.