General formatting (#57)

* set black line length to 100; ensure tests have expectations last * set black line length to 120
mlcommons · Feb 1, 2024 · 4b7899d · 4b7899d
1 parent 6776971
commit 4b7899d
Show file tree

Hide file tree

Showing 7 changed files with 40 additions and 84 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,3 +34,6 @@ pythonpath = [
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
+
+[tool.black]
+line-length = 120
diff --git a/src/coffee/benchmark.py b/src/coffee/benchmark.py
@@ -95,9 +95,7 @@ def __init__(
         self.harm_scores = harm_scores
 
     def value(self):
-        return sum([s.normalized_value() for s in self.harm_scores]) / len(
-            self.harm_scores
-        )
+        return sum([s.normalized_value() for s in self.harm_scores]) / len(self.harm_scores)
 
     def stars(self):
         return float_to_stars(self.value())

diff --git a/src/coffee/helm.py b/src/coffee/helm.py
@@ -128,13 +128,9 @@ def load_scores(self):
                 if t.__class__ == BbqHelmTest:
                     test_sut_scores = self.hackily_extract_bbq_scores(focus, glob_path)
                 elif t.__class__ == RealToxicityPromptsHelmTest:
-                    test_sut_scores = self.hackily_extract_toxicity_scores(
-                        focus, glob_path
-                    )
+                    test_sut_scores = self.hackily_extract_toxicity_scores(focus, glob_path)
                 else:
-                    raise NotImplementedError(
-                        f"need to add score extration for {t.__class__}"
-                    )
+                    raise NotImplementedError(f"need to add score extration for {t.__class__}")
                 result.add(t, s, test_sut_scores)
         return result
 
@@ -225,15 +221,11 @@ def _build_runspecs(self, suts, tests):
                     runspecs.append(r + separator + "model=" + s.key)
         return runspecs
 
-    def _execute(
-        self, command: List[str], output_dir: pathlib.Path
-    ) -> subprocess.CompletedProcess:
+    def _execute(self, command: List[str], output_dir: pathlib.Path) -> subprocess.CompletedProcess:
         if coffee.app_config.debug:
             return self._run_with_debug_settings(command, output_dir)
         else:
-            return subprocess.run(
-                " ".join(command), shell=True, capture_output=True, cwd=output_dir
-            )
+            return subprocess.run(" ".join(command), shell=True, capture_output=True, cwd=output_dir)
 
     def _run_with_debug_settings(self, command, output_dir):
         with subprocess.Popen(
@@ -257,13 +249,8 @@ def _make_output_dir(self):
         return o
 
     def _helm_command_for_runspecs(self, bbq_runspecs, max_instances):
-        command = [
-            "python "
-            + str(pathlib.Path(__file__).parent.parent / "dubious_helm_cli_wrapper.py")
-        ]
-        command.extend(
-            ["--suite", "v1"]
-        )  # this is a fixed string for now, which is probably wrong
+        command = ["python " + str(pathlib.Path(__file__).parent.parent / "dubious_helm_cli_wrapper.py")]
+        command.extend(["--suite", "v1"])  # this is a fixed string for now, which is probably wrong
         command.extend(["-n", "1"])  # working around a bug
         command.extend(["--max-eval-instances", str(max_instances)])
 
@@ -276,7 +263,5 @@ def _model_deployment_conf(self, sut: HelmSut):
             "name": sut.key,
             "tokenizer_name": sut.tokenizer_name,
             "max_sequence_length": sut.tokenizer_max_length,
-            "client_spec": {
-                "class_name": "helm.proxy.clients.huggingface_client.HuggingFaceClient"
-            },
+            "client_spec": {"class_name": "helm.proxy.clients.huggingface_client.HuggingFaceClient"},
         }
diff --git a/src/coffee/run.py b/src/coffee/run.py
@@ -37,27 +37,19 @@ def cli(output_dir: pathlib.Path, max_instances: int, debug: bool, web_only) ->
     suts = [HelmSut.GPT2, HelmSut.PYTHIA_70M, HelmSut.FB_OPT_125M]
     benchmark_scores = []
     for benchmark_definition in [GeneralChatBotBenchmarkDefinition()]:
-        print(
-            termcolor.colored(
-                f'Starting run for benchmark "{benchmark_definition.name()}"', "green"
-            )
-        )
+        print(termcolor.colored(f'Starting run for benchmark "{benchmark_definition.name()}"', "green"))
         harm_scores_by_sut = defaultdict(list)
         for harm in benchmark_definition.harms():
             print(termcolor.colored(f'  Examining harm "term{harm.name()}"', "yellow"))
 
             if web_only:
                 # this is a little sketchy for now, a quick fix to make testing HTML changes easier
-                tests = itertools.chain(
-                    *[harm.tests() for harm in benchmark_definition.harms()]
-                )
+                tests = itertools.chain(*[harm.tests() for harm in benchmark_definition.harms()])
                 result = HelmResult(list(tests), suts, pathlib.Path("./run"), None)
             else:
                 result = runner.run(harm.tests(), suts, max_instances)
                 if not result.success():
-                    print(
-                        f"HELM execution failed with return code {result.execution_result.returncode}:"
-                    )
+                    print(f"HELM execution failed with return code {result.execution_result.returncode}:")
                     print("stdout:")
                     print(result.helm_stdout())
                     print("stderr:")
@@ -75,9 +67,7 @@ def cli(output_dir: pathlib.Path, max_instances: int, debug: bool, web_only) ->
                     )
                 harm_scores_by_sut[sut].append(score)
         for sut in suts:
-            benchmark_scores.append(
-                BenchmarkScore(benchmark_definition, sut, harm_scores_by_sut[sut])
-            )
+            benchmark_scores.append(BenchmarkScore(benchmark_definition, sut, harm_scores_by_sut[sut]))
     print()
     print(termcolor.colored(f"Benchmarking complete, rendering reports...", "green"))
     static_site_generator = StaticSiteGenerator()

diff --git a/src/coffee/static_site_generator.py b/src/coffee/static_site_generator.py
@@ -36,11 +36,7 @@ def display_stars(score, size) -> Markup:
     <path d="M3.612 15.443c-.386.198-.824-.149-.746-.592l.83-4.73L.173 6.765c-.329-.314-.158-.888.283-.95l4.898-.696L7.538.792c.197-.39.73-.39.927 0l2.184 4.327 4.898.696c.441.062.612.636.282.95l-3.522 3.356.83 4.73c.078.443-.36.79-.746.592L8 13.187l-4.389 2.256z"/>
     </svg></span>
     """
-    final_html = (
-        (stars_html * stars)
-        + (half_star_html * half_star)
-        + (empty_stars_html * empty_stars)
-    )
+    final_html = (stars_html * stars) + (half_star_html * half_star) + (empty_stars_html * empty_stars)
     return Markup(final_html)
 
 
@@ -93,9 +89,7 @@ def display_stars(score, size) -> Markup:
 
 class StaticSiteGenerator:
     def __init__(self) -> None:
-        self.env = Environment(
-            loader=PackageLoader("coffee"), autoescape=select_autoescape()
-        )
+        self.env = Environment(loader=PackageLoader("coffee"), autoescape=select_autoescape())
         self.env.filters["display_stars"] = display_stars
 
     def _template_dir(self):
@@ -141,37 +135,26 @@ def _grouped_benchmark_scores(self, benchmark_scores: list[BenchmarkScore]) -> d
             benchmark_scores_dict[benchmark_definition] = grouped_benchmark_scores_list
         return benchmark_scores_dict
 
-    def _generate_benchmarks_page(
-        self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path
-    ) -> None:
+    def _generate_benchmarks_page(self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path) -> None:
         self._write_file(
             output=output_dir / "benchmarks.html",
             template_name="benchmarks.html",
             grouped_benchmark_scores=self._grouped_benchmark_scores(benchmark_scores),
             show_benchmark_header=True,
         )
 
-    def _generate_benchmark_pages(
-        self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path
-    ) -> None:
-        for benchmark_definition, benchmark_scores in self._grouped_benchmark_scores(
-            benchmark_scores
-        ).items():
+    def _generate_benchmark_pages(self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path) -> None:
+        for benchmark_definition, benchmark_scores in self._grouped_benchmark_scores(benchmark_scores).items():
             for benchmark_score in benchmark_scores:
                 self._write_file(
-                    output=output_dir
-                    / f"{benchmark_score.benchmark_definition.path_name()}.html",
+                    output=output_dir / f"{benchmark_score.benchmark_definition.path_name()}.html",
                     template_name="benchmark.html",
                     benchmark_definition=benchmark_definition,
-                    grouped_benchmark_scores=self._grouped_benchmark_scores(
-                        benchmark_scores
-                    ),
+                    grouped_benchmark_scores=self._grouped_benchmark_scores(benchmark_scores),
                     stars_description=STARS_DESCRIPTION,
                 )
 
-    def _generate_test_report_pages(
-        self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path
-    ) -> None:
+    def _generate_test_report_pages(self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path) -> None:
         for benchmark_score in benchmark_scores:
             self._write_file(
                 output=output_dir

diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
@@ -132,14 +132,14 @@ def test_toxicity_scoring(datafiles):
 
 
 def test_quantize_stars():
-    assert 0 == quantize_stars(0)
-    assert 5 == quantize_stars(5)
+    assert quantize_stars(0) == 0
+    assert quantize_stars(5) == 5
 
-    assert 4.5 == quantize_stars(4.5)
+    assert quantize_stars(4.5) == 4.5
 
-    assert 4.5 == quantize_stars(4.26)
-    assert 4.5 == quantize_stars(4.3)
-    assert 4.5 == quantize_stars(4.4)
-    assert 4.5 == quantize_stars(4.6)
-    assert 4.5 == quantize_stars(4.7)
-    assert 4.5 == quantize_stars(4.74)
+    assert quantize_stars(4.26) == 4.5
+    assert quantize_stars(4.3) == 4.5
+    assert quantize_stars(4.4) == 4.5
+    assert quantize_stars(4.6) == 4.5
+    assert quantize_stars(4.7) == 4.5
+    assert quantize_stars(4.74) == 4.5
diff --git a/tests/test_helm_runner.py b/tests/test_helm_runner.py
@@ -23,8 +23,8 @@ def test_cli_helm_runner_command(cwd_tmpdir):
     runner.run([BbqHelmTest()], [HelmSut.GPT2])
     shell_arguments = runner._execute.call_args.args[0]
     runspecs = shell_arguments[shell_arguments.index("-r") + 1 :]
-    assert "bbq:subject=Age,model=openai/gpt2" == runspecs[0]
-    assert len(BbqHelmTest.CATEGORIES) == len(runspecs)
+    assert runspecs[0] == "bbq:subject=Age,model=openai/gpt2"
+    assert len(runspecs) == len(BbqHelmTest.CATEGORIES)
 
 
 def test_runspec_without_params():
@@ -56,10 +56,7 @@ def test_cli_helm_runner_command_handles_huggingface_models_with_config(cwd_tmpd
     assert fb["name"] == HelmSut.FB_OPT_125M.key
     assert fb["tokenizer_name"] == HelmSut.FB_OPT_125M.tokenizer_name
     assert fb["max_sequence_length"] == HelmSut.FB_OPT_125M.tokenizer_max_length
-    assert (
-        fb["client_spec"]["class_name"]
-        == "helm.proxy.clients.huggingface_client.HuggingFaceClient"
-    )
+    assert fb["client_spec"]["class_name"] == "helm.proxy.clients.huggingface_client.HuggingFaceClient"
 
 
 @pytest.mark.datafiles(SIMPLE_BBQ_DATA)
@@ -68,8 +65,8 @@ def test_read_scores(datafiles):
     scores = hr.load_scores()
     sut_scores = scores.for_sut(HelmSut.GPT2)
     assert "BbqHelmTest" in sut_scores
-    assert 2 == len(sut_scores["BbqHelmTest"])
-    assert 0.7 == sut_scores["BbqHelmTest"]["Age"]["bbq_accuracy"]
+    assert len(sut_scores["BbqHelmTest"]) == 2
+    assert sut_scores["BbqHelmTest"]["Age"]["bbq_accuracy"] == 0.7
 
 
 def test_helmsut_basics():
@@ -78,6 +75,6 @@ def test_helmsut_basics():
 
 
 def test_helmsut_huggingface():
-    assert HelmSut.GPT2.huggingface == False
-    assert HelmSut.FB_OPT_125M.huggingface == True
-    assert HelmSut.PYTHIA_70M.huggingface == True
+    assert HelmSut.GPT2.huggingface is False
+    assert HelmSut.FB_OPT_125M.huggingface is True
+    assert HelmSut.PYTHIA_70M.huggingface is True