From 48454e8af91ed7a6166b130fb6639c98c9424219 Mon Sep 17 00:00:00 2001
From: Robert Knight <robertknight@gmail.com>
Date: Wed, 28 Feb 2024 02:02:48 +0000
Subject: [PATCH] Add Python script to run end-to-end tests

This enables adding new test cases just by adding new images + expected text
output in `ocrs-cli/test-data/`. Also the Python script can more easily include
extra info such as the runtime of each test.
---
 Makefile          |  9 +----
 tools/test-e2e.py | 88 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+), 8 deletions(-)
 create mode 100755 tools/test-e2e.py

diff --git a/Makefile b/Makefile
index c5484fc..9d68a25 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,3 @@
-TMPDIR := $(or $(RUNNER_TEMP),/tmp)
-
 .PHONY: build
 build:
 	cargo build
@@ -26,12 +24,7 @@ test:
 
 .PHONY: test-e2e
 test-e2e:
-	# Simple test case
-	cargo run --release -p ocrs-cli ocrs-cli/test-data/why-rust.png -o $(TMPDIR)/why-rust.txt
-	diff --ignore-space-change -u $(TMPDIR)/why-rust.txt ocrs-cli/test-data/why-rust.expected.txt
-	# Long lines
-	cargo run --release -p ocrs-cli ocrs-cli/test-data/polar-bears.png -o $(TMPDIR)/polar-bears.txt
-	diff --ignore-space-change -u $(TMPDIR)/polar-bears.txt ocrs-cli/test-data/polar-bears.expected.txt
+	python tools/test-e2e.py ocrs-cli/test-data/
 
 .PHONY: wasm
 wasm:
diff --git a/tools/test-e2e.py b/tools/test-e2e.py
new file mode 100755
index 0000000..3b96083
--- /dev/null
+++ b/tools/test-e2e.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+
+from argparse import ArgumentParser
+import re
+import os
+from subprocess import run
+import sys
+import time
+
+
+def build_ocrs() -> None:
+    run("cargo build --release -p ocrs-cli", shell=True, check=True, text=True)
+
+
+def extract_text(image_path: str) -> str:
+    """Extract text from an image using ocrs."""
+    result = run(
+        # We run the binary directly here rather than use `cargo run` as it
+        # is slightly faster.
+        [f"target/release/ocrs", image_path],
+        check=True,
+        text=True,
+        capture_output=True,
+    )
+    return result.stdout
+
+
+IMAGE_PAT = "\\.(jpeg|jpg|png|webp)$"
+
+
+def run_tests(test_case_dir: str) -> bool:
+    """
+    Compare extracted text for image files against expectations.
+
+    Each image file in `test_case_dir` is expected to have an accompanying
+    "{image_name}.expected.txt" file.
+
+    Returns True if all test cases passed.
+    """
+    image_filenames = [
+        path for path in os.listdir(test_case_dir) if re.search(IMAGE_PAT, path)
+    ]
+
+    print(f"Testing {len(image_filenames)} images...")
+
+    errors = 0
+    for i, fname in enumerate(image_filenames):
+        basename = os.path.splitext(fname)[0]
+        expected_path = f"{test_case_dir}/{basename}.expected.txt"
+        with open(expected_path) as fp:
+            expected_text = fp.read()
+
+        print(f"[{i+1}/{len(image_filenames)}] Testing {fname}", end="")
+        start = time.perf_counter()
+        text = extract_text(f"{test_case_dir}/{fname}")
+        elapsed = time.perf_counter() - start
+        print(f" ({elapsed:0.2f}s)")
+
+        expected_text = expected_text.strip()
+        text = text.strip()
+
+        if text != expected_text:
+            print(f"Actual vs expected mismatch for {fname}")
+            errors += 1
+
+    if errors != 0:
+        print(f"{errors} tests failed")
+
+    return errors == 0
+
+
+parser = ArgumentParser(
+    description="""
+Run end-to-end tests of ocrs.
+
+Runs ocrs on a set of image files and compares the extracted text with
+expectations in `{imagename}.expected.txt` files.
+"""
+)
+parser.add_argument("dir", help="Directory containing test images and expected outputs")
+args = parser.parse_args()
+
+print("Building ocrs...")
+build_ocrs()
+passed = run_tests(args.dir)
+
+if not passed:
+    sys.exit(1)