From 8419c40a085d67eb5832cecebf3ef8213112857d Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Fri, 12 Jul 2024 11:00:09 +0200 Subject: [PATCH] Mark tests that require librosa (#7044) * Implement test require_librosa * Mark tests that require librosa * Mark tests in test_audiofolder with require_librosa * Mark test in test_upstream_hub with require_librosa --- tests/features/test_audio.py | 22 ++++++++++++++++++++++ tests/packaged_modules/test_audiofolder.py | 10 ++++++++-- tests/packaged_modules/test_webdataset.py | 3 ++- tests/test_formatting.py | 13 ++++++++++++- tests/test_upstream_hub.py | 6 ++++-- tests/utils.py | 1 + 6 files changed, 49 insertions(+), 6 deletions(-) diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index 5b58cb7c329..8848b34154a 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -8,6 +8,7 @@ from datasets.features import Audio, Features, Sequence, Value from ..utils import ( + require_librosa, require_sndfile, ) @@ -57,6 +58,7 @@ def test_audio_feature_type_to_arrow(): assert features.arrow_schema == pa.schema({"sequence_of_audios": pa.list_(Audio().pa_type)}) +@require_librosa @pytest.mark.parametrize( "build_example", [ @@ -81,6 +83,7 @@ def test_audio_feature_encode_example(shared_datadir, build_example): assert decoded_example.keys() == {"path", "array", "sampling_rate"} +@require_librosa @pytest.mark.parametrize( "build_example", [ @@ -101,6 +104,7 @@ def test_audio_feature_encode_example_pcm(shared_datadir, build_example): assert decoded_example.keys() == {"path", "array", "sampling_rate"} +@require_librosa @require_sndfile def test_audio_decode_example(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") @@ -115,6 +119,7 @@ def test_audio_decode_example(shared_datadir): Audio(decode=False).decode_example(audio_path) +@require_librosa @require_sndfile def test_audio_resampling(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") @@ -126,6 +131,7 @@ def test_audio_resampling(shared_datadir): assert decoded_example["sampling_rate"] == 16000 +@require_librosa @require_sndfile def test_audio_decode_example_mp3(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.mp3") @@ -137,6 +143,7 @@ def test_audio_decode_example_mp3(shared_datadir): assert decoded_example["sampling_rate"] == 44100 +@require_librosa @require_sndfile def test_audio_decode_example_opus(shared_datadir): audio_path = str(shared_datadir / "test_audio_48000.opus") @@ -148,6 +155,7 @@ def test_audio_decode_example_opus(shared_datadir): assert decoded_example["sampling_rate"] == 48000 +@require_librosa @pytest.mark.parametrize("sampling_rate", [16_000, 48_000]) def test_audio_decode_example_pcm(shared_datadir, sampling_rate): audio_path = str(shared_datadir / "test_audio_16000.pcm") @@ -160,6 +168,7 @@ def test_audio_decode_example_pcm(shared_datadir, sampling_rate): assert decoded_example["sampling_rate"] == sampling_rate +@require_librosa @require_sndfile def test_audio_resampling_mp3_different_sampling_rates(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.mp3") @@ -179,6 +188,7 @@ def test_audio_resampling_mp3_different_sampling_rates(shared_datadir): assert decoded_example["sampling_rate"] == 48000 +@require_librosa @require_sndfile def test_dataset_with_audio_feature(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") @@ -206,6 +216,7 @@ def test_dataset_with_audio_feature(shared_datadir): assert column[0]["sampling_rate"] == 44100 +@require_librosa @require_sndfile def test_dataset_with_audio_feature_tar_wav(tar_wav_path): audio_filename = "test_audio_44100.wav" @@ -236,6 +247,7 @@ def test_dataset_with_audio_feature_tar_wav(tar_wav_path): assert column[0]["sampling_rate"] == 44100 +@require_librosa @require_sndfile def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path): audio_filename = "test_audio_44100.mp3" @@ -300,6 +312,7 @@ def test_dataset_with_audio_feature_with_none(): assert item["nested"]["audio"] is None +@require_librosa @require_sndfile def test_resampling_at_loading_dataset_with_audio_feature(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") @@ -327,6 +340,7 @@ def test_resampling_at_loading_dataset_with_audio_feature(shared_datadir): assert column[0]["sampling_rate"] == 16000 +@require_librosa @require_sndfile def test_resampling_at_loading_dataset_with_audio_feature_mp3(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.mp3") @@ -354,6 +368,7 @@ def test_resampling_at_loading_dataset_with_audio_feature_mp3(shared_datadir): assert column[0]["sampling_rate"] == 16000 +@require_librosa @require_sndfile def test_resampling_after_loading_dataset_with_audio_feature(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") @@ -384,6 +399,7 @@ def test_resampling_after_loading_dataset_with_audio_feature(shared_datadir): assert column[0]["sampling_rate"] == 16000 +@require_librosa @require_sndfile def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.mp3") @@ -414,6 +430,7 @@ def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir) assert column[0]["sampling_rate"] == 16000 +@require_librosa @pytest.mark.parametrize( "build_data", [ @@ -438,6 +455,7 @@ def test_dataset_cast_to_audio_features(shared_datadir, build_data): assert item["audio"].keys() == {"path", "array", "sampling_rate"} +@require_librosa def test_dataset_concatenate_audio_features(shared_datadir): # we use a different data structure between 1 and 2 to make sure they are compatible with each other audio_path = str(shared_datadir / "test_audio_44100.wav") @@ -451,6 +469,7 @@ def test_dataset_concatenate_audio_features(shared_datadir): assert concatenated_dataset[1]["audio"]["array"].shape == dset2[0]["audio"]["array"].shape +@require_librosa def test_dataset_concatenate_nested_audio_features(shared_datadir): # we use a different data structure between 1 and 2 to make sure they are compatible with each other audio_path = str(shared_datadir / "test_audio_44100.wav") @@ -493,6 +512,7 @@ def process_text(example): assert item == {"audio": expected_audio, "text": "Hello World!"} +@require_librosa @require_sndfile def test_dataset_with_audio_feature_map_is_decoded(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") @@ -522,6 +542,7 @@ def process_audio_sampling_rate_by_batch(batch): assert item["double_sampling_rate"] == 88200 +@require_librosa @require_sndfile def test_formatted_dataset_with_audio_feature(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") @@ -585,6 +606,7 @@ def jsonl_audio_dataset_path(shared_datadir, tmp_path_factory): return path +@require_librosa @require_sndfile @pytest.mark.parametrize("streaming", [False, True]) def test_load_dataset_with_audio_feature(streaming, jsonl_audio_dataset_path, shared_datadir): diff --git a/tests/packaged_modules/test_audiofolder.py b/tests/packaged_modules/test_audiofolder.py index 3351fccf604..30dfd7a8d85 100644 --- a/tests/packaged_modules/test_audiofolder.py +++ b/tests/packaged_modules/test_audiofolder.py @@ -1,7 +1,6 @@ import shutil import textwrap -import librosa import numpy as np import pytest import soundfile as sf @@ -12,7 +11,7 @@ from datasets.download.streaming_download_manager import StreamingDownloadManager from datasets.packaged_modules.audiofolder.audiofolder import AudioFolder, AudioFolderConfig -from ..utils import require_sndfile +from ..utils import require_librosa, require_sndfile @pytest.fixture @@ -195,6 +194,8 @@ def data_files_with_two_splits_and_metadata(request, tmp_path, audio_file): @pytest.fixture def data_files_with_zip_archives(tmp_path, audio_file): + import librosa + data_dir = tmp_path / "audiofolder_data_dir_with_zip_archives" data_dir.mkdir(parents=True, exist_ok=True) archive_dir = data_dir / "archive" @@ -242,6 +243,7 @@ def test_config_raises_when_invalid_data_files(data_files) -> None: _ = AudioFolderConfig(name="name", data_files=data_files) +@require_librosa @require_sndfile # check that labels are inferred correctly from dir names def test_generate_examples_with_labels(data_files_with_labels_no_metadata, cache_dir): @@ -256,6 +258,7 @@ def test_generate_examples_with_labels(data_files_with_labels_no_metadata, cache assert dataset[1]["label"] == label_feature._str2int["uk"] +@require_librosa @require_sndfile @pytest.mark.parametrize("drop_metadata", [None, True, False]) @pytest.mark.parametrize("drop_labels", [None, True, False]) @@ -385,6 +388,7 @@ def test_generate_examples_with_metadata_that_misses_one_audio( ) +@require_librosa @require_sndfile @pytest.mark.parametrize("streaming", [False, True]) def test_data_files_with_metadata_and_single_split(streaming, cache_dir, data_files_with_one_split_and_metadata): @@ -403,6 +407,7 @@ def test_data_files_with_metadata_and_single_split(streaming, cache_dir, data_fi assert all(example["text"] is not None for example in dataset) +@require_librosa @require_sndfile @pytest.mark.parametrize("streaming", [False, True]) def test_data_files_with_metadata_and_multiple_splits(streaming, cache_dir, data_files_with_two_splits_and_metadata): @@ -421,6 +426,7 @@ def test_data_files_with_metadata_and_multiple_splits(streaming, cache_dir, data assert all(example["text"] is not None for example in dataset) +@require_librosa @require_sndfile @pytest.mark.parametrize("streaming", [False, True]) def test_data_files_with_metadata_and_archives(streaming, cache_dir, data_files_with_zip_archives): diff --git a/tests/packaged_modules/test_webdataset.py b/tests/packaged_modules/test_webdataset.py index e122275cec9..6cdd53b6cbf 100644 --- a/tests/packaged_modules/test_webdataset.py +++ b/tests/packaged_modules/test_webdataset.py @@ -7,7 +7,7 @@ from datasets import Audio, DownloadManager, Features, Image, Sequence, Value from datasets.packaged_modules.webdataset.webdataset import WebDataset -from ..utils import require_pil, require_sndfile, require_torch +from ..utils import require_librosa, require_pil, require_sndfile, require_torch @pytest.fixture @@ -159,6 +159,7 @@ def test_image_webdataset_missing_keys(image_wds_file): assert decoded["txt"] is None +@require_librosa @require_sndfile def test_audio_webdataset(audio_wds_file): data_files = {"train": [audio_wds_file]} diff --git a/tests/test_formatting.py b/tests/test_formatting.py index 2344bf2e38f..e190356f216 100644 --- a/tests/test_formatting.py +++ b/tests/test_formatting.py @@ -18,7 +18,15 @@ ) from datasets.table import InMemoryTable -from .utils import require_jax, require_pil, require_polars, require_sndfile, require_tf, require_torch +from .utils import ( + require_jax, + require_librosa, + require_pil, + require_polars, + require_sndfile, + require_tf, + require_torch, +) class AnyArray: @@ -300,6 +308,7 @@ def test_numpy_formatter_image(self): self.assertEqual(batch["image"][0].dtype, np.uint8) self.assertEqual(batch["image"][0].shape, (480, 640, 3)) + @require_librosa @require_sndfile def test_numpy_formatter_audio(self): pa_table = pa.table({"audio": [{"bytes": None, "path": str(AUDIO_PATH_1)}]}) @@ -419,6 +428,7 @@ def test_torch_formatter_image(self): self.assertEqual(batch["image"][0].shape, (3, 480, 640)) @require_torch + @require_librosa @require_sndfile def test_torch_formatter_audio(self): import torch @@ -602,6 +612,7 @@ def test_jax_formatter_image(self): self.assertEqual(batch["image"][0].shape, (480, 640, 3)) @require_jax + @require_librosa @require_sndfile def test_jax_formatter_audio(self): import jax.numpy as jnp diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py index daa0841ac98..1e15584f41e 100644 --- a/tests/test_upstream_hub.py +++ b/tests/test_upstream_hub.py @@ -34,8 +34,9 @@ ) from datasets.utils.file_utils import cached_path from datasets.utils.hub import hf_dataset_url -from tests.fixtures.hub import CI_HUB_ENDPOINT, CI_HUB_USER, CI_HUB_USER_TOKEN -from tests.utils import for_all_test_methods, require_pil, require_sndfile, xfail_if_500_502_http_error + +from .fixtures.hub import CI_HUB_ENDPOINT, CI_HUB_USER, CI_HUB_USER_TOKEN +from .utils import for_all_test_methods, require_librosa, require_pil, require_sndfile, xfail_if_500_502_http_error pytestmark = pytest.mark.integration @@ -383,6 +384,7 @@ def test_push_dataset_to_hub_custom_features(self, temporary_repo): assert ds.features == hub_ds.features assert ds[:] == hub_ds[:] + @require_librosa @require_sndfile def test_push_dataset_to_hub_custom_features_audio(self, temporary_repo): audio_path = os.path.join(os.path.dirname(__file__), "features", "data", "test_audio_44100.wav") diff --git a/tests/utils.py b/tests/utils.py index 6448fe79272..92c2cad30fe 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -48,6 +48,7 @@ def parse_flag_from_env(key, default=False): require_zstandard = pytest.mark.skipif(not config.ZSTANDARD_AVAILABLE, reason="test requires zstandard") # Audio +require_librosa = pytest.mark.skipif(find_spec("librosa") is None, reason="test requires librosa") require_sndfile = pytest.mark.skipif( # On Windows and OS X, soundfile installs sndfile find_spec("soundfile") is None or version.parse(importlib.metadata.version("soundfile")) < version.parse("0.12.0"),