From 5ae4c9b685feee02cbd58d25210e51def7037079 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Fri, 17 May 2024 20:59:02 +0200 Subject: [PATCH] improve(io): use (faster) soundfile backend when available (#1711) --- CHANGELOG.md | 8 ++++++ pyannote/audio/core/io.py | 48 +++++++++++++++++++++++++++----- pyannote/audio/utils/protocol.py | 11 +++++++- 3 files changed, 59 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e48aafacb..accc0cc1b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,10 +2,18 @@ ## develop +### New features + +- feat(io): add option to select torchaudio `backend` + ### Fixes - fix(task): fix wrong train/development split when training with (some) meta-protocols ([#1709](https://github.com/pyannote/pyannote-audio/issues/1709)) +### Improvements + +- improve(io): when available, default to using `soundfile` backend + ## Version 3.2.0 (2024-05-08) ### New features diff --git a/pyannote/audio/core/io.py b/pyannote/audio/core/io.py index 8fafe69d3..bce9c4dbf 100644 --- a/pyannote/audio/core/io.py +++ b/pyannote/audio/core/io.py @@ -55,14 +55,34 @@ """ -def get_torchaudio_info(file: AudioFile): +def get_torchaudio_info( + file: AudioFile, backend: str = None +) -> torchaudio.AudioMetaData: """Protocol preprocessor used to cache output of torchaudio.info This is useful to speed future random access to this file, e.g. in dataloaders using Audio.crop a lot.... + + Parameters + ---------- + file : AudioFile + backend : str + torchaudio backend to use. Defaults to 'soundfile' if available, + or the first available backend. + + Returns + ------- + info : torchaudio.AudioMetaData + Audio file metadata """ - info = torchaudio.info(file["audio"]) + if not backend: + backends = ( + torchaudio.list_audio_backends() + ) # e.g ['ffmpeg', 'soundfile', 'sox'] + backend = "soundfile" if "soundfile" in backends else backends[0] + + info = torchaudio.info(file["audio"], backend=backend) # rewind if needed if isinstance(file["audio"], IOBase): @@ -82,6 +102,9 @@ class Audio: In case of multi-channel audio, convert to single-channel audio using one of the following strategies: select one channel at 'random' or 'downmix' by averaging all channels. + backend : str + torchaudio backend to use. Defaults to 'soundfile' if available, + or the first available backend. Usage ----- @@ -179,11 +202,19 @@ def validate_file(file: AudioFile) -> Mapping: return file - def __init__(self, sample_rate=None, mono=None): + def __init__(self, sample_rate: int = None, mono=None, backend: str = None): super().__init__() self.sample_rate = sample_rate self.mono = mono + if not backend: + backends = ( + torchaudio.list_audio_backends() + ) # e.g ['ffmpeg', 'soundfile', 'sox'] + backend = "soundfile" if "soundfile" in backends else backends[0] + + self.backend = backend + def downmix_and_resample(self, waveform: Tensor, sample_rate: int) -> Tensor: """Downmix and resample @@ -244,7 +275,7 @@ def get_duration(self, file: AudioFile) -> float: if "torchaudio.info" in file: info = file["torchaudio.info"] else: - info = get_torchaudio_info(file) + info = get_torchaudio_info(file, backend=self.backend) frames = info.num_frames sample_rate = info.sample_rate @@ -291,7 +322,7 @@ def __call__(self, file: AudioFile) -> Tuple[Tensor, int]: sample_rate = file["sample_rate"] elif "audio" in file: - waveform, sample_rate = torchaudio.load(file["audio"]) + waveform, sample_rate = torchaudio.load(file["audio"], backend=self.backend) # rewind if needed if isinstance(file["audio"], IOBase): @@ -349,7 +380,7 @@ def crop( sample_rate = info.sample_rate else: - info = get_torchaudio_info(file) + info = get_torchaudio_info(file, backend=self.backend) frames = info.num_frames sample_rate = info.sample_rate @@ -401,7 +432,10 @@ def crop( else: try: data, _ = torchaudio.load( - file["audio"], frame_offset=start_frame, num_frames=num_frames + file["audio"], + frame_offset=start_frame, + num_frames=num_frames, + backend=self.backend, ) # rewind if needed if isinstance(file["audio"], IOBase): diff --git a/pyannote/audio/utils/protocol.py b/pyannote/audio/utils/protocol.py index 0cfe4ccf2..bca0e5942 100644 --- a/pyannote/audio/utils/protocol.py +++ b/pyannote/audio/utils/protocol.py @@ -20,7 +20,9 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from functools import partial +import torchaudio from pyannote.database import FileFinder, Protocol, get_annotated from pyannote.database.protocol import SpeakerVerificationProtocol @@ -89,7 +91,14 @@ def check_protocol(protocol: Protocol) -> Protocol: if "waveform" not in file and "torchaudio.info" not in file: - protocol.preprocessors["torchaudio.info"] = get_torchaudio_info + # use soundfile when available (it usually is faster than ffmpeg for getting info) + backends = ( + torchaudio.list_audio_backends() + ) # e.g ['ffmpeg', 'soundfile', 'sox'] + backend = "soundfile" if "soundfile" in backends else backends[0] + protocol.preprocessors["torchaudio.info"] = partial( + get_torchaudio_info, backend=backend + ) msg = ( f"Protocol {protocol.name} does not precompute the output of torchaudio.info(): " f"adding a 'torchaudio.info' preprocessor for you to speed up dataloaders. "