pyannote · hbredin · May 17, 2024 · May 15, 2024 · May 17, 2024 · May 17, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,10 +2,18 @@
 
 ## develop
 
+### New features
+
+- feat(io): add option to select torchaudio `backend`
+
 ### Fixes
 
 - fix(task): fix wrong train/development split when training with (some) meta-protocols ([#1709](https://github.com/pyannote/pyannote-audio/issues/1709))
 
+### Improvements
+
+- improve(io): when available, default to using `soundfile` backend
+
 ## Version 3.2.0 (2024-05-08)
 
 ### New features

diff --git a/pyannote/audio/core/io.py b/pyannote/audio/core/io.py
@@ -55,14 +55,34 @@
 """
 
 
-def get_torchaudio_info(file: AudioFile):
+def get_torchaudio_info(
+    file: AudioFile, backend: str = None
+) -> torchaudio.AudioMetaData:
     """Protocol preprocessor used to cache output of torchaudio.info
 
     This is useful to speed future random access to this file, e.g.
     in dataloaders using Audio.crop a lot....
+
+    Parameters
+    ----------
+    file : AudioFile
+    backend : str
+        torchaudio backend to use. Defaults to 'soundfile' if available,
+        or the first available backend.
+
+    Returns
+    -------
+    info : torchaudio.AudioMetaData
+        Audio file metadata
     """
 
-    info = torchaudio.info(file["audio"])
+    if not backend:
+        backends = (
+            torchaudio.list_audio_backends()
+        )  # e.g ['ffmpeg', 'soundfile', 'sox']
+        backend = "soundfile" if "soundfile" in backends else backends[0]
+
+    info = torchaudio.info(file["audio"], backend=backend)
 
     # rewind if needed
     if isinstance(file["audio"], IOBase):
@@ -82,6 +102,9 @@ class Audio:
         In case of multi-channel audio, convert to single-channel audio
         using one of the following strategies: select one channel at
         'random' or 'downmix' by averaging all channels.
+    backend : str
+        torchaudio backend to use. Defaults to 'soundfile' if available,
+        or the first available backend.
 
     Usage
     -----
@@ -179,11 +202,19 @@ def validate_file(file: AudioFile) -> Mapping:
 
         return file
 
-    def __init__(self, sample_rate=None, mono=None):
+    def __init__(self, sample_rate: int = None, mono=None, backend: str = None):
         super().__init__()
         self.sample_rate = sample_rate
         self.mono = mono
 
+        if not backend:
+            backends = (
+                torchaudio.list_audio_backends()
+            )  # e.g ['ffmpeg', 'soundfile', 'sox']
+            backend = "soundfile" if "soundfile" in backends else backends[0]
+
+        self.backend = backend
+
     def downmix_and_resample(self, waveform: Tensor, sample_rate: int) -> Tensor:
         """Downmix and resample
 
@@ -244,7 +275,7 @@ def get_duration(self, file: AudioFile) -> float:
             if "torchaudio.info" in file:
                 info = file["torchaudio.info"]
             else:
-                info = get_torchaudio_info(file)
+                info = get_torchaudio_info(file, backend=self.backend)
 
             frames = info.num_frames
             sample_rate = info.sample_rate
@@ -291,7 +322,7 @@ def __call__(self, file: AudioFile) -> Tuple[Tensor, int]:
             sample_rate = file["sample_rate"]
 
         elif "audio" in file:
-            waveform, sample_rate = torchaudio.load(file["audio"])
+            waveform, sample_rate = torchaudio.load(file["audio"], backend=self.backend)
 
             # rewind if needed
             if isinstance(file["audio"], IOBase):
@@ -349,7 +380,7 @@ def crop(
             sample_rate = info.sample_rate
 
         else:
-            info = get_torchaudio_info(file)
+            info = get_torchaudio_info(file, backend=self.backend)
             frames = info.num_frames
             sample_rate = info.sample_rate
 
@@ -401,7 +432,10 @@ def crop(
         else:
             try:
                 data, _ = torchaudio.load(
-                    file["audio"], frame_offset=start_frame, num_frames=num_frames
+                    file["audio"],
+                    frame_offset=start_frame,
+                    num_frames=num_frames,
+                    backend=self.backend,
                 )
                 # rewind if needed
                 if isinstance(file["audio"], IOBase):

diff --git a/pyannote/audio/utils/protocol.py b/pyannote/audio/utils/protocol.py
@@ -20,7 +20,9 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+from functools import partial
 
+import torchaudio
 from pyannote.database import FileFinder, Protocol, get_annotated
 from pyannote.database.protocol import SpeakerVerificationProtocol
 
@@ -89,7 +91,14 @@ def check_protocol(protocol: Protocol) -> Protocol:
 
     if "waveform" not in file and "torchaudio.info" not in file:
 
-        protocol.preprocessors["torchaudio.info"] = get_torchaudio_info
+        # use soundfile when available (it usually is faster than ffmpeg for getting info)
+        backends = (
+            torchaudio.list_audio_backends()
+        )  # e.g ['ffmpeg', 'soundfile', 'sox']
+        backend = "soundfile" if "soundfile" in backends else backends[0]
+        protocol.preprocessors["torchaudio.info"] = partial(
+            get_torchaudio_info, backend=backend
+        )
         msg = (
             f"Protocol {protocol.name} does not precompute the output of torchaudio.info(): "
             f"adding a 'torchaudio.info' preprocessor for you to speed up dataloaders. "