Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve(io): use (faster) soundfile backend when available #1711

Merged
merged 3 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,18 @@

## develop

### New features

- feat(io): add option to select torchaudio `backend`

### Fixes

- fix(task): fix wrong train/development split when training with (some) meta-protocols ([#1709](https://github.com/pyannote/pyannote-audio/issues/1709))

### Improvements

- improve(io): when available, default to using `soundfile` backend

## Version 3.2.0 (2024-05-08)

### New features
Expand Down
48 changes: 41 additions & 7 deletions pyannote/audio/core/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,34 @@
"""


def get_torchaudio_info(file: AudioFile):
def get_torchaudio_info(
file: AudioFile, backend: str = None
) -> torchaudio.AudioMetaData:
"""Protocol preprocessor used to cache output of torchaudio.info

This is useful to speed future random access to this file, e.g.
in dataloaders using Audio.crop a lot....

Parameters
----------
file : AudioFile
backend : str
torchaudio backend to use. Defaults to 'soundfile' if available,
or the first available backend.

Returns
-------
info : torchaudio.AudioMetaData
Audio file metadata
"""

info = torchaudio.info(file["audio"])
if not backend:
backends = (
torchaudio.list_audio_backends()
) # e.g ['ffmpeg', 'soundfile', 'sox']
backend = "soundfile" if "soundfile" in backends else backends[0]

info = torchaudio.info(file["audio"], backend=backend)

# rewind if needed
if isinstance(file["audio"], IOBase):
Expand All @@ -82,6 +102,9 @@ class Audio:
In case of multi-channel audio, convert to single-channel audio
using one of the following strategies: select one channel at
'random' or 'downmix' by averaging all channels.
backend : str
torchaudio backend to use. Defaults to 'soundfile' if available,
or the first available backend.

Usage
-----
Expand Down Expand Up @@ -179,11 +202,19 @@ def validate_file(file: AudioFile) -> Mapping:

return file

def __init__(self, sample_rate=None, mono=None):
def __init__(self, sample_rate: int = None, mono=None, backend: str = None):
super().__init__()
self.sample_rate = sample_rate
self.mono = mono

if not backend:
backends = (
torchaudio.list_audio_backends()
) # e.g ['ffmpeg', 'soundfile', 'sox']
backend = "soundfile" if "soundfile" in backends else backends[0]

self.backend = backend

def downmix_and_resample(self, waveform: Tensor, sample_rate: int) -> Tensor:
"""Downmix and resample

Expand Down Expand Up @@ -244,7 +275,7 @@ def get_duration(self, file: AudioFile) -> float:
if "torchaudio.info" in file:
info = file["torchaudio.info"]
else:
info = get_torchaudio_info(file)
info = get_torchaudio_info(file, backend=self.backend)

frames = info.num_frames
sample_rate = info.sample_rate
Expand Down Expand Up @@ -291,7 +322,7 @@ def __call__(self, file: AudioFile) -> Tuple[Tensor, int]:
sample_rate = file["sample_rate"]

elif "audio" in file:
waveform, sample_rate = torchaudio.load(file["audio"])
waveform, sample_rate = torchaudio.load(file["audio"], backend=self.backend)

# rewind if needed
if isinstance(file["audio"], IOBase):
Expand Down Expand Up @@ -349,7 +380,7 @@ def crop(
sample_rate = info.sample_rate

else:
info = get_torchaudio_info(file)
info = get_torchaudio_info(file, backend=self.backend)
frames = info.num_frames
sample_rate = info.sample_rate

Expand Down Expand Up @@ -401,7 +432,10 @@ def crop(
else:
try:
data, _ = torchaudio.load(
file["audio"], frame_offset=start_frame, num_frames=num_frames
file["audio"],
frame_offset=start_frame,
num_frames=num_frames,
backend=self.backend,
)
# rewind if needed
if isinstance(file["audio"], IOBase):
Expand Down
11 changes: 10 additions & 1 deletion pyannote/audio/utils/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from functools import partial

import torchaudio
from pyannote.database import FileFinder, Protocol, get_annotated
from pyannote.database.protocol import SpeakerVerificationProtocol

Expand Down Expand Up @@ -89,7 +91,14 @@ def check_protocol(protocol: Protocol) -> Protocol:

if "waveform" not in file and "torchaudio.info" not in file:

protocol.preprocessors["torchaudio.info"] = get_torchaudio_info
# use soundfile when available (it usually is faster than ffmpeg for getting info)
backends = (
torchaudio.list_audio_backends()
) # e.g ['ffmpeg', 'soundfile', 'sox']
backend = "soundfile" if "soundfile" in backends else backends[0]
protocol.preprocessors["torchaudio.info"] = partial(
get_torchaudio_info, backend=backend
)
msg = (
f"Protocol {protocol.name} does not precompute the output of torchaudio.info(): "
f"adding a 'torchaudio.info' preprocessor for you to speed up dataloaders. "
Expand Down
Loading