Skip to content

Commit

Permalink
Merge branch 'develop' into patch-1
Browse files Browse the repository at this point in the history
  • Loading branch information
hbredin authored May 17, 2024
2 parents c89fd15 + 5ae4c9b commit 6d39322
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 8 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,18 @@

## develop

### New features

- feat(io): add option to select torchaudio `backend`

### Fixes

- fix(task): fix wrong train/development split when training with (some) meta-protocols ([#1709](https://github.com/pyannote/pyannote-audio/issues/1709))

### Improvements

- improve(io): when available, default to using `soundfile` backend

## Version 3.2.0 (2024-05-08)

### New features
Expand Down
48 changes: 41 additions & 7 deletions pyannote/audio/core/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,34 @@
"""


def get_torchaudio_info(file: AudioFile):
def get_torchaudio_info(
file: AudioFile, backend: str = None
) -> torchaudio.AudioMetaData:
"""Protocol preprocessor used to cache output of torchaudio.info
This is useful to speed future random access to this file, e.g.
in dataloaders using Audio.crop a lot....
Parameters
----------
file : AudioFile
backend : str
torchaudio backend to use. Defaults to 'soundfile' if available,
or the first available backend.
Returns
-------
info : torchaudio.AudioMetaData
Audio file metadata
"""

info = torchaudio.info(file["audio"])
if not backend:
backends = (
torchaudio.list_audio_backends()
) # e.g ['ffmpeg', 'soundfile', 'sox']
backend = "soundfile" if "soundfile" in backends else backends[0]

info = torchaudio.info(file["audio"], backend=backend)

# rewind if needed
if isinstance(file["audio"], IOBase):
Expand All @@ -82,6 +102,9 @@ class Audio:
In case of multi-channel audio, convert to single-channel audio
using one of the following strategies: select one channel at
'random' or 'downmix' by averaging all channels.
backend : str
torchaudio backend to use. Defaults to 'soundfile' if available,
or the first available backend.
Usage
-----
Expand Down Expand Up @@ -179,11 +202,19 @@ def validate_file(file: AudioFile) -> Mapping:

return file

def __init__(self, sample_rate=None, mono=None):
def __init__(self, sample_rate: int = None, mono=None, backend: str = None):
super().__init__()
self.sample_rate = sample_rate
self.mono = mono

if not backend:
backends = (
torchaudio.list_audio_backends()
) # e.g ['ffmpeg', 'soundfile', 'sox']
backend = "soundfile" if "soundfile" in backends else backends[0]

self.backend = backend

def downmix_and_resample(self, waveform: Tensor, sample_rate: int) -> Tensor:
"""Downmix and resample
Expand Down Expand Up @@ -244,7 +275,7 @@ def get_duration(self, file: AudioFile) -> float:
if "torchaudio.info" in file:
info = file["torchaudio.info"]
else:
info = get_torchaudio_info(file)
info = get_torchaudio_info(file, backend=self.backend)

frames = info.num_frames
sample_rate = info.sample_rate
Expand Down Expand Up @@ -291,7 +322,7 @@ def __call__(self, file: AudioFile) -> Tuple[Tensor, int]:
sample_rate = file["sample_rate"]

elif "audio" in file:
waveform, sample_rate = torchaudio.load(file["audio"])
waveform, sample_rate = torchaudio.load(file["audio"], backend=self.backend)

# rewind if needed
if isinstance(file["audio"], IOBase):
Expand Down Expand Up @@ -349,7 +380,7 @@ def crop(
sample_rate = info.sample_rate

else:
info = get_torchaudio_info(file)
info = get_torchaudio_info(file, backend=self.backend)
frames = info.num_frames
sample_rate = info.sample_rate

Expand Down Expand Up @@ -401,7 +432,10 @@ def crop(
else:
try:
data, _ = torchaudio.load(
file["audio"], frame_offset=start_frame, num_frames=num_frames
file["audio"],
frame_offset=start_frame,
num_frames=num_frames,
backend=self.backend,
)
# rewind if needed
if isinstance(file["audio"], IOBase):
Expand Down
11 changes: 10 additions & 1 deletion pyannote/audio/utils/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from functools import partial

import torchaudio
from pyannote.database import FileFinder, Protocol, get_annotated
from pyannote.database.protocol import SpeakerVerificationProtocol

Expand Down Expand Up @@ -89,7 +91,14 @@ def check_protocol(protocol: Protocol) -> Protocol:

if "waveform" not in file and "torchaudio.info" not in file:

protocol.preprocessors["torchaudio.info"] = get_torchaudio_info
# use soundfile when available (it usually is faster than ffmpeg for getting info)
backends = (
torchaudio.list_audio_backends()
) # e.g ['ffmpeg', 'soundfile', 'sox']
backend = "soundfile" if "soundfile" in backends else backends[0]
protocol.preprocessors["torchaudio.info"] = partial(
get_torchaudio_info, backend=backend
)
msg = (
f"Protocol {protocol.name} does not precompute the output of torchaudio.info(): "
f"adding a 'torchaudio.info' preprocessor for you to speed up dataloaders. "
Expand Down

0 comments on commit 6d39322

Please sign in to comment.