Skip to content

Commit

Permalink
Ingest CSV labels into Hoplite DB.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 655991116
  • Loading branch information
sdenton4 authored and copybara-github committed Jul 25, 2024
1 parent 1281f45 commit 1864e01
Show file tree
Hide file tree
Showing 7 changed files with 359 additions and 195 deletions.
173 changes: 0 additions & 173 deletions chirp/data/soundscapes/dataset_fns.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,9 @@
"""Config utils specific to BirdClef Soundscape datasets."""

import csv
import json
import os

from chirp.data.soundscapes import soundscapes_lib
from chirp.taxonomy import annotations
from chirp.taxonomy import namespace_db
from etils import epath
import pandas as pd
import tensorflow as tf
Expand Down Expand Up @@ -92,57 +89,6 @@ def birdclef_metadata_features() -> dict[str, soundscapes_lib.MetadataFeature]:
return feature_types


def load_caples_annotations(annotations_path: epath.Path) -> pd.DataFrame:
"""Loads the dataframe of all caples annotations from annotation CSV.
Args:
annotations_path: Filepath for the annotations CSV.
Returns:
DataFrame of annotations.
"""
filename_fn = lambda _, row: row['fid'].strip()
start_time_fn = lambda row: float(row['start_time_s'])
end_time_fn = lambda row: float(row['end_time_s'])
# Get rid of the one bad label in the dataset...
filter_fn = lambda row: 'comros' in row['ebird_codes']
class_fn = lambda row: row['ebird_codes'].split(' ')
annos = annotations.read_dataset_annotations_csvs(
[annotations_path],
filename_fn=filename_fn,
namespace='ebird2021',
class_fn=class_fn,
start_time_fn=start_time_fn,
end_time_fn=end_time_fn,
filter_fn=filter_fn,
)
segments = annotations.annotations_to_dataframe(annos)
return segments


def load_cornell_annotations(annotations_path: epath.Path) -> pd.DataFrame:
"""Load the annotations from a Cornell Zenodo dataset."""
start_time_fn = lambda row: float(row['Start Time (s)'])
end_time_fn = lambda row: float(row['End Time (s)'])
filter_fn = lambda row: False
class_fn = lambda row: [ # pylint: disable=g-long-lambda
row['Species eBird Code'].strip().replace('????', 'unknown')
]

filename_fn = lambda filepath, row: row['Filename'].strip()
annos = annotations.read_dataset_annotations_csvs(
[annotations_path],
filename_fn=filename_fn,
namespace='ebird2021',
class_fn=class_fn,
start_time_fn=start_time_fn,
end_time_fn=end_time_fn,
filter_fn=filter_fn,
)
segments = annotations.annotations_to_dataframe(annos)
return segments


# TODO(tomdenton): Eliminate these 'combine' functions.
# Reading directly from the set of annotation files will be more direct and
# less error prone when updating datasets.
Expand Down Expand Up @@ -181,122 +127,3 @@ def combine_powdermill_annotations(
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)


def load_powdermill_annotations(annotations_path: epath.Path) -> pd.DataFrame:
"""Load the dataframe of all Powdermill annotations from annotation CSV."""
start_time_fn = lambda row: float(row['Begin Time (s)'])
end_time_fn = lambda row: float(row['End Time (s)'])
filter_fn = lambda row: False

# Convert dataset labels to ebird2021.
db = namespace_db.load_db()
ebird_mapping = db.mappings['ibp2019_to_ebird2021']
ebird_mapping_dict = ebird_mapping.mapped_pairs
class_fn = lambda row: [ # pylint: disable=g-long-lambda
ebird_mapping_dict.get(row['Species'].strip(), row['Species'].strip())
]

annotation_filepaths = [annotations_path]
filename_fn = lambda filepath, row: row['Filename'].strip()
annos = annotations.read_dataset_annotations_csvs(
annotation_filepaths,
filename_fn=filename_fn,
namespace=ebird_mapping.target_namespace,
class_fn=class_fn,
start_time_fn=start_time_fn,
end_time_fn=end_time_fn,
filter_fn=filter_fn,
)
segments = annotations.annotations_to_dataframe(annos)
return segments


def load_weldy_annotations(annotations_path: epath.Path) -> pd.DataFrame:
"""Loads a dataframe of all annotations from the Weldy Calltype dataset."""
filename_fn = lambda _, row: 'annotated_recordings/' + row['file'].strip()
start_time_fn = lambda row: float(row['start'])
end_time_fn = lambda row: float(row['end'])
filter_fn = lambda row: False
class_fn = lambda row: ( # pylint: disable=g-long-lambda
row['label']
.replace('unk', 'unknown')
.replace('impossible', 'unknown')
.replace('unknown_chip', 'unknown')
.split(' ')
)
annos = annotations.read_dataset_annotations_csvs(
[epath.Path(annotations_path)],
filename_fn=filename_fn,
namespace='weldy_calltype',
class_fn=class_fn,
start_time_fn=start_time_fn,
end_time_fn=end_time_fn,
filter_fn=filter_fn,
)
segments = annotations.annotations_to_dataframe(annos)
return segments


def load_anuraset_annotations(annotations_path: epath.Path) -> pd.DataFrame:
"""Loads a dataframe of all annotations."""
filename_fn = lambda _, row: os.path.join( # pylint: disable=g-long-lambda
row['filename'].split('_')[0], row['filename'].strip()
)
start_time_fn = lambda row: float(row['start_time_s'])
end_time_fn = lambda row: float(row['end_time_s'])
# There are a few SPECIES_LALSE labels which according to the authors should
# be ignored.
filter_fn = lambda row: '_LALSE' in row['label']
class_fn = lambda row: row['label'].split(' ')
annos = annotations.read_dataset_annotations_csvs(
[epath.Path(annotations_path)],
filename_fn=filename_fn,
namespace='anuraset',
class_fn=class_fn,
start_time_fn=start_time_fn,
end_time_fn=end_time_fn,
filter_fn=filter_fn,
)
segments = annotations.annotations_to_dataframe(annos)
return segments


def load_reef_annotations(annotations_path: epath.Path) -> pd.DataFrame:
"""Loads a dataframe of all annotations from the reefs JSON file.
Args:
annotations_path: path to dataset_v*.json.
Returns:
DataFrame of metadata parsed from the datasets
Reef specific stuff:
- All clips are 1.88sec long, so we fix all start and end times accordingly
- We only take entries for which the dataset_type is sound_event_dataset, as
other entries are only soundscape (habitat level) labels or just unlabeled
completely
- In future, should this add a header to the df that species the region
somehow? Allowing selection by regional datasets
"""
# Read the JSON file
with annotations_path.open() as f:
data = json.load(f)
# Prepare a list of dictionaries for creating a DataFrame
rows = []
for entry in data:
# Include only entries with "dataset_type": "sound_event_dataset"
if entry.get('dataset_type') == 'sound_event_dataset':
label = entry.get('label', '')
# to use region.label format use:
# label = f"{entry.get('region', '')}.{entry.get('label', '')}"
row = {
'filename': entry.get('file_name', ''),
'start_time_s': 0.0,
'end_time_s': 1.88,
'namespace': 'reefs',
'label': [label],
}
rows.append(row)
# Create a DataFrame
segments = pd.DataFrame(rows)
return segments
38 changes: 19 additions & 19 deletions chirp/data/soundscapes/soundscapes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
from chirp import audio_utils
from chirp.data import tfds_features
from chirp.data.bird_taxonomy import bird_taxonomy
from chirp.data.soundscapes import dataset_fns
from chirp.data.soundscapes import soundscapes_lib
from chirp.taxonomy import annotations_fns
from etils import epath
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -137,15 +137,15 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
audio_glob='caples/audio/*',
interval_length_s=5.0,
localization_fn=audio_utils.slice_peaked_audio,
annotation_load_fn=dataset_fns.load_caples_annotations,
annotation_load_fn=annotations_fns.load_caples_annotations,
description='Annotated Caples recordings from 2018/2019.',
),
SoundscapesConfig(
name='caples_full_length',
class_list_name='caples',
audio_glob='caples/audio/*',
annotation_filename='caples.csv',
annotation_load_fn=dataset_fns.load_caples_annotations,
annotation_load_fn=annotations_fns.load_caples_annotations,
keep_unknown_annotation=True,
# Some recordings in Caples are only partially-annotated, so to avoid
# scoring legitimate model predictions as false positives we pad with
Expand All @@ -159,7 +159,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
audio_glob='hawaii/audio/*.flac',
interval_length_s=5.0,
localization_fn=audio_utils.slice_peaked_audio,
annotation_load_fn=dataset_fns.load_cornell_annotations,
annotation_load_fn=annotations_fns.load_cornell_annotations,
annotation_filename='annotations.csv',
keep_unknown_annotation=True,
description=(
Expand All @@ -171,7 +171,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
SoundscapesConfig(
name='hawaii_full_length',
audio_glob='hawaii/audio/*.flac',
annotation_load_fn=dataset_fns.load_cornell_annotations,
annotation_load_fn=annotations_fns.load_cornell_annotations,
annotation_filename='annotations.csv',
keep_unknown_annotation=True,
description=(
Expand All @@ -185,7 +185,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
audio_glob='ssw/audio/*.flac',
interval_length_s=5.0,
localization_fn=audio_utils.slice_peaked_audio,
annotation_load_fn=dataset_fns.load_cornell_annotations,
annotation_load_fn=annotations_fns.load_cornell_annotations,
annotation_filename='annotations.csv',
description=(
'Annotated Sapsucker Woods recordings. '
Expand All @@ -196,7 +196,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
SoundscapesConfig(
name='ssw_full_length',
audio_glob='ssw/audio/*.flac',
annotation_load_fn=dataset_fns.load_cornell_annotations,
annotation_load_fn=annotations_fns.load_cornell_annotations,
annotation_filename='annotations.csv',
keep_unknown_annotation=True,
description=(
Expand All @@ -208,7 +208,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
SoundscapesConfig(
name='coffee_farms',
audio_glob='coffee_farms/audio/*.flac',
annotation_load_fn=dataset_fns.load_cornell_annotations,
annotation_load_fn=annotations_fns.load_cornell_annotations,
annotation_filename='annotations.csv',
interval_length_s=5.0,
localization_fn=audio_utils.slice_peaked_audio,
Expand All @@ -222,7 +222,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
SoundscapesConfig(
name='coffee_farms_full_length',
audio_glob='coffee_farms/audio/*.flac',
annotation_load_fn=dataset_fns.load_cornell_annotations,
annotation_load_fn=annotations_fns.load_cornell_annotations,
annotation_filename='annotations.csv',
keep_unknown_annotation=True,
description=(
Expand All @@ -236,7 +236,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
audio_glob='high_sierras/audio/*.flac',
interval_length_s=5.0,
localization_fn=audio_utils.slice_peaked_audio,
annotation_load_fn=dataset_fns.load_cornell_annotations,
annotation_load_fn=annotations_fns.load_cornell_annotations,
annotation_filename='annotations.csv',
keep_unknown_annotation=True,
description=(
Expand All @@ -247,7 +247,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
SoundscapesConfig(
name='high_sierras_full_length',
audio_glob='high_sierras/audio/*.flac',
annotation_load_fn=dataset_fns.load_cornell_annotations,
annotation_load_fn=annotations_fns.load_cornell_annotations,
annotation_filename='annotations.csv',
keep_unknown_annotation=True,
description=(
Expand All @@ -261,7 +261,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
audio_glob='sierras_kahl/audio/*.flac',
interval_length_s=5.0,
localization_fn=audio_utils.slice_peaked_audio,
annotation_load_fn=dataset_fns.load_cornell_annotations,
annotation_load_fn=annotations_fns.load_cornell_annotations,
annotation_filename='annotations.csv',
keep_unknown_annotation=True,
description=(
Expand All @@ -272,7 +272,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
SoundscapesConfig(
name='sierras_kahl_full_length',
audio_glob='sierras_kahl/audio/*.flac',
annotation_load_fn=dataset_fns.load_cornell_annotations,
annotation_load_fn=annotations_fns.load_cornell_annotations,
annotation_filename='annotations.csv',
keep_unknown_annotation=True,
description=(
Expand All @@ -286,7 +286,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
audio_glob='powdermill/*/*.wav',
interval_length_s=5.0,
localization_fn=audio_utils.slice_peaked_audio,
annotation_load_fn=dataset_fns.load_powdermill_annotations,
annotation_load_fn=annotations_fns.load_powdermill_annotations,
description=(
'New England recordings from Powdermill Nature Reserve, '
'Rector, PA. https://doi.org/10.1002/ecy.3329'
Expand All @@ -297,7 +297,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
name='powdermill_full_length',
audio_glob='powdermill/*/*.wav',
annotation_filename='powdermill.csv',
annotation_load_fn=dataset_fns.load_powdermill_annotations,
annotation_load_fn=annotations_fns.load_powdermill_annotations,
keep_unknown_annotation=True,
description=(
'Full-length New England recordings from Powdermill '
Expand All @@ -311,7 +311,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
audio_glob='peru/audio/*.flac',
interval_length_s=5.0,
localization_fn=audio_utils.slice_peaked_audio,
annotation_load_fn=dataset_fns.load_cornell_annotations,
annotation_load_fn=annotations_fns.load_cornell_annotations,
annotation_filename='annotations.csv',
keep_unknown_annotation=True,
description=(
Expand All @@ -323,7 +323,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
SoundscapesConfig(
name='peru_full_length',
audio_glob='peru/audio/*.flac',
annotation_load_fn=dataset_fns.load_cornell_annotations,
annotation_load_fn=annotations_fns.load_cornell_annotations,
annotation_filename='annotations.csv',
keep_unknown_annotation=True,
description=(
Expand All @@ -335,7 +335,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
SoundscapesConfig(
name='weldy_calltype_full_length',
audio_glob='weldy_calltype/annotated_recordings/*.wav',
annotation_load_fn=dataset_fns.load_weldy_annotations,
annotation_load_fn=annotations_fns.load_weldy_annotations,
annotation_filename='annotations.csv',
keep_unknown_annotation=True,
description=(
Expand All @@ -347,7 +347,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
SoundscapesConfig(
name='anuraset_full_length',
audio_glob='anuraset/raw_data/*/*.wav',
annotation_load_fn=dataset_fns.load_anuraset_annotations,
annotation_load_fn=annotations_fns.load_anuraset_annotations,
annotation_filename='annotations.csv',
keep_unknown_annotation=True,
description=(
Expand Down
Loading

0 comments on commit 1864e01

Please sign in to comment.