Ingest CSV labels into Hoplite DB.

PiperOrigin-RevId: 655991116
google-research · Jul 25, 2024 · 1864e01 · 1864e01
1 parent 1281f45
commit 1864e01
Show file tree

Hide file tree

Showing 7 changed files with 359 additions and 195 deletions.
diff --git a/chirp/data/soundscapes/dataset_fns.py b/chirp/data/soundscapes/dataset_fns.py
@@ -16,12 +16,9 @@
 """Config utils specific to BirdClef Soundscape datasets."""
 
 import csv
-import json
 import os
 
 from chirp.data.soundscapes import soundscapes_lib
-from chirp.taxonomy import annotations
-from chirp.taxonomy import namespace_db
 from etils import epath
 import pandas as pd
 import tensorflow as tf
@@ -92,57 +89,6 @@ def birdclef_metadata_features() -> dict[str, soundscapes_lib.MetadataFeature]:
   return feature_types
 
 
-def load_caples_annotations(annotations_path: epath.Path) -> pd.DataFrame:
-  """Loads the dataframe of all caples annotations from annotation CSV.
-
-  Args:
-    annotations_path: Filepath for the annotations CSV.
-
-  Returns:
-    DataFrame of annotations.
-  """
-  filename_fn = lambda _, row: row['fid'].strip()
-  start_time_fn = lambda row: float(row['start_time_s'])
-  end_time_fn = lambda row: float(row['end_time_s'])
-  # Get rid of the one bad label in the dataset...
-  filter_fn = lambda row: 'comros' in row['ebird_codes']
-  class_fn = lambda row: row['ebird_codes'].split(' ')
-  annos = annotations.read_dataset_annotations_csvs(
-      [annotations_path],
-      filename_fn=filename_fn,
-      namespace='ebird2021',
-      class_fn=class_fn,
-      start_time_fn=start_time_fn,
-      end_time_fn=end_time_fn,
-      filter_fn=filter_fn,
-  )
-  segments = annotations.annotations_to_dataframe(annos)
-  return segments
-
-
-def load_cornell_annotations(annotations_path: epath.Path) -> pd.DataFrame:
-  """Load the annotations from a Cornell Zenodo dataset."""
-  start_time_fn = lambda row: float(row['Start Time (s)'])
-  end_time_fn = lambda row: float(row['End Time (s)'])
-  filter_fn = lambda row: False
-  class_fn = lambda row: [  # pylint: disable=g-long-lambda
-      row['Species eBird Code'].strip().replace('????', 'unknown')
-  ]
-
-  filename_fn = lambda filepath, row: row['Filename'].strip()
-  annos = annotations.read_dataset_annotations_csvs(
-      [annotations_path],
-      filename_fn=filename_fn,
-      namespace='ebird2021',
-      class_fn=class_fn,
-      start_time_fn=start_time_fn,
-      end_time_fn=end_time_fn,
-      filter_fn=filter_fn,
-  )
-  segments = annotations.annotations_to_dataframe(annos)
-  return segments
-
-
 # TODO(tomdenton): Eliminate these 'combine' functions.
 # Reading directly from the set of annotation files will be more direct and
 # less error prone when updating datasets.
@@ -181,122 +127,3 @@ def combine_powdermill_annotations(
     writer = csv.DictWriter(f, fieldnames=fieldnames)
     writer.writeheader()
     writer.writerows(rows)
-
-
-def load_powdermill_annotations(annotations_path: epath.Path) -> pd.DataFrame:
-  """Load the dataframe of all Powdermill annotations from annotation CSV."""
-  start_time_fn = lambda row: float(row['Begin Time (s)'])
-  end_time_fn = lambda row: float(row['End Time (s)'])
-  filter_fn = lambda row: False
-
-  # Convert dataset labels to ebird2021.
-  db = namespace_db.load_db()
-  ebird_mapping = db.mappings['ibp2019_to_ebird2021']
-  ebird_mapping_dict = ebird_mapping.mapped_pairs
-  class_fn = lambda row: [  # pylint: disable=g-long-lambda
-      ebird_mapping_dict.get(row['Species'].strip(), row['Species'].strip())
-  ]
-
-  annotation_filepaths = [annotations_path]
-  filename_fn = lambda filepath, row: row['Filename'].strip()
-  annos = annotations.read_dataset_annotations_csvs(
-      annotation_filepaths,
-      filename_fn=filename_fn,
-      namespace=ebird_mapping.target_namespace,
-      class_fn=class_fn,
-      start_time_fn=start_time_fn,
-      end_time_fn=end_time_fn,
-      filter_fn=filter_fn,
-  )
-  segments = annotations.annotations_to_dataframe(annos)
-  return segments
-
-
-def load_weldy_annotations(annotations_path: epath.Path) -> pd.DataFrame:
-  """Loads a dataframe of all annotations from the Weldy Calltype dataset."""
-  filename_fn = lambda _, row: 'annotated_recordings/' + row['file'].strip()
-  start_time_fn = lambda row: float(row['start'])
-  end_time_fn = lambda row: float(row['end'])
-  filter_fn = lambda row: False
-  class_fn = lambda row: (  # pylint: disable=g-long-lambda
-      row['label']
-      .replace('unk', 'unknown')
-      .replace('impossible', 'unknown')
-      .replace('unknown_chip', 'unknown')
-      .split(' ')
-  )
-  annos = annotations.read_dataset_annotations_csvs(
-      [epath.Path(annotations_path)],
-      filename_fn=filename_fn,
-      namespace='weldy_calltype',
-      class_fn=class_fn,
-      start_time_fn=start_time_fn,
-      end_time_fn=end_time_fn,
-      filter_fn=filter_fn,
-  )
-  segments = annotations.annotations_to_dataframe(annos)
-  return segments
-
-
-def load_anuraset_annotations(annotations_path: epath.Path) -> pd.DataFrame:
-  """Loads a dataframe of all annotations."""
-  filename_fn = lambda _, row: os.path.join(  # pylint: disable=g-long-lambda
-      row['filename'].split('_')[0], row['filename'].strip()
-  )
-  start_time_fn = lambda row: float(row['start_time_s'])
-  end_time_fn = lambda row: float(row['end_time_s'])
-  # There are a few SPECIES_LALSE labels which according to the authors should
-  # be ignored.
-  filter_fn = lambda row: '_LALSE' in row['label']
-  class_fn = lambda row: row['label'].split(' ')
-  annos = annotations.read_dataset_annotations_csvs(
-      [epath.Path(annotations_path)],
-      filename_fn=filename_fn,
-      namespace='anuraset',
-      class_fn=class_fn,
-      start_time_fn=start_time_fn,
-      end_time_fn=end_time_fn,
-      filter_fn=filter_fn,
-  )
-  segments = annotations.annotations_to_dataframe(annos)
-  return segments
-
-
-def load_reef_annotations(annotations_path: epath.Path) -> pd.DataFrame:
-  """Loads a dataframe of all annotations from the reefs JSON file.
-
-  Args:
-    annotations_path: path to dataset_v*.json.
-
-  Returns:
-    DataFrame of metadata parsed from the datasets
-    Reef specific stuff:
-    - All clips are 1.88sec long, so we fix all start and end times accordingly
-    - We only take entries for which the dataset_type is sound_event_dataset, as
-    other entries are only soundscape (habitat level) labels or just unlabeled
-    completely
-    - In future, should this add a header to the df that species the region
-    somehow? Allowing selection by regional datasets
-  """
-  # Read the JSON file
-  with annotations_path.open() as f:
-    data = json.load(f)
-  # Prepare a list of dictionaries for creating a DataFrame
-  rows = []
-  for entry in data:
-    # Include only entries with "dataset_type": "sound_event_dataset"
-    if entry.get('dataset_type') == 'sound_event_dataset':
-      label = entry.get('label', '')
-      # to use region.label format use:
-      # label = f"{entry.get('region', '')}.{entry.get('label', '')}"
-      row = {
-          'filename': entry.get('file_name', ''),
-          'start_time_s': 0.0,
-          'end_time_s': 1.88,
-          'namespace': 'reefs',
-          'label': [label],
-      }
-      rows.append(row)
-  # Create a DataFrame
-  segments = pd.DataFrame(rows)
-  return segments
diff --git a/chirp/data/soundscapes/soundscapes.py b/chirp/data/soundscapes/soundscapes.py
@@ -24,8 +24,8 @@
 from chirp import audio_utils
 from chirp.data import tfds_features
 from chirp.data.bird_taxonomy import bird_taxonomy
-from chirp.data.soundscapes import dataset_fns
 from chirp.data.soundscapes import soundscapes_lib
+from chirp.taxonomy import annotations_fns
 from etils import epath
 import numpy as np
 import pandas as pd
@@ -137,15 +137,15 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
           audio_glob='caples/audio/*',
           interval_length_s=5.0,
           localization_fn=audio_utils.slice_peaked_audio,
-          annotation_load_fn=dataset_fns.load_caples_annotations,
+          annotation_load_fn=annotations_fns.load_caples_annotations,
           description='Annotated Caples recordings from 2018/2019.',
       ),
       SoundscapesConfig(
           name='caples_full_length',
           class_list_name='caples',
           audio_glob='caples/audio/*',
           annotation_filename='caples.csv',
-          annotation_load_fn=dataset_fns.load_caples_annotations,
+          annotation_load_fn=annotations_fns.load_caples_annotations,
           keep_unknown_annotation=True,
           # Some recordings in Caples are only partially-annotated, so to avoid
           # scoring legitimate model predictions as false positives we pad with
@@ -159,7 +159,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
           audio_glob='hawaii/audio/*.flac',
           interval_length_s=5.0,
           localization_fn=audio_utils.slice_peaked_audio,
-          annotation_load_fn=dataset_fns.load_cornell_annotations,
+          annotation_load_fn=annotations_fns.load_cornell_annotations,
           annotation_filename='annotations.csv',
           keep_unknown_annotation=True,
           description=(
@@ -171,7 +171,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
       SoundscapesConfig(
           name='hawaii_full_length',
           audio_glob='hawaii/audio/*.flac',
-          annotation_load_fn=dataset_fns.load_cornell_annotations,
+          annotation_load_fn=annotations_fns.load_cornell_annotations,
           annotation_filename='annotations.csv',
           keep_unknown_annotation=True,
           description=(
@@ -185,7 +185,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
           audio_glob='ssw/audio/*.flac',
           interval_length_s=5.0,
           localization_fn=audio_utils.slice_peaked_audio,
-          annotation_load_fn=dataset_fns.load_cornell_annotations,
+          annotation_load_fn=annotations_fns.load_cornell_annotations,
           annotation_filename='annotations.csv',
           description=(
               'Annotated Sapsucker Woods recordings. '
@@ -196,7 +196,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
       SoundscapesConfig(
           name='ssw_full_length',
           audio_glob='ssw/audio/*.flac',
-          annotation_load_fn=dataset_fns.load_cornell_annotations,
+          annotation_load_fn=annotations_fns.load_cornell_annotations,
           annotation_filename='annotations.csv',
           keep_unknown_annotation=True,
           description=(
@@ -208,7 +208,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
       SoundscapesConfig(
           name='coffee_farms',
           audio_glob='coffee_farms/audio/*.flac',
-          annotation_load_fn=dataset_fns.load_cornell_annotations,
+          annotation_load_fn=annotations_fns.load_cornell_annotations,
           annotation_filename='annotations.csv',
           interval_length_s=5.0,
           localization_fn=audio_utils.slice_peaked_audio,
@@ -222,7 +222,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
       SoundscapesConfig(
           name='coffee_farms_full_length',
           audio_glob='coffee_farms/audio/*.flac',
-          annotation_load_fn=dataset_fns.load_cornell_annotations,
+          annotation_load_fn=annotations_fns.load_cornell_annotations,
           annotation_filename='annotations.csv',
           keep_unknown_annotation=True,
           description=(
@@ -236,7 +236,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
           audio_glob='high_sierras/audio/*.flac',
           interval_length_s=5.0,
           localization_fn=audio_utils.slice_peaked_audio,
-          annotation_load_fn=dataset_fns.load_cornell_annotations,
+          annotation_load_fn=annotations_fns.load_cornell_annotations,
           annotation_filename='annotations.csv',
           keep_unknown_annotation=True,
           description=(
@@ -247,7 +247,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
       SoundscapesConfig(
           name='high_sierras_full_length',
           audio_glob='high_sierras/audio/*.flac',
-          annotation_load_fn=dataset_fns.load_cornell_annotations,
+          annotation_load_fn=annotations_fns.load_cornell_annotations,
           annotation_filename='annotations.csv',
           keep_unknown_annotation=True,
           description=(
@@ -261,7 +261,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
           audio_glob='sierras_kahl/audio/*.flac',
           interval_length_s=5.0,
           localization_fn=audio_utils.slice_peaked_audio,
-          annotation_load_fn=dataset_fns.load_cornell_annotations,
+          annotation_load_fn=annotations_fns.load_cornell_annotations,
           annotation_filename='annotations.csv',
           keep_unknown_annotation=True,
           description=(
@@ -272,7 +272,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
       SoundscapesConfig(
           name='sierras_kahl_full_length',
           audio_glob='sierras_kahl/audio/*.flac',
-          annotation_load_fn=dataset_fns.load_cornell_annotations,
+          annotation_load_fn=annotations_fns.load_cornell_annotations,
           annotation_filename='annotations.csv',
           keep_unknown_annotation=True,
           description=(
@@ -286,7 +286,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
           audio_glob='powdermill/*/*.wav',
           interval_length_s=5.0,
           localization_fn=audio_utils.slice_peaked_audio,
-          annotation_load_fn=dataset_fns.load_powdermill_annotations,
+          annotation_load_fn=annotations_fns.load_powdermill_annotations,
           description=(
               'New England recordings from Powdermill Nature Reserve, '
               'Rector, PA. https://doi.org/10.1002/ecy.3329'
@@ -297,7 +297,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
           name='powdermill_full_length',
           audio_glob='powdermill/*/*.wav',
           annotation_filename='powdermill.csv',
-          annotation_load_fn=dataset_fns.load_powdermill_annotations,
+          annotation_load_fn=annotations_fns.load_powdermill_annotations,
           keep_unknown_annotation=True,
           description=(
               'Full-length New England recordings from Powdermill '
@@ -311,7 +311,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
           audio_glob='peru/audio/*.flac',
           interval_length_s=5.0,
           localization_fn=audio_utils.slice_peaked_audio,
-          annotation_load_fn=dataset_fns.load_cornell_annotations,
+          annotation_load_fn=annotations_fns.load_cornell_annotations,
           annotation_filename='annotations.csv',
           keep_unknown_annotation=True,
           description=(
@@ -323,7 +323,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
       SoundscapesConfig(
           name='peru_full_length',
           audio_glob='peru/audio/*.flac',
-          annotation_load_fn=dataset_fns.load_cornell_annotations,
+          annotation_load_fn=annotations_fns.load_cornell_annotations,
           annotation_filename='annotations.csv',
           keep_unknown_annotation=True,
           description=(
@@ -335,7 +335,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
       SoundscapesConfig(
           name='weldy_calltype_full_length',
           audio_glob='weldy_calltype/annotated_recordings/*.wav',
-          annotation_load_fn=dataset_fns.load_weldy_annotations,
+          annotation_load_fn=annotations_fns.load_weldy_annotations,
           annotation_filename='annotations.csv',
           keep_unknown_annotation=True,
           description=(
@@ -347,7 +347,7 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy):
       SoundscapesConfig(
           name='anuraset_full_length',
           audio_glob='anuraset/raw_data/*/*.wav',
-          annotation_load_fn=dataset_fns.load_anuraset_annotations,
+          annotation_load_fn=annotations_fns.load_anuraset_annotations,
           annotation_filename='annotations.csv',
           keep_unknown_annotation=True,
           description=(