Update global seabirds.

PiperOrigin-RevId: 550602484
google-research · Jul 24, 2023 · 000183e · 000183e
1 parent 166c672
commit 000183e
Show file tree

Hide file tree

Showing 73 changed files with 519,234 additions and 87,579 deletions.
diff --git a/chirp/data/soundevents/soundevents.py b/chirp/data/soundevents/soundevents.py
@@ -151,11 +151,11 @@ class Soundevents(tfds.core.GeneratorBasedBuilder):
   }
 
   def _info(self) -> tfds.core.DatasetInfo:
-    db = namespace_db.NamespaceDatabase.load_csvs()
+    db = namespace_db.load_db()
     dataset_class_list = db.class_lists[self.builder_config.class_list_name]
     logging.info(
         'Currently considering a total of %s soundevent.',
-        dataset_class_list.size,
+        len(dataset_class_list.classes),
     )
 
     full_length = self.builder_config.localization_fn is None

diff --git a/chirp/data/soundscapes/dataset_fns.py b/chirp/data/soundscapes/dataset_fns.py
@@ -189,9 +189,9 @@ def load_powdermill_annotations(annotations_path: epath.Path) -> pd.DataFrame:
   filter_fn = lambda row: False
 
   # Convert dataset labels to ebird2021.
-  db = namespace_db.NamespaceDatabase.load_csvs()
+  db = namespace_db.load_db()
   ebird_mapping = db.mappings['ibp2019_to_ebird2021']
-  ebird_mapping_dict = ebird_mapping.to_dict()
+  ebird_mapping_dict = ebird_mapping.mapped_pairs
   class_fn = lambda row: [  # pylint: disable=g-long-lambda
       ebird_mapping_dict.get(row['Species'].strip(), row['Species'].strip())
   ]

diff --git a/chirp/data/soundscapes/soundscapes.py b/chirp/data/soundscapes/soundscapes.py
@@ -343,7 +343,8 @@ def _info(self) -> tfds.core.DatasetInfo:
         self.builder_config.keep_unknown_annotation,
     )
     logging.info(
-        'Currently considering a total of %s species.', dataset_class_list.size
+        'Currently considering a total of %s species.',
+        len(dataset_class_list.classes),
     )
     full_length = self.builder_config.localization_fn is None
     if full_length:

diff --git a/chirp/data/soundscapes/soundscapes_lib.py b/chirp/data/soundscapes/soundscapes_lib.py
@@ -70,7 +70,7 @@ def load_class_list(
   Returns:
     The desired ClassList.
   """
-  db = namespace_db.NamespaceDatabase.load_csvs()
+  db = namespace_db.load_db()
   dataset_class_list = db.class_lists[class_list_name]
 
   if (
@@ -79,9 +79,8 @@ def load_class_list(
   ):
     # Create a new class list which includes the 'unknown' class.
     dataset_class_list = namespace.ClassList(
-        dataset_class_list.name + '_' + UNKNOWN_LABEL,
         dataset_class_list.namespace,
-        [UNKNOWN_LABEL] + list(dataset_class_list.classes),
+        (UNKNOWN_LABEL,) + dataset_class_list.classes,
     )
   return dataset_class_list
 
@@ -419,7 +418,7 @@ def _start_end_key(seg):
         continue
       # found an overlap!
       for label in seg['label']:
-        if label in class_list:
+        if label in class_list.classes:
           interval_labels.add(label)
         else:
           logging.info('dropping label not in class list: %s', str(label))

diff --git a/chirp/eval/callbacks.py b/chirp/eval/callbacks.py
@@ -174,7 +174,7 @@ def _load_learned_representations(self):
     """Loads classifier output weights from the separator."""
     label_csv_path = epath.Path(self.model_path) / 'label.csv'
     with label_csv_path.open('r') as f:
-      class_list = namespace.ClassList.from_csv('label', f)
+      class_list = namespace.ClassList.from_csv(f)
     # Load the output layer weights.
     variables_path = (
         epath.Path(self.model_path) / 'savedmodel/variables/variables'
@@ -187,7 +187,7 @@ def _load_learned_representations(self):
       if (
           len(v_shape) == 3
           and v_shape[0] == 1
-          and v_shape[-1] == class_list.size
+          and v_shape[-1] == len(class_list.classes)
       ):
         candidates.append(v)
     if not candidates:

diff --git a/chirp/export_utils.py b/chirp/export_utils.py
@@ -120,6 +120,9 @@ def export_converted_model(
     if class_lists is not None:
       for key, class_list in class_lists.items():
         with tf.io.gfile.GFile(os.path.join(workdir, f'{key}.csv'), 'w') as f:
+          # NOTE: Although the namespace is written to the file, there is no
+          # guarantee that the class list will still be compatible with the
+          # namespace if the latter gets updated.
           f.write(class_list.to_csv())
 
     if not export_tf_lite:

diff --git a/chirp/inference/models.py b/chirp/inference/models.py
@@ -234,9 +234,9 @@ def __post_init__(self):
       model_path = base_path / 'savedmodel'
       label_csv_path = base_path / 'label.csv'
 
-    self.model = tf.saved_model.load(model_path.as_posix())
+    self.model = tf.saved_model.load(model_path)
     with label_csv_path.open('r') as f:
-      self.class_list = namespace.ClassList.from_csv('label', f)
+      self.class_list = namespace.ClassList.from_csv(f)
 
     # Check whether the model support polymorphic batch shape.
     sig = self.model.signatures['serving_default']
@@ -266,7 +266,7 @@ def embed(self, audio_array: np.ndarray) -> interface.InferenceOutputs:
     )
 
     return interface.InferenceOutputs(
-        all_embeddings, {self.class_list.name: all_logits}, None
+        all_embeddings, {'label': all_logits}, None
     )
 
   def batch_embed(
@@ -291,9 +291,7 @@ def batch_embed(
         embeddings, framed_audio.shape[:2] + (embeddings.shape[-1],)
     )
 
-    return interface.InferenceOutputs(
-        embeddings, {self.class_list.name: logits}, None
-    )
+    return interface.InferenceOutputs(embeddings, {'label': logits}, None)
 
 
 @dataclasses.dataclass
@@ -325,7 +323,7 @@ def __post_init__(self):
     self.model = tf.saved_model.load(epath.Path(self.model_path) / 'savedmodel')
     label_csv_path = epath.Path(self.model_path) / 'label.csv'
     with label_csv_path.open('r') as f:
-      self.class_list = namespace.ClassList.from_csv('label', f)
+      self.class_list = namespace.ClassList.from_csv(f)
 
   def embed(self, audio_array: np.ndarray) -> interface.InferenceOutputs:
     # Drop samples to allow reshaping to frame_size
@@ -362,7 +360,7 @@ def embed(self, audio_array: np.ndarray) -> interface.InferenceOutputs:
     )
     all_embeddings = np.reshape(all_embeddings, [-1, all_embeddings.shape[-1]])
     return interface.InferenceOutputs(
-        all_embeddings, {self.class_list.name: all_logits}, sep_audio
+        all_embeddings, {'label': all_logits}, sep_audio
     )
 
   def batch_embed(self, audio_batch: np.ndarray) -> interface.InferenceOutputs:
@@ -592,7 +590,9 @@ def embed(self, audio_array: np.ndarray) -> interface.InferenceOutputs:
       )
     if self.make_logits:
       outputs['logits'] = {
-          'label': np.zeros([time_size, self.class_list.size], np.float32),
+          'label': np.zeros(
+              [time_size, len(self.class_list.classes)], np.float32
+          ),
       }
       outputs['logits']['label'] = self.convert_logits(
           outputs['logits']['label'], self.class_list, self.target_class_list

diff --git a/chirp/path_utils.py b/chirp/path_utils.py
@@ -18,12 +18,10 @@
 General utilities to help with handling paths.
 """
 import os
-from typing import Iterable
-from absl import logging
-from etils import epath
+import pathlib
 
 
-def get_absolute_epath(relative_path: str) -> epath.Path:
+def get_absolute_path(relative_path: os.PathLike[str] | str) -> pathlib.Path:
   """Returns the absolute epath.Path associated with the relative_path.
 
   Args:
@@ -32,11 +30,5 @@ def get_absolute_epath(relative_path: str) -> epath.Path:
   Returns:
     The absolute path to the resource.
   """
-  file_path = epath.Path(__file__).parent / relative_path
+  file_path = pathlib.Path(__file__).parent / relative_path
   return file_path
-
-
-def listdir(relative_path: str) -> Iterable[str]:
-  """List the contents of a directory in the Chirp project."""
-  absolute_path = get_absolute_epath(relative_path).as_posix()
-  return os.listdir(absolute_path)
diff --git a/chirp/preprocessing/pipeline.py b/chirp/preprocessing/pipeline.py
@@ -703,7 +703,7 @@ class ConvertBirdTaxonomyLabels(FeaturesPreprocessOp):
   output_masks: bool = True
 
   # The following members are for cached / stateful data.
-  db: namespace_db.NamespaceDatabase | None = None
+  db: namespace_db.TaxonomyDatabase | None = None
 
   def __post_init__(self):
     # Create NamespaceDatabase in post_init to avoid loading CSVs repeatedly.
@@ -713,7 +713,7 @@ def __post_init__(self):
     # applied multiple times on different datasets. Otherwise, in subsequent
     # pipeline applications TF will attempt to re-use previous constants
     # belonging to a different tf.function.
-    self.db = namespace_db.NamespaceDatabase.load_csvs()
+    self.db = namespace_db.load_db()
 
   def load_tables(
       self, source_class_list: namespace.ClassList
@@ -756,7 +756,7 @@ def load_tables(
         target_taxa_classes = target_classes.apply_namespace_mapping(
             namespace_mapping
         )
-        namespace_table, _ = source_class_list.get_namespace_map_tf_lookup(
+        namespace_table = source_class_list.get_namespace_map_tf_lookup(
             namespace_mapping
         )
         class_table, label_mask = source_taxa_classes.get_class_map_tf_lookup(
@@ -861,19 +861,18 @@ def __call__(
       self, features: Features, dataset_info: tfds.core.DatasetInfo
   ) -> Features:
     source_classes = namespace.ClassList(
-        'dataset',
         self.source_namespace,
         # TODO(vdumoulin): generalize this to labels beyond 'ignore'.
         # Some dataset variants (e.g. bird_taxonomy/downstream_slice_peaked)
         # use an 'ignore' label which is not part of the eBirds taxonomy. We
         # ignore this label; the mapping tables return an 'unknown' default
         # value, so all 'ignore' labels will naturally be converted to
         # 'unknown'.
-        [
+        tuple(
             n
             for n in dataset_info.features[self.species_feature_name].names
             if n != 'ignore'
-        ],
+        ),
     )
     output_features = self.convert_features(features, source_classes)
     return output_features

diff --git a/chirp/projects/sfda/model_utils.py b/chirp/projects/sfda/model_utils.py
@@ -458,7 +458,7 @@ def prepare_audio_model(
   class_lists = class_utils.get_class_lists(
       target_class_list, add_taxonomic_labels=False
   )
-  num_classes = {k: v.size for (k, v) in class_lists.items()}
+  num_classes = {k: len(v.classes) for (k, v) in class_lists.items()}
   model = taxonomy_model.TaxonomyModel(
       num_classes=num_classes,
       encoder=model_config.encoder,