Add eBird parsing script.

PiperOrigin-RevId: 549690251
google-research · Jul 24, 2023 · ed408c5 · ed408c5
1 parent ae6721e
commit ed408c5
Show file tree

Hide file tree

Showing 5 changed files with 288,807 additions and 409,366 deletions.
diff --git a/chirp/data/bird_taxonomy/bird_taxonomy.py b/chirp/data/bird_taxonomy/bird_taxonomy.py
@@ -247,13 +247,17 @@ class BirdTaxonomy(tfds.core.GeneratorBasedBuilder):
               'chirp.audio_utils.slice_peaked_audio.'
           ),
           data_processing_query=fsu.QuerySequence([
-              fsu.filter_in_class_list('species_code', 'global_seabirds'),
+              fsu.filter_in_class_list(
+                  'species_code', 'ebird2021_global_seabirds'
+              ),
               fsu.scrub_all_but_class_list(
-                  'bg_species_codes', 'global_seabirds'
+                  'bg_species_codes', 'ebird2021_global_seabirds'
               ),
           ]),
           metadata_processing_query=fsu.QuerySequence([
-              fsu.filter_in_class_list('species_code', 'global_seabirds'),
+              fsu.filter_in_class_list(
+                  'species_code', 'ebird2021_global_seabirds'
+              ),
           ]),
       ),
   ]

diff --git a/chirp/taxonomy/ebird.py b/chirp/taxonomy/ebird.py
@@ -0,0 +1,175 @@
+# coding=utf-8
+# Copyright 2023 The Chirp Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Load eBird/Clements labels from source data."""
+from absl import app
+from absl import flags
+from chirp.taxonomy import namespace
+from chirp.taxonomy import namespace_db
+import numpy as np
+import pandas as pd
+
+_SOURCE_FILE = flags.DEFINE_string(
+    'source_file', 'source_data/ebird_taxonomy_v2022.csv', 'CSV file to load.'
+)
+_PREFIX = flags.DEFINE_string(
+    'prefix',
+    'ebird2022',
+    'The prefix to attach to the generated namespaces, class lists, and'
+    ' mappings.',
+)
+_OUTPUT_FILE = flags.DEFINE_string(
+    'output_file', 'taxonomy_database.json', 'Output file.'
+)
+
+SEABIRD_FAMILIES = {
+    'sulidae',
+    'fregatidae',
+    'stercorariidae',
+    'laridae',
+    'alcidae',
+    'scolopacidae',
+}
+
+SEABIRD_ORDERS = {
+    'sphenisciformes',
+    'procellariiformes',
+}
+
+
+def main(argv: list[str]) -> None:
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+  # Load the CSV data
+  df = pd.read_csv(_SOURCE_FILE.value)
+  # Lower-case the data
+  df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
+  # Extract the genus from the scientific name
+  df['genus'] = df['SCI_NAME'].str.split(' ').str[0]
+  # Only keep the scientific family name (ignore the common name)
+  df['family'] = df['FAMILY'].str.split(' ').str[0]
+  # Correction to spuhs
+  df.loc[
+      (df['CATEGORY'] == 'spuh')
+      & ((df['genus'] == df['ORDER1']) | (df['genus'] == df['family'])),
+      'genus',
+  ] = np.nan
+  # Report species as themselves
+  df.loc[df['CATEGORY'] == 'species', 'REPORT_AS'] = df.loc[
+      df['CATEGORY'] == 'species', 'SPECIES_CODE'
+  ]
+
+  # Namespaces (dictionary key is the name of the namespace)
+  namespaces = {
+      '': df['SPECIES_CODE'],
+      'species': df.loc[df['CATEGORY'] == 'species', 'SPECIES_CODE'],
+      'issf': df.loc[df['CATEGORY'] == 'issf', 'SPECIES_CODE'],
+      'genera': df['genus'].drop_duplicates().dropna(),
+      'families': df['family'].drop_duplicates().dropna(),
+      'orders': df['ORDER1'].drop_duplicates().dropna(),
+      'clements': df.loc[df['CATEGORY'] == 'species', 'SCI_NAME'],
+  }
+
+  # The keys are (mapping name, source namespace, target namespace)
+  mappings = {
+      ('to_species', '', 'species'): df[
+          # Only select rows which should be reported as a species
+          df.merge(
+              df, left_on='REPORT_AS', right_on='SPECIES_CODE', how='left'
+          )['CATEGORY_y']
+          == 'species'
+      ][['SPECIES_CODE', 'REPORT_AS']],
+  }
+
+  for mask, suffix in (
+      (df['CATEGORY'] == 'species', 'species'),
+      (slice(None), ''),
+  ):
+    prefix = suffix + '_' if suffix else ''
+    mappings |= {
+        (prefix + 'to_genus', suffix, 'genera'): df[mask][
+            ['SPECIES_CODE', 'genus']
+        ],
+        (prefix + 'to_family', suffix, 'families'): df[mask][
+            ['SPECIES_CODE', 'family']
+        ],
+        (prefix + 'to_order', suffix, 'orders'): df[mask][
+            ['SPECIES_CODE', 'ORDER1']
+        ],
+    }
+
+  if SEABIRD_FAMILIES - set(df['family']):
+    raise ValueError('seabird families not found in eBird data')
+  if SEABIRD_ORDERS - set(df['ORDER1']):
+    raise ValueError('seabird orders not found in eBird data')
+  seabirds = df[
+      df['family'].isin(SEABIRD_FAMILIES) | df['ORDER1'].isin(SEABIRD_ORDERS)
+  ]
+  # The keys are class list name, namespace
+  class_lists = {
+      ('global_seabirds', 'species'): seabirds.loc[
+          seabirds['CATEGORY'] == 'species', 'SPECIES_CODE'
+      ],
+  }
+
+  # Add the prefixes and write to the database
+  db = namespace_db.load_db(validate=False)
+
+  # REMOVE
+  # for name in list(db.namespaces):
+  #   if (
+  #       name.startswith('bird_')
+  #       or name.startswith('ebird')
+  #       or name.startswith('clements')
+  #   ):
+  #     del db.namespaces[name]
+  # for name in list(db.mappings):
+  #   if name.startswith('ebird'):
+  #     del db.mappings[name]
+  # for name in list(db.class_lists):
+  #   if (
+  #       name.startswith('ebird')
+  #       or 'seabird' in name
+  #       or name.startswith('clements')
+  #       or name.startswith('bird_')
+  #   ):
+  #     del db.class_lists[name]
+  # END REMOVE
+
+  add_prefix = lambda name: (_PREFIX.value + '_' + name).strip('_')
+  for name, classes in namespaces.items():
+    db.namespaces[add_prefix(name)] = namespace.Namespace(frozenset(classes))
+  for (name, source_ns, target_ns), mapping in mappings.items():
+    # Some spuhs don't have a genus, and this was set to nan. Drop these from
+    # the mappings.
+    mapping = mapping.dropna()
+    db.mappings[add_prefix(name)] = namespace.Mapping(
+        add_prefix(source_ns),
+        add_prefix(target_ns),
+        dict(zip(mapping.iloc[:, 0], mapping.iloc[:, 1])),
+    )
+  for (name, ns), classes in class_lists.items():
+    db.class_lists[add_prefix(name)] = namespace.ClassList(
+        add_prefix(ns), tuple(sorted(classes))
+    )
+
+  namespace_db.validate_taxonomy_database(db)
+
+  with open(_OUTPUT_FILE.value, 'w') as f:
+    f.write(namespace_db.dump_db(db, validate=False))
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/chirp/taxonomy/namespace_db.py b/chirp/taxonomy/namespace_db.py
@@ -113,8 +113,9 @@ def default(self, o):
     return super().default(o)
 
 
-def dump_db(taxonomy_database: TaxonomyDatabase) -> str:
-  validate_taxonomy_database(taxonomy_database)
+def dump_db(taxonomy_database: TaxonomyDatabase, validate: bool = True) -> str:
+  if validate:
+    validate_taxonomy_database(taxonomy_database)
   return json.dumps(
       dataclasses.asdict(taxonomy_database),
       cls=TaxonomyDatabaseEncoder,
@@ -124,10 +125,23 @@ def dump_db(taxonomy_database: TaxonomyDatabase) -> str:
 
 
 @functools.cache
-def load_db(path: str = TAXONOMY_DATABASE_FILENAME) -> TaxonomyDatabase:
+def load_db(
+    path: str = TAXONOMY_DATABASE_FILENAME, validate: bool = True
+) -> TaxonomyDatabase:
+  """Load the taxonomy database.
+
+  This loads the taxonomy database from the given JSON file. It converts the
+  database into Python data structures and optionally validates that the
+  database is consistent.
+
+  Args:
+    path: The JSON file to load.
+    validate: If true, it validates the database.
+  """
   fileobj = open(path, "r")
   with fileobj as f:
     data = json.load(f)
   taxonomy_database = load_taxonomy_database(data)
-  validate_taxonomy_database(taxonomy_database)
+  if validate:
+    validate_taxonomy_database(taxonomy_database)
   return taxonomy_database