Added process,test and lookup zip creator codes for glmcierra

ghrcdaac · Jul 2, 2024 · 2a05ff7 · 2a05ff7
1 parent 8a0cb2f
commit 2a05ff7
Show file tree

Hide file tree

Showing 10 changed files with 506 additions and 1 deletion.
diff --git a/mdx/granule_metadata_extractor/processing/__init__.py b/mdx/granule_metadata_extractor/processing/__init__.py
@@ -146,3 +146,4 @@
 from .process_hiwat import ExtractHiwatMetadata
 from .process_sbuairmarimpacts import ExtractSbuairmarimpactsMetadata
 from .process_sbumwrimpacts import ExtractSbumwrimpactsMetadata
+from .process_glmcierra import ExtractGlmcierraMetadata
diff --git a/mdx/granule_metadata_extractor/processing/process_glmcierra.py b/mdx/granule_metadata_extractor/processing/process_glmcierra.py
@@ -0,0 +1,118 @@
+from ..src.extract_netcdf_metadata import ExtractNetCDFMetadata
+import os
+import numpy as np
+from datetime import datetime, timedelta
+from netCDF4 import Dataset
+
+class ExtractGlmcierraMetadata(ExtractNetCDFMetadata):
+    """
+    A class to extract glmcierra 
+    """
+
+    def __init__(self, file_path):
+        #super().__init__(file_path)
+        self.file_path = file_path
+        #these 5 files below have incorrect lat/lon info
+        #we assign [90,-90,180,-180] temporarily
+        #after finishing in PROD, come back to assign summary metadata to them
+        self.file_excluded = ['OR_GLM-L2-CIERRA-DB_GOES-EAST_s20192931845000.nc',
+                              'OR_GLM-L2-CIERRA-DB_GOES-EAST_s20193132345000.nc',
+                              'OR_GLM-L2-CIERRA-DB_GOES-WEST_s20203590215000.nc',
+                              'OR_GLM-L2-CIERRA-DB_GOES-WEST_s20203591600000.nc',
+                              'OR_GLM-L2-CIERRA-DB_GOES-WEST_s20210122000000.nc']
+        self.fileformat = 'netCDF-4'
+
+        # extracting time and space metadata from nc file
+        dataset = Dataset(file_path)
+        [self.minTime, self.maxTime, self.SLat, self.NLat, self.WLon, self.ELon] = \
+                        self.get_variables_min_max(dataset, file_path)
+        dataset.close()
+
+    def get_variables_min_max(self, datafile, filename):
+        """
+        :param datafile: Dataset opened
+        :param filename: file path
+        :return:
+        """
+        ftype = datafile.file_format
+        if ftype.startswith('NETCDF3'):
+            file_type = "netCDF-3"
+        else:
+            file_type = "netCDF-4"
+
+        lats = np.array(datafile['FLASH_LAT'][:])
+        lons = np.array(datafile['FLASH_LON'][:])
+
+        maxlat, minlat, maxlon, minlon = [np.nanmax(lats),
+                                          np.nanmin(lats),
+                                          np.nanmax(lons),
+                                          np.nanmin(lons)]
+        if filename.split('/')[-1] in self.file_excluded:
+            #assign summary metadata to these files 
+            #'north': '57.267', 'south': '-57.312', 'east': '180.0', 'west': '-180.0'
+            maxlat, minlat, maxlon, minlon = [57.267,-57.312, 180.,-180.]
+
+        minTime = datetime.strptime(datafile.TIME_COVERAGE_START,'%Y-%m-%d %H:%M:%SZ')
+        maxTime = datetime.strptime(datafile.TIME_COVERAGE_END,'%Y-%m-%d %H:%M:%SZ')
+
+        return minTime, maxTime, minlat, maxlat, minlon, maxlon
+
+    def get_wnes_geometry(self, scale_factor=1.0, offset=0):
+        """
+        Extract the geometry from a GIF file
+        :param scale_factor: In case it is not CF compliant we will need scale factor
+        :param offset: data offset if the netCDF not CF compliant
+        :return: list of bounding box coordinates [west, north, east, south]
+        """
+        north, south, east, west = [round((x * scale_factor) + offset, 3) for x in
+                                    [self.NLat, self.SLat, self.ELon, self.WLon]]
+        return [self.convert_360_to_180(west), north, self.convert_360_to_180(east), south]
+
+    def get_temporal(self, time_variable_key='time', units_variable='units', scale_factor=1.0,
+                     offset=0,
+                     date_format='%Y-%m-%dT%H:%M:%SZ'):
+        """
+        :param time_variable_key: The NetCDF variable we need to target
+        :param units_variable: The NetCDF variable we need to target
+        :param scale_factor: In case it is not CF compliant we will need scale factor
+        :param offset: data offset if the netCDF not CF compliant
+        :param date_format IF specified the return type will be a string type
+        :return:
+        """
+        start_date = self.minTime.strftime(date_format)
+        stop_date = self.maxTime.strftime(date_format)
+        return start_date, stop_date
+
+    def get_metadata(self, ds_short_name, format='netCDF-4', version='1', **kwargs):
+        """
+        :param ds_short_name:
+        :param time_variable_key:
+        :param lon_variable_key:
+        :param lat_variable_key:
+        :param time_units:
+        :param format:
+        :return:
+        """
+        data = dict()
+        data['GranuleUR'] = granule_name = os.path.basename(self.file_path)
+        start_date, stop_date = self.get_temporal()
+        data['ShortName'] = ds_short_name
+        data['BeginningDateTime'], data['EndingDateTime'] = start_date, stop_date
+
+        geometry_list = self.get_wnes_geometry()
+        data['WestBoundingCoordinate'], data['NorthBoundingCoordinate'], \
+        data['EastBoundingCoordinate'], data['SouthBoundingCoordinate'] = list(
+            str(x) for x in geometry_list)
+        data['checksum'] = self.get_checksum()
+        data['SizeMBDataGranule'] = str(round(self.get_file_size_megabytes(), 2))
+        data['DataFormat'] = self.fileformat
+        data['VersionId'] = version
+        return data
+
+
+if __name__ == '__main__':
+    print('Extracting glmcierra  Metadata')
+    path_to_file = "../../test/fixtures/"
+    exnet = ExtractSbuceilimpactsMetadata(path_to_file)
+    metada = exnet.get_metadata("test")
+    print(metada)
diff --git a/mdx/granule_metadata_extractor/src/helpers/creators/glmcierra.py b/mdx/granule_metadata_extractor/src/helpers/creators/glmcierra.py
@@ -0,0 +1,89 @@
+# create lookup zip for glmcierra 
+# for all future collections
+from datetime import datetime, timedelta
+from utils.mdx import MDX
+import cProfile
+import time
+import math
+import re
+
+from netCDF4 import Dataset
+import numpy as np
+
+short_name = "glmcierra"
+provider_path = "glmcierra/"
+file_type = "netCDF-4"
+
+file_excluded = ['OR_GLM-L2-CIERRA-DB_GOES-EAST_s20192931845000.nc',
+                 'OR_GLM-L2-CIERRA-DB_GOES-EAST_s20193132345000.nc',
+                 'OR_GLM-L2-CIERRA-DB_GOES-WEST_s20203590215000.nc',
+                 'OR_GLM-L2-CIERRA-DB_GOES-WEST_s20203591600000.nc',
+                 'OR_GLM-L2-CIERRA-DB_GOES-WEST_s20210122000000.nc']
+
+class MDXProcessing(MDX):
+
+    def __init__(self):
+        super().__init__()
+
+    def process(self, filename, file_obj_stream) -> dict:
+        """
+        Individual collection processing logic for spatial and temporal 
+        metadata extraction
+        :param filename: name of file to process
+        :type filename: str
+        :param file_obj_stream: file object stream to be processed
+        :type file_obj_stream: botocore.response.StreamingBody
+        """
+        return self.get_nc_metadata(filename, file_obj_stream)
+
+    def get_nc_metadata(self, filename, file_obj_stream):
+        """
+        Extract temporal and spatial metadata from netCDF-4 files
+        """
+        print(filename)
+        datafile = Dataset("in-mem-file", mode='r', memory=file_obj_stream.read())
+        ftype = datafile.file_format
+        if ftype.startswith('NETCDF3'):
+            file_type = "netCDF-3"
+        else:
+            file_type = "netCDF-4"
+
+        lats = np.array(datafile['FLASH_LAT'][:])
+        lons = np.array(datafile['FLASH_LON'][:])
+
+        north, south, east, west = [np.nanmax(lats),
+                                    np.nanmin(lats),
+                                    np.nanmax(lons),
+                                    np.nanmin(lons)]
+        if filename.split('/')[-1] in file_excluded:
+            #assign summary metadata 
+            #"north": 57.267, "south": -57.312, "east": 180.0, "west": -180.0
+            north, south, east, west = [57.267, -57.312, 180.0, -180.0]
+
+        start_time = datetime.strptime(datafile.TIME_COVERAGE_START,'%Y-%m-%d %H:%M:%SZ')
+        end_time = datetime.strptime(datafile.TIME_COVERAGE_END,'%Y-%m-%d %H:%M:%SZ')
+        datafile.close()
+        return {
+            "start": start_time,
+            "end": end_time,
+            "north": north,
+            "south": south,
+            "east": east,
+            "west": west,
+            "format": file_type
+        }
+
+
+    def main(self):
+        # start_time = time.time()
+        self.process_collection(short_name, provider_path)
+        # elapsed_time = time.time() - start_time
+        # print(f"Elapsed time in seconds: {elapsed_time}")
+        self.shutdown_ec2()
+
+
+if __name__ == '__main__':
+    MDXProcessing().main()
+    # The below can be use to run a profiler and see which functions are
+    # taking the most time to process
+    # cProfile.run('MDXProcessing().main()', sort='tottime')
diff --git a/mdx/granule_metadata_extractor/src/helpers/creators/glmcierra_update_lookup.py b/mdx/granule_metadata_extractor/src/helpers/creators/glmcierra_update_lookup.py
@@ -0,0 +1,48 @@
+from zipfile import ZipFile
+import json
+import os
+import pathlib
+
+#summary metadata
+#"north": 57.267, "south": -57.312, "east": 180.0, "west": -180.0
+#north, south, east, west = [57.267, -57.312, 180.0, -180.0]
+
+file_excluded = ['OR_GLM-L2-CIERRA-DB_GOES-EAST_s20192931845000.nc',
+                 'OR_GLM-L2-CIERRA-DB_GOES-EAST_s20193132345000.nc',
+                 'OR_GLM-L2-CIERRA-DB_GOES-WEST_s20203590215000.nc',
+                 'OR_GLM-L2-CIERRA-DB_GOES-WEST_s20203591600000.nc',
+                 'OR_GLM-L2-CIERRA-DB_GOES-WEST_s20210122000000.nc']
+
+
+
+#Get netCDF-4 metadata attributes from lookup zip
+lookup_zip_path = os.path.join(pathlib.Path(__file__).parent.absolute(),
+                      f"../glmcierra.zip")
+lookup_zip_path_orig = os.path.join(pathlib.Path(__file__).parent.absolute(),
+                      f"../glmcierra_orig.zip")
+os.rename(lookup_zip_path, lookup_zip_path_orig)
+
+with ZipFile(lookup_zip_path_orig) as lookup_zip_orig:
+     with lookup_zip_orig.open("lookup.json") as collection_lookup_orig:
+          metadata = json.load(collection_lookup_orig)
+with ZipFile(lookup_zip_path_orig) as lookup_zip_orig:
+     with lookup_zip_orig.open("summary.json") as summary_meta_orig:
+          summary_meta = json.load(summary_meta_orig)
+
+#{"OR_GLM-L2-CIERRA-DB_GOES-EAST_s20170122300000.nc": {"start": "2017-01-12T23:00:00Z", "end": "2017-01-12T23:14:59Z", "north": "56.429", "south": "-56.141", "east": "-33.987", "west": "-148.712", "format": "netCDF-4", "sizeMB": 0.92}
+
+for key in file_excluded:
+    metadata[key]["north"] = "57.267"
+    metadata[key]["south"] = "-57.312"
+    metadata[key]["east"] = "180.0"
+    metadata[key]["west"] = "-180.0" 
+
+with open('./lookup.json', 'w') as fp:
+    json.dump(metadata, fp)
+with open('./summary.json', 'w') as fp:
+    json.dump(summary_meta, fp)
+
+# The below 2 line can also be substituted by the command line "zip glmcierra.zip lookup.json"
+with ZipFile('../glmcierra.zip', 'w') as myzip:
+    myzip.write('lookup.json')
+    myzip.write('summary.json')
diff --git a/mdx/granule_metadata_extractor/src/helpers/glmcierra.zip b/mdx/granule_metadata_extractor/src/helpers/glmcierra.zip
diff --git a/mdx/granule_metadata_extractor/src/helpers/glmcierra_orig.zip b/mdx/granule_metadata_extractor/src/helpers/glmcierra_orig.zip
diff --git a/mdx/main.py b/mdx/main.py
@@ -171,7 +171,8 @@ def extract_netcdf_metadata(self, ds_short_name, version, access_url, netcdf_fil
             "sondecpexcv": mdx.ExtractSondecpexcvMetadata,
             "hiwat": mdx.ExtractHiwatMetadata,
             "sbuairmarimpacts": mdx.ExtractSbuairmarimpactsMetadata,
-            "sbumwrimpacts": mdx.ExtractSbumwrimpactsMetadata
+            "sbumwrimpacts": mdx.ExtractSbumwrimpactsMetadata,
+            "glmcierra": mdx.ExtractGlmcierraMetadata
         }
 
         time_variable_key = netcdf_vars.get('time_var_key')

diff --git a/mdx/test/fixtures/OR_GLM-L2-CIERRA-DB_GOES-EAST_s20230901400000.nc b/mdx/test/fixtures/OR_GLM-L2-CIERRA-DB_GOES-EAST_s20230901400000.nc