Skip to content

Commit

Permalink
Added process,test and lookup zip creator codes for glmcierra
Browse files Browse the repository at this point in the history
  • Loading branch information
Lucie2006 committed Jul 2, 2024
1 parent 8a0cb2f commit 2a05ff7
Show file tree
Hide file tree
Showing 10 changed files with 506 additions and 1 deletion.
1 change: 1 addition & 0 deletions mdx/granule_metadata_extractor/processing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,4 @@
from .process_hiwat import ExtractHiwatMetadata
from .process_sbuairmarimpacts import ExtractSbuairmarimpactsMetadata
from .process_sbumwrimpacts import ExtractSbumwrimpactsMetadata
from .process_glmcierra import ExtractGlmcierraMetadata
118 changes: 118 additions & 0 deletions mdx/granule_metadata_extractor/processing/process_glmcierra.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
from ..src.extract_netcdf_metadata import ExtractNetCDFMetadata
import os
import numpy as np
from datetime import datetime, timedelta
from netCDF4 import Dataset

class ExtractGlmcierraMetadata(ExtractNetCDFMetadata):
"""
A class to extract glmcierra
"""

def __init__(self, file_path):
#super().__init__(file_path)
self.file_path = file_path
#these 5 files below have incorrect lat/lon info
#we assign [90,-90,180,-180] temporarily
#after finishing in PROD, come back to assign summary metadata to them
self.file_excluded = ['OR_GLM-L2-CIERRA-DB_GOES-EAST_s20192931845000.nc',
'OR_GLM-L2-CIERRA-DB_GOES-EAST_s20193132345000.nc',
'OR_GLM-L2-CIERRA-DB_GOES-WEST_s20203590215000.nc',
'OR_GLM-L2-CIERRA-DB_GOES-WEST_s20203591600000.nc',
'OR_GLM-L2-CIERRA-DB_GOES-WEST_s20210122000000.nc']
self.fileformat = 'netCDF-4'

# extracting time and space metadata from nc file
dataset = Dataset(file_path)
[self.minTime, self.maxTime, self.SLat, self.NLat, self.WLon, self.ELon] = \
self.get_variables_min_max(dataset, file_path)
dataset.close()

def get_variables_min_max(self, datafile, filename):
"""
:param datafile: Dataset opened
:param filename: file path
:return:
"""
ftype = datafile.file_format
if ftype.startswith('NETCDF3'):
file_type = "netCDF-3"
else:
file_type = "netCDF-4"

lats = np.array(datafile['FLASH_LAT'][:])
lons = np.array(datafile['FLASH_LON'][:])

maxlat, minlat, maxlon, minlon = [np.nanmax(lats),
np.nanmin(lats),
np.nanmax(lons),
np.nanmin(lons)]
if filename.split('/')[-1] in self.file_excluded:
#assign summary metadata to these files
#'north': '57.267', 'south': '-57.312', 'east': '180.0', 'west': '-180.0'
maxlat, minlat, maxlon, minlon = [57.267,-57.312, 180.,-180.]

minTime = datetime.strptime(datafile.TIME_COVERAGE_START,'%Y-%m-%d %H:%M:%SZ')
maxTime = datetime.strptime(datafile.TIME_COVERAGE_END,'%Y-%m-%d %H:%M:%SZ')

return minTime, maxTime, minlat, maxlat, minlon, maxlon

def get_wnes_geometry(self, scale_factor=1.0, offset=0):
"""
Extract the geometry from a GIF file
:param scale_factor: In case it is not CF compliant we will need scale factor
:param offset: data offset if the netCDF not CF compliant
:return: list of bounding box coordinates [west, north, east, south]
"""
north, south, east, west = [round((x * scale_factor) + offset, 3) for x in
[self.NLat, self.SLat, self.ELon, self.WLon]]
return [self.convert_360_to_180(west), north, self.convert_360_to_180(east), south]

def get_temporal(self, time_variable_key='time', units_variable='units', scale_factor=1.0,
offset=0,
date_format='%Y-%m-%dT%H:%M:%SZ'):
"""
:param time_variable_key: The NetCDF variable we need to target
:param units_variable: The NetCDF variable we need to target
:param scale_factor: In case it is not CF compliant we will need scale factor
:param offset: data offset if the netCDF not CF compliant
:param date_format IF specified the return type will be a string type
:return:
"""
start_date = self.minTime.strftime(date_format)
stop_date = self.maxTime.strftime(date_format)
return start_date, stop_date

def get_metadata(self, ds_short_name, format='netCDF-4', version='1', **kwargs):
"""
:param ds_short_name:
:param time_variable_key:
:param lon_variable_key:
:param lat_variable_key:
:param time_units:
:param format:
:return:
"""
data = dict()
data['GranuleUR'] = granule_name = os.path.basename(self.file_path)
start_date, stop_date = self.get_temporal()
data['ShortName'] = ds_short_name
data['BeginningDateTime'], data['EndingDateTime'] = start_date, stop_date

geometry_list = self.get_wnes_geometry()
data['WestBoundingCoordinate'], data['NorthBoundingCoordinate'], \
data['EastBoundingCoordinate'], data['SouthBoundingCoordinate'] = list(
str(x) for x in geometry_list)
data['checksum'] = self.get_checksum()
data['SizeMBDataGranule'] = str(round(self.get_file_size_megabytes(), 2))
data['DataFormat'] = self.fileformat
data['VersionId'] = version
return data


if __name__ == '__main__':
print('Extracting glmcierra Metadata')
path_to_file = "../../test/fixtures/"
exnet = ExtractSbuceilimpactsMetadata(path_to_file)
metada = exnet.get_metadata("test")
print(metada)
89 changes: 89 additions & 0 deletions mdx/granule_metadata_extractor/src/helpers/creators/glmcierra.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# create lookup zip for glmcierra
# for all future collections
from datetime import datetime, timedelta
from utils.mdx import MDX
import cProfile
import time
import math
import re

from netCDF4 import Dataset
import numpy as np

short_name = "glmcierra"
provider_path = "glmcierra/"
file_type = "netCDF-4"

file_excluded = ['OR_GLM-L2-CIERRA-DB_GOES-EAST_s20192931845000.nc',
'OR_GLM-L2-CIERRA-DB_GOES-EAST_s20193132345000.nc',
'OR_GLM-L2-CIERRA-DB_GOES-WEST_s20203590215000.nc',
'OR_GLM-L2-CIERRA-DB_GOES-WEST_s20203591600000.nc',
'OR_GLM-L2-CIERRA-DB_GOES-WEST_s20210122000000.nc']

class MDXProcessing(MDX):

def __init__(self):
super().__init__()

def process(self, filename, file_obj_stream) -> dict:
"""
Individual collection processing logic for spatial and temporal
metadata extraction
:param filename: name of file to process
:type filename: str
:param file_obj_stream: file object stream to be processed
:type file_obj_stream: botocore.response.StreamingBody
"""
return self.get_nc_metadata(filename, file_obj_stream)

def get_nc_metadata(self, filename, file_obj_stream):
"""
Extract temporal and spatial metadata from netCDF-4 files
"""
print(filename)
datafile = Dataset("in-mem-file", mode='r', memory=file_obj_stream.read())
ftype = datafile.file_format
if ftype.startswith('NETCDF3'):
file_type = "netCDF-3"
else:
file_type = "netCDF-4"

lats = np.array(datafile['FLASH_LAT'][:])
lons = np.array(datafile['FLASH_LON'][:])

north, south, east, west = [np.nanmax(lats),
np.nanmin(lats),
np.nanmax(lons),
np.nanmin(lons)]
if filename.split('/')[-1] in file_excluded:
#assign summary metadata
#"north": 57.267, "south": -57.312, "east": 180.0, "west": -180.0
north, south, east, west = [57.267, -57.312, 180.0, -180.0]

start_time = datetime.strptime(datafile.TIME_COVERAGE_START,'%Y-%m-%d %H:%M:%SZ')
end_time = datetime.strptime(datafile.TIME_COVERAGE_END,'%Y-%m-%d %H:%M:%SZ')
datafile.close()
return {
"start": start_time,
"end": end_time,
"north": north,
"south": south,
"east": east,
"west": west,
"format": file_type
}


def main(self):
# start_time = time.time()
self.process_collection(short_name, provider_path)
# elapsed_time = time.time() - start_time
# print(f"Elapsed time in seconds: {elapsed_time}")
self.shutdown_ec2()


if __name__ == '__main__':
MDXProcessing().main()
# The below can be use to run a profiler and see which functions are
# taking the most time to process
# cProfile.run('MDXProcessing().main()', sort='tottime')
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from zipfile import ZipFile
import json
import os
import pathlib

#summary metadata
#"north": 57.267, "south": -57.312, "east": 180.0, "west": -180.0
#north, south, east, west = [57.267, -57.312, 180.0, -180.0]

file_excluded = ['OR_GLM-L2-CIERRA-DB_GOES-EAST_s20192931845000.nc',
'OR_GLM-L2-CIERRA-DB_GOES-EAST_s20193132345000.nc',
'OR_GLM-L2-CIERRA-DB_GOES-WEST_s20203590215000.nc',
'OR_GLM-L2-CIERRA-DB_GOES-WEST_s20203591600000.nc',
'OR_GLM-L2-CIERRA-DB_GOES-WEST_s20210122000000.nc']



#Get netCDF-4 metadata attributes from lookup zip
lookup_zip_path = os.path.join(pathlib.Path(__file__).parent.absolute(),
f"../glmcierra.zip")
lookup_zip_path_orig = os.path.join(pathlib.Path(__file__).parent.absolute(),
f"../glmcierra_orig.zip")
os.rename(lookup_zip_path, lookup_zip_path_orig)

with ZipFile(lookup_zip_path_orig) as lookup_zip_orig:
with lookup_zip_orig.open("lookup.json") as collection_lookup_orig:
metadata = json.load(collection_lookup_orig)
with ZipFile(lookup_zip_path_orig) as lookup_zip_orig:
with lookup_zip_orig.open("summary.json") as summary_meta_orig:
summary_meta = json.load(summary_meta_orig)

#{"OR_GLM-L2-CIERRA-DB_GOES-EAST_s20170122300000.nc": {"start": "2017-01-12T23:00:00Z", "end": "2017-01-12T23:14:59Z", "north": "56.429", "south": "-56.141", "east": "-33.987", "west": "-148.712", "format": "netCDF-4", "sizeMB": 0.92}

for key in file_excluded:
metadata[key]["north"] = "57.267"
metadata[key]["south"] = "-57.312"
metadata[key]["east"] = "180.0"
metadata[key]["west"] = "-180.0"

with open('./lookup.json', 'w') as fp:
json.dump(metadata, fp)
with open('./summary.json', 'w') as fp:
json.dump(summary_meta, fp)

# The below 2 line can also be substituted by the command line "zip glmcierra.zip lookup.json"
with ZipFile('../glmcierra.zip', 'w') as myzip:
myzip.write('lookup.json')
myzip.write('summary.json')
Binary file not shown.
Binary file not shown.
3 changes: 2 additions & 1 deletion mdx/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ def extract_netcdf_metadata(self, ds_short_name, version, access_url, netcdf_fil
"sondecpexcv": mdx.ExtractSondecpexcvMetadata,
"hiwat": mdx.ExtractHiwatMetadata,
"sbuairmarimpacts": mdx.ExtractSbuairmarimpactsMetadata,
"sbumwrimpacts": mdx.ExtractSbumwrimpactsMetadata
"sbumwrimpacts": mdx.ExtractSbumwrimpactsMetadata,
"glmcierra": mdx.ExtractGlmcierraMetadata
}

time_variable_key = netcdf_vars.get('time_var_key')
Expand Down
Binary file not shown.
Loading

0 comments on commit 2a05ff7

Please sign in to comment.