Merge branch 'master' into update-pyhdf-version

ghrcdaac · Jun 28, 2024 · 039c73f · 039c73f
2 parents ed1e8d1 + b91b900
commit 039c73f
Show file tree

Hide file tree

Showing 12 changed files with 990 additions and 1 deletion.
diff --git a/Dockerfile b/Dockerfile
@@ -4,7 +4,7 @@ ARG stage
 
 COPY requirements*.txt /tmp/
 
-RUN     pip install -r /tmp/requirements.txt --target "${LAMBDA_TASK_ROOT}"
+RUN     pip install --upgrade --force-reinstall -r /tmp/requirements.txt --target "${LAMBDA_TASK_ROOT}"
 
 # Only if stage is other than dev
 ADD mdx ${LAMBDA_TASK_ROOT}

diff --git a/mdx/granule_metadata_extractor/src/helpers/cossirimpacts.zip b/mdx/granule_metadata_extractor/src/helpers/cossirimpacts.zip
diff --git a/mdx/granule_metadata_extractor/src/helpers/creators/cossirimpacts.py b/mdx/granule_metadata_extractor/src/helpers/creators/cossirimpacts.py
@@ -0,0 +1,89 @@
+# create lookup zip for cossirimpacts
+# for all future collections
+from datetime import datetime, timedelta
+from utils.mdx import MDX
+import cProfile
+import time
+import math
+import re
+
+from netCDF4 import Dataset
+import numpy as np
+
+short_name = "cossirimpacts"
+provider_path = "cossirimpacts/"
+file_type = "netCDF-4"
+
+
+class MDXProcessing(MDX):
+
+    def __init__(self):
+        super().__init__()
+
+    def process(self, filename, file_obj_stream) -> dict:
+        """
+        Individual collection processing logic for spatial and temporal 
+        metadata extraction
+        :param filename: name of file to process
+        :type filename: str
+        :param file_obj_stream: file object stream to be processed
+        :type file_obj_stream: botocore.response.StreamingBody
+        """
+        return self.get_nc_metadata(filename, file_obj_stream)
+
+
+    def get_nc_metadata(self, filename, file_obj_stream):
+        """
+        Extract temporal and spatial metadata from netCDF-3 files
+        """
+        nc = Dataset("in-mem-file", mode='r', memory=file_obj_stream.read())
+        lat = np.array(nc['Latitude'])
+        lon = np.array(nc['Longitude'])
+        year = np.array(nc['Year'])
+        mon = np.array(nc['Month'])
+        day = np.array(nc['DayOfMonth'])
+        hr = np.array(nc['Hour'])
+        mn = np.array(nc['Minute'])
+        sec = np.array(nc['Second'])
+
+        #set missing values in lat and lon to np.nan
+        lat[lat==-999.0] = np.nan
+        lon[lon==-999.0] = np.nan
+        north, south, east, west = [np.nanmax(lat), np.nanmin(lat),
+                                    np.nanmax(lon), np.nanmin(lon)]
+
+        start_time, end_time = [datetime(2100, 1, 1), datetime(1900, 1, 1)]
+        for i in range(sec.shape[0]):
+            for j in range(sec.shape[1]):
+                if int(year[i,j]) != -999.0:
+                    total_sec = int(hr[i,j]*3600 + mn[i,j]*60 + sec[i,j])
+                    dt = datetime(int(year[i,j]), int(mon[i,j]), int(day[i,j])) + \
+                         timedelta(seconds=total_sec)
+                    start_time = min(dt, start_time)
+                    end_time = max(dt, end_time)
+
+        nc.close()
+        return {
+            "start": start_time,
+            "end": end_time,
+            "north": north,
+            "south": south,
+            "east": east,
+            "west": west,
+            "format": file_type
+        }
+
+
+    def main(self):
+        # start_time = time.time()
+        self.process_collection(short_name, provider_path)
+        # elapsed_time = time.time() - start_time
+        # print(f"Elapsed time in seconds: {elapsed_time}")
+        self.shutdown_ec2()
+
+
+if __name__ == '__main__':
+    MDXProcessing().main()
+    # The below can be use to run a profiler and see which functions are
+    # taking the most time to process
+    # cProfile.run('MDXProcessing().main()', sort='tottime')
diff --git a/mdx/granule_metadata_extractor/src/helpers/creators/er2navimpacts.py b/mdx/granule_metadata_extractor/src/helpers/creators/er2navimpacts.py
@@ -0,0 +1,81 @@
+# Create lookup zip for er2navimpacts
+# for all future collections
+from datetime import datetime, timedelta
+from utils.mdx import MDX
+import cProfile
+import time
+import math
+import re
+
+short_name = "er2navimpacts"
+provider_path = "er2navimpacts/fieldCampaigns/impacts/ER2Nav/data/"
+file_type = "ASCII"
+
+
+class MDXProcessing(MDX):
+
+    def __init__(self):
+        super().__init__()
+
+    def process(self, filename, file_obj_stream) -> dict:
+        """
+        Individual collection processing logic for spatial and temporal 
+        metadata extraction
+        :param filename: name of file to process
+        :type filename: str
+        :param file_obj_stream: file object stream to be processed
+        :type file_obj_stream: botocore.response.StreamingBody
+        """
+        return self.read_metadata_ascii(filename, file_obj_stream)
+
+
+    def read_metadata_ascii(self,filename, file_obj_stream):
+        """
+        Extract temporal and spatial metadata from ascii files
+        """
+        print(filename)
+        file_lines = []
+        for encoded_line in file_obj_stream.iter_lines():
+            file_lines.append(encoded_line.decode("utf-8"))
+
+        num_header_lines = int(file_lines[0].split(',')[0])
+        databuf = file_lines[num_header_lines:]
+        minTime, maxTime, minlat, maxlat, minlon, maxlon = [datetime(2100,1,1),
+                                                            datetime(1900,1,1),
+                                                            90.0,-90.0,180.0,-180.0]
+        year = int(filename.split('/')[-1].split('_')[3][0:4])
+        for i in range(len(databuf)):
+            tkn = databuf[i].split(',')
+            sec, doy, lat, lon = [int(tkn[0]), int(tkn[1]), float(tkn[2]), float(tkn[3])]
+            dt = datetime(year,1,1) + timedelta(seconds=sec) + timedelta(days=doy-1)
+            minTime = min(minTime, dt)
+            maxTime = max(maxTime, dt)
+            if lat != -9999.0 and lon != -9999.0:
+                maxlat = max(maxlat, lat)
+                minlat = min(minlat, lat)
+                maxlon = max(maxlon, lon)
+                minlon = min(minlon, lon)
+
+        return {
+            "start": minTime,
+            "end": maxTime,
+            "north": maxlat,
+            "south": minlat,
+            "east": maxlon,
+            "west": minlon,
+            "format": file_type
+        }
+
+    def main(self):
+        # start_time = time.time()
+        self.process_collection(short_name, provider_path)
+        # elapsed_time = time.time() - start_time
+        # print(f"Elapsed time in seconds: {elapsed_time}")
+        self.shutdown_ec2()
+
+
+if __name__ == '__main__':
+    MDXProcessing().main()
+    # The below can be use to run a profiler and see which functions are
+    # taking the most time to process
+    # cProfile.run('MDXProcessing().main()', sort='tottime')
diff --git a/mdx/granule_metadata_extractor/src/helpers/creators/ualbsndimpacts.py b/mdx/granule_metadata_extractor/src/helpers/creators/ualbsndimpacts.py
@@ -0,0 +1,107 @@
+# Create lookup zip for ualbsndimpacts
+# for all future collections
+from datetime import datetime, timedelta
+from utils.mdx import MDX
+import cProfile
+import time
+import math
+import re
+
+short_name = "ualbsndimpacts"
+provider_path = "ualbsndimpacts/"
+file_type = "ASCII"
+
+
+class MDXProcessing(MDX):
+
+    def __init__(self):
+        super().__init__()
+
+    def process(self, filename, file_obj_stream) -> dict:
+        """
+        Individual collection processing logic for spatial and temporal 
+        metadata extraction
+        :param filename: name of file to process
+        :type filename: str
+        :param file_obj_stream: file object stream to be processed
+        :type file_obj_stream: botocore.response.StreamingBody
+        """
+        return self.read_metadata_ascii(filename, file_obj_stream)
+
+
+    def read_metadata_ascii(self,filename, file_obj_stream):
+        """
+        Extract temporal and spatial metadata from ascii files
+        """
+        print(filename)
+        file_lines = []
+        for encoded_line in file_obj_stream.iter_lines():
+            file_lines.append(encoded_line.decode("ISO-8859-1"))
+
+        num_header_lines = 3 
+        databuf = file_lines[num_header_lines:]
+        minTime, maxTime, minlat, maxlat, minlon, maxlon = [datetime(2100,1,1),
+                                                            datetime(1900,1,1),
+                                                            90.0,-90.0,180.0,-180.0]
+
+        for i in range(len(databuf)):
+            tkn = databuf[i].split()
+            #tkn[0]: 1/19/2023 
+            #tkn[1]: 5:40:53
+            #tkn[2]: PM
+            #tkn[13]: 073°47'57.4"W 
+            #tkn[14]: 42°41'54.1"N
+
+            dt = tkn[0].split('/') #i.e., 1, 19, 2023
+            tt = tkn[1].split(':') #i.e., 5, 40, 53
+            utc_time_str = ''.join([dt[2],dt[0].zfill(2),dt[1].zfill(2),'T',tt[0].zfill(2),tt[1].zfill(2),tt[2].zfill(2)])
+            utc_time = datetime.strptime(utc_time_str,'%Y%m%dT%H%M%S')
+            if tkn[2] == 'PM':
+               utc_time = utc_time + timedelta(hours=12)
+            minTime = min(minTime, utc_time)
+            maxTime = max(maxTime, utc_time)
+
+            lon_str = tkn[13] #i.e.,'072°47\'35.1"W'
+            deg = float(lon_str.split('°')[0]) #i.e., 72
+            minu = float(lon_str.split('°')[1].split('\'')[0]) #i.e.,47
+            sec = float(lon_str.split('°')[1].split('\'')[1].split('"')[0]) #i.e.,35.1  
+            lon = (sec/60.+minu)/60.+deg
+            if lon_str.endswith('W'):
+               lon = -1.*lon
+
+            lat_str = tkn[14] #i.e., '43°07\'30.8"N'
+            deg = float(lat_str.split('°')[0]) #i.e., 43
+            minu = float(lat_str.split('°')[1].split('\'')[0]) #i.e.,7
+            sec = float(lat_str.split('°')[1].split('\'')[1].split('"')[0]) #i.e.,30.8
+            lat = (sec/60.+minu)/60.+deg
+            if lat_str.endswith('S'):
+               lat = -1.*lat
+
+            maxlat = max(maxlat, lat)
+            minlat = min(minlat, lat)
+            maxlon = max(maxlon, lon)
+            minlon = min(minlon, lon)
+
+        return {
+            "start": minTime,
+            "end": maxTime,
+            "north": maxlat,
+            "south": minlat,
+            "east": maxlon,
+            "west": minlon,
+            "format": file_type
+        }
+
+    def main(self):
+        # start_time = time.time()
+        self.process_collection(short_name, provider_path)
+        # elapsed_time = time.time() - start_time
+        # print(f"Elapsed time in seconds: {elapsed_time}")
+        self.shutdown_ec2()
+
+
+if __name__ == '__main__':
+    MDXProcessing().main()
+    # The below can be use to run a profiler and see which functions are
+    # taking the most time to process
+    # cProfile.run('MDXProcessing().main()', sort='tottime')
diff --git a/mdx/granule_metadata_extractor/src/helpers/er2navimpacts.zip b/mdx/granule_metadata_extractor/src/helpers/er2navimpacts.zip
diff --git a/mdx/granule_metadata_extractor/src/helpers/ualbsndimpacts.zip b/mdx/granule_metadata_extractor/src/helpers/ualbsndimpacts.zip