From 0a5c8a9710776f4431c73fa74aecbf3872691ef1 Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Tue, 8 Oct 2024 15:22:01 -0500 Subject: [PATCH 1/3] Added lookup zip creator code musondeimpacts.py --- .../src/helpers/creators/musondeimpacts.py | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 mdx/granule_metadata_extractor/src/helpers/creators/musondeimpacts.py diff --git a/mdx/granule_metadata_extractor/src/helpers/creators/musondeimpacts.py b/mdx/granule_metadata_extractor/src/helpers/creators/musondeimpacts.py new file mode 100644 index 0000000..56203af --- /dev/null +++ b/mdx/granule_metadata_extractor/src/helpers/creators/musondeimpacts.py @@ -0,0 +1,164 @@ +# create lookup zip for musondeimpacts +# for all future collections +from datetime import datetime, timedelta +from utils.mdx import MDX +import cProfile +import time +import math +import re + +from netCDF4 import Dataset +import numpy as np + +short_name = "musondeimpacts" +provider_path = "musondeimpacts/fieldCampaigns/impacts/MU_sondes/data/" +file_type = "HDF-5" + + +class MDXProcessing(MDX): + + def __init__(self): + super().__init__() + self.fileformat = 'ASCII' + + self.utf8_list = ['IMPACTS_upperair_UMILL_radiosonde_202201291800_QCMiller.txt', + 'IMPACTS_upperair_UMILL_radiosonde_202201292000_QCMiller.txt', + 'IMPACTS_upperair_UMILL_radiosonde_202201292200_QCMiller.txt', + 'IMPACTS_upperair_UMILL_radiosonde_202202191800_QC.txt', + 'IMPACTS_upperair_UMILL_radiosonde_202202191500_QC.txt'] + + + def process(self, filename, file_obj_stream) -> dict: + """ + Individual collection processing logic for spatial and temporal + metadata extraction + :param filename: name of file to process + :type filename: str + :param file_obj_stream: file object stream to be processed + :type file_obj_stream: botocore.response.StreamingBody + """ + if filename.endswith('.cdf'): #HDF-5 + file_type = 'HDF-5' + return self.get_hdf_metadata(filename, file_obj_stream) + else: #ASCII + file_type = 'ASCII' + return self.get_ascii_metadata(filename, file_obj_stream) + + + def get_hdf_metadata(self, filename, file_obj_stream): + """ + Extract temporal and spatial metadata from HDF-5 files + """ + datafile = Dataset("in-mem-file", mode='r', memory=file_obj_stream.read()) + lats = datafile['lat'][:].flatten() + lons = datafile['lon'][:].flatten() + sec = datafile['time'][:].flatten() + ref_time_str = datafile['time'].units #'seconds since 2023-01-25 20:16:35' + ref_time = datetime.strptime(ref_time_str, 'seconds since %Y-%m-%d %H:%M:%S') + + start_time, end_time = [ref_time+timedelta(seconds=min(sec)), + ref_time+timedelta(seconds=max(sec))] + + north, south, east, west = [lats.max(), + lats.min(), + lons.max(), + lons.min()] + datafile.close() + return { + "start": start_time, + "end": end_time, + "north": north, + "south": south, + "east": east, + "west": west, + "format": file_type + } + + + def get_ascii_metadata(self, filename, file_obj_stream): + """ + Extract temporal and spatial metadata from ascii files + """ +######## + file_lines = [] + fn = filename.split('/')[-1] + if '_windsonde1_' in fn: #wind sonde file + #sample file: + #IMPACTS_upperair_UMILL_windsonde1_202201162100_QCTeare.txt + for encoded_line in file_obj_stream.iter_lines(): + file_lines.append(encoded_line.decode("utf-8")) + + for line in file_lines: + line = line.strip() # remove all the leading and trailing spaces from a string + if line.startswith('XXX '): + start_time_str = '20'+line.split()[-1] #i.e., 220116/1958 + minTime = datetime.strptime(start_time_str,'%Y%m%d/%H%M') + elif line.startswith('Site'): + tkn = line.split() + lat0 = float(tkn[1].split(',')[0].split('=')[-1]) + lon0 = float(tkn[2].split('=')[-1]) + maxlat, minlat, maxlon, minlon = [lat0+0.01, + lat0-0.01, + lon0+0.01, + lon0-0.01] + elif line.startswith('Saved by user: '): + maxTime = datetime.strptime(line,'Saved by user: User on %Y%m%d/%H%M UTC') + break + else: #radio sonde file, either utf-8 or utf-16-be (big endian) + if fn in self.utf8_list: + endian_type = 'utf_8' + else: #utf_16-be + endian_type = 'utf_16-be' + + count = 0 #account number of header lines for later use + for encoded_line in file_obj_stream.iter_lines(): + count = count + 1 + line = encoded_line.decode(endian_type,errors='ignore').strip() + file_lines.append(line) + + if line.startswith('Balloon release date and time'): + minTime = datetime.strptime(line.split()[-1].strip(),'%Y-%m-%dT%H:%M:%S') #i.e.,2022-01-29T13:07:23 + elif 'n Elapsed time TimeUTC' in line: + num_header_lines = count + 1 + break + + elap_sec = [] + lat = [] + lon = [] + for line in file_lines[num_header_lines:]: + if len(line) < 20 or 'row' in line: + continue + tkn = line.split() + elap_sec.append(float(tkn[1])) + lat.append(float(tkn[-2])) + lon.append(float(tkn[-1])) + maxTime = minTime + timedelta(seconds = max(elap_sec)) + maxlat, minlat, maxlon, minlon = [max(lat),min(lat),max(lon),min(lon)] + +######## + + + return { + "start": minTime, + "end": maxTime, + "north": maxlat, + "south": minlat, + "east": maxlon, + "west": minlon, + "format": file_type + } + + + def main(self): + # start_time = time.time() + self.process_collection(short_name, provider_path) + # elapsed_time = time.time() - start_time + # print(f"Elapsed time in seconds: {elapsed_time}") + self.shutdown_ec2() + + +if __name__ == '__main__': + MDXProcessing().main() + # The below can be use to run a profiler and see which functions are + # taking the most time to process + # cProfile.run('MDXProcessing().main()', sort='tottime') From 86318f6b05afdbe364c3f507a3abc3aa29329ecd Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Wed, 9 Oct 2024 17:13:51 +0000 Subject: [PATCH 2/3] Updated musondeimpacts.py and created musondeimpacts.zip --- .../src/helpers/creators/musondeimpacts.py | 63 ++++++++++-------- .../src/helpers/musondeimpacts.zip | Bin 0 -> 11407 bytes 2 files changed, 34 insertions(+), 29 deletions(-) create mode 100644 mdx/granule_metadata_extractor/src/helpers/musondeimpacts.zip diff --git a/mdx/granule_metadata_extractor/src/helpers/creators/musondeimpacts.py b/mdx/granule_metadata_extractor/src/helpers/creators/musondeimpacts.py index 56203af..0e862af 100644 --- a/mdx/granule_metadata_extractor/src/helpers/creators/musondeimpacts.py +++ b/mdx/granule_metadata_extractor/src/helpers/creators/musondeimpacts.py @@ -24,9 +24,8 @@ def __init__(self): self.utf8_list = ['IMPACTS_upperair_UMILL_radiosonde_202201291800_QCMiller.txt', 'IMPACTS_upperair_UMILL_radiosonde_202201292000_QCMiller.txt', 'IMPACTS_upperair_UMILL_radiosonde_202201292200_QCMiller.txt', - 'IMPACTS_upperair_UMILL_radiosonde_202202191800_QC.txt', - 'IMPACTS_upperair_UMILL_radiosonde_202202191500_QC.txt'] - + 'IMPACTS_upperair_UMILL_radiosonde_202202191500_QC.txt', + 'IMPACTS_upperair_UMILL_radiosonde_202202191800_QC.txt'] def process(self, filename, file_obj_stream) -> dict: """ @@ -50,19 +49,27 @@ def get_hdf_metadata(self, filename, file_obj_stream): Extract temporal and spatial metadata from HDF-5 files """ datafile = Dataset("in-mem-file", mode='r', memory=file_obj_stream.read()) - lats = datafile['lat'][:].flatten() - lons = datafile['lon'][:].flatten() - sec = datafile['time'][:].flatten() - ref_time_str = datafile['time'].units #'seconds since 2023-01-25 20:16:35' - ref_time = datetime.strptime(ref_time_str, 'seconds since %Y-%m-%d %H:%M:%S') - - start_time, end_time = [ref_time+timedelta(seconds=min(sec)), - ref_time+timedelta(seconds=max(sec))] - - north, south, east, west = [lats.max(), - lats.min(), - lons.max(), - lons.min()] + if '_windsonde_' in filename: + lat = float(datafile.latitude) + lon = float(datafile.longitude) + north, south, east, west = [lat+0.01, lat-0.01, lon+0.01, lon-0.01] + tkn = filename.split('.cdf')[0].split('_') + start_time = datetime.strptime(tkn[-2]+tkn[-1],'%Y%m%d%H%M%S') + end_time = start_time + else: + lats = datafile['lat'][:].flatten() + lons = datafile['lon'][:].flatten() + sec = datafile['time'][:].flatten() + ref_time_str = datafile['time'].units #'seconds since 2023-01-25 20:16:35' + ref_time = datetime.strptime(ref_time_str, 'seconds since %Y-%m-%d %H:%M:%S') + + start_time, end_time = [ref_time+timedelta(seconds=sec.min().item()), + ref_time+timedelta(seconds=sec.max().item())] + + north, south, east, west = [lats.max(), + lats.min(), + lons.max(), + lons.min()] datafile.close() return { "start": start_time, @@ -79,7 +86,6 @@ def get_ascii_metadata(self, filename, file_obj_stream): """ Extract temporal and spatial metadata from ascii files """ -######## file_lines = [] fn = filename.split('/')[-1] if '_windsonde1_' in fn: #wind sonde file @@ -105,21 +111,23 @@ def get_ascii_metadata(self, filename, file_obj_stream): maxTime = datetime.strptime(line,'Saved by user: User on %Y%m%d/%H%M UTC') break else: #radio sonde file, either utf-8 or utf-16-be (big endian) + print('fn=',fn) + endian_type = 'utf-16-be' if fn in self.utf8_list: - endian_type = 'utf_8' - else: #utf_16-be - endian_type = 'utf_16-be' + endian_type = 'utf-8' - count = 0 #account number of header lines for later use + #read lines and save into 'file_lines' list for encoded_line in file_obj_stream.iter_lines(): - count = count + 1 line = encoded_line.decode(endian_type,errors='ignore').strip() file_lines.append(line) - + + count = 0 #account number of header lines for later use + for line in file_lines: + count = count + 1 if line.startswith('Balloon release date and time'): - minTime = datetime.strptime(line.split()[-1].strip(),'%Y-%m-%dT%H:%M:%S') #i.e.,2022-01-29T13:07:23 - elif 'n Elapsed time TimeUTC' in line: - num_header_lines = count + 1 + minTime = datetime.strptime(line.split()[-1],'%Y-%m-%dT%H:%M:%S') #i.e.,2022-01-29T13:07:23 + elif line.startswith('s hh:mm:ss'): + num_header_lines = count break elap_sec = [] @@ -135,9 +143,6 @@ def get_ascii_metadata(self, filename, file_obj_stream): maxTime = minTime + timedelta(seconds = max(elap_sec)) maxlat, minlat, maxlon, minlon = [max(lat),min(lat),max(lon),min(lon)] -######## - - return { "start": minTime, "end": maxTime, diff --git a/mdx/granule_metadata_extractor/src/helpers/musondeimpacts.zip b/mdx/granule_metadata_extractor/src/helpers/musondeimpacts.zip new file mode 100644 index 0000000000000000000000000000000000000000..b159a3e4297dbb5b55a83dad6034923cd460a34f GIT binary patch literal 11407 zcmd5?O>bL86ir)T1IsR1R9Ucq`e{Dj*Nz4%sy2|Sju31ti#WoN#Fd?>(5h0G{38AV z3pPmbe~>tL#?Rw7Z{B!brKF^a+RnAp9-njXJ@?K#dVJ@-yL5~4mPp=v=KH~Y9SG6u{#=hUbe{X+wapHvq zt2JxF7mMZPkMsh?GREl*tHqT)kyTbz=0{9d=ARf9c_tN|yqImBDgc zP5Vd%uWHPB%~b=Xh17;cE%GMpkn*hLg(0J1Rq?E-%+YK^tIW9N-tmefVdSubSTO)D z=LN%8l7@w>`9`EIxR5xAWi^*wxYw%T7}VwTWWF%@x=vS7@f=W(#|ICW^VzD8T0Roh znCqJ5wai2LN@4H{tfdJ%YFQ@oZo=C9bkr*3UfE7rmC4d1Xq?(oMW_woEe*AY^Ru(r zGLE+(yt?SaTOd|*-oeY2DK)JEr3?~>S7wzo>UL6AncxXEN){2kemSPhRjHY)0H)Du zOUgEL6{v5*j^1m}+*nBt(2E{q3e|3U1;BzotJfA@qjrR)uBBz);YI4|F1?mZM=#`D z3SZ7VH%%!nXTc-%hVa@-l_F5F#DRIOB3&R%7vek)9_I=nI~3a{g;ED5iQ z@f(tJkt%0!8?MY&&PA=zl&$%2^#0%E%s5kN)tN!gXuN&&!hxWEYacJma%+M%eV1HN zhh3(mujbO6#Kl|=Bp3cM&1FEWF>(=>8}4tA+@Y;kCAVHQ)}j`&uN9$mi&~0K?n>*7 zTdhcRTvb(dYL!yvC{Ayc52#gwS6rLg+}Cu)h{Hty3aAS-T^rV9F9S$*+hfp}K#FxG z3hh;xF~N0Qj0yH_ahR$c1XFS;NyU$s(~HZO5wLO$(o`>4gk>#Jl5I*m86#&alc5`0 z39Tu=4z9>@)219+6yzeoDwjh1utYA!%G(G9q>D>MpfL-eEm5~jmP@M-o}GzFF4&wVd{>{*+AzhE6*HL z=b>s9FHJHgV}BN)Qfy(Slw#y8Xw_XmVM{TJN^4SBoXQaJ_WL)S8FKE-4{cv)M-CcXbC~3n;*$U4TsKddqt4=&)wwrFltV*xJ z>P%P;9ZVeQ1X`IM=qGHprqBw~kdA>Fz|x8t08?#%R@LL!m<*cDIVFaI*HW zDkRXxS;fwchE;!W+_2gTw^+4HC&wy?<{ehoCTW{hIM>}Ud%9~MOh$XRsCa3{*(_XK ztJ%QZ4!?7lhsVz*?TWNcQl$#1qBzk~RRN>x)uIL`TB_o;s5-MvZqsi=XJ{{>Gp;+O zSEVGW4A-JQoS0nk|B2$Xd`iFOHaHOm&UNlOK3QgWOBJnnhQHQ`i0m$!a#Iiz*DQ z%7(XLt}o#>bapdSWx2hlX^S?YWy&fUC5e`tpD6lJ8n#eGsr~yV9w==X1r|%tE=pky z`Ua&T(=$Lis|w@p+BglrJ4BmaflP815|`*voG`Yyj1hq{YeieuLWg>$w5_j=(-W=6 zddXWIAvsh0d$?sDeRJHg@4SM2$M!Sgfb2*{>#?BG^TSTFJj-px+@Qo=a&C)@D?u_V zOAjUPT*mN$?kP0>Tb9V)4OOv z-^@GL50!l$z_}oLmc@989VKW$rG=$sDq%sQ;tLWNZo@QP{<&IjVJm+eL5@c(TKxMATw&rNF-74I&U{r_<3t6`-pr>B3*`l{J z543VDI8piCmq7I5>U^>|ot)0kW-BbtRRMgrZr{Dr{~Rztny)vv=(nw>g8eu8AB}z6 d-rk#ApM2K-ppG8D{{cmB Date: Thu, 17 Oct 2024 12:53:43 -0500 Subject: [PATCH 3/3] Added cdf file extension --- mdx/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mdx/main.py b/mdx/main.py index 442942d..adbdfb8 100644 --- a/mdx/main.py +++ b/mdx/main.py @@ -589,7 +589,7 @@ def input_keys(self): return { 'input_key': r'^(.*)\.?(nc|tsv|txt|gif|tar|zip|png|kml|dat|gz|pdf|docx|kmz|xlsx|eos|csv' r'|hdf5|hdf|nc4|ict|xls|.*rest|h5|xlsx|1Hz|impacts_archive|\d{5}|ar2v|mat' - r'|he5|raw|bz2|grb2f\d{4}|_\d{2}-00-00)$', + r'|he5|raw|bz2|grb2f\d{4}|_\d{2}-00-00)|cdf$', 'lookup_key': r'^(.*).*$' }