From 0a5c8a9710776f4431c73fa74aecbf3872691ef1 Mon Sep 17 00:00:00 2001
From: Lucy Wang <wangl1@uah.edu>
Date: Tue, 8 Oct 2024 15:22:01 -0500
Subject: [PATCH 1/3] Added lookup zip creator code musondeimpacts.py

---
 .../src/helpers/creators/musondeimpacts.py    | 164 ++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 mdx/granule_metadata_extractor/src/helpers/creators/musondeimpacts.py

diff --git a/mdx/granule_metadata_extractor/src/helpers/creators/musondeimpacts.py b/mdx/granule_metadata_extractor/src/helpers/creators/musondeimpacts.py
new file mode 100644
index 0000000..56203af
--- /dev/null
+++ b/mdx/granule_metadata_extractor/src/helpers/creators/musondeimpacts.py
@@ -0,0 +1,164 @@
+# create lookup zip for musondeimpacts
+# for all future collections
+from datetime import datetime, timedelta
+from utils.mdx import MDX
+import cProfile
+import time
+import math
+import re
+
+from netCDF4 import Dataset
+import numpy as np
+
+short_name = "musondeimpacts"
+provider_path = "musondeimpacts/fieldCampaigns/impacts/MU_sondes/data/"
+file_type = "HDF-5"
+
+
+class MDXProcessing(MDX):
+
+    def __init__(self):
+        super().__init__()
+        self.fileformat = 'ASCII'
+
+        self.utf8_list = ['IMPACTS_upperair_UMILL_radiosonde_202201291800_QCMiller.txt',
+                          'IMPACTS_upperair_UMILL_radiosonde_202201292000_QCMiller.txt',
+                          'IMPACTS_upperair_UMILL_radiosonde_202201292200_QCMiller.txt',
+                          'IMPACTS_upperair_UMILL_radiosonde_202202191800_QC.txt',
+                          'IMPACTS_upperair_UMILL_radiosonde_202202191500_QC.txt']
+
+
+    def process(self, filename, file_obj_stream) -> dict:
+        """
+        Individual collection processing logic for spatial and temporal 
+        metadata extraction
+        :param filename: name of file to process
+        :type filename: str
+        :param file_obj_stream: file object stream to be processed
+        :type file_obj_stream: botocore.response.StreamingBody
+        """
+        if filename.endswith('.cdf'): #HDF-5
+            file_type = 'HDF-5'
+            return self.get_hdf_metadata(filename, file_obj_stream)
+        else: #ASCII
+            file_type = 'ASCII'
+            return self.get_ascii_metadata(filename, file_obj_stream)
+
+
+    def get_hdf_metadata(self, filename, file_obj_stream):
+        """
+        Extract temporal and spatial metadata from HDF-5 files
+        """
+        datafile = Dataset("in-mem-file", mode='r', memory=file_obj_stream.read())
+        lats = datafile['lat'][:].flatten()
+        lons = datafile['lon'][:].flatten()
+        sec = datafile['time'][:].flatten()
+        ref_time_str = datafile['time'].units #'seconds since 2023-01-25 20:16:35'
+        ref_time = datetime.strptime(ref_time_str, 'seconds since %Y-%m-%d %H:%M:%S')
+
+        start_time, end_time = [ref_time+timedelta(seconds=min(sec)),
+                                ref_time+timedelta(seconds=max(sec))]
+
+        north, south, east, west = [lats.max(),
+                                    lats.min(),
+                                    lons.max(),
+                                    lons.min()]
+        datafile.close()
+        return {
+            "start": start_time,
+            "end": end_time,
+            "north": north,
+            "south": south,
+            "east": east,
+            "west": west,
+            "format": file_type
+        }
+
+
+    def get_ascii_metadata(self, filename, file_obj_stream):
+        """
+        Extract temporal and spatial metadata from ascii files
+        """
+########
+        file_lines = []
+        fn = filename.split('/')[-1]
+        if '_windsonde1_' in fn: #wind sonde file
+           #sample file:
+           #IMPACTS_upperair_UMILL_windsonde1_202201162100_QCTeare.txt
+           for encoded_line in file_obj_stream.iter_lines():
+               file_lines.append(encoded_line.decode("utf-8"))
+
+           for line in file_lines:
+               line = line.strip() # remove all the leading and trailing spaces from a string
+               if line.startswith('XXX '):
+                  start_time_str = '20'+line.split()[-1] #i.e., 220116/1958
+                  minTime = datetime.strptime(start_time_str,'%Y%m%d/%H%M')
+               elif line.startswith('Site'):
+                  tkn = line.split()
+                  lat0 = float(tkn[1].split(',')[0].split('=')[-1])
+                  lon0 = float(tkn[2].split('=')[-1])
+                  maxlat, minlat, maxlon, minlon = [lat0+0.01,
+                                                    lat0-0.01,
+                                                    lon0+0.01,
+                                                    lon0-0.01]
+               elif line.startswith('Saved by user: '):
+                  maxTime = datetime.strptime(line,'Saved by user: User on %Y%m%d/%H%M UTC')
+                  break
+        else: #radio sonde file, either utf-8 or utf-16-be (big endian)
+           if fn in self.utf8_list:
+              endian_type = 'utf_8'
+           else: #utf_16-be
+              endian_type = 'utf_16-be'
+
+           count = 0  #account number of header lines for later use
+           for encoded_line in file_obj_stream.iter_lines():
+               count = count + 1
+               line = encoded_line.decode(endian_type,errors='ignore').strip()
+               file_lines.append(line)
+
+               if line.startswith('Balloon release date and time'):
+                  minTime = datetime.strptime(line.split()[-1].strip(),'%Y-%m-%dT%H:%M:%S') #i.e.,2022-01-29T13:07:23
+               elif 'n Elapsed time  TimeUTC' in line:
+                  num_header_lines = count + 1
+                  break
+
+           elap_sec = []
+           lat = []
+           lon = []
+           for line in file_lines[num_header_lines:]:
+               if len(line) < 20 or 'row' in line:
+                  continue
+               tkn = line.split()
+               elap_sec.append(float(tkn[1]))
+               lat.append(float(tkn[-2]))
+               lon.append(float(tkn[-1]))
+           maxTime = minTime + timedelta(seconds = max(elap_sec))
+           maxlat, minlat, maxlon, minlon = [max(lat),min(lat),max(lon),min(lon)]
+
+########
+
+
+        return {
+            "start": minTime,
+            "end": maxTime,
+            "north": maxlat,
+            "south": minlat,
+            "east": maxlon,
+            "west": minlon,
+            "format": file_type
+        }
+
+
+    def main(self):
+        # start_time = time.time()
+        self.process_collection(short_name, provider_path)
+        # elapsed_time = time.time() - start_time
+        # print(f"Elapsed time in seconds: {elapsed_time}")
+        self.shutdown_ec2()
+
+
+if __name__ == '__main__':
+    MDXProcessing().main()
+    # The below can be use to run a profiler and see which functions are
+    # taking the most time to process
+    # cProfile.run('MDXProcessing().main()', sort='tottime')

From 86318f6b05afdbe364c3f507a3abc3aa29329ecd Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-10-11-130-19.us-west-2.compute.internal>
Date: Wed, 9 Oct 2024 17:13:51 +0000
Subject: [PATCH 2/3] Updated musondeimpacts.py and created musondeimpacts.zip

---
 .../src/helpers/creators/musondeimpacts.py    |  63 ++++++++++--------
 .../src/helpers/musondeimpacts.zip            | Bin 0 -> 11407 bytes
 2 files changed, 34 insertions(+), 29 deletions(-)
 create mode 100644 mdx/granule_metadata_extractor/src/helpers/musondeimpacts.zip

diff --git a/mdx/granule_metadata_extractor/src/helpers/creators/musondeimpacts.py b/mdx/granule_metadata_extractor/src/helpers/creators/musondeimpacts.py
index 56203af..0e862af 100644
--- a/mdx/granule_metadata_extractor/src/helpers/creators/musondeimpacts.py
+++ b/mdx/granule_metadata_extractor/src/helpers/creators/musondeimpacts.py
@@ -24,9 +24,8 @@ def __init__(self):
         self.utf8_list = ['IMPACTS_upperair_UMILL_radiosonde_202201291800_QCMiller.txt',
                           'IMPACTS_upperair_UMILL_radiosonde_202201292000_QCMiller.txt',
                           'IMPACTS_upperair_UMILL_radiosonde_202201292200_QCMiller.txt',
-                          'IMPACTS_upperair_UMILL_radiosonde_202202191800_QC.txt',
-                          'IMPACTS_upperair_UMILL_radiosonde_202202191500_QC.txt']
-
+                          'IMPACTS_upperair_UMILL_radiosonde_202202191500_QC.txt',
+                          'IMPACTS_upperair_UMILL_radiosonde_202202191800_QC.txt']
 
     def process(self, filename, file_obj_stream) -> dict:
         """
@@ -50,19 +49,27 @@ def get_hdf_metadata(self, filename, file_obj_stream):
         Extract temporal and spatial metadata from HDF-5 files
         """
         datafile = Dataset("in-mem-file", mode='r', memory=file_obj_stream.read())
-        lats = datafile['lat'][:].flatten()
-        lons = datafile['lon'][:].flatten()
-        sec = datafile['time'][:].flatten()
-        ref_time_str = datafile['time'].units #'seconds since 2023-01-25 20:16:35'
-        ref_time = datetime.strptime(ref_time_str, 'seconds since %Y-%m-%d %H:%M:%S')
-
-        start_time, end_time = [ref_time+timedelta(seconds=min(sec)),
-                                ref_time+timedelta(seconds=max(sec))]
-
-        north, south, east, west = [lats.max(),
-                                    lats.min(),
-                                    lons.max(),
-                                    lons.min()]
+        if '_windsonde_' in filename:
+            lat = float(datafile.latitude)
+            lon = float(datafile.longitude)
+            north, south, east, west = [lat+0.01, lat-0.01, lon+0.01, lon-0.01]
+            tkn = filename.split('.cdf')[0].split('_')
+            start_time = datetime.strptime(tkn[-2]+tkn[-1],'%Y%m%d%H%M%S')
+            end_time = start_time
+        else:
+            lats = datafile['lat'][:].flatten()
+            lons = datafile['lon'][:].flatten()
+            sec = datafile['time'][:].flatten()
+            ref_time_str = datafile['time'].units #'seconds since 2023-01-25 20:16:35'
+            ref_time = datetime.strptime(ref_time_str, 'seconds since %Y-%m-%d %H:%M:%S')
+
+            start_time, end_time = [ref_time+timedelta(seconds=sec.min().item()),
+                                    ref_time+timedelta(seconds=sec.max().item())]
+
+            north, south, east, west = [lats.max(),
+                                        lats.min(),
+                                        lons.max(),
+                                        lons.min()]
         datafile.close()
         return {
             "start": start_time,
@@ -79,7 +86,6 @@ def get_ascii_metadata(self, filename, file_obj_stream):
         """
         Extract temporal and spatial metadata from ascii files
         """
-########
         file_lines = []
         fn = filename.split('/')[-1]
         if '_windsonde1_' in fn: #wind sonde file
@@ -105,21 +111,23 @@ def get_ascii_metadata(self, filename, file_obj_stream):
                   maxTime = datetime.strptime(line,'Saved by user: User on %Y%m%d/%H%M UTC')
                   break
         else: #radio sonde file, either utf-8 or utf-16-be (big endian)
+           print('fn=',fn)
+           endian_type = 'utf-16-be'
            if fn in self.utf8_list:
-              endian_type = 'utf_8'
-           else: #utf_16-be
-              endian_type = 'utf_16-be'
+              endian_type = 'utf-8'
 
-           count = 0  #account number of header lines for later use
+           #read lines and save into 'file_lines' list 
            for encoded_line in file_obj_stream.iter_lines():
-               count = count + 1
                line = encoded_line.decode(endian_type,errors='ignore').strip()
                file_lines.append(line)
-
+          
+           count = 0 #account number of header lines for later use
+           for line in file_lines:
+               count = count + 1
                if line.startswith('Balloon release date and time'):
-                  minTime = datetime.strptime(line.split()[-1].strip(),'%Y-%m-%dT%H:%M:%S') #i.e.,2022-01-29T13:07:23
-               elif 'n Elapsed time  TimeUTC' in line:
-                  num_header_lines = count + 1
+                  minTime = datetime.strptime(line.split()[-1],'%Y-%m-%dT%H:%M:%S') #i.e.,2022-01-29T13:07:23
+               elif line.startswith('s hh:mm:ss'):
+                  num_header_lines = count
                   break
 
            elap_sec = []
@@ -135,9 +143,6 @@ def get_ascii_metadata(self, filename, file_obj_stream):
            maxTime = minTime + timedelta(seconds = max(elap_sec))
            maxlat, minlat, maxlon, minlon = [max(lat),min(lat),max(lon),min(lon)]
 
-########
-
-
         return {
             "start": minTime,
             "end": maxTime,
diff --git a/mdx/granule_metadata_extractor/src/helpers/musondeimpacts.zip b/mdx/granule_metadata_extractor/src/helpers/musondeimpacts.zip
new file mode 100644
index 0000000000000000000000000000000000000000..b159a3e4297dbb5b55a83dad6034923cd460a34f
GIT binary patch
literal 11407
zcmd5?O>bL86ir)T1IsR1R9Ucq`e{Dj*Nz4%sy2|Sju31ti#WoN#Fd?>(5h0G{38AV
z3pPmbe~>tL#?Rw7Z{B!brKF^a+RnAp9-njXJ@?K#dVJ@-yL<HY$8V3m`|HoY)xV$a
z?a}v#I6GS`e!hC1{j^$KyxM<sc=Y+f_}S#@`SaOwI$utX4<9{wGI=q-I6?em#wUUa
z!8p$aXKeEHU_6^HXW8X1m;3d-SNp5W>5~4mPp=v=KH~Y9SG6u{#=hUbe{X+wapHvq
zt2JxF7mMZPkMsh?GREl*tHqT)kyTbz=0{9d=ARf9c_tN|yqImB<aBbnSe{SKtG@c;
z;YgbwG=Diee1M;gpDbVFHOWv5CX=8RV<u`=HCTM8CX5eb{Yl&q)B#8ZQ=D@m^>Dgc
zP5Vd%uWHPB%~b=Xh17;cE%GMpkn*hLg(0J1Rq?E-%+YK^tIW9N-tmefVdSubSTO)D
z=LN%8l7@w>`9`EIxR5xAWi^*wxYw%T7}VwTWWF%@x=vS7@f=W(#|ICW^VzD8T0Roh
znCqJ5wai2LN@4H{tfdJ%YFQ@oZo=C9bkr*3UfE7rmC4d1Xq?(oMW_woEe*AY^Ru(r
zGLE+(yt?SaTOd|*-oeY2DK)JEr3?~>S7wzo>UL6AncxXEN){2kemSPhRjHY)0H)Du
zOUgEL6{v5*j^1m}+*nBt(2E{q3e|3U1;BzotJfA@qjrR)uBBz);YI4|F1?mZM=#`D
z3SZ7VH%%!nXTc-%hVa@-l_F<!n7mef>5F#DRIOB3&R%7vek)9_I=nI~3a{g;ED5iQ
z@f(tJkt%0!8?MY&&PA=zl&$%2^#0%E%s5kN)tN!gXuN&&!hxWEYacJma%+M%eV1HN
zhh3(mujbO6#Kl|=Bp3cM&1FEWF>(=>8}4tA+@Y;kCAVHQ)}j`&uN9$mi&~0K?n>*7
zTdhcRTvb(dYL!yvC{Ayc52#gwS6rLg+}Cu)h{Hty3aAS-T^rV9F9S$*+hfp}K#FxG
z3hh;xF~N0Qj0yH_ahR$c1XFS;NyU$s(~HZO5wLO$(o`>4gk>#Jl5I*m86#&alc5`0
z39Tu=4z9>@)219+6yzeoDwjh<EVx{gnu69)D{O~pHC#{&w6K@KA*f*9Xh3K^KP55@
zV}6O^wkO5eEeH82p&9w*rw9GK3O!~}X`hv{VHrRwGiE0q$19GCPP!JBM#n6=Z<WC8
zpy^fsX_bJqghguFsK+TGC74avDfrrv#!OVlR0zG+@VB(c=$#3~2!Lp|-B8!XOB#L`
zcn9@a;I$?>1utYA!%G(G9q>D>MpfL-eEm5~jmP@M-o}GzFF4&wVd{>{*+AzhE6*HL
z=b>s9FHJHgV}BN)Qfy(Slw#y8Xw_XmVM{T<Vvo}uZ3}5yXbY@giYd8GHF5=tT2F#q
z%)>JN^4SBoXQaJ_WL)S8FKE-4{cv)M-CcXbC~3n;*$U4TsKddqt4=&)wwrFltV*xJ
z>P%P;9ZVeQ1X`IM=qGHprqBw~kdA>Fz|x8t08?#%R@L<r(%6S>L!m<*cDIVFaI*HW
zDkRXxS;fwchE;!W+_2gTw^+4HC&wy?<{ehoCTW{hIM>}Ud%9~MOh$XRsCa3{*(_XK
ztJ%QZ4!?7lhsVz*?TWNcQl$#1qBzk~RRN>x)uIL`TB_o;s5-MvZqsi=XJ{{>Gp;+O
zSEVGW4A-JQoS0nk|B2$Xd`iFOHaHOm&UNlOK3QgWOB<No$wp_THHl8M=r1YrIS&tT
zL6BTdV0|mi`$59QW5a~TqHT%sL~FyEu0lFwdONic$lSJ&7D&6ih<8yj0OY!@=&~Q=
zrmbjybJ#G+%bKpXx~mqIce^JQ7FT37dMBN<Kw5I+AZ56xgqRduOmJz!c&}X1ECkJ_
z=p{TL*Cwn~ExPMb%=v_1qgQi1XO6aEmX$ZHks~sbP8?^?dbAg(f-UML(*gksG{pd!
zO&E&R_G8#iOW|1|gVS4_<rq{u<wquoHfRw-Fv$wR5iu>JnnhQHQ`i0m$!a#Iiz*DQ
z%7(XLt}o#>bapdSWx2hlX^S?YWy&fUC5e`tpD6lJ8n#eGsr~yV9w==X1r|%tE=pky
z`Ua&T(=$Lis|w@p+BglrJ4BmaflP815|`*voG`Yyj1hq{YeieuLWg>$w5_j=(-W=6
zddXWIAvsh0d$?sDeRJHg@4SM2$M!Sgfb2*{>#?BG^TSTFJj-px+@Qo=a&C)@D?u_V
zOAjUP<d`{%v&q72_P`{`?tp0&+Yhr9ejAvcXj5ShViWyjCrl(?U{z!)5ZJWWae)h&
zR5srdX1H;N*c5WYxx@~;FEuf9VvT9;4a1`K!!0(g99yEbE8MndJ%2!Knxh?}4d0UY
z!7S(zh&W8FOZwO(nA_nSgc)Au24P<71|h#*Hq%RxaR(fm+z6X`j2hz}K=N!m3~wuZ
zgKQ2Htzy`JMPNde*xm-7II||m3*R8jWHyB;Cjbez*9Q|z>T*mN$?kP0>Tb9V)4OOv
z-^@GL50!l$z_}oLmc@989VKW$rG=$sDq%sQ;tLWNZo@QP{<<n{GE8EVD_nWU*(A}1
zmzajlR=5o_jZH({a5j@Gt%1pOBsVP%Oef*o?1tY4rui_2!<4UIAGMy6{pahC&wj_#
zvGn~BzE@Z0=hNk{>&IjVJm+eL5@c(TKxMATw&rNF-74I&U{r_<3t6`-pr>B3*`l{J
z543VDI8piCmq7I5>U^>|ot)0kW-BbtRRMgrZr{Dr{~Rztny)vv=(nw>g8eu8AB}z6
d-rk#ApM2K-ppG8D{{cmB<JStme{i4v_8;K(U)2Br

literal 0
HcmV?d00001


From 1b335dcde618193c2976a761353557519cf49813 Mon Sep 17 00:00:00 2001
From: Lucy Wang <wangl1@uah.edu>
Date: Thu, 17 Oct 2024 12:53:43 -0500
Subject: [PATCH 3/3] Added cdf file extension

---
 mdx/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mdx/main.py b/mdx/main.py
index 442942d..adbdfb8 100644
--- a/mdx/main.py
+++ b/mdx/main.py
@@ -589,7 +589,7 @@ def input_keys(self):
         return {
             'input_key': r'^(.*)\.?(nc|tsv|txt|gif|tar|zip|png|kml|dat|gz|pdf|docx|kmz|xlsx|eos|csv'
                          r'|hdf5|hdf|nc4|ict|xls|.*rest|h5|xlsx|1Hz|impacts_archive|\d{5}|ar2v|mat'
-                         r'|he5|raw|bz2|grb2f\d{4}|_\d{2}-00-00)$',
+                         r'|he5|raw|bz2|grb2f\d{4}|_\d{2}-00-00)|cdf$',
             'lookup_key': r'^(.*).*$'
         }