Merge pull request #228 from JackReevesEyre/jre_py_long_strings

Add python read_long_string function
NOAA-EMC · Aug 25, 2022 · 0997671 · 0997671
2 parents 8519a73 + 6a5c6da
commit 0997671
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 4 deletions.
diff --git a/python/_bufrlib.pyf b/python/_bufrlib.pyf
@@ -106,6 +106,11 @@ subroutine ufbint(lunin,usr,i1,i2,iret,str) ! in ufbint.f
     integer, intent(out) :: iret
     character*(*), intent(in) :: str
 end subroutine ufbint
+subroutine readlc(lunit, chr, str) ! in readlc.f
+    integer,intent(in) :: lunit
+    character*64, intent(out) :: chr
+    character*(*), intent(in) :: str
+end subroutine readlc
 subroutine ufbqcd(lunit,nemo,qcd) ! in ufbqcd.f
     integer, intent(in) :: lunit
     character*(*), intent(in) :: nemo

diff --git a/python/ncepbufr/__init__.py b/python/ncepbufr/__init__.py
@@ -469,6 +469,48 @@ def load_subset(self):
         if iret == 0:
             self.subset_loaded = True
         return iret
+    def read_long_string(self,mnemonic):
+        """
+        Decode character string from the currently loaded message subset
+        using the specified mnemonic (a 'mnemonic' is simply a
+        descriptive, alphanumeric name for a data value, like
+        a key in a python dictionary). The mnemonic string
+        must be a single mnemonic only. If the subset contains more 
+        than one occurrence of the mnemonic, then can append '#X' to 
+        the mnemonic to request the character string corresponding to 
+        the Xth occurrence of the mnemonic, counting from the beginning 
+        of the subset. Otherwise, X is assumed to be 1.
+        
+        Returns the character string, if found, or "MISSING" if not.
+
+        Example:
+
+            :::python
+            >>> bufr = ncepbufr.open(filename)
+            >>> while bufr.advance() == 0:
+            >>>     while bufr.load_subset() == 0:
+            >>>         st_name = bufr.read_long_string(mnemonic='STSN')
+        """
+        if not self.subset_loaded:
+            raise IOError('subset not loaded, call load_subset first')
+        if len(mnemonic.split()) > 1:
+            raise ValueError('only one mnemonic per call to read_long_string')
+        long_string = _bufrlib.readlc(self.lunit,mnemonic)
+        try:
+            result = str(long_string, encoding='ascii').strip()
+        except UnicodeDecodeError:
+            try:
+                if all([bt == int('0xff',16) for bt in long_string.strip()]):
+                    # All values set to 255 for missing data.
+                    result = 'MISSING'
+                else:
+                    # Extended ASCII for Roman alphabet accents.
+                    result = str(long_string, encoding='cp1252').strip()
+            except Exception as error:
+                print(f"An exception occurred {error}")
+        except Exception as error:
+            print(f"An exception occurred {error}")
+        return result
     def read_subset(self,mnemonics,rep=False,seq=False,events=False):
         """
         decode the data from the currently loaded message subset

diff --git a/python/test/test.py b/python/test/test.py
@@ -11,7 +11,7 @@
 while bufr.advance() == 0: # loop over messages.
     while bufr.load_subset() == 0: # loop over subsets in message.
         hdr = bufr.read_subset(hdstr).squeeze()
-        station_id = hdr[0].tostring()
+        station_id = hdr[0].tobytes()
         lon = hdr[1]; lat = hdr[2]
         station_type = int(hdr[4])
         obs = bufr.read_subset(obstr)
@@ -115,7 +115,7 @@
 np.testing.assert_almost_equal(lon,-167.3253)
 obs_tst=np.array([1.4555e+02,1.4618e+02,2.1374e+02,2.4871e+02,2.4807e+02,2.3607e+02,\
  2.2802e+02,2.2255e+02,2.1699e+02,2.1880e+02,2.2440e+02,2.2970e+02,\
- 2.3407e+02,1.0000e+11,2.0008e+02],np.float)
+ 2.3407e+02,1.0000e+11,2.0008e+02],np.float64)
 np.testing.assert_array_almost_equal(obs,obs_tst)
 bufr.close()
 
@@ -158,7 +158,7 @@
 while bufr.advance() == 0:
     while bufr.load_subset() == 0:
         hdr = bufr.read_subset(hdstr).squeeze()
-        station_id = hdr[0].tostring()
+        station_id = hdr[0].tobytes()
         obs = bufr.read_subset(obstr)
         nlevs = obs.shape[-1]
         oer = bufr.read_subset(oestr)
@@ -174,7 +174,7 @@
 bufr.restore()
 bufr.load_subset()
 hdr = bufr.read_subset(hdstr).squeeze()
-station_id = hdr[0].tostring()
+station_id = hdr[0].tobytes()
 obs2 = bufr.read_subset(obstr)
 nlevs = obs2.shape[-1]
 oer2 = bufr.read_subset(oestr)
@@ -187,3 +187,22 @@
 np.testing.assert_array_almost_equal(oer_save.filled(), oer2.filled())
 np.testing.assert_array_almost_equal(qcf_save.filled(), qcf2.filled())
 bufr.close()
+
+# test reading long strings
+bufr = ncepbufr.open('data/xx103')
+test_station_names = ['BOUEE_LION', 'BOUEE_ANTILLES',
+                      'BOUEE_COTE D\'AZUR',
+                      'GULF OF MAINE', 'TENERIFE']
+test_report_ids = ['6100002', '4100300', '6100001', '4400005', '1300131']
+i_msg = 0
+while bufr.advance() == 0:
+    # Just read the first subset from each message.
+    if bufr.load_subset() == 0:
+        stsn = bufr.read_long_string(mnemonic='STSN')
+        rpid = bufr.read_long_string(mnemonic='RPID')
+        assert stsn == test_station_names[i_msg]
+        assert rpid == test_report_ids[i_msg]
+        i_msg = i_msg + 1
+    # only loop over first 5 subsets
+    if i_msg == 5: break
+bufr.close()