From 7c11dba22e7b023a59626b3e00ce116722566b9b Mon Sep 17 00:00:00 2001
From: Jack Reeves Eyre <jack.reeveseyre@gmail.com>
Date: Wed, 17 Aug 2022 14:11:48 -0400
Subject: [PATCH 1/5] First attempt at new python feature.

---
 python/_bufrlib.pyf         |  5 +++++
 python/ncepbufr/__init__.py | 16 ++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/python/_bufrlib.pyf b/python/_bufrlib.pyf
index 40ee6f60..ca4efd2b 100644
--- a/python/_bufrlib.pyf
+++ b/python/_bufrlib.pyf
@@ -106,6 +106,11 @@ subroutine ufbint(lunin,usr,i1,i2,iret,str) ! in ufbint.f
     integer, intent(out) :: iret
     character*(*), intent(in) :: str
 end subroutine ufbint
+subroutine readlc(lunit, chr, str) ! in readlc.f
+    integer,intent(in) :: lunit
+    character*(*), intent(out) :: chr
+    character*(*), intent(in) :: str
+end subroutine readlc
 subroutine ufbqcd(lunit,nemo,qcd) ! in ufbqcd.f
     integer, intent(in) :: lunit
     character*(*), intent(in) :: nemo
diff --git a/python/ncepbufr/__init__.py b/python/ncepbufr/__init__.py
index dd8fb3d2..ec45b8be 100644
--- a/python/ncepbufr/__init__.py
+++ b/python/ncepbufr/__init__.py
@@ -469,6 +469,22 @@ def load_subset(self):
         if iret == 0:
             self.subset_loaded = True
         return iret
+    def read_long_string(self,mnemonic):
+        """
+        Decode character string from the currently loaded message subset
+        using the specified mnemonic (a 'mnemonic' is simply a
+        descriptive, alphanumeric name for a data value, like
+        a key in a python dictionary). The mnemonic string
+        must be a single mnemonic only.
+        
+        Returns the character string, if found, or "MISSING" if not.
+        """
+        if not self.subset_loaded:
+            raise IOError('subset not loaded, call load_subset first')
+        if len(mnemonic.split()) > 1:
+            raise ValueError('only one mnemonic per call to read_long_string')
+        data = _bufrlib.readlc(self.lunit,data,mnemonic)
+        return data
     def read_subset(self,mnemonics,rep=False,seq=False,events=False):
         """
         decode the data from the currently loaded message subset

From d398a5e3bfe2607fb24309bc9fed5f03cd26dd8f Mon Sep 17 00:00:00 2001
From: Jack Reeves Eyre <jack.reeveseyre@gmail.com>
Date: Wed, 17 Aug 2022 17:29:53 -0400
Subject: [PATCH 2/5] Troubleshooting to get read_long_string working.

---
 python/_bufrlib.pyf         | 2 +-
 python/ncepbufr/__init__.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/_bufrlib.pyf b/python/_bufrlib.pyf
index ca4efd2b..7a8fa6d7 100644
--- a/python/_bufrlib.pyf
+++ b/python/_bufrlib.pyf
@@ -108,7 +108,7 @@ subroutine ufbint(lunin,usr,i1,i2,iret,str) ! in ufbint.f
 end subroutine ufbint
 subroutine readlc(lunit, chr, str) ! in readlc.f
     integer,intent(in) :: lunit
-    character*(*), intent(out) :: chr
+    character*120, intent(out) :: chr
     character*(*), intent(in) :: str
 end subroutine readlc
 subroutine ufbqcd(lunit,nemo,qcd) ! in ufbqcd.f
diff --git a/python/ncepbufr/__init__.py b/python/ncepbufr/__init__.py
index ec45b8be..e1c8c497 100644
--- a/python/ncepbufr/__init__.py
+++ b/python/ncepbufr/__init__.py
@@ -483,8 +483,8 @@ def read_long_string(self,mnemonic):
             raise IOError('subset not loaded, call load_subset first')
         if len(mnemonic.split()) > 1:
             raise ValueError('only one mnemonic per call to read_long_string')
-        data = _bufrlib.readlc(self.lunit,data,mnemonic)
-        return data
+        long_string = _bufrlib.readlc(self.lunit,mnemonic)
+        return str(long_string, encoding='ascii').strip()
     def read_subset(self,mnemonics,rep=False,seq=False,events=False):
         """
         decode the data from the currently loaded message subset

From 74c4776c9bcf52c1de607fb2a2a5d02f55612092 Mon Sep 17 00:00:00 2001
From: Jack Reeves Eyre <jack.reeveseyre@gmail.com>
Date: Mon, 22 Aug 2022 15:40:30 -0400
Subject: [PATCH 3/5] Adds more error handling and a simple test.

Error handling now tries an alternative text decoding (extended ASCII)
before raising an error. This should help with some French-language
station names (and maybe other examples) with accents.
---
 python/ncepbufr/__init__.py | 24 +++++++++++++++++++++++-
 python/test/test.py         | 21 +++++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/python/ncepbufr/__init__.py b/python/ncepbufr/__init__.py
index e1c8c497..7d66d97e 100644
--- a/python/ncepbufr/__init__.py
+++ b/python/ncepbufr/__init__.py
@@ -478,13 +478,35 @@ def read_long_string(self,mnemonic):
         must be a single mnemonic only.
         
         Returns the character string, if found, or "MISSING" if not.
+
+        Example:
+
+            :::python
+            >>> bufr = ncepbufr.open(filename)
+            >>> while bufr.advance() == 0:
+            >>>     while bufr.load_subset() == 0:
+            >>>         st_name = bufr.read_long_string(mnemonic='STSN')
         """
         if not self.subset_loaded:
             raise IOError('subset not loaded, call load_subset first')
         if len(mnemonic.split()) > 1:
             raise ValueError('only one mnemonic per call to read_long_string')
         long_string = _bufrlib.readlc(self.lunit,mnemonic)
-        return str(long_string, encoding='ascii').strip()
+        try:
+            result = str(long_string, encoding='ascii').strip()
+        except UnicodeDecodeError:
+            try:
+                if all([bt == int('0xff',16) for bt in long_string.strip()]):
+                    # All values set to 255 for missing data.
+                    result = 'MISSING'
+                else:
+                    # Extended ASCII for Roman alphabet accents.
+                    result = str(long_string, encoding='cp1252').strip()
+            except Exception as error:
+                print(f"An exception occurred {error}")
+        except Exception as error:
+            print(f"An exception occurred {error}")
+        return result
     def read_subset(self,mnemonics,rep=False,seq=False,events=False):
         """
         decode the data from the currently loaded message subset
diff --git a/python/test/test.py b/python/test/test.py
index 5338090c..f27a8c59 100644
--- a/python/test/test.py
+++ b/python/test/test.py
@@ -187,3 +187,24 @@
 np.testing.assert_array_almost_equal(oer_save.filled(), oer2.filled())
 np.testing.assert_array_almost_equal(qcf_save.filled(), qcf2.filled())
 bufr.close()
+
+# test reading long strings
+bufr = ncepbufr.open('data/xx103')
+test_station_names = ['BOUEE_LION', 'BOUEE_ANTILLES',
+                      'BOUEE_COTE D\'AZUR',
+                      'GULF OF MAINE', 'TENERIFE']
+test_report_ids = ['6100002', '4100300', '6100001', '4400005', '1300131']
+i_msg = 0
+while bufr.advance() == 0:
+    # Just read the first subset from each message.
+    if bufr.load_subset() == 0:
+        stsn = bufr.read_long_string(mnemonic='STSN')
+        rpid = bufr.read_long_string(mnemonic='RPID')
+        assert stsn == test_station_names[i_msg]
+        assert rpid == test_report_ids[i_msg]
+        i_msg = i_msg + 1
+    # only loop over first 5 subsets
+    if i_msg == 5: break
+bufr.close()
+    
+    

From e6f03fcc0a3d8641ad9d4c76dd419cac91a5d281 Mon Sep 17 00:00:00 2001
From: Jack Reeves Eyre <jack.reeveseyre@gmail.com>
Date: Tue, 23 Aug 2022 16:48:07 -0400
Subject: [PATCH 4/5] Reduces max length of read_long_string; adds
 documentation.

The maximum length of string was set to 120 -- thought to be more than needed.
This commit changes it to 64.

Documentation improved by describing the methodology built into the fortran
"readlc" routine -- ability to retrieve one particular occurence of a
long string when multiple occur in the same subset.
---
 python/_bufrlib.pyf         | 2 +-
 python/ncepbufr/__init__.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/_bufrlib.pyf b/python/_bufrlib.pyf
index 7a8fa6d7..ba09d6a0 100644
--- a/python/_bufrlib.pyf
+++ b/python/_bufrlib.pyf
@@ -108,7 +108,7 @@ subroutine ufbint(lunin,usr,i1,i2,iret,str) ! in ufbint.f
 end subroutine ufbint
 subroutine readlc(lunit, chr, str) ! in readlc.f
     integer,intent(in) :: lunit
-    character*120, intent(out) :: chr
+    character*64, intent(out) :: chr
     character*(*), intent(in) :: str
 end subroutine readlc
 subroutine ufbqcd(lunit,nemo,qcd) ! in ufbqcd.f
diff --git a/python/ncepbufr/__init__.py b/python/ncepbufr/__init__.py
index 7d66d97e..1eec0e0a 100644
--- a/python/ncepbufr/__init__.py
+++ b/python/ncepbufr/__init__.py
@@ -475,7 +475,11 @@ def read_long_string(self,mnemonic):
         using the specified mnemonic (a 'mnemonic' is simply a
         descriptive, alphanumeric name for a data value, like
         a key in a python dictionary). The mnemonic string
-        must be a single mnemonic only.
+        must be a single mnemonic only. If the subset contains more 
+        than one occurrence of the mnemonic, then can append '#X' to 
+        the mnemonic to request the character string corresponding to 
+        the Xth occurrence of the mnemonic, counting from the beginning 
+        of the subset. Otherwise, X is assumed to be 1.
         
         Returns the character string, if found, or "MISSING" if not.
 

From 6a5c6da348436d416429ea666610875c398cf4a7 Mon Sep 17 00:00:00 2001
From: Jeff Ator <Jeff.Ator@noaa.gov>
Date: Wed, 24 Aug 2022 18:44:39 +0000
Subject: [PATCH 5/5] minor updates to remove warnings for deprecated Python
 methods

---
 python/test/test.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/python/test/test.py b/python/test/test.py
index f27a8c59..d1fefb47 100644
--- a/python/test/test.py
+++ b/python/test/test.py
@@ -11,7 +11,7 @@
 while bufr.advance() == 0: # loop over messages.
     while bufr.load_subset() == 0: # loop over subsets in message.
         hdr = bufr.read_subset(hdstr).squeeze()
-        station_id = hdr[0].tostring()
+        station_id = hdr[0].tobytes()
         lon = hdr[1]; lat = hdr[2]
         station_type = int(hdr[4])
         obs = bufr.read_subset(obstr)
@@ -115,7 +115,7 @@
 np.testing.assert_almost_equal(lon,-167.3253)
 obs_tst=np.array([1.4555e+02,1.4618e+02,2.1374e+02,2.4871e+02,2.4807e+02,2.3607e+02,\
  2.2802e+02,2.2255e+02,2.1699e+02,2.1880e+02,2.2440e+02,2.2970e+02,\
- 2.3407e+02,1.0000e+11,2.0008e+02],np.float)
+ 2.3407e+02,1.0000e+11,2.0008e+02],np.float64)
 np.testing.assert_array_almost_equal(obs,obs_tst)
 bufr.close()
 
@@ -158,7 +158,7 @@
 while bufr.advance() == 0:
     while bufr.load_subset() == 0:
         hdr = bufr.read_subset(hdstr).squeeze()
-        station_id = hdr[0].tostring()
+        station_id = hdr[0].tobytes()
         obs = bufr.read_subset(obstr)
         nlevs = obs.shape[-1]
         oer = bufr.read_subset(oestr)
@@ -174,7 +174,7 @@
 bufr.restore()
 bufr.load_subset()
 hdr = bufr.read_subset(hdstr).squeeze()
-station_id = hdr[0].tostring()
+station_id = hdr[0].tobytes()
 obs2 = bufr.read_subset(obstr)
 nlevs = obs2.shape[-1]
 oer2 = bufr.read_subset(oestr)
@@ -206,5 +206,3 @@
     # only loop over first 5 subsets
     if i_msg == 5: break
 bufr.close()
-    
-