upgrade to stanza 1.3.0, incl. update of fix_ssplit processor

recski · Feb 1, 2022 · d93ae4c · d93ae4c
1 parent 81169a2
commit d93ae4c
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 9 deletions.
diff --git a/setup.py b/setup.py
@@ -38,7 +38,7 @@ def run(self):
         'dict-recursive-update',
         'networkx',
         'penman',
-        'stanza==1.1.1',
+        'stanza==1.3.0',
         'nltk',
         "graphviz"
     ],

diff --git a/tuw_nlp/grammar/text_to_4lang.py b/tuw_nlp/grammar/text_to_4lang.py
@@ -27,7 +27,7 @@ def __init__(self, lang, nlp_cache, cache_dir=None):
             nlp = stanza.Pipeline(
                 'en', package="craft")
         assert lang, "TextTo4lang does not have lang set"
-        
+
         self.lang = lang
 
         self.nlp = CachedStanzaPipeline(nlp, nlp_cache)

diff --git a/tuw_nlp/text/segmentation.py b/tuw_nlp/text/segmentation.py
@@ -2,7 +2,6 @@
 from stanza.pipeline.processor import Processor, register_processor
 
 from tuw_nlp.text.patterns.de import ABBREV, MONTH
-from tuw_nlp.text.patterns.misc import CHAR_PATT
 
 
 @register_processor("fix_ssplit")
@@ -54,13 +53,12 @@ def process(self, document):
                         if requires_space is False:
                             char_offset -= 1
 
-                start_char, end_char = (
-                    int(c) + char_offset
-                    for c in CHAR_PATT.match(token.misc).groups())
-
                 sens[-1].append({
-                    doc.ID: (token_id + 1, ), doc.TEXT: token.text,
-                    doc.MISC: f'start_char={start_char}|end_char={end_char}'})
+                    doc.ID: (token_id + 1, ),
+                    doc.TEXT: token.text,
+                    doc.MISC: token.misc,
+                    doc.START_CHAR: token.start_char + char_offset,
+                    doc.END_CHAR: token.end_char + char_offset})
 
                 token_id += 1