diff --git a/setup.py b/setup.py index 6d9e1d1..ff6b750 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ def run(self): 'dict-recursive-update', 'networkx', 'penman', - 'stanza==1.1.1', + 'stanza==1.3.0', 'nltk', "graphviz" ], diff --git a/tuw_nlp/grammar/text_to_4lang.py b/tuw_nlp/grammar/text_to_4lang.py index 1a74960..bf24d28 100644 --- a/tuw_nlp/grammar/text_to_4lang.py +++ b/tuw_nlp/grammar/text_to_4lang.py @@ -27,7 +27,7 @@ def __init__(self, lang, nlp_cache, cache_dir=None): nlp = stanza.Pipeline( 'en', package="craft") assert lang, "TextTo4lang does not have lang set" - + self.lang = lang self.nlp = CachedStanzaPipeline(nlp, nlp_cache) diff --git a/tuw_nlp/text/segmentation.py b/tuw_nlp/text/segmentation.py index 0c1ad5e..1076bb5 100644 --- a/tuw_nlp/text/segmentation.py +++ b/tuw_nlp/text/segmentation.py @@ -2,7 +2,6 @@ from stanza.pipeline.processor import Processor, register_processor from tuw_nlp.text.patterns.de import ABBREV, MONTH -from tuw_nlp.text.patterns.misc import CHAR_PATT @register_processor("fix_ssplit") @@ -54,13 +53,12 @@ def process(self, document): if requires_space is False: char_offset -= 1 - start_char, end_char = ( - int(c) + char_offset - for c in CHAR_PATT.match(token.misc).groups()) - sens[-1].append({ - doc.ID: (token_id + 1, ), doc.TEXT: token.text, - doc.MISC: f'start_char={start_char}|end_char={end_char}'}) + doc.ID: (token_id + 1, ), + doc.TEXT: token.text, + doc.MISC: token.misc, + doc.START_CHAR: token.start_char + char_offset, + doc.END_CHAR: token.end_char + char_offset}) token_id += 1