Merge pull request #2 from dfint/1-convert-into-package

1 convert into package
dfint · Jun 19, 2022 · 94a83d6 · 94a83d6
2 parents af98cf1 + f9f6905
commit 94a83d6
Show file tree

Hide file tree

Showing 19 changed files with 2,636 additions and 2,210 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+ignore = W293
+exclude = .git,__pycache__,venv,.venv
+max-line-length = 120
+max-complexity = 10
diff --git a/.github/workflows/test Py 3.4 compatibility.yml b/.github/workflows/test Py 3.4 compatibility.yml
@@ -23,8 +23,8 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install -r requirements.txt
-        pip install -r test_requirements.txt
+        pip install -r legacy_requirements/requirements.txt
+        pip install -r legacy_requirements/test_requirements.txt
     - name: Lint with flake8
       run: |
         # stop the build if there are Python syntax errors or undefined names
@@ -33,4 +33,4 @@ jobs:
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Test with pytest
       run: |
-        pytest --doctest-modules changetext.py ./tests
+        pytest --doctest-modules changetext ./tests
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -23,18 +23,18 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install -r requirements.txt
-        pip install -r test_requirements.txt
+        pip install poetry
+        poetry install
         pip install coveralls==3.2.0
     - name: Lint with flake8
       run: |
         # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+        poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Test with pytest
       run: |
-        pytest --doctest-modules changetext.py ./tests --cov=.
+        poetry run pytest --doctest-modules changetext ./tests --cov=.
     - name: Upload coverage data to coveralls.io
       run: |
         coveralls --service=github

diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,9 @@
-.idea
-.venv
-venv
-htmlcov
-.pytest_cache
+.idea/
+.venv/
+venv/
+htmlcov/
+.pytest_cache/
+dist/
 *.pyc
 .coverage
 changetext.log
diff --git a/changetext/__init__.py b/changetext/__init__.py
@@ -0,0 +1,3 @@
+from .changetext import ChangeText, change_text
+from .common_state import init
+
diff --git a/changetext.py → changetext/changetext.py b/changetext.py → changetext/changetext.py
diff --git a/changetext.pyi → changetext/changetext.pyi b/changetext.pyi → changetext/changetext.pyi
diff --git a/changetext/common_state.py b/changetext/common_state.py
@@ -0,0 +1,22 @@
+class ChangeTextState:
+    def __init__(self):
+        self.prev_tail = ''
+        self.context = None
+
+
+_change_text_state = None
+
+
+def init():
+    global _change_text_state
+    _change_text_state = ChangeTextState()
+
+
+init()
+
+
+def get_state() -> ChangeTextState:
+    global _change_text_state
+    if _change_text_state is None:
+        _change_text_state = ChangeTextState()
+    return _change_text_state
diff --git a/changetext/logging_tools.py b/changetext/logging_tools.py
@@ -0,0 +1,40 @@
+import functools
+import logging
+import sys
+from logging.handlers import RotatingFileHandler
+
+
+class LoggerWrapper:
+    def __init__(self, stream=None):
+        self.logged = set()
+        self.logger = logging.Logger(name=__name__, level=logging.DEBUG)
+
+        if not stream:
+            stream = sys.stdout
+
+        file_handler = RotatingFileHandler("changetext.log", encoding="utf-8", backupCount=0, maxBytes=1024**2)
+        file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s"))
+        stream_handler = logging.StreamHandler(stream)
+        self.logger.addHandler(file_handler)
+        self.logger.addHandler(stream_handler)
+
+    def write(self, text, output):
+        if text not in self.logged:
+            self.logger.debug("{!r} --> {!r}".format(text, output))
+            self.logged.add(text)
+
+
+@functools.lru_cache()
+def get_logger(stream=None) -> LoggerWrapper:
+    return LoggerWrapper(stream)
+
+
+def log_exceptions(func):
+    @functools.wraps(func)
+    def wrapper(text):
+        try:
+            return func(text)
+        except Exception:
+            get_logger().logger.exception("An exception occurred. Initial string: {!r}".format(text))
+
+    return wrapper
diff --git a/changetext/tag_correction.py b/changetext/tag_correction.py
@@ -0,0 +1,142 @@
+from changetext.common_state import get_state
+from changetext.utils import inflect_collocation, re_sentence, any_cyr, cut_number, inflect_enumeration, get_form, \
+    smart_join, custom_parse
+
+
+def parse_tags(text):
+    start = 0
+    for i, c in enumerate(text):
+        if c == '<':
+            if start < i:
+                yield text[start:i]
+            start = i
+        elif c == '>':
+            yield text[start:i + 1]
+            start = i + 1
+
+    if start < len(text):
+        yield text[start:]
+
+
+def corr_tags(text):
+    # print('corr_tags(%r)' % s)
+    li = []
+    get_index = None
+    set_indices = set()
+    capitalize_indices = set()
+    inflect_next = set()
+    for i, item in enumerate(parse_tags(text)):
+        # print(repr(item))
+        if not item.strip():
+            pass
+        elif item[0] == '<':
+            item = item.strip('<>')
+            if not item:
+                return None
+            tags, _, item = item.partition(':')
+            tags = set(tags.split(','))
+            # print(tags)
+
+            if 'capitalize' in tags:
+                tags.remove('capitalize')
+                capitalize_indices.add(len(li))
+
+            if item:
+                # Inflect the word inside the tag after the colon
+                word = item.strip()
+
+                if 'get-form' in tags:
+                    if get_index is not None:
+                        raise ValueError('Duplicate <get-form> tag in %r' % text)
+                    get_index = len(li)
+                    tags.remove('get-form')
+                elif 'set-form' in tags:
+                    set_indices.add(len(li))
+                    tags.remove('set-form')
+
+                if tags:
+                    if ' ' in word:
+                        item = inflect_collocation(word, tags)
+                    else:
+                        p = custom_parse(word)[0]
+                        item = p.inflect(tags).word
+                        # if not make_lower and word[0].isupper():
+                        if word[0].isupper():
+                            item = item.capitalize()
+                else:
+                    # item = word if not make_lower else word.lower()
+                    item = word
+            else:
+                # Inflect a part of text after the tag till the ending point of the sentence.
+                inflect_next = tags
+                continue
+        elif inflect_next:
+            sentence = re_sentence.search(item)
+            if sentence:
+                item = sentence.group(1)
+                tail = sentence.group(2)
+            else:
+                tail = ''
+            item = item.lstrip(' ')
+            if not any_cyr(item.split(' ')[0]):
+                if item.strip()[0].isdigit():
+                    if 'loct' in tags:  # FIXME: possible uninitialized variable tags
+                        tags.remove('loct')
+                        tags.add('loc2')  # inflect into 'году' instead of 'годе'
+                    item, tail1 = cut_number(item)
+                    item += ' ' + custom_parse('год')[0].inflect(inflect_next).word + tail1.lstrip(',')
+                elif (not li or not any_cyr(li[-1].rstrip().split(' ')[-1])) and tags == {'gent'}:
+                    li.append('of ')
+                pass
+            else:
+                if ',' in item:
+                    item = inflect_enumeration(item, inflect_next)
+                elif ' ' in item:
+                    item = inflect_collocation(item, inflect_next)
+                else:
+                    p = custom_parse(item)[0]
+                    item = p.inflect(tags).word
+            item += tail
+            inflect_next = set()
+        else:
+            pass
+        li.append(item)
+
+    delayed = ''
+    if inflect_next:
+        delayed += '<%s>' % ','.join(inflect_next)
+        # print('Delay to the next string: %r' % prev_tail)
+
+    if get_index is not None:
+        # print(get_index)
+        form = get_form(li[get_index])
+        form -= {'anim', 'inan'}  # discard these two because they doesn't matter for the nominal case
+        # print(form)
+        for i in set_indices:
+            word = li[i]
+            if ' ' in word:
+                item = inflect_collocation(word, form)
+            else:
+                p = custom_parse(word)[0]
+                item = p.inflect(form).word
+                if word[0].isupper():
+                    item = item.capitalize()
+            li[i] = item
+
+    if capitalize_indices:
+        for i in capitalize_indices:
+            if i >= len(li):
+                delayed += '<capitalize>'
+            else:
+                for part in li[i].split():
+                    if part:
+                        li[i] = li[i].replace(part, part.capitalize(), 1)
+                        break
+
+    if delayed:
+        # print('Delay to the next string: %r' % delayed)
+        state = get_state()
+        state.prev_tail += delayed
+
+    # print(li)
+    return smart_join(li)
diff --git a/changetext/utf16_codec.py b/changetext/utf16_codec.py
@@ -0,0 +1,14 @@
+import functools
+
+
+def utf16_codec(func):
+    @functools.wraps(func)
+    def wrapper(data):
+        if isinstance(data, bytes):
+            data = data.decode("utf-16-le")
+            output = func(data)
+            return output if output is None else output.encode("utf-16-le") + b"\0\0"
+        else:
+            return func(data)
+
+    return wrapper