Skip to content

Commit

Permalink
Merge pull request #2 from dfint/1-convert-into-package
Browse files Browse the repository at this point in the history
1 convert into package
  • Loading branch information
insolor authored Jun 19, 2022
2 parents af98cf1 + f9f6905 commit 94a83d6
Show file tree
Hide file tree
Showing 19 changed files with 2,636 additions and 2,210 deletions.
5 changes: 5 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[flake8]
ignore = W293
exclude = .git,__pycache__,venv,.venv
max-line-length = 120
max-complexity = 10
6 changes: 3 additions & 3 deletions .github/workflows/test Py 3.4 compatibility.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install -r test_requirements.txt
pip install -r legacy_requirements/requirements.txt
pip install -r legacy_requirements/test_requirements.txt
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
Expand All @@ -33,4 +33,4 @@ jobs:
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest --doctest-modules changetext.py ./tests
pytest --doctest-modules changetext ./tests
10 changes: 5 additions & 5 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,18 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install -r test_requirements.txt
pip install poetry
poetry install
pip install coveralls==3.2.0
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest --doctest-modules changetext.py ./tests --cov=.
poetry run pytest --doctest-modules changetext ./tests --cov=.
- name: Upload coverage data to coveralls.io
run: |
coveralls --service=github
Expand Down
11 changes: 6 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
.idea
.venv
venv
htmlcov
.pytest_cache
.idea/
.venv/
venv/
htmlcov/
.pytest_cache/
dist/
*.pyc
.coverage
changetext.log
3 changes: 3 additions & 0 deletions changetext/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .changetext import ChangeText, change_text
from .common_state import init

3,592 changes: 1,397 additions & 2,195 deletions changetext.py → changetext/changetext.py

Large diffs are not rendered by default.

File renamed without changes.
22 changes: 22 additions & 0 deletions changetext/common_state.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
class ChangeTextState:
def __init__(self):
self.prev_tail = ''
self.context = None


_change_text_state = None


def init():
global _change_text_state
_change_text_state = ChangeTextState()


init()


def get_state() -> ChangeTextState:
global _change_text_state
if _change_text_state is None:
_change_text_state = ChangeTextState()
return _change_text_state
40 changes: 40 additions & 0 deletions changetext/logging_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import functools
import logging
import sys
from logging.handlers import RotatingFileHandler


class LoggerWrapper:
def __init__(self, stream=None):
self.logged = set()
self.logger = logging.Logger(name=__name__, level=logging.DEBUG)

if not stream:
stream = sys.stdout

file_handler = RotatingFileHandler("changetext.log", encoding="utf-8", backupCount=0, maxBytes=1024**2)
file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s"))
stream_handler = logging.StreamHandler(stream)
self.logger.addHandler(file_handler)
self.logger.addHandler(stream_handler)

def write(self, text, output):
if text not in self.logged:
self.logger.debug("{!r} --> {!r}".format(text, output))
self.logged.add(text)


@functools.lru_cache()
def get_logger(stream=None) -> LoggerWrapper:
return LoggerWrapper(stream)


def log_exceptions(func):
@functools.wraps(func)
def wrapper(text):
try:
return func(text)
except Exception:
get_logger().logger.exception("An exception occurred. Initial string: {!r}".format(text))

return wrapper
142 changes: 142 additions & 0 deletions changetext/tag_correction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
from changetext.common_state import get_state
from changetext.utils import inflect_collocation, re_sentence, any_cyr, cut_number, inflect_enumeration, get_form, \
smart_join, custom_parse


def parse_tags(text):
start = 0
for i, c in enumerate(text):
if c == '<':
if start < i:
yield text[start:i]
start = i
elif c == '>':
yield text[start:i + 1]
start = i + 1

if start < len(text):
yield text[start:]


def corr_tags(text):
# print('corr_tags(%r)' % s)
li = []
get_index = None
set_indices = set()
capitalize_indices = set()
inflect_next = set()
for i, item in enumerate(parse_tags(text)):
# print(repr(item))
if not item.strip():
pass
elif item[0] == '<':
item = item.strip('<>')
if not item:
return None
tags, _, item = item.partition(':')
tags = set(tags.split(','))
# print(tags)

if 'capitalize' in tags:
tags.remove('capitalize')
capitalize_indices.add(len(li))

if item:
# Inflect the word inside the tag after the colon
word = item.strip()

if 'get-form' in tags:
if get_index is not None:
raise ValueError('Duplicate <get-form> tag in %r' % text)
get_index = len(li)
tags.remove('get-form')
elif 'set-form' in tags:
set_indices.add(len(li))
tags.remove('set-form')

if tags:
if ' ' in word:
item = inflect_collocation(word, tags)
else:
p = custom_parse(word)[0]
item = p.inflect(tags).word
# if not make_lower and word[0].isupper():
if word[0].isupper():
item = item.capitalize()
else:
# item = word if not make_lower else word.lower()
item = word
else:
# Inflect a part of text after the tag till the ending point of the sentence.
inflect_next = tags
continue
elif inflect_next:
sentence = re_sentence.search(item)
if sentence:
item = sentence.group(1)
tail = sentence.group(2)
else:
tail = ''
item = item.lstrip(' ')
if not any_cyr(item.split(' ')[0]):
if item.strip()[0].isdigit():
if 'loct' in tags: # FIXME: possible uninitialized variable tags
tags.remove('loct')
tags.add('loc2') # inflect into 'году' instead of 'годе'
item, tail1 = cut_number(item)
item += ' ' + custom_parse('год')[0].inflect(inflect_next).word + tail1.lstrip(',')
elif (not li or not any_cyr(li[-1].rstrip().split(' ')[-1])) and tags == {'gent'}:
li.append('of ')
pass
else:
if ',' in item:
item = inflect_enumeration(item, inflect_next)
elif ' ' in item:
item = inflect_collocation(item, inflect_next)
else:
p = custom_parse(item)[0]
item = p.inflect(tags).word
item += tail
inflect_next = set()
else:
pass
li.append(item)

delayed = ''
if inflect_next:
delayed += '<%s>' % ','.join(inflect_next)
# print('Delay to the next string: %r' % prev_tail)

if get_index is not None:
# print(get_index)
form = get_form(li[get_index])
form -= {'anim', 'inan'} # discard these two because they doesn't matter for the nominal case
# print(form)
for i in set_indices:
word = li[i]
if ' ' in word:
item = inflect_collocation(word, form)
else:
p = custom_parse(word)[0]
item = p.inflect(form).word
if word[0].isupper():
item = item.capitalize()
li[i] = item

if capitalize_indices:
for i in capitalize_indices:
if i >= len(li):
delayed += '<capitalize>'
else:
for part in li[i].split():
if part:
li[i] = li[i].replace(part, part.capitalize(), 1)
break

if delayed:
# print('Delay to the next string: %r' % delayed)
state = get_state()
state.prev_tail += delayed

# print(li)
return smart_join(li)
14 changes: 14 additions & 0 deletions changetext/utf16_codec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import functools


def utf16_codec(func):
@functools.wraps(func)
def wrapper(data):
if isinstance(data, bytes):
data = data.decode("utf-16-le")
output = func(data)
return output if output is None else output.encode("utf-16-le") + b"\0\0"
else:
return func(data)

return wrapper
Loading

0 comments on commit 94a83d6

Please sign in to comment.