-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from dfint/1-convert-into-package
1 convert into package
- Loading branch information
Showing
19 changed files
with
2,636 additions
and
2,210 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
[flake8] | ||
ignore = W293 | ||
exclude = .git,__pycache__,venv,.venv | ||
max-line-length = 120 | ||
max-complexity = 10 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,9 @@ | ||
.idea | ||
.venv | ||
venv | ||
htmlcov | ||
.pytest_cache | ||
.idea/ | ||
.venv/ | ||
venv/ | ||
htmlcov/ | ||
.pytest_cache/ | ||
dist/ | ||
*.pyc | ||
.coverage | ||
changetext.log |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from .changetext import ChangeText, change_text | ||
from .common_state import init | ||
|
Large diffs are not rendered by default.
Oops, something went wrong.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
class ChangeTextState: | ||
def __init__(self): | ||
self.prev_tail = '' | ||
self.context = None | ||
|
||
|
||
_change_text_state = None | ||
|
||
|
||
def init(): | ||
global _change_text_state | ||
_change_text_state = ChangeTextState() | ||
|
||
|
||
init() | ||
|
||
|
||
def get_state() -> ChangeTextState: | ||
global _change_text_state | ||
if _change_text_state is None: | ||
_change_text_state = ChangeTextState() | ||
return _change_text_state |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import functools | ||
import logging | ||
import sys | ||
from logging.handlers import RotatingFileHandler | ||
|
||
|
||
class LoggerWrapper: | ||
def __init__(self, stream=None): | ||
self.logged = set() | ||
self.logger = logging.Logger(name=__name__, level=logging.DEBUG) | ||
|
||
if not stream: | ||
stream = sys.stdout | ||
|
||
file_handler = RotatingFileHandler("changetext.log", encoding="utf-8", backupCount=0, maxBytes=1024**2) | ||
file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s")) | ||
stream_handler = logging.StreamHandler(stream) | ||
self.logger.addHandler(file_handler) | ||
self.logger.addHandler(stream_handler) | ||
|
||
def write(self, text, output): | ||
if text not in self.logged: | ||
self.logger.debug("{!r} --> {!r}".format(text, output)) | ||
self.logged.add(text) | ||
|
||
|
||
@functools.lru_cache() | ||
def get_logger(stream=None) -> LoggerWrapper: | ||
return LoggerWrapper(stream) | ||
|
||
|
||
def log_exceptions(func): | ||
@functools.wraps(func) | ||
def wrapper(text): | ||
try: | ||
return func(text) | ||
except Exception: | ||
get_logger().logger.exception("An exception occurred. Initial string: {!r}".format(text)) | ||
|
||
return wrapper |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
from changetext.common_state import get_state | ||
from changetext.utils import inflect_collocation, re_sentence, any_cyr, cut_number, inflect_enumeration, get_form, \ | ||
smart_join, custom_parse | ||
|
||
|
||
def parse_tags(text): | ||
start = 0 | ||
for i, c in enumerate(text): | ||
if c == '<': | ||
if start < i: | ||
yield text[start:i] | ||
start = i | ||
elif c == '>': | ||
yield text[start:i + 1] | ||
start = i + 1 | ||
|
||
if start < len(text): | ||
yield text[start:] | ||
|
||
|
||
def corr_tags(text): | ||
# print('corr_tags(%r)' % s) | ||
li = [] | ||
get_index = None | ||
set_indices = set() | ||
capitalize_indices = set() | ||
inflect_next = set() | ||
for i, item in enumerate(parse_tags(text)): | ||
# print(repr(item)) | ||
if not item.strip(): | ||
pass | ||
elif item[0] == '<': | ||
item = item.strip('<>') | ||
if not item: | ||
return None | ||
tags, _, item = item.partition(':') | ||
tags = set(tags.split(',')) | ||
# print(tags) | ||
|
||
if 'capitalize' in tags: | ||
tags.remove('capitalize') | ||
capitalize_indices.add(len(li)) | ||
|
||
if item: | ||
# Inflect the word inside the tag after the colon | ||
word = item.strip() | ||
|
||
if 'get-form' in tags: | ||
if get_index is not None: | ||
raise ValueError('Duplicate <get-form> tag in %r' % text) | ||
get_index = len(li) | ||
tags.remove('get-form') | ||
elif 'set-form' in tags: | ||
set_indices.add(len(li)) | ||
tags.remove('set-form') | ||
|
||
if tags: | ||
if ' ' in word: | ||
item = inflect_collocation(word, tags) | ||
else: | ||
p = custom_parse(word)[0] | ||
item = p.inflect(tags).word | ||
# if not make_lower and word[0].isupper(): | ||
if word[0].isupper(): | ||
item = item.capitalize() | ||
else: | ||
# item = word if not make_lower else word.lower() | ||
item = word | ||
else: | ||
# Inflect a part of text after the tag till the ending point of the sentence. | ||
inflect_next = tags | ||
continue | ||
elif inflect_next: | ||
sentence = re_sentence.search(item) | ||
if sentence: | ||
item = sentence.group(1) | ||
tail = sentence.group(2) | ||
else: | ||
tail = '' | ||
item = item.lstrip(' ') | ||
if not any_cyr(item.split(' ')[0]): | ||
if item.strip()[0].isdigit(): | ||
if 'loct' in tags: # FIXME: possible uninitialized variable tags | ||
tags.remove('loct') | ||
tags.add('loc2') # inflect into 'году' instead of 'годе' | ||
item, tail1 = cut_number(item) | ||
item += ' ' + custom_parse('год')[0].inflect(inflect_next).word + tail1.lstrip(',') | ||
elif (not li or not any_cyr(li[-1].rstrip().split(' ')[-1])) and tags == {'gent'}: | ||
li.append('of ') | ||
pass | ||
else: | ||
if ',' in item: | ||
item = inflect_enumeration(item, inflect_next) | ||
elif ' ' in item: | ||
item = inflect_collocation(item, inflect_next) | ||
else: | ||
p = custom_parse(item)[0] | ||
item = p.inflect(tags).word | ||
item += tail | ||
inflect_next = set() | ||
else: | ||
pass | ||
li.append(item) | ||
|
||
delayed = '' | ||
if inflect_next: | ||
delayed += '<%s>' % ','.join(inflect_next) | ||
# print('Delay to the next string: %r' % prev_tail) | ||
|
||
if get_index is not None: | ||
# print(get_index) | ||
form = get_form(li[get_index]) | ||
form -= {'anim', 'inan'} # discard these two because they doesn't matter for the nominal case | ||
# print(form) | ||
for i in set_indices: | ||
word = li[i] | ||
if ' ' in word: | ||
item = inflect_collocation(word, form) | ||
else: | ||
p = custom_parse(word)[0] | ||
item = p.inflect(form).word | ||
if word[0].isupper(): | ||
item = item.capitalize() | ||
li[i] = item | ||
|
||
if capitalize_indices: | ||
for i in capitalize_indices: | ||
if i >= len(li): | ||
delayed += '<capitalize>' | ||
else: | ||
for part in li[i].split(): | ||
if part: | ||
li[i] = li[i].replace(part, part.capitalize(), 1) | ||
break | ||
|
||
if delayed: | ||
# print('Delay to the next string: %r' % delayed) | ||
state = get_state() | ||
state.prev_tail += delayed | ||
|
||
# print(li) | ||
return smart_join(li) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
import functools | ||
|
||
|
||
def utf16_codec(func): | ||
@functools.wraps(func) | ||
def wrapper(data): | ||
if isinstance(data, bytes): | ||
data = data.decode("utf-16-le") | ||
output = func(data) | ||
return output if output is None else output.encode("utf-16-le") + b"\0\0" | ||
else: | ||
return func(data) | ||
|
||
return wrapper |
Oops, something went wrong.