-
Notifications
You must be signed in to change notification settings - Fork 7
/
parse_tpsv.py
70 lines (55 loc) · 3 KB
/
parse_tpsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""Functions to parse lexeme form representations from tab/pipe-separated values syntax."""
import re
from typing import List
import werkzeug.datastructures
from templates import Template
def parse_lexemes(tpsv: str, template: Template) -> List[werkzeug.datastructures.MultiDict]:
lexemes = []
for n, line in enumerate(tpsv.split('\n')):
line = line.rstrip('\r')
if not line:
continue
lexemes.append(parse_lexeme(line, template, line_number=n + 1))
return lexemes
def parse_lexeme(line: str, template: Template, line_number: int) -> werkzeug.datastructures.MultiDict:
fields = [field.strip() for field in line.replace('\t', '|').split('|')]
if len(fields) == len(template['forms']) + 1:
[lexeme_id, *form_representations] = fields
if not re.fullmatch(r'L[1-9][0-9]*', lexeme_id):
raise FirstFieldNotLexemeIdError(len(template['forms']), len(fields), lexeme_id, line_number)
return werkzeug.datastructures.ImmutableMultiDict([
('lexeme_id', lexeme_id),
*(('form_representation', form_representation) for form_representation in form_representations)])
if len(fields) == len(template['forms']):
form_representations = fields
if re.fullmatch(r'L[1-9][0-9]*', form_representations[0]):
raise FirstFieldLexemeIdError(len(template['forms']), len(fields), form_representations[0], line_number)
return werkzeug.datastructures.ImmutableMultiDict(
[('form_representation', form_representation) for form_representation in form_representations])
raise WrongNumberOfFieldsError(len(template['forms']), len(fields), line_number)
class FirstFieldNotLexemeIdError(ValueError):
"""Error raised if there are n+1 fields but the first one is not a lexeme ID."""
def __init__(self, num_forms: int, num_fields: int, first_field: str, line_number: int):
assert num_fields == num_forms + 1
self.num_forms = num_forms
self.num_fields = num_fields
self.first_field = first_field
self.line_number = line_number
super().__init__('n+1 fields but first field is not a lexeme ID')
class FirstFieldLexemeIdError(ValueError):
"""Error raised if there are n fields but the first one is a lexeme ID."""
def __init__(self, num_forms: int, num_fields: int, first_field: str, line_number: int):
assert num_fields == num_forms
self.num_forms = num_forms
self.num_fields = num_fields
self.first_field = first_field
self.line_number = line_number
super().__init__('n fields but first field looks like a lexeme ID')
class WrongNumberOfFieldsError(ValueError):
"""Error raised if there are neither n nor n+1 fields."""
def __init__(self, num_forms: int, num_fields: int, line_number: int):
assert num_fields not in {num_forms, num_forms + 1}
self.num_forms = num_forms
self.num_fields = num_fields
self.line_number = line_number
super().__init__('neither n nor n+1 fields')