-
Notifications
You must be signed in to change notification settings - Fork 2
/
lexicon.py
140 lines (98 loc) · 3.77 KB
/
lexicon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
import wget
from os import listdir
from zipfile import ZipFile
from collections import defaultdict
import trie
LEXICON_URL='https://raw.githubusercontent.com/techiaith/lecsicon-cymraeg-bangor/main/lecsicon_cc0.zip'
LEXICON_FILENAME='lecsicon_cc0.txt'
class Lexicon(object):
def __init__(self):
self.lex_trie = trie.Trie()
self.wordform_lookup = defaultdict(list)
self.lemma_lookup = defaultdict(list)
for w,l,p,f in self.read_lexicon_file():
self.add(w,l,p,f)
self.spellings = defaultdict(list)
self.initialise_spellings_cache()
#self.add('mae hen wlad fy nhadau', 'mae hen gwlad fy tadau', 'PHRASE', '')
def read_lexicon_file(self):
def download_lexicon():
wget.download(LEXICON_URL)
if os.path.isfile('lecsicon_cc0.zip'):
z=ZipFile('lecsicon_cc0.zip','r')
z.extractall()
z.close()
for fn in os.listdir():
if fn.endswith('.zip') or fn.endswith('.tmp'):
os.remove(fn)
print ("\n")
def parse_entry(lex_entry):
def parse_ud_fields(ud_string):
p=dict()
ud = ud_string.split('|')
for m in ud:
f = m.split('=')
p[f[0]]=f[1]
return p
f = lex_entry.rstrip().split('\t')
return f[0], f[1], f[2], parse_ud_fields(f[3]) if len(f)>3 else ''
if not os.path.isfile(LEXICON_FILENAME):
print ("Llwytho'r lecsicon i lawr..")
download_lexicon()
print ("Llwytho'r geirfa...")
with open(LEXICON_FILENAME, 'r', encoding='utf-8') as lexicon_file:
for lex in lexicon_file:
wordform, lemma, pos, ud = parse_entry(lex)
yield (wordform, lemma, pos, ud)
def contains(self, word):
return self.lex_trie.search(word)
def get_lemmas_with_info(self, wordform):
return '\n'.join('{}\tLemma:{}\tPos:{}\tInfo:{}'.format(query, *l) for l in self.get_lemmas(query))
def get_lemmas(self, wordform):
return self.wordform_lookup[wordform]
def get_wordforms(self, lemma):
return self.lemma_lookup[lemma]
def get_size(self):
return len(self.wordform_lookup), len(self.lemma_lookup)
def add(self, wordform, lemma, pos, features):
self.lex_trie.insert(wordform)
self.wordform_lookup[wordform].append((lemma, pos, features))
self.lemma_lookup[lemma].append((wordform,pos,features))
def generate_spelling(self, wordform):
result=[]
digraphs = ['ch','dd','ff','ng','ll','ph','rh','th']
l = len(wordform)
skip=False
for c in range(l):
if skip:
skip=False
continue
ch = wordform[int(c)]
nch=''
if c<l-1:
nch = wordform[c+1]
for dg in digraphs:
if ch==dg[0] and nch==dg[1]:
result.append(dg)
skip=True
break
if skip==False:
result.append(wordform[c])
return result
def initialise_spellings_cache(self):
for k,v in self.lemma_lookup.items():
spelling = self.generate_spelling(k)
self.spellings[len(spelling)].append(".".join(spelling))
def is_word_length(self, word, length):
spelling = self.generate_spelling(word)
if len(spelling)==length:
return True
else:
return False
def get_spellings(self, length):
return self.spellings[int(length)]
if __name__ == "__main__":
l=Lexicon()
print (l.get_size())
print ("Lecsicon wedi'i llwytho")