-
Notifications
You must be signed in to change notification settings - Fork 0
/
setup.py
107 lines (73 loc) · 2.79 KB
/
setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os, re, json, yaml
from datasets import load_dataset
from tokenizers.models import BPE
from tokenizers import Tokenizer, normalizers
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import NFD, Lowercase, StripAccents
def process_translation_data(data_volumn):
#load original dataset
nmt_data = load_dataset('wmt14', 'de-en', split='train')['translation']
min_len = 10
max_len = 300
max_diff = 50
volumn_cnt = 0
corpus, processed = [], []
for elem in nmt_data:
temp_dict = dict()
x, y = elem['en'].strip().lower(), elem['de'].strip().lower()
x_len, y_len = len(x), len(y)
#Filtering Conditions
min_condition = (x_len >= min_len) & (y_len >= min_len)
max_condition = (x_len <= max_len) & (y_len <= max_len)
dif_condition = abs(x_len - y_len) < max_diff
if max_condition & min_condition & dif_condition:
corpus.append(x)
corpus.append(y)
processed.append({'x': x, 'y':y})
#End condition
volumn_cnt += 1
if volumn_cnt == data_volumn:
break
#Save Corpus
with open('data/translation/corpus.txt', 'w') as f:
f.write('\n'.join(corpus))
return processed
def train_tokenizer():
corpus_path = 'data/corpus.txt'
assert os.path.exists(corpus_path)
assert os.path.exists('config.yaml')
with open('config.yaml', 'r') as f:
tok_config = yaml.load(f, Loader=yaml.FullLoader)['tokenizer']
tokenizer = Tokenizer(BPE(unk_token=tok_config['unk_token']))
tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(
vocab_size=tok_config['vocab_size'],
special_tokens=[
tok_config['pad_token'],
tok_config['unk_token'],
tok_config['bos_token'],
tok_config['eos_token']
]
)
tokenizer.train(files=[corpus_path], trainer=trainer)
tokenizer.save("data/tokenizer.json")
def save_data(data_obj):
#split data into train/valid/test sets
train, valid, test = data_obj[:-5100], data_obj[-5100:-100], data_obj[-100:]
data_dict = {k:v for k, v in zip(['train', 'valid', 'test'], [train, valid, test])}
for key, val in data_dict.items():
with open(f'data/{key}.json', 'w') as f:
json.dump(val, f)
assert os.path.exists(f'data/{task}/{key}.json')
def main():
#PreProcess Data
data_volumn = 55100
processed = process_translation_data(data_volumn)
#Train Tokenizer
train_tokenizer()
#Save Data
save_data(processed)
if __name__ == '__main__':
main()