-
Notifications
You must be signed in to change notification settings - Fork 0
/
normalize.py
37 lines (30 loc) · 1.32 KB
/
normalize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import argparse
import os
import re
from pathlib import Path
from train import DATA_DIR
STARTING_LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜËÏÊÉÈÑ'
CONTINUATION_LETTERS = 'abcdefghijklmnopqrstuvwxyzäöüëïêéèñ'
def normalize(line):
line = line.strip()
if len(line) == 0:
return ''
splits = re.split(r'[/,]', line)
if len(splits) > 1:
return ''.join(normalize(part) for part in splits)
if not re.search(rf"^[{STARTING_LETTERS}][{CONTINUATION_LETTERS}]*(-[{STARTING_LETTERS}][{CONTINUATION_LETTERS}]*)*$", line):
if not re.search(r'\s', line):
print(line)
return ''
return line + '\n'
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Normalize a training data file.')
parser.add_argument('--input', default='input.txt',
help='name of the text file to normalize')
parser.add_argument('--output', default='input-normalized.txt',
help='name of the file that the results are written to')
args = parser.parse_args()
with open(os.path.join(DATA_DIR, args.input), 'r', encoding='utf-8') as data_file:
with open(os.path.join(DATA_DIR, args.output), 'w', encoding='utf-8') as output_file:
for line in data_file:
output_file.write(normalize(line))