forked from espnet/espnet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilt.py
executable file
·46 lines (34 loc) · 1.41 KB
/
filt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/usr/bin/env python
# Apache 2.0
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import codecs
import sys
is_python2 = sys.version_info[0] == 2
def get_parser():
parser = argparse.ArgumentParser(
description='filter words in a text file',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--exclude', '-v', dest='exclude', action='store_true',
help='exclude filter words')
parser.add_argument('filt', type=str, help='filter list')
parser.add_argument('infile', type=str, help='input file')
return parser
def main(args):
args = get_parser().parse_args(args)
filter_file(args.infile, args.filt, args.exclude)
def filter_file(infile, filt, exclude):
vocab = set()
with codecs.open(filt, "r", encoding="utf-8") as vocabfile:
for line in vocabfile:
vocab.add(line.strip())
sys.stdout = codecs.getwriter("utf-8")(sys.stdout if is_python2 else sys.stdout.buffer)
with codecs.open(infile, "r", encoding="utf-8") as textfile:
for line in textfile:
if exclude:
print(" ".join(map(lambda word: word if word not in vocab else '', line.strip().split())))
else:
print(" ".join(map(lambda word: word if word in vocab else '<UNK>', line.strip().split())))
if __name__ == '__main__':
main(sys.argv[1:])