This repository has been archived by the owner on May 4, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
74 lines (51 loc) · 1.6 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import click
import sys
import os
import errno
import logging
import json
import re
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
logger = logging.getLogger("cmput497")
def main():
data_files = get_data_files()
all_content = []
#test = []
for f in data_files:
content = read_file(f)
track_misid(content)
# for i in content:
# test = (i['sentence'])
# print(test)
all_content.append(content)
def track_misid(phrases):
for sent in phrases:
freebase_tokens = re.findall(r'\[(.*?)\]]', sent['sentence'])
cleaned_sent = clean_text(freebase_tokens, sent['sentence'])
print(cleaned_sent)
#https://stackoverflow.com/questions/37192606/python-regex-how-to-delete-all-matches-from-a-string
def clean_text(rgx_list, text):
new_text = text
for rgx_match in rgx_list:
new_text = re.sub(rgx_match, ' ', new_text).rstrip()
#TODO fix the unterminated character set at position 0
# TODO once clean check NER
return new_text
def get_data_files():
filenames = []
for file in os.listdir("data"):
if file.endswith(".json"):
filenames.append(file)
return filenames
def read_file(filename):
script_dir = os.path.dirname(__file__) #<-- absolute dir the script is in
rel_path = "data/"+filename
abs_file_path = os.path.join(script_dir, rel_path)
with open(abs_file_path, 'r') as f:
content = json.load(f)
return content
main()