-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathpreprocessing.py
37 lines (28 loc) · 1.01 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# -*- coding: utf-8 -*-
"""
Use NLTK for preprocessing.
Feel free to switch to spacy.
"""
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import utils
LEMMATIZER = WordNetLemmatizer()
STOPWORDS = set(stopwords.words('english'))
def tokenize(text, filter_stopwords=False, lowercase=True):
words = wordpunct_tokenize(text)
if filter_stopwords:
words = [w for w in words if w not in STOPWORDS]
return words
def lemmatize(text, filter_stopwords=False, lowercase=True):
lemmas = [LEMMATIZER.lemmatize(w)
for w in tokenize(text, lowercase=lowercase,
filter_stopwords=filter_stopwords)]
lemmas = [w for w in lemmas if len(w) > 2]
return lemmas
def preprocess_input(input_file, output_file, func=tokenize):
data = []
for d in utils.read_json_list(input_file):
d["text"] = " ".join(func(d["text"]))
data.append(d)
utils.write_json_list(output_file, data)