-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_tokenizer.py
63 lines (51 loc) · 1.63 KB
/
text_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# -*- coding: utf-8 -*-
"""
Created on December 2021
@author: Amin
"""
# Import liberaries and functions
import time
import spacy
nlp = spacy.load('en_core_web_sm')
from multiprocessing import Pool
from tqdm.auto import tqdm
def clean(doc):
# Identify named entities
ents = [ent.lemma_.lower() for ent in doc.ents]
# To remove stop words, punctuations, and currency tokens
mask = lambda t: t.is_alpha and not t.is_stop
# mask = lambda t: not (t.is_stop or t.is_punct or t.is_currency or t.is_space or t.ent_iob_ !='O')
tokens = [tok.lemma_.lower() for tok in filter(mask, doc)]
tokens.extend(ents)
return tokens
def split_dataframe(df, batch_size = 1000):
chunks = list()
num_chunks = len(df) // batch_size + 1
for i in range(num_chunks):
chunks.append(df[i*batch_size:(i+1)*batch_size])
return chunks
def tokenizer(data, n_jobs = 8, batch_size = 1000):
"""
Extract tokens from the spaCy doc object.
Parameters
----------
data : pandas Series object of text segments
n_jobs : int, number of cpu cores to be used
The default is 5.
Returns
-------
list of tokens per text segment
"""
print(f"Process started | {time.ctime()}\n")
# Prepare for the parallel computing
batches = split_dataframe(data, batch_size)
output = []
for batch in tqdm(batches):
# Covert texts to spaCy doc objects
docs = nlp.pipe(batch.tolist())
with Pool(processes=n_jobs) as p:
temp = p.map(clean, docs)
p.join()
output.extend(temp)
print(f"Process ended | {time.ctime()}\n")
return output