-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathText Preprocessing.py
executable file
·109 lines (90 loc) · 3.3 KB
/
Text Preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import collections
import re
from d2l import torch as d2l
#读取数据集
#@save
d2l.DATA_HUB['time_machine']=(d2l.DATA_URL+'timemachine.txt','090b5e7e70c295757f55df93cb0a180b9691891a')
def read_time_machine(): #@save
'''将时间机器数据集加载到文本行的列表中'''
with open(d2l.download('time_machine'),'r') as f:
lines=f.readlines()
return [re.sub('[^A-Za-z]+',' ',line).strip().lower()for line in lines]
lines=read_time_machine()
print(f'#文本总行数:{len(lines)}')
print(lines[0])
print(lines[10])
#次元化
def tokenize(lines,token='word'): #@save
'''将文本行拆分为单次或字符词元'''
if token=='word':
return [line.split() for line in lines]
elif token=='char':
return [list(line)for line in lines]
else:
print('错误:未知词元类型:'+token)
tokens=tokenize(lines)
for i in range(11):
print(tokens[i])
#词表
class Vocab: #@save
'''文本词表'''
def __init__(self,tokens=None,min_freq=0,reserved_tokens=None):
if tokens is None:
tokens=[]
if reserved_tokens is None:
reserved_tokens=[]
#按出现频率排序
counter=count_corpus(tokens)
self._token_freqs=sorted(counter.items(),key=lambda x:x[1],reverse=True)
#未知词元的索引为0
self.idx_to_token=['<unk>']+reserved_tokens
self.token_to_idx={token:idx for idx,token in enumerate(self.idx_to_token)}
for token,freq in self._token_freqs:
if freq<min_freq:
break
if token not in self.token_to_idx:
self.idx_to_token.append(token)
self.token_to_idx[token]=len(self.idx_to_token)-1
def __len__(self):
return len(self.idx_to_token)
def __getitem__(self,tokens):
if not isinstance(tokens,(list,tuple)):
return self.token_to_idx.get(tokens,self.unk)
return [self.__getitem__(token)for token in tokens]
def to_tokens(self,indices):
if not isinstance(indices,(list,tuple)):
return self.idx_to_token[indices]
return [self.idx_to_token[index]for index in indices]
@property
def unk(self): #未知词元的索引为0
return 0
@property
def token_freqs(self):
return self._token_freqs
def count_corpus(tokens): #@save
'''统计词元频率'''
#这里的tokens是1D或2D列表
if len(tokens)==0 or isinstance(tokens[0],list):
#将词元展平成一个列表
tokens=[token for line in tokens for token in line]
return collections.Counter(tokens)
vocab=Vocab(tokens)
print(list(vocab.token_to_idx.items())[:10])
for i in [0,10]:
print('文本:',tokens[i])
print('索引:',vocab[tokens[i]])
#整合所有功能
def load_corpus_time_machine(max_tokens=-1): #@save
'''返回时光机数据集的词元索引列表和词表'''
lines=read_time_machine()
tokens=tokenize(lines,'char')
vocab=Vocab(tokens)
#因为时光机器数据集中的米格文本行不一定是一个句子或一个段落
#所以将所有文本行展平到到一个列表中
corpus=[vocab[token]for line in tokens for token in line]
if max_tokens>0:
corpus=corpus[:max_tokens]
return corpus,vocab
corpus,vocab=load_corpus_time_machine()
print(len(corpus))
print(len(vocab))