-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathplsa.py
executable file
·213 lines (154 loc) · 6.03 KB
/
plsa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import numpy as np
import math
def normalize(input_matrix):
"""
Normalizes the rows of a 2d input_matrix so they sum to 1
"""
row_sums = input_matrix.sum(axis=1)
try:
assert (np.count_nonzero(row_sums)==np.shape(row_sums)[0]) # no row should sum to zero
except Exception:
raise Exception("Error while normalizing. Row(s) sum to zero")
new_matrix = input_matrix / row_sums[:, np.newaxis]
return new_matrix
class Corpus(object):
"""
A collection of documents.
"""
def __init__(self, documents_path):
"""
Initialize empty document list.
"""
self.documents = []
self.vocabulary = []
self.likelihoods = []
self.documents_path = documents_path
self.term_doc_matrix = None
self.document_topic_prob = None # P(z | d)
self.topic_word_prob = None # P(w | z)
self.topic_prob = None # P(z | d, w)
self.number_of_documents = 0
self.vocabulary_size = 0
def build_corpus(self):
"""
Read document, fill in self.documents, a list of list of word
self.documents = [["the", "day", "is", "nice", "the", ...], [], []...]
Update self.number_of_documents
"""
# #############################
# your code here
# #############################
pass # REMOVE THIS
def build_vocabulary(self):
"""
Construct a list of unique words in the whole corpus. Put it in self.vocabulary
for example: ["rain", "the", ...]
Update self.vocabulary_size
"""
# #############################
# your code here
# #############################
pass # REMOVE THIS
def build_term_doc_matrix(self):
"""
Construct the term-document matrix where each row represents a document,
and each column represents a vocabulary term.
self.term_doc_matrix[i][j] is the count of term j in document i
"""
# ############################
# your code here
# ############################
pass # REMOVE THIS
def initialize_randomly(self, number_of_topics):
"""
Randomly initialize the matrices: document_topic_prob and topic_word_prob
which hold the probability distributions for P(z | d) and P(w | z): self.document_topic_prob, and self.topic_word_prob
Don't forget to normalize!
HINT: you will find numpy's random matrix useful [https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.random.random.html]
"""
# ############################
# your code here
# ############################
pass # REMOVE THIS
def initialize_uniformly(self, number_of_topics):
"""
Initializes the matrices: self.document_topic_prob and self.topic_word_prob with a uniform
probability distribution. This is used for testing purposes.
DO NOT CHANGE THIS FUNCTION
"""
self.document_topic_prob = np.ones((self.number_of_documents, number_of_topics))
self.document_topic_prob = normalize(self.document_topic_prob)
self.topic_word_prob = np.ones((number_of_topics, len(self.vocabulary)))
self.topic_word_prob = normalize(self.topic_word_prob)
def initialize(self, number_of_topics, random=False):
""" Call the functions to initialize the matrices document_topic_prob and topic_word_prob
"""
print("Initializing...")
if random:
self.initialize_randomly(number_of_topics)
else:
self.initialize_uniformly(number_of_topics)
def expectation_step(self):
""" The E-step updates P(z | w, d)
"""
print("E step:")
# ############################
# your code here
# ############################
pass # REMOVE THIS
def maximization_step(self, number_of_topics):
""" The M-step updates P(w | z)
"""
print("M step:")
# update P(w | z)
# ############################
# your code here
# ############################
# update P(z | d)
# ############################
# your code here
# ############################
pass # REMOVE THIS
def calculate_likelihood(self, number_of_topics):
""" Calculate the current log-likelihood of the model using
the model's updated probability matrices
Append the calculated log-likelihood to self.likelihoods
"""
# ############################
# your code here
# ############################
return
def plsa(self, number_of_topics, max_iter, epsilon):
"""
Model topics.
"""
print ("EM iteration begins...")
# build term-doc matrix
self.build_term_doc_matrix()
# Create the counter arrays.
# P(z | d, w)
self.topic_prob = np.zeros([self.number_of_documents, number_of_topics, self.vocabulary_size], dtype=np.float)
# P(z | d) P(w | z)
self.initialize(number_of_topics, random=True)
# Run the EM algorithm
current_likelihood = 0.0
for iteration in range(max_iter):
print("Iteration #" + str(iteration + 1) + "...")
# ############################
# your code here
# ############################
pass # REMOVE THIS
def main():
documents_path = 'data/test.txt'
corpus = Corpus(documents_path) # instantiate corpus
corpus.build_corpus()
corpus.build_vocabulary()
print(corpus.vocabulary)
print("Vocabulary size:" + str(len(corpus.vocabulary)))
print("Number of documents:" + str(len(corpus.documents)))
number_of_topics = 2
max_iterations = 50
epsilon = 0.001
corpus.plsa(number_of_topics, max_iterations, epsilon)
if __name__ == '__main__':
main()