First commit

iliakur · Oct 14, 2012 · 6b32728 · 6b32728
commit 6b32728
Show file tree

Hide file tree

Showing 2,843 changed files with 888,077 additions and 0 deletions.
diff --git a/SW.py b/SW.py
@@ -0,0 +1,111 @@
+''' This is a sample implementation of the Smith-Waterman algorithm in an
+attempt to adapt it to the alignment of different members of a declension
+paradigm with the end goal of separating matching morphemes from those that
+differ.
+'''
+
+from numpy import zeros
+
+WORDS = [('help','unhelpful'), ('kamok','kamka')]
+MATCH = 2
+MISMATCH = -1
+GAP = -1
+
+def score(str1, str2):
+	if str1 == str2:
+		return MATCH
+	else:
+		return MISMATCH
+
+class Allignments:
+
+	def __init__(self, matrix, s1, s2):
+		self.mat = matrix
+		self.a = s1
+		self.b = s2
+		self.Paths = []
+
+	def find_further_steps(self, x, y):
+		steps = []  # this will be our output
+		diag = self.mat[x-1][y-1]
+		up = self.mat[x-1][y]
+		left = self.mat[x][y-1]
+		if x == 0:  # this is if we are at the top end of the matrix
+			steps.append((x,y-1))
+		elif y == 0:    # this is if we are at the left end of it
+			steps.append((x-1,y))
+		else:
+			if diag == max(diag, up, left): # case A
+					steps.append((x-1,y-1))
+			if left == max(diag, up, left): # case B
+				steps.append((x,y-1))
+			if up == max(diag, up, left):   # case C
+				steps.append((x-1,y))
+		return steps
+
+#print score('a','a')
+	def find_path(self, starter, path=[]):
+		path.append(starter)
+		#exit clause
+		solutions = self.find_further_steps(starter[0], starter[1])
+		for solution in solutions:
+			if solution == (0,0):
+				path.append(solution)
+				print 'done', path
+				return path
+			else:
+				further_path = self.find_path(solution, path=path)
+				if further_path:
+					self.Paths.append(self.find_path(solution, path=path))
+			path = path[:path.index(solution)]
+
+
+for pair in WORDS:
+	print 'working on', pair
+	A = pair[0]
+	B = pair[1]
+	matrix= zeros((len(A),len(B)))
+	maxScore = 0
+
+	for i in range(1,len(A)):
+		for j in range(1,len(B)):
+			#print B[j]
+			#print score(A[i], B[j])
+			seq_score = score(A[i], B[j]) + matrix[i-1][j-1]
+			matrix[i][j] = max(seq_score, matrix[i-1][j]+GAP, matrix[i][j-1]+GAP)
+			if matrix[i][j] >= maxScore:
+				imax = i
+				jmax = j
+				maxScore = matrix[i][j]
+	print matrix
+
+	test = Allignments(matrix, A, B)
+
+	test.find_path((imax, jmax), path=[])
+	print test.Paths
+
+	#maxindx = matrix.argmax()
+	#i = maxindx / matrix.shape[1]
+	#j = maxindx % matrix.shape[1]
+	#path = []
+	#i, j = imax, jmax
+	#while i > 0 and j > 0:
+		#diag = matrix[i-1][j-1]		
+		#up = matrix[i-1][j]		
+		#left = matrix[i][j-1]		
+		#path.append((i,j))
+		#if diag  == max(diag, up, left):
+			#i -= 1
+			#j -= 1
+		#elif up == max(diag, up, left):
+			#i -= 1
+		#elif left == max(diag, up, left):
+			#j -= 1
+	#while i > 0:
+		#path.append((i,j))
+		#i -= 1
+	#while j >0:
+		#path.append((i,j))
+		#j -= 1
+	##print imax, jmax
+	#print path
diff --git a/SWNotes b/SWNotes
@@ -0,0 +1,3 @@
+traceback:
+
+
diff --git a/bayes/classifier.py b/bayes/classifier.py
@@ -0,0 +1,158 @@
+'''
+Umass Amherst
+Compsci 585 - Introduction to Natural Language Processing.
+Instroctor: Andrew McCallum
+Submission: Ilia Kurenkov
+Assignment 4: Naive Bayes Classifier
+
+This classifier, for testing purposes, uses the nltk movie review corpus.
+'''
+import datetime
+# from operator import mul
+from math import log
+from nltk.corpus import movie_reviews as mr 
+
+CLASSES = mr.categories()
+TRAINING_POSITIVES1 = mr.fileids(categories='pos')[:500]
+TRAINING_NEGATIVES1 = mr.fileids(categories='neg')[:500]
+
+TRAINING_POSITIVES2 = mr.fileids(categories='pos')[:700]
+TRAINING_NEGATIVES2 = mr.fileids(categories='neg')[:700]
+
+TRAINING_POSITIVES3 = mr.fileids(categories='pos')[:900]
+TRAINING_NEGATIVES3 = mr.fileids(categories='neg')[:900]
+
+TESTING1 = mr.fileids(categories='pos')[500:] + mr.fileids(categories='neg')[500:]
+TESTING2 = mr.fileids(categories='pos')[700:] + mr.fileids(categories='neg')[700:]
+TESTING3 = mr.fileids(categories='pos')[900:] + mr.fileids(categories='neg')[900:]
+
+################ Training Machinery ########################
+'''The way I'm doing this for now may not very good for the memory since it
+loads all of the texts together into it.
+'''
+
+class Laplace_Label():
+	"""this class represents a label. As it is initialized, it processes a 
+	collection of filenames the following way:
+	1. texts corresponding to filenames are extracted and combined into a list
+	2. the vocabulary is created from a set of this list
+	3. Laplace smooting denominator is calculated
+	4. dictionary of word probabilities is created for class.
+	"""
+	def __init__(self, collection):
+		''' constructor takes collection of texts as arg
+		'''
+		self.rev = [word for text in collection for word in mr.words(text)]
+		self.V = set(self.rev)
+		self.N = len(self.rev) + len(self.V)
+		self.word_probs = dict([(w, float(self.rev.count(w)+1)/self.N) for w in self.V])
+
+################ Some Testing Machinery #########################
+
+def prob(word, label):
+	'''lots of error catching here'''
+	if label == 'pos':
+		try:
+			return pos.word_probs[word]
+		except KeyError:
+			return 1.0/pos.N
+	elif label == 'neg':
+		try:
+			return neg.word_probs[word]
+		except KeyError:
+			return 1.0/neg.N
+	else:
+		raise Exception('An invalid label was passed. Exiting...')
+
+
+def cat_score(review, cat):
+	'''gets probability of a document being in a class by summing up
+	the log probabilities of the words in the document, given the class.
+	'''
+	return -sum([log(prob(word, cat)) for word in mr.words(review)])
+
+
+def foo(review):
+	'''returns most likely class for a review'''
+	return max([(cat_score(review, cat), cat) for cat in CLASSES])[1]
+
+############ Calculating Precision and recall ############
+
+def evaluate(classified, test):
+	''' function for evaluating our results. 
+	'''
+	classified_pos = [x for x in classified if x[0] == 'pos']
+	true_positives = [x for x in classified_pos if mr.categories(x[1])[0] == x[0]]
+	false_positives = [x for x in classified_pos if mr.categories(x[1])[0] != x[0]]
+	precision = 100*float(len(true_positives))/len(classified_pos)
+	print 'precision is:', precision
+	recall = 100*len(true_positives)/float(len([x for x in test if mr.categories(x)[0]=='pos']))
+	print 'recall is', recall
+
+	return 2*precision*recall/(precision+recall)
+
+
+############### Some Actual testing ##################
+''' Round 1, with training corpus of 500 for every label.'''
+print '''Round 1:\nThe training corpus is 500 reviews for every class\n
+The testing corpus is 1000 texts'''
+# first we train...
+start = datetime.datetime.now()	#start the timer
+pos = Laplace_Label(TRAINING_POSITIVES1)		# train positive reviews
+print 'size of positive vocabulary: {}'.format(len(pos.V))
+neg = Laplace_Label(TRAINING_NEGATIVES1)		# train on negative reviews
+print 'size of negative vocabulary: {}'.format(len(neg.V))
+finish = datetime.datetime.now()		# stop timer
+print 'done training, it took ', finish - start		# print the time it took
+# then we test...
+start = datetime.datetime.now()			# start timer
+classified = [(foo(x), x) for x in TESTING1]		# create list of 
+finish = datetime.datetime.now()
+print 'done testing, it took ', finish - start
+# then we evaluate ...
+print 'the F1 value is: {}\n'.format(evaluate(classified, TESTING1))
+
+
+'''Round 2 with training corpus of 700 for every label. '''
+print '''Round 2:\nThe training corpus is 700 reviews for every class\n
+The testing corpus is 600 texts'''
+# first we train...
+start = datetime.datetime.now()	#start the timer
+pos = Laplace_Label(TRAINING_POSITIVES2)		# train positive reviews
+print 'size of positive vocabulary: {}'.format(len(pos.V))
+neg = Laplace_Label(TRAINING_NEGATIVES2)		# train on negative reviews
+print 'size of negative vocabulary: {}'.format(len(neg.V))
+finish = datetime.datetime.now()		# stop timer
+print 'done training, it took ', finish - start		# print the time it took
+# then we test...
+start = datetime.datetime.now()			# start timer
+classified = [(foo(x), x) for x in TESTING2]		# create list of 
+finish = datetime.datetime.now()
+print 'done testing, it took ', finish - start
+# then we evaluate ...
+print 'the F1 value is: {}\n'.format(evaluate(classified, TESTING2))
+
+
+'''Round 3 with training corpus of 700 for every label. '''
+print '''Round 3:\nThe training corpus is 900 reviews for every class\n
+The testing corpus is 200 texts'''
+# first we train...
+start = datetime.datetime.now()	#start the timer
+pos = Laplace_Label(TRAINING_POSITIVES3)		# train positive reviews
+print 'size of positive vocabulary: {}'.format(len(pos.V))
+neg = Laplace_Label(TRAINING_NEGATIVES3)		# train on negative reviews
+print 'size of negative vocabulary: {}'.format(len(neg.V))
+finish = datetime.datetime.now()		# stop timer
+print 'done training, it took ', finish - start		# print the time it took
+# then we test...
+start = datetime.datetime.now()			# start timer
+classified = [(foo(x), x) for x in TESTING3]		# create list of 
+finish = datetime.datetime.now()
+print 'done testing, it took ', finish - start
+# then we evaluate ...
+print 'the F1 value is: {}\n'.format(evaluate(classified, TESTING3))
+
+''' Sandbox '''
+
+def combine(list1, list2):
+	pass
diff --git a/bayes/gt-classifier.py b/bayes/gt-classifier.py
@@ -0,0 +1,112 @@
+'''This classifier uses the Good-Turing smoothing. I plan to compare its results
+with a classifier that uses Laplace smoothing. I want to compare the performance
+of these two algorithms on the movie_review corpus as well as on the spam corpus.
+'''
+
+import datetime
+from math import log
+from collections import defaultdict
+from nltk.corpus import movie_reviews as mr
+
+CLASSES = mr.categories()
+TRAINING_POSITIVES1 = mr.fileids(categories='pos')[:500]
+TRAINING_NEGATIVES1 = mr.fileids(categories='neg')[:500]
+
+TRAINING_POSITIVES2 = mr.fileids(categories='pos')[:700]
+TRAINING_NEGATIVES2 = mr.fileids(categories='neg')[:700]
+
+TRAINING_POSITIVES2 = mr.fileids(categories='pos')[:900]
+TRAINING_NEGATIVES1 = mr.fileids(categories='neg')[:900]
+
+TESTING1 = mr.fileids(categories='pos')[500:] + mr.fileids(categories='neg')[500:]
+TESTING2 = mr.fileids(categories='pos')[700:] + mr.fileids(categories='neg')[700:]
+TESTING2 = mr.fileids(categories='pos')[900:] + mr.fileids(categories='neg')[900:]
+
+
+################ Training Machinery ########################
+'''The way I'm doing this for now may not very good for the memory since it
+loads all of the texts together into it.
+'''
+
+class GT_Label():
+	"""this class represents a label. As it is initialized, it processes a 
+	collection of filenames the following way:
+	REDO REDO REDO REDO
+	1. texts corresponding to filenames are extracted and combined into a list
+	2. the vocabulary is created from a set of this list
+	3. Laplace smooting denominator is calculated
+	4. dictionary of word probabilities is created for class.
+	"""
+	def __init__(self, collection):
+		''' constructor takes collection of texts as arg
+		'''
+		self.rev = [word for text in collection for word in mr.words(text)]
+		self.V = set(self.rev)
+		self.N = len(self.rev)
+		self.word_counts = defaultdict(int, [(w, float(self.rev.count(w))) for w in self.V])
+		self.freq_counts = defaultdict(int, {0:self.N})
+		for word in self.word_counts: self.freq_counts[self.word_counts[word]] += 1
+
+	def get_prob(self, word):
+		k = self.word_counts[word]
+		count = (k+1)*self.freq_counts[k+1]/self.freq_counts[k]
+		return count/self.N
+
+
+################ Some Testing Machinery #########################
+
+def prob(word, label):
+	'''lots of error catching here
+	REDO REDO REDO 
+	'''
+	if label == 'pos':
+		try:
+			return pos.word_probs[word]
+		except KeyError:
+			return 1.0/pos.N
+	elif label == 'neg':
+		try:
+			return neg.word_probs[word]
+		except KeyError:
+			return 1.0/neg.N
+	else:
+		raise Exception('An invalid label was passed. Exiting...')
+
+
+def cat_score(review, cat):
+	'''gets probability of a document being in a class by summing up
+	the log probabilities of the words in the document, given the class.
+	'''
+	return -sum([log(prob(word, cat)) for word in mr.words(review)])
+
+
+def foo(review):
+	'''returns most likely class for a review'''
+	return max([(cat_score(review, cat), cat) for cat in CLASSES])[1]
+
+
+############ Calculating Precision and recall ############
+
+def evaluate(classified, test):
+	''' function for evaluating our results. 
+	'''
+	classified_pos = [x for x in classified if x[0] == 'pos']
+	true_positives = [x for x in classified_pos if mr.categories(x[1])[0] == x[0]]
+	false_positives = [x for x in classified_pos if mr.categories(x[1])[0] != x[0]]
+	precision = 100*float(len(true_positives))/len(classified_pos)
+	print 'precision is:', precision
+	recall = 100*len(true_positives)/float(len([x for x in test if mr.categories(x)[0]=='pos']))
+	print 'recall is', recall
+
+	return 2*precision*recall/(precision+recall)
+
+
+############### Some Actual testing ##################
+''' Round 1, with training corpus of 500 for every label.'''
+print '''Trying to figure out where to introduce the quadratic function.'''
+# first we train...
+start = datetime.datetime.now()	#start the timer
+pos = GT_Label(TRAINING_POSITIVES1)		# train positive reviews
+print 'finished training, that took:', datetime.datetime.now() - start
+print 'size of positive vocabulary: {}\n'.format(len(pos.V))
+print pos.freq_counts
diff --git a/bayes/gt-counts b/bayes/gt-counts
@@ -0,0 +1,5 @@
+Trying to figure out where to introduce the quadratic function.
+finished training, that took: 0:01:44.335017
+size of positive vocabulary: 21809
+
+defaultdict(<type 'int'>, {0: 394541, 1.0: 8973, 2.0: 3548, 3.0: 1802, 4.0: 1193, 5.0: 825, 6.0: 648, 7.0: 551, 8.0: 411, 9.0: 341, 10.0: 296, 11.0: 262, 12.0: 188, 13.0: 211, 14.0: 144, 15.0: 150, 16.0: 127, 17.0: 107, 18.0: 87, 19.0: 88, 20.0: 78, 21.0: 77, 22.0: 68, 23.0: 66, 24.0: 65, 25.0: 53, 26.0: 56, 27.0: 48, 28.0: 46, 29.0: 36, 30.0: 37, 31.0: 37, 32.0: 41, 33.0: 30, 34.0: 29, 35.0: 32, 36.0: 25, 37.0: 21, 38.0: 31, 39.0: 24, 40.0: 24, 41.0: 29, 42.0: 17, 43.0: 17, 44.0: 13, 45.0: 20, 46.0: 17, 47.0: 15, 48.0: 13, 49.0: 16, 50.0: 22, 51.0: 23, 52.0: 13, 53.0: 12, 54.0: 15, 55.0: 14, 56.0: 8, 57.0: 11, 58.0: 10, 59.0: 11, 60.0: 15, 61.0: 17, 62.0: 7, 63.0: 13, 64.0: 9, 65.0: 10, 66.0: 10, 67.0: 8, 68.0: 7, 69.0: 11, 70.0: 7, 71.0: 12, 72.0: 6, 73.0: 6, 74.0: 5, 75.0: 5, 76.0: 5, 78.0: 11, 79.0: 4, 80.0: 6, 81.0: 3, 82.0: 5, 83.0: 5, 84.0: 5, 85.0: 4, 86.0: 9, 87.0: 8, 88.0: 5, 89.0: 4, 90.0: 6, 91.0: 3, 92.0: 8, 93.0: 3, 94.0: 4, 95.0: 2, 96.0: 4, 97.0: 1, 98.0: 2, 99.0: 4, 100.0: 6, 101.0: 4, 102.0: 4, 103.0: 4, 104.0: 8, 105.0: 3, 106.0: 7, 107.0: 2, 108.0: 3, 109.0: 3, 110.0: 2, 112.0: 2, 113.0: 2, 114.0: 3, 115.0: 1, 116.0: 2, 118.0: 2, 120.0: 5, 121.0: 3, 122.0: 6, 123.0: 3, 124.0: 2, 125.0: 2, 126.0: 1, 127.0: 4, 128.0: 1, 129.0: 1, 130.0: 1, 131.0: 2, 133.0: 2, 134.0: 3, 136.0: 3, 138.0: 4, 139.0: 1, 140.0: 2, 141.0: 1, 142.0: 2, 143.0: 2, 144.0: 2, 145.0: 2, 146.0: 1, 147.0: 1, 148.0: 2, 149.0: 4, 151.0: 1, 152.0: 3, 153.0: 1, 155.0: 3, 156.0: 3, 2205.0: 1, 158.0: 3, 159.0: 1, 160.0: 2, 161.0: 1, 162.0: 1, 163.0: 3, 164.0: 2, 165.0: 1, 166.0: 1, 167.0: 1, 168.0: 3, 169.0: 2, 170.0: 1, 171.0: 1, 172.0: 1, 174.0: 2, 176.0: 1, 177.0: 2, 178.0: 4, 181.0: 1, 182.0: 3, 183.0: 1, 184.0: 1, 185.0: 3, 186.0: 1, 187.0: 1, 189.0: 1, 193.0: 2, 194.0: 3, 195.0: 1, 196.0: 1, 197.0: 3, 200.0: 1, 201.0: 1, 202.0: 2, 205.0: 1, 207.0: 1, 208.0: 1, 209.0: 1, 213.0: 1, 215.0: 2, 216.0: 2, 217.0: 1, 218.0: 1, 219.0: 2, 222.0: 1, 226.0: 2, 227.0: 2, 229.0: 1, 232.0: 1, 234.0: 1, 235.0: 1, 236.0: 1, 239.0: 1, 241.0: 1, 242.0: 1, 243.0: 1, 244.0: 2, 249.0: 1, 250.0: 1, 251.0: 1, 257.0: 3, 258.0: 1, 259.0: 1, 262.0: 1, 263.0: 3, 264.0: 1, 265.0: 2, 266.0: 1, 269.0: 2, 271.0: 1, 273.0: 1, 274.0: 1, 277.0: 1, 282.0: 1, 287.0: 2, 289.0: 1, 291.0: 1, 294.0: 1, 2352.0: 1, 307.0: 2, 308.0: 1, 310.0: 1, 314.0: 1, 315.0: 1, 326.0: 1, 328.0: 1, 330.0: 1, 333.0: 1, 334.0: 2, 337.0: 1, 338.0: 1, 340.0: 1, 343.0: 1, 346.0: 1, 348.0: 1, 359.0: 1, 364.0: 1, 365.0: 1, 369.0: 1, 378.0: 1, 379.0: 1, 380.0: 1, 385.0: 1, 394.0: 1, 403.0: 1, 404.0: 1, 405.0: 1, 2465.0: 1, 426.0: 3, 431.0: 1, 433.0: 1, 437.0: 1, 446.0: 1, 447.0: 1, 448.0: 1, 453.0: 1, 456.0: 2, 464.0: 1, 466.0: 1, 467.0: 1, 2520.0: 1, 484.0: 1, 490.0: 1, 495.0: 1, 502.0: 1, 507.0: 1, 526.0: 1, 535.0: 1, 544.0: 1, 567.0: 1, 569.0: 1, 579.0: 1, 584.0: 1, 588.0: 1, 589.0: 1, 593.0: 1, 8786.0: 1, 597.0: 1, 610.0: 1, 611.0: 1, 619.0: 1, 6782.0: 1, 642.0: 1, 2708.0: 1, 671.0: 1, 675.0: 1, 676.0: 1, 677.0: 1, 684.0: 1, 705.0: 1, 712.0: 1, 802.0: 1, 725.0: 1, 727.0: 1, 745.0: 1, 4562.0: 1, 780.0: 1, 788.0: 1, 2850.0: 1, 806.0: 1, 811.0: 1, 830.0: 2, 2900.0: 1, 2906.0: 1, 862.0: 1, 872.0: 1, 882.0: 1, 3058.0: 1, 1028.0: 2, 7219.0: 1, 1097.0: 1, 1122.0: 1, 19586.0: 1, 1163.0: 1, 9356.0: 1, 1200.0: 1, 1207.0: 1, 1237.0: 1, 1250.0: 1, 1307.0: 1, 1337.0: 1, 1365.0: 1, 1390.0: 1, 2092.0: 1, 5490.0: 1, 1403.0: 1, 9616.0: 1, 1443.0: 1, 1446.0: 1, 3627.0: 1, 20033.0: 1, 1647.0: 1, 7792.0: 1, 1774.0: 1, 3838.0: 1, 1800.0: 1, 16199.0: 1, 3920.0: 1, 3923.0: 1, 1971.0: 1})