-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 6b32728
Showing
2,843 changed files
with
888,077 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
''' This is a sample implementation of the Smith-Waterman algorithm in an | ||
attempt to adapt it to the alignment of different members of a declension | ||
paradigm with the end goal of separating matching morphemes from those that | ||
differ. | ||
''' | ||
|
||
from numpy import zeros | ||
|
||
WORDS = [('help','unhelpful'), ('kamok','kamka')] | ||
MATCH = 2 | ||
MISMATCH = -1 | ||
GAP = -1 | ||
|
||
def score(str1, str2): | ||
if str1 == str2: | ||
return MATCH | ||
else: | ||
return MISMATCH | ||
|
||
class Allignments: | ||
|
||
def __init__(self, matrix, s1, s2): | ||
self.mat = matrix | ||
self.a = s1 | ||
self.b = s2 | ||
self.Paths = [] | ||
|
||
def find_further_steps(self, x, y): | ||
steps = [] # this will be our output | ||
diag = self.mat[x-1][y-1] | ||
up = self.mat[x-1][y] | ||
left = self.mat[x][y-1] | ||
if x == 0: # this is if we are at the top end of the matrix | ||
steps.append((x,y-1)) | ||
elif y == 0: # this is if we are at the left end of it | ||
steps.append((x-1,y)) | ||
else: | ||
if diag == max(diag, up, left): # case A | ||
steps.append((x-1,y-1)) | ||
if left == max(diag, up, left): # case B | ||
steps.append((x,y-1)) | ||
if up == max(diag, up, left): # case C | ||
steps.append((x-1,y)) | ||
return steps | ||
|
||
#print score('a','a') | ||
def find_path(self, starter, path=[]): | ||
path.append(starter) | ||
#exit clause | ||
solutions = self.find_further_steps(starter[0], starter[1]) | ||
for solution in solutions: | ||
if solution == (0,0): | ||
path.append(solution) | ||
print 'done', path | ||
return path | ||
else: | ||
further_path = self.find_path(solution, path=path) | ||
if further_path: | ||
self.Paths.append(self.find_path(solution, path=path)) | ||
path = path[:path.index(solution)] | ||
|
||
|
||
for pair in WORDS: | ||
print 'working on', pair | ||
A = pair[0] | ||
B = pair[1] | ||
matrix= zeros((len(A),len(B))) | ||
maxScore = 0 | ||
|
||
for i in range(1,len(A)): | ||
for j in range(1,len(B)): | ||
#print B[j] | ||
#print score(A[i], B[j]) | ||
seq_score = score(A[i], B[j]) + matrix[i-1][j-1] | ||
matrix[i][j] = max(seq_score, matrix[i-1][j]+GAP, matrix[i][j-1]+GAP) | ||
if matrix[i][j] >= maxScore: | ||
imax = i | ||
jmax = j | ||
maxScore = matrix[i][j] | ||
print matrix | ||
|
||
test = Allignments(matrix, A, B) | ||
|
||
test.find_path((imax, jmax), path=[]) | ||
print test.Paths | ||
|
||
#maxindx = matrix.argmax() | ||
#i = maxindx / matrix.shape[1] | ||
#j = maxindx % matrix.shape[1] | ||
#path = [] | ||
#i, j = imax, jmax | ||
#while i > 0 and j > 0: | ||
#diag = matrix[i-1][j-1] | ||
#up = matrix[i-1][j] | ||
#left = matrix[i][j-1] | ||
#path.append((i,j)) | ||
#if diag == max(diag, up, left): | ||
#i -= 1 | ||
#j -= 1 | ||
#elif up == max(diag, up, left): | ||
#i -= 1 | ||
#elif left == max(diag, up, left): | ||
#j -= 1 | ||
#while i > 0: | ||
#path.append((i,j)) | ||
#i -= 1 | ||
#while j >0: | ||
#path.append((i,j)) | ||
#j -= 1 | ||
##print imax, jmax | ||
#print path |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
traceback: | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
''' | ||
Umass Amherst | ||
Compsci 585 - Introduction to Natural Language Processing. | ||
Instroctor: Andrew McCallum | ||
Submission: Ilia Kurenkov | ||
Assignment 4: Naive Bayes Classifier | ||
This classifier, for testing purposes, uses the nltk movie review corpus. | ||
''' | ||
import datetime | ||
# from operator import mul | ||
from math import log | ||
from nltk.corpus import movie_reviews as mr | ||
|
||
CLASSES = mr.categories() | ||
TRAINING_POSITIVES1 = mr.fileids(categories='pos')[:500] | ||
TRAINING_NEGATIVES1 = mr.fileids(categories='neg')[:500] | ||
|
||
TRAINING_POSITIVES2 = mr.fileids(categories='pos')[:700] | ||
TRAINING_NEGATIVES2 = mr.fileids(categories='neg')[:700] | ||
|
||
TRAINING_POSITIVES3 = mr.fileids(categories='pos')[:900] | ||
TRAINING_NEGATIVES3 = mr.fileids(categories='neg')[:900] | ||
|
||
TESTING1 = mr.fileids(categories='pos')[500:] + mr.fileids(categories='neg')[500:] | ||
TESTING2 = mr.fileids(categories='pos')[700:] + mr.fileids(categories='neg')[700:] | ||
TESTING3 = mr.fileids(categories='pos')[900:] + mr.fileids(categories='neg')[900:] | ||
|
||
################ Training Machinery ######################## | ||
'''The way I'm doing this for now may not very good for the memory since it | ||
loads all of the texts together into it. | ||
''' | ||
|
||
class Laplace_Label(): | ||
"""this class represents a label. As it is initialized, it processes a | ||
collection of filenames the following way: | ||
1. texts corresponding to filenames are extracted and combined into a list | ||
2. the vocabulary is created from a set of this list | ||
3. Laplace smooting denominator is calculated | ||
4. dictionary of word probabilities is created for class. | ||
""" | ||
def __init__(self, collection): | ||
''' constructor takes collection of texts as arg | ||
''' | ||
self.rev = [word for text in collection for word in mr.words(text)] | ||
self.V = set(self.rev) | ||
self.N = len(self.rev) + len(self.V) | ||
self.word_probs = dict([(w, float(self.rev.count(w)+1)/self.N) for w in self.V]) | ||
|
||
################ Some Testing Machinery ######################### | ||
|
||
def prob(word, label): | ||
'''lots of error catching here''' | ||
if label == 'pos': | ||
try: | ||
return pos.word_probs[word] | ||
except KeyError: | ||
return 1.0/pos.N | ||
elif label == 'neg': | ||
try: | ||
return neg.word_probs[word] | ||
except KeyError: | ||
return 1.0/neg.N | ||
else: | ||
raise Exception('An invalid label was passed. Exiting...') | ||
|
||
|
||
def cat_score(review, cat): | ||
'''gets probability of a document being in a class by summing up | ||
the log probabilities of the words in the document, given the class. | ||
''' | ||
return -sum([log(prob(word, cat)) for word in mr.words(review)]) | ||
|
||
|
||
def foo(review): | ||
'''returns most likely class for a review''' | ||
return max([(cat_score(review, cat), cat) for cat in CLASSES])[1] | ||
|
||
############ Calculating Precision and recall ############ | ||
|
||
def evaluate(classified, test): | ||
''' function for evaluating our results. | ||
''' | ||
classified_pos = [x for x in classified if x[0] == 'pos'] | ||
true_positives = [x for x in classified_pos if mr.categories(x[1])[0] == x[0]] | ||
false_positives = [x for x in classified_pos if mr.categories(x[1])[0] != x[0]] | ||
precision = 100*float(len(true_positives))/len(classified_pos) | ||
print 'precision is:', precision | ||
recall = 100*len(true_positives)/float(len([x for x in test if mr.categories(x)[0]=='pos'])) | ||
print 'recall is', recall | ||
|
||
return 2*precision*recall/(precision+recall) | ||
|
||
|
||
############### Some Actual testing ################## | ||
''' Round 1, with training corpus of 500 for every label.''' | ||
print '''Round 1:\nThe training corpus is 500 reviews for every class\n | ||
The testing corpus is 1000 texts''' | ||
# first we train... | ||
start = datetime.datetime.now() #start the timer | ||
pos = Laplace_Label(TRAINING_POSITIVES1) # train positive reviews | ||
print 'size of positive vocabulary: {}'.format(len(pos.V)) | ||
neg = Laplace_Label(TRAINING_NEGATIVES1) # train on negative reviews | ||
print 'size of negative vocabulary: {}'.format(len(neg.V)) | ||
finish = datetime.datetime.now() # stop timer | ||
print 'done training, it took ', finish - start # print the time it took | ||
# then we test... | ||
start = datetime.datetime.now() # start timer | ||
classified = [(foo(x), x) for x in TESTING1] # create list of | ||
finish = datetime.datetime.now() | ||
print 'done testing, it took ', finish - start | ||
# then we evaluate ... | ||
print 'the F1 value is: {}\n'.format(evaluate(classified, TESTING1)) | ||
|
||
|
||
'''Round 2 with training corpus of 700 for every label. ''' | ||
print '''Round 2:\nThe training corpus is 700 reviews for every class\n | ||
The testing corpus is 600 texts''' | ||
# first we train... | ||
start = datetime.datetime.now() #start the timer | ||
pos = Laplace_Label(TRAINING_POSITIVES2) # train positive reviews | ||
print 'size of positive vocabulary: {}'.format(len(pos.V)) | ||
neg = Laplace_Label(TRAINING_NEGATIVES2) # train on negative reviews | ||
print 'size of negative vocabulary: {}'.format(len(neg.V)) | ||
finish = datetime.datetime.now() # stop timer | ||
print 'done training, it took ', finish - start # print the time it took | ||
# then we test... | ||
start = datetime.datetime.now() # start timer | ||
classified = [(foo(x), x) for x in TESTING2] # create list of | ||
finish = datetime.datetime.now() | ||
print 'done testing, it took ', finish - start | ||
# then we evaluate ... | ||
print 'the F1 value is: {}\n'.format(evaluate(classified, TESTING2)) | ||
|
||
|
||
'''Round 3 with training corpus of 700 for every label. ''' | ||
print '''Round 3:\nThe training corpus is 900 reviews for every class\n | ||
The testing corpus is 200 texts''' | ||
# first we train... | ||
start = datetime.datetime.now() #start the timer | ||
pos = Laplace_Label(TRAINING_POSITIVES3) # train positive reviews | ||
print 'size of positive vocabulary: {}'.format(len(pos.V)) | ||
neg = Laplace_Label(TRAINING_NEGATIVES3) # train on negative reviews | ||
print 'size of negative vocabulary: {}'.format(len(neg.V)) | ||
finish = datetime.datetime.now() # stop timer | ||
print 'done training, it took ', finish - start # print the time it took | ||
# then we test... | ||
start = datetime.datetime.now() # start timer | ||
classified = [(foo(x), x) for x in TESTING3] # create list of | ||
finish = datetime.datetime.now() | ||
print 'done testing, it took ', finish - start | ||
# then we evaluate ... | ||
print 'the F1 value is: {}\n'.format(evaluate(classified, TESTING3)) | ||
|
||
''' Sandbox ''' | ||
|
||
def combine(list1, list2): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
'''This classifier uses the Good-Turing smoothing. I plan to compare its results | ||
with a classifier that uses Laplace smoothing. I want to compare the performance | ||
of these two algorithms on the movie_review corpus as well as on the spam corpus. | ||
''' | ||
|
||
import datetime | ||
from math import log | ||
from collections import defaultdict | ||
from nltk.corpus import movie_reviews as mr | ||
|
||
CLASSES = mr.categories() | ||
TRAINING_POSITIVES1 = mr.fileids(categories='pos')[:500] | ||
TRAINING_NEGATIVES1 = mr.fileids(categories='neg')[:500] | ||
|
||
TRAINING_POSITIVES2 = mr.fileids(categories='pos')[:700] | ||
TRAINING_NEGATIVES2 = mr.fileids(categories='neg')[:700] | ||
|
||
TRAINING_POSITIVES2 = mr.fileids(categories='pos')[:900] | ||
TRAINING_NEGATIVES1 = mr.fileids(categories='neg')[:900] | ||
|
||
TESTING1 = mr.fileids(categories='pos')[500:] + mr.fileids(categories='neg')[500:] | ||
TESTING2 = mr.fileids(categories='pos')[700:] + mr.fileids(categories='neg')[700:] | ||
TESTING2 = mr.fileids(categories='pos')[900:] + mr.fileids(categories='neg')[900:] | ||
|
||
|
||
################ Training Machinery ######################## | ||
'''The way I'm doing this for now may not very good for the memory since it | ||
loads all of the texts together into it. | ||
''' | ||
|
||
class GT_Label(): | ||
"""this class represents a label. As it is initialized, it processes a | ||
collection of filenames the following way: | ||
REDO REDO REDO REDO | ||
1. texts corresponding to filenames are extracted and combined into a list | ||
2. the vocabulary is created from a set of this list | ||
3. Laplace smooting denominator is calculated | ||
4. dictionary of word probabilities is created for class. | ||
""" | ||
def __init__(self, collection): | ||
''' constructor takes collection of texts as arg | ||
''' | ||
self.rev = [word for text in collection for word in mr.words(text)] | ||
self.V = set(self.rev) | ||
self.N = len(self.rev) | ||
self.word_counts = defaultdict(int, [(w, float(self.rev.count(w))) for w in self.V]) | ||
self.freq_counts = defaultdict(int, {0:self.N}) | ||
for word in self.word_counts: self.freq_counts[self.word_counts[word]] += 1 | ||
|
||
def get_prob(self, word): | ||
k = self.word_counts[word] | ||
count = (k+1)*self.freq_counts[k+1]/self.freq_counts[k] | ||
return count/self.N | ||
|
||
|
||
################ Some Testing Machinery ######################### | ||
|
||
def prob(word, label): | ||
'''lots of error catching here | ||
REDO REDO REDO | ||
''' | ||
if label == 'pos': | ||
try: | ||
return pos.word_probs[word] | ||
except KeyError: | ||
return 1.0/pos.N | ||
elif label == 'neg': | ||
try: | ||
return neg.word_probs[word] | ||
except KeyError: | ||
return 1.0/neg.N | ||
else: | ||
raise Exception('An invalid label was passed. Exiting...') | ||
|
||
|
||
def cat_score(review, cat): | ||
'''gets probability of a document being in a class by summing up | ||
the log probabilities of the words in the document, given the class. | ||
''' | ||
return -sum([log(prob(word, cat)) for word in mr.words(review)]) | ||
|
||
|
||
def foo(review): | ||
'''returns most likely class for a review''' | ||
return max([(cat_score(review, cat), cat) for cat in CLASSES])[1] | ||
|
||
|
||
############ Calculating Precision and recall ############ | ||
|
||
def evaluate(classified, test): | ||
''' function for evaluating our results. | ||
''' | ||
classified_pos = [x for x in classified if x[0] == 'pos'] | ||
true_positives = [x for x in classified_pos if mr.categories(x[1])[0] == x[0]] | ||
false_positives = [x for x in classified_pos if mr.categories(x[1])[0] != x[0]] | ||
precision = 100*float(len(true_positives))/len(classified_pos) | ||
print 'precision is:', precision | ||
recall = 100*len(true_positives)/float(len([x for x in test if mr.categories(x)[0]=='pos'])) | ||
print 'recall is', recall | ||
|
||
return 2*precision*recall/(precision+recall) | ||
|
||
|
||
############### Some Actual testing ################## | ||
''' Round 1, with training corpus of 500 for every label.''' | ||
print '''Trying to figure out where to introduce the quadratic function.''' | ||
# first we train... | ||
start = datetime.datetime.now() #start the timer | ||
pos = GT_Label(TRAINING_POSITIVES1) # train positive reviews | ||
print 'finished training, that took:', datetime.datetime.now() - start | ||
print 'size of positive vocabulary: {}\n'.format(len(pos.V)) | ||
print pos.freq_counts |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
Trying to figure out where to introduce the quadratic function. | ||
finished training, that took: 0:01:44.335017 | ||
size of positive vocabulary: 21809 | ||
|
||
defaultdict(<type 'int'>, {0: 394541, 1.0: 8973, 2.0: 3548, 3.0: 1802, 4.0: 1193, 5.0: 825, 6.0: 648, 7.0: 551, 8.0: 411, 9.0: 341, 10.0: 296, 11.0: 262, 12.0: 188, 13.0: 211, 14.0: 144, 15.0: 150, 16.0: 127, 17.0: 107, 18.0: 87, 19.0: 88, 20.0: 78, 21.0: 77, 22.0: 68, 23.0: 66, 24.0: 65, 25.0: 53, 26.0: 56, 27.0: 48, 28.0: 46, 29.0: 36, 30.0: 37, 31.0: 37, 32.0: 41, 33.0: 30, 34.0: 29, 35.0: 32, 36.0: 25, 37.0: 21, 38.0: 31, 39.0: 24, 40.0: 24, 41.0: 29, 42.0: 17, 43.0: 17, 44.0: 13, 45.0: 20, 46.0: 17, 47.0: 15, 48.0: 13, 49.0: 16, 50.0: 22, 51.0: 23, 52.0: 13, 53.0: 12, 54.0: 15, 55.0: 14, 56.0: 8, 57.0: 11, 58.0: 10, 59.0: 11, 60.0: 15, 61.0: 17, 62.0: 7, 63.0: 13, 64.0: 9, 65.0: 10, 66.0: 10, 67.0: 8, 68.0: 7, 69.0: 11, 70.0: 7, 71.0: 12, 72.0: 6, 73.0: 6, 74.0: 5, 75.0: 5, 76.0: 5, 78.0: 11, 79.0: 4, 80.0: 6, 81.0: 3, 82.0: 5, 83.0: 5, 84.0: 5, 85.0: 4, 86.0: 9, 87.0: 8, 88.0: 5, 89.0: 4, 90.0: 6, 91.0: 3, 92.0: 8, 93.0: 3, 94.0: 4, 95.0: 2, 96.0: 4, 97.0: 1, 98.0: 2, 99.0: 4, 100.0: 6, 101.0: 4, 102.0: 4, 103.0: 4, 104.0: 8, 105.0: 3, 106.0: 7, 107.0: 2, 108.0: 3, 109.0: 3, 110.0: 2, 112.0: 2, 113.0: 2, 114.0: 3, 115.0: 1, 116.0: 2, 118.0: 2, 120.0: 5, 121.0: 3, 122.0: 6, 123.0: 3, 124.0: 2, 125.0: 2, 126.0: 1, 127.0: 4, 128.0: 1, 129.0: 1, 130.0: 1, 131.0: 2, 133.0: 2, 134.0: 3, 136.0: 3, 138.0: 4, 139.0: 1, 140.0: 2, 141.0: 1, 142.0: 2, 143.0: 2, 144.0: 2, 145.0: 2, 146.0: 1, 147.0: 1, 148.0: 2, 149.0: 4, 151.0: 1, 152.0: 3, 153.0: 1, 155.0: 3, 156.0: 3, 2205.0: 1, 158.0: 3, 159.0: 1, 160.0: 2, 161.0: 1, 162.0: 1, 163.0: 3, 164.0: 2, 165.0: 1, 166.0: 1, 167.0: 1, 168.0: 3, 169.0: 2, 170.0: 1, 171.0: 1, 172.0: 1, 174.0: 2, 176.0: 1, 177.0: 2, 178.0: 4, 181.0: 1, 182.0: 3, 183.0: 1, 184.0: 1, 185.0: 3, 186.0: 1, 187.0: 1, 189.0: 1, 193.0: 2, 194.0: 3, 195.0: 1, 196.0: 1, 197.0: 3, 200.0: 1, 201.0: 1, 202.0: 2, 205.0: 1, 207.0: 1, 208.0: 1, 209.0: 1, 213.0: 1, 215.0: 2, 216.0: 2, 217.0: 1, 218.0: 1, 219.0: 2, 222.0: 1, 226.0: 2, 227.0: 2, 229.0: 1, 232.0: 1, 234.0: 1, 235.0: 1, 236.0: 1, 239.0: 1, 241.0: 1, 242.0: 1, 243.0: 1, 244.0: 2, 249.0: 1, 250.0: 1, 251.0: 1, 257.0: 3, 258.0: 1, 259.0: 1, 262.0: 1, 263.0: 3, 264.0: 1, 265.0: 2, 266.0: 1, 269.0: 2, 271.0: 1, 273.0: 1, 274.0: 1, 277.0: 1, 282.0: 1, 287.0: 2, 289.0: 1, 291.0: 1, 294.0: 1, 2352.0: 1, 307.0: 2, 308.0: 1, 310.0: 1, 314.0: 1, 315.0: 1, 326.0: 1, 328.0: 1, 330.0: 1, 333.0: 1, 334.0: 2, 337.0: 1, 338.0: 1, 340.0: 1, 343.0: 1, 346.0: 1, 348.0: 1, 359.0: 1, 364.0: 1, 365.0: 1, 369.0: 1, 378.0: 1, 379.0: 1, 380.0: 1, 385.0: 1, 394.0: 1, 403.0: 1, 404.0: 1, 405.0: 1, 2465.0: 1, 426.0: 3, 431.0: 1, 433.0: 1, 437.0: 1, 446.0: 1, 447.0: 1, 448.0: 1, 453.0: 1, 456.0: 2, 464.0: 1, 466.0: 1, 467.0: 1, 2520.0: 1, 484.0: 1, 490.0: 1, 495.0: 1, 502.0: 1, 507.0: 1, 526.0: 1, 535.0: 1, 544.0: 1, 567.0: 1, 569.0: 1, 579.0: 1, 584.0: 1, 588.0: 1, 589.0: 1, 593.0: 1, 8786.0: 1, 597.0: 1, 610.0: 1, 611.0: 1, 619.0: 1, 6782.0: 1, 642.0: 1, 2708.0: 1, 671.0: 1, 675.0: 1, 676.0: 1, 677.0: 1, 684.0: 1, 705.0: 1, 712.0: 1, 802.0: 1, 725.0: 1, 727.0: 1, 745.0: 1, 4562.0: 1, 780.0: 1, 788.0: 1, 2850.0: 1, 806.0: 1, 811.0: 1, 830.0: 2, 2900.0: 1, 2906.0: 1, 862.0: 1, 872.0: 1, 882.0: 1, 3058.0: 1, 1028.0: 2, 7219.0: 1, 1097.0: 1, 1122.0: 1, 19586.0: 1, 1163.0: 1, 9356.0: 1, 1200.0: 1, 1207.0: 1, 1237.0: 1, 1250.0: 1, 1307.0: 1, 1337.0: 1, 1365.0: 1, 1390.0: 1, 2092.0: 1, 5490.0: 1, 1403.0: 1, 9616.0: 1, 1443.0: 1, 1446.0: 1, 3627.0: 1, 20033.0: 1, 1647.0: 1, 7792.0: 1, 1774.0: 1, 3838.0: 1, 1800.0: 1, 16199.0: 1, 3920.0: 1, 3923.0: 1, 1971.0: 1}) |
Oops, something went wrong.