Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
iliakur committed Oct 14, 2012
0 parents commit 6b32728
Show file tree
Hide file tree
Showing 2,843 changed files with 888,077 additions and 0 deletions.
111 changes: 111 additions & 0 deletions SW.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
''' This is a sample implementation of the Smith-Waterman algorithm in an
attempt to adapt it to the alignment of different members of a declension
paradigm with the end goal of separating matching morphemes from those that
differ.
'''

from numpy import zeros

WORDS = [('help','unhelpful'), ('kamok','kamka')]
MATCH = 2
MISMATCH = -1
GAP = -1

def score(str1, str2):
if str1 == str2:
return MATCH
else:
return MISMATCH

class Allignments:

def __init__(self, matrix, s1, s2):
self.mat = matrix
self.a = s1
self.b = s2
self.Paths = []

def find_further_steps(self, x, y):
steps = [] # this will be our output
diag = self.mat[x-1][y-1]
up = self.mat[x-1][y]
left = self.mat[x][y-1]
if x == 0: # this is if we are at the top end of the matrix
steps.append((x,y-1))
elif y == 0: # this is if we are at the left end of it
steps.append((x-1,y))
else:
if diag == max(diag, up, left): # case A
steps.append((x-1,y-1))
if left == max(diag, up, left): # case B
steps.append((x,y-1))
if up == max(diag, up, left): # case C
steps.append((x-1,y))
return steps

#print score('a','a')
def find_path(self, starter, path=[]):
path.append(starter)
#exit clause
solutions = self.find_further_steps(starter[0], starter[1])
for solution in solutions:
if solution == (0,0):
path.append(solution)
print 'done', path
return path
else:
further_path = self.find_path(solution, path=path)
if further_path:
self.Paths.append(self.find_path(solution, path=path))
path = path[:path.index(solution)]


for pair in WORDS:
print 'working on', pair
A = pair[0]
B = pair[1]
matrix= zeros((len(A),len(B)))
maxScore = 0

for i in range(1,len(A)):
for j in range(1,len(B)):
#print B[j]
#print score(A[i], B[j])
seq_score = score(A[i], B[j]) + matrix[i-1][j-1]
matrix[i][j] = max(seq_score, matrix[i-1][j]+GAP, matrix[i][j-1]+GAP)
if matrix[i][j] >= maxScore:
imax = i
jmax = j
maxScore = matrix[i][j]
print matrix

test = Allignments(matrix, A, B)

test.find_path((imax, jmax), path=[])
print test.Paths

#maxindx = matrix.argmax()
#i = maxindx / matrix.shape[1]
#j = maxindx % matrix.shape[1]
#path = []
#i, j = imax, jmax
#while i > 0 and j > 0:
#diag = matrix[i-1][j-1]
#up = matrix[i-1][j]
#left = matrix[i][j-1]
#path.append((i,j))
#if diag == max(diag, up, left):
#i -= 1
#j -= 1
#elif up == max(diag, up, left):
#i -= 1
#elif left == max(diag, up, left):
#j -= 1
#while i > 0:
#path.append((i,j))
#i -= 1
#while j >0:
#path.append((i,j))
#j -= 1
##print imax, jmax
#print path
3 changes: 3 additions & 0 deletions SWNotes
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
traceback:


158 changes: 158 additions & 0 deletions bayes/classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
'''
Umass Amherst
Compsci 585 - Introduction to Natural Language Processing.
Instroctor: Andrew McCallum
Submission: Ilia Kurenkov
Assignment 4: Naive Bayes Classifier
This classifier, for testing purposes, uses the nltk movie review corpus.
'''
import datetime
# from operator import mul
from math import log
from nltk.corpus import movie_reviews as mr

CLASSES = mr.categories()
TRAINING_POSITIVES1 = mr.fileids(categories='pos')[:500]
TRAINING_NEGATIVES1 = mr.fileids(categories='neg')[:500]

TRAINING_POSITIVES2 = mr.fileids(categories='pos')[:700]
TRAINING_NEGATIVES2 = mr.fileids(categories='neg')[:700]

TRAINING_POSITIVES3 = mr.fileids(categories='pos')[:900]
TRAINING_NEGATIVES3 = mr.fileids(categories='neg')[:900]

TESTING1 = mr.fileids(categories='pos')[500:] + mr.fileids(categories='neg')[500:]
TESTING2 = mr.fileids(categories='pos')[700:] + mr.fileids(categories='neg')[700:]
TESTING3 = mr.fileids(categories='pos')[900:] + mr.fileids(categories='neg')[900:]

################ Training Machinery ########################
'''The way I'm doing this for now may not very good for the memory since it
loads all of the texts together into it.
'''

class Laplace_Label():
"""this class represents a label. As it is initialized, it processes a
collection of filenames the following way:
1. texts corresponding to filenames are extracted and combined into a list
2. the vocabulary is created from a set of this list
3. Laplace smooting denominator is calculated
4. dictionary of word probabilities is created for class.
"""
def __init__(self, collection):
''' constructor takes collection of texts as arg
'''
self.rev = [word for text in collection for word in mr.words(text)]
self.V = set(self.rev)
self.N = len(self.rev) + len(self.V)
self.word_probs = dict([(w, float(self.rev.count(w)+1)/self.N) for w in self.V])

################ Some Testing Machinery #########################

def prob(word, label):
'''lots of error catching here'''
if label == 'pos':
try:
return pos.word_probs[word]
except KeyError:
return 1.0/pos.N
elif label == 'neg':
try:
return neg.word_probs[word]
except KeyError:
return 1.0/neg.N
else:
raise Exception('An invalid label was passed. Exiting...')


def cat_score(review, cat):
'''gets probability of a document being in a class by summing up
the log probabilities of the words in the document, given the class.
'''
return -sum([log(prob(word, cat)) for word in mr.words(review)])


def foo(review):
'''returns most likely class for a review'''
return max([(cat_score(review, cat), cat) for cat in CLASSES])[1]

############ Calculating Precision and recall ############

def evaluate(classified, test):
''' function for evaluating our results.
'''
classified_pos = [x for x in classified if x[0] == 'pos']
true_positives = [x for x in classified_pos if mr.categories(x[1])[0] == x[0]]
false_positives = [x for x in classified_pos if mr.categories(x[1])[0] != x[0]]
precision = 100*float(len(true_positives))/len(classified_pos)
print 'precision is:', precision
recall = 100*len(true_positives)/float(len([x for x in test if mr.categories(x)[0]=='pos']))
print 'recall is', recall

return 2*precision*recall/(precision+recall)


############### Some Actual testing ##################
''' Round 1, with training corpus of 500 for every label.'''
print '''Round 1:\nThe training corpus is 500 reviews for every class\n
The testing corpus is 1000 texts'''
# first we train...
start = datetime.datetime.now() #start the timer
pos = Laplace_Label(TRAINING_POSITIVES1) # train positive reviews
print 'size of positive vocabulary: {}'.format(len(pos.V))
neg = Laplace_Label(TRAINING_NEGATIVES1) # train on negative reviews
print 'size of negative vocabulary: {}'.format(len(neg.V))
finish = datetime.datetime.now() # stop timer
print 'done training, it took ', finish - start # print the time it took
# then we test...
start = datetime.datetime.now() # start timer
classified = [(foo(x), x) for x in TESTING1] # create list of
finish = datetime.datetime.now()
print 'done testing, it took ', finish - start
# then we evaluate ...
print 'the F1 value is: {}\n'.format(evaluate(classified, TESTING1))


'''Round 2 with training corpus of 700 for every label. '''
print '''Round 2:\nThe training corpus is 700 reviews for every class\n
The testing corpus is 600 texts'''
# first we train...
start = datetime.datetime.now() #start the timer
pos = Laplace_Label(TRAINING_POSITIVES2) # train positive reviews
print 'size of positive vocabulary: {}'.format(len(pos.V))
neg = Laplace_Label(TRAINING_NEGATIVES2) # train on negative reviews
print 'size of negative vocabulary: {}'.format(len(neg.V))
finish = datetime.datetime.now() # stop timer
print 'done training, it took ', finish - start # print the time it took
# then we test...
start = datetime.datetime.now() # start timer
classified = [(foo(x), x) for x in TESTING2] # create list of
finish = datetime.datetime.now()
print 'done testing, it took ', finish - start
# then we evaluate ...
print 'the F1 value is: {}\n'.format(evaluate(classified, TESTING2))


'''Round 3 with training corpus of 700 for every label. '''
print '''Round 3:\nThe training corpus is 900 reviews for every class\n
The testing corpus is 200 texts'''
# first we train...
start = datetime.datetime.now() #start the timer
pos = Laplace_Label(TRAINING_POSITIVES3) # train positive reviews
print 'size of positive vocabulary: {}'.format(len(pos.V))
neg = Laplace_Label(TRAINING_NEGATIVES3) # train on negative reviews
print 'size of negative vocabulary: {}'.format(len(neg.V))
finish = datetime.datetime.now() # stop timer
print 'done training, it took ', finish - start # print the time it took
# then we test...
start = datetime.datetime.now() # start timer
classified = [(foo(x), x) for x in TESTING3] # create list of
finish = datetime.datetime.now()
print 'done testing, it took ', finish - start
# then we evaluate ...
print 'the F1 value is: {}\n'.format(evaluate(classified, TESTING3))

''' Sandbox '''

def combine(list1, list2):
pass
112 changes: 112 additions & 0 deletions bayes/gt-classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
'''This classifier uses the Good-Turing smoothing. I plan to compare its results
with a classifier that uses Laplace smoothing. I want to compare the performance
of these two algorithms on the movie_review corpus as well as on the spam corpus.
'''

import datetime
from math import log
from collections import defaultdict
from nltk.corpus import movie_reviews as mr

CLASSES = mr.categories()
TRAINING_POSITIVES1 = mr.fileids(categories='pos')[:500]
TRAINING_NEGATIVES1 = mr.fileids(categories='neg')[:500]

TRAINING_POSITIVES2 = mr.fileids(categories='pos')[:700]
TRAINING_NEGATIVES2 = mr.fileids(categories='neg')[:700]

TRAINING_POSITIVES2 = mr.fileids(categories='pos')[:900]
TRAINING_NEGATIVES1 = mr.fileids(categories='neg')[:900]

TESTING1 = mr.fileids(categories='pos')[500:] + mr.fileids(categories='neg')[500:]
TESTING2 = mr.fileids(categories='pos')[700:] + mr.fileids(categories='neg')[700:]
TESTING2 = mr.fileids(categories='pos')[900:] + mr.fileids(categories='neg')[900:]


################ Training Machinery ########################
'''The way I'm doing this for now may not very good for the memory since it
loads all of the texts together into it.
'''

class GT_Label():
"""this class represents a label. As it is initialized, it processes a
collection of filenames the following way:
REDO REDO REDO REDO
1. texts corresponding to filenames are extracted and combined into a list
2. the vocabulary is created from a set of this list
3. Laplace smooting denominator is calculated
4. dictionary of word probabilities is created for class.
"""
def __init__(self, collection):
''' constructor takes collection of texts as arg
'''
self.rev = [word for text in collection for word in mr.words(text)]
self.V = set(self.rev)
self.N = len(self.rev)
self.word_counts = defaultdict(int, [(w, float(self.rev.count(w))) for w in self.V])
self.freq_counts = defaultdict(int, {0:self.N})
for word in self.word_counts: self.freq_counts[self.word_counts[word]] += 1

def get_prob(self, word):
k = self.word_counts[word]
count = (k+1)*self.freq_counts[k+1]/self.freq_counts[k]
return count/self.N


################ Some Testing Machinery #########################

def prob(word, label):
'''lots of error catching here
REDO REDO REDO
'''
if label == 'pos':
try:
return pos.word_probs[word]
except KeyError:
return 1.0/pos.N
elif label == 'neg':
try:
return neg.word_probs[word]
except KeyError:
return 1.0/neg.N
else:
raise Exception('An invalid label was passed. Exiting...')


def cat_score(review, cat):
'''gets probability of a document being in a class by summing up
the log probabilities of the words in the document, given the class.
'''
return -sum([log(prob(word, cat)) for word in mr.words(review)])


def foo(review):
'''returns most likely class for a review'''
return max([(cat_score(review, cat), cat) for cat in CLASSES])[1]


############ Calculating Precision and recall ############

def evaluate(classified, test):
''' function for evaluating our results.
'''
classified_pos = [x for x in classified if x[0] == 'pos']
true_positives = [x for x in classified_pos if mr.categories(x[1])[0] == x[0]]
false_positives = [x for x in classified_pos if mr.categories(x[1])[0] != x[0]]
precision = 100*float(len(true_positives))/len(classified_pos)
print 'precision is:', precision
recall = 100*len(true_positives)/float(len([x for x in test if mr.categories(x)[0]=='pos']))
print 'recall is', recall

return 2*precision*recall/(precision+recall)


############### Some Actual testing ##################
''' Round 1, with training corpus of 500 for every label.'''
print '''Trying to figure out where to introduce the quadratic function.'''
# first we train...
start = datetime.datetime.now() #start the timer
pos = GT_Label(TRAINING_POSITIVES1) # train positive reviews
print 'finished training, that took:', datetime.datetime.now() - start
print 'size of positive vocabulary: {}\n'.format(len(pos.V))
print pos.freq_counts
5 changes: 5 additions & 0 deletions bayes/gt-counts
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Trying to figure out where to introduce the quadratic function.
finished training, that took: 0:01:44.335017
size of positive vocabulary: 21809

defaultdict(<type 'int'>, {0: 394541, 1.0: 8973, 2.0: 3548, 3.0: 1802, 4.0: 1193, 5.0: 825, 6.0: 648, 7.0: 551, 8.0: 411, 9.0: 341, 10.0: 296, 11.0: 262, 12.0: 188, 13.0: 211, 14.0: 144, 15.0: 150, 16.0: 127, 17.0: 107, 18.0: 87, 19.0: 88, 20.0: 78, 21.0: 77, 22.0: 68, 23.0: 66, 24.0: 65, 25.0: 53, 26.0: 56, 27.0: 48, 28.0: 46, 29.0: 36, 30.0: 37, 31.0: 37, 32.0: 41, 33.0: 30, 34.0: 29, 35.0: 32, 36.0: 25, 37.0: 21, 38.0: 31, 39.0: 24, 40.0: 24, 41.0: 29, 42.0: 17, 43.0: 17, 44.0: 13, 45.0: 20, 46.0: 17, 47.0: 15, 48.0: 13, 49.0: 16, 50.0: 22, 51.0: 23, 52.0: 13, 53.0: 12, 54.0: 15, 55.0: 14, 56.0: 8, 57.0: 11, 58.0: 10, 59.0: 11, 60.0: 15, 61.0: 17, 62.0: 7, 63.0: 13, 64.0: 9, 65.0: 10, 66.0: 10, 67.0: 8, 68.0: 7, 69.0: 11, 70.0: 7, 71.0: 12, 72.0: 6, 73.0: 6, 74.0: 5, 75.0: 5, 76.0: 5, 78.0: 11, 79.0: 4, 80.0: 6, 81.0: 3, 82.0: 5, 83.0: 5, 84.0: 5, 85.0: 4, 86.0: 9, 87.0: 8, 88.0: 5, 89.0: 4, 90.0: 6, 91.0: 3, 92.0: 8, 93.0: 3, 94.0: 4, 95.0: 2, 96.0: 4, 97.0: 1, 98.0: 2, 99.0: 4, 100.0: 6, 101.0: 4, 102.0: 4, 103.0: 4, 104.0: 8, 105.0: 3, 106.0: 7, 107.0: 2, 108.0: 3, 109.0: 3, 110.0: 2, 112.0: 2, 113.0: 2, 114.0: 3, 115.0: 1, 116.0: 2, 118.0: 2, 120.0: 5, 121.0: 3, 122.0: 6, 123.0: 3, 124.0: 2, 125.0: 2, 126.0: 1, 127.0: 4, 128.0: 1, 129.0: 1, 130.0: 1, 131.0: 2, 133.0: 2, 134.0: 3, 136.0: 3, 138.0: 4, 139.0: 1, 140.0: 2, 141.0: 1, 142.0: 2, 143.0: 2, 144.0: 2, 145.0: 2, 146.0: 1, 147.0: 1, 148.0: 2, 149.0: 4, 151.0: 1, 152.0: 3, 153.0: 1, 155.0: 3, 156.0: 3, 2205.0: 1, 158.0: 3, 159.0: 1, 160.0: 2, 161.0: 1, 162.0: 1, 163.0: 3, 164.0: 2, 165.0: 1, 166.0: 1, 167.0: 1, 168.0: 3, 169.0: 2, 170.0: 1, 171.0: 1, 172.0: 1, 174.0: 2, 176.0: 1, 177.0: 2, 178.0: 4, 181.0: 1, 182.0: 3, 183.0: 1, 184.0: 1, 185.0: 3, 186.0: 1, 187.0: 1, 189.0: 1, 193.0: 2, 194.0: 3, 195.0: 1, 196.0: 1, 197.0: 3, 200.0: 1, 201.0: 1, 202.0: 2, 205.0: 1, 207.0: 1, 208.0: 1, 209.0: 1, 213.0: 1, 215.0: 2, 216.0: 2, 217.0: 1, 218.0: 1, 219.0: 2, 222.0: 1, 226.0: 2, 227.0: 2, 229.0: 1, 232.0: 1, 234.0: 1, 235.0: 1, 236.0: 1, 239.0: 1, 241.0: 1, 242.0: 1, 243.0: 1, 244.0: 2, 249.0: 1, 250.0: 1, 251.0: 1, 257.0: 3, 258.0: 1, 259.0: 1, 262.0: 1, 263.0: 3, 264.0: 1, 265.0: 2, 266.0: 1, 269.0: 2, 271.0: 1, 273.0: 1, 274.0: 1, 277.0: 1, 282.0: 1, 287.0: 2, 289.0: 1, 291.0: 1, 294.0: 1, 2352.0: 1, 307.0: 2, 308.0: 1, 310.0: 1, 314.0: 1, 315.0: 1, 326.0: 1, 328.0: 1, 330.0: 1, 333.0: 1, 334.0: 2, 337.0: 1, 338.0: 1, 340.0: 1, 343.0: 1, 346.0: 1, 348.0: 1, 359.0: 1, 364.0: 1, 365.0: 1, 369.0: 1, 378.0: 1, 379.0: 1, 380.0: 1, 385.0: 1, 394.0: 1, 403.0: 1, 404.0: 1, 405.0: 1, 2465.0: 1, 426.0: 3, 431.0: 1, 433.0: 1, 437.0: 1, 446.0: 1, 447.0: 1, 448.0: 1, 453.0: 1, 456.0: 2, 464.0: 1, 466.0: 1, 467.0: 1, 2520.0: 1, 484.0: 1, 490.0: 1, 495.0: 1, 502.0: 1, 507.0: 1, 526.0: 1, 535.0: 1, 544.0: 1, 567.0: 1, 569.0: 1, 579.0: 1, 584.0: 1, 588.0: 1, 589.0: 1, 593.0: 1, 8786.0: 1, 597.0: 1, 610.0: 1, 611.0: 1, 619.0: 1, 6782.0: 1, 642.0: 1, 2708.0: 1, 671.0: 1, 675.0: 1, 676.0: 1, 677.0: 1, 684.0: 1, 705.0: 1, 712.0: 1, 802.0: 1, 725.0: 1, 727.0: 1, 745.0: 1, 4562.0: 1, 780.0: 1, 788.0: 1, 2850.0: 1, 806.0: 1, 811.0: 1, 830.0: 2, 2900.0: 1, 2906.0: 1, 862.0: 1, 872.0: 1, 882.0: 1, 3058.0: 1, 1028.0: 2, 7219.0: 1, 1097.0: 1, 1122.0: 1, 19586.0: 1, 1163.0: 1, 9356.0: 1, 1200.0: 1, 1207.0: 1, 1237.0: 1, 1250.0: 1, 1307.0: 1, 1337.0: 1, 1365.0: 1, 1390.0: 1, 2092.0: 1, 5490.0: 1, 1403.0: 1, 9616.0: 1, 1443.0: 1, 1446.0: 1, 3627.0: 1, 20033.0: 1, 1647.0: 1, 7792.0: 1, 1774.0: 1, 3838.0: 1, 1800.0: 1, 16199.0: 1, 3920.0: 1, 3923.0: 1, 1971.0: 1})
Loading

0 comments on commit 6b32728

Please sign in to comment.