-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstredit.py
126 lines (117 loc) · 4.15 KB
/
stredit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import sys
def print_table(s1,s2,table,trace=None):
"""print the DP table, t, for strings s1 and s2.
If the optional 'trace' is present, print * indicators for the alignment.
Fancy formatting ensures this will also work when s1 and s2 are lists of strings"""
print " ",
for i in range(len(s1)):
print "%3.3s" % s1[i],
print
for i in range(len(table)):
if i > 0: print "%3.3s" % s2[i-1],
else: print ' ',
for j in range(len(table[i])):
if trace and trace[i][j] == "*":
print "*" + "%2d" % table[i][j],
else:
print "%3d" % table[i][j],
print
def argmin (*a):
"""Return two arguments: first the smallest value, second its offset"""
min = sys.maxint; arg = -1; i = 0
for x in a:
if (x < min):
min = x; arg = i
i += 1
return (min,arg)
def stredit (s1,s2, showtable=True):
"Calculate Levenstein edit distance for strings s1 and s2."
len1 = len(s1) # vertically
len2 = len(s2) # horizontally
# Allocate the table
table = [None]*(len2+1)
for i in range(len2+1): table[i] = [0]*(len1+1)
# Initialize the table
for i in range(1, len2+1): table[i][0] = i
for i in range(1, len1+1): table[0][i] = i
# Do dynamic programming
for i in range(1,len2+1):
for j in range(1,len1+1):
if s1[j-1] == s2[i-1]:
d = 0
else:
d = 1
table[i][j] = min(table[i-1][j-1] + d,
table[i-1][j]+1,
table[i][j-1]+1)
if showtable:
print_table(s1, s2, table)
return table[len2][len1]
def stredit2 (s1,s2, showtable=True):
"String edit distance, keeping trace of best alignment"
len1 = len(s1) # vertically
len2 = len(s2) # horizontally
# Allocate tables
table = [None]*(len2+1)
for i in range(len2+1): table[i] = [0]*(len1+1)
trace = [None]*(len2+1)
for i in range(len2+1): trace[i] = [None]*(len1+1)
# initialize table
for i in range(1, len2+1): table[i][0] = i
for i in range(1, len1+1): table[0][i] = i
# in the trace table, 0=subst, 1=insert, 2=delete
for i in range(1,len2+1): trace[i][0] = 1
for j in range(1,len1+1): trace[0][j] = 2
# Do dynamic programming
for i in range(1,len2+1):
for j in range(1,len1+1):
if s1[j-1] == s2[i-1]:
d = 0
else:
d = 1
# if true, the integer value of the first clause in the "or" is 1
table[i][j],trace[i][j] = argmin(table[i-1][j-1] + d,
table[i-1][j]+1,
table[i][j-1]+1)
if showtable:
# If you are implementing Smith-Waterman, then instead of initializing
# i=len2 and j=len1, you must initialize i and j to the indices
# of the table entry that has the miminum value (it will be negative)
i = len2
j = len1
while i != 0 or j != 0:
if trace[i][j] == 0:
nexti = i-1
nextj = j-1
elif trace[i][j] == 1:
nexti = i-1
nextj = j
elif trace[i][j] == 2:
nexti = i
nextj = j-1
else:
nexti = 0
nextj = 0
trace[i][j] = "*"
i = nexti
j = nextj
print "ij", i, j
print_table(s1, s2, table, trace)
return table[len2][len1]
#stredit2('mccallum', 'mcalllomo')
#stredit(['this', 'is', 'a', 'test'], ['this', 'will', 'be', 'another', 'test'])
#stredit2("s'allonger", "lounge")
#stredit2("lounge", "s'allonger")
#stredit2('cow over the moon', 'moon in the sky')
#stredit2('another fine day', 'anyone can dive')
#stredit2('another fine day in the park', 'anyone can see him pick the ball')
# import dicts
# argvlen = len(sys.argv)
# target = sys.argv[argvlen-2].lower()
# filename = sys.argv[argvlen-1]
# d = dicts.DefaultDict(0)
# for word in open(filename).read().split():
# if word not in d:
# word = word.lower()
# d[word] = stredit(word, target, False)
# print d.sorted(rev=False)[:20]