-
Notifications
You must be signed in to change notification settings - Fork 0
/
inefficient_min_edit_distance.py
160 lines (129 loc) · 6.58 KB
/
inefficient_min_edit_distance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
MAX_LENGTH = 15 # It is for limitation the word of length.
TOP_FIVE = 5 # It is for most similar five words.
# It holds information about the word.
# index, specifies the line number of the word inside the file.
# word, specifies the word of the file.
# cost, specifies the edit-distance cost between the main word and the word inside the file.
class MostSimilarWord:
# It is constructor of the class.
def __init__(self, index=-1, word="?", cost=100): # Default values for the properties.
# Assign the properties.
self.index = index
self.word = word
self.cost = cost
# It prints the information about the class object
def print_msw(self):
print("==================")
print("Word :", self.word)
print("Cost :", self.cost)
print("==================")
# It reads the file line by line and fill the list with the words.
# It returns the list that has words of the file.
# file_name is the path of the file that will be read.
def read_random_file(file_name):
file_handle = open(file_name, "r", encoding='utf8')
read_line = file_handle.readlines()
words = []
for line in read_line:
line = line[: -1]
if len(line) > MAX_LENGTH:
line = line[: MAX_LENGTH]
words.append(line)
file_handle.close()
return words
# It gets the main word from the user and returns it.
# It can terminate the program, if the main word is null.
# If the main word has more than 15 letters, it ignores the rest.
def get_word():
word = input("Enter main word : ")
if not word:
print("You must enter non-empty word!\nProgram is terminating...")
exit()
elif len(word) > MAX_LENGTH:
word = word[: MAX_LENGTH]
return word
# It is an inefficient version of edit distance algorithm.
# It fills each costs of the words.
# word_1 is the main word that was entered by the user.
# words_of_file holds the words that was read from the file.
# top_5_words holds the most similar five words.
# Cost of insertion is 1.
# Cost of deletion is 1.
# Cost of substitution is 2.
def min_edit_distance(word_1, words_of_file, top_5_words):
number_of_words = len(words_of_file)
table = [[[0 for x in range(MAX_LENGTH + 1)] for y in range(MAX_LENGTH + 1)] for z in range(number_of_words)]
len_word_1 = len(word_1) # vertical word |
len_word_2 = 0 # horizontal word -
used_place_count = 0 # It is counter for the used places.
total_place = 0 # It is the for the total number of all places for words.
table_initialization(table, number_of_words)
# k (1000) # It represents the total number of the words in the file.
# i (1,16) # It represents the main word.
# j (1,16) # It represents a word which is in the file.
# The loops with i and j are base form of the minimum edit distance algorithm.
# There are 1000 words to compare distance, so that the loop with k is added.
for k in range(number_of_words):
word_2 = words_of_file[k]
len_word_2 = len(word_2)
total_place = total_place + (len_word_1 * len_word_2)
for i in range(1, len_word_1 + 1):
for j in range(1, len_word_2 + 1):
if word_1[i - 1] == word_2[j - 1]:
substitution_cost = 0
else:
substitution_cost = 2
table[k][i][j] = min(table[k][i][j - 1] + 1, table[k][i - 1][j] + 1,
table[k][i - 1][j - 1] + substitution_cost)
used_place_count = used_place_count + 1 # Increment the used_place_count.
# table[k][i][j] represents the cost of kth word's cost.
# If the current cost is smaller than the top_5_words cost, then remove the word which has the greatest cost.
# After removing, add the new word into the top_5_words list.
cost = table[k][i][j]
if cost < top_5_words[TOP_FIVE - 1].cost:
top_5_words.pop()
new_word = MostSimilarWord(k, word_2, cost)
top_5_words.append(new_word)
top_5_words = sorted(top_5_words, key=lambda x: x.cost) # Sorts the similar words based on the cost.
percentage_of_used_place = calculate_percentage(used_place_count, total_place) # Calculate the percentage of used places.
return top_5_words, percentage_of_used_place # Returns the most similar five words and the percentage.
# It calculates the percentage of used places of the 3D table.
def calculate_percentage(total_usage, total_place):
percentage_of_used_place = (total_usage * 100.0) / total_place
return percentage_of_used_place
# It does initialization of the all first row and all first column of the 3D table.
def table_initialization(table, word_of_file_count):
for k in range(word_of_file_count):
for j in range(MAX_LENGTH + 1):
table[k][0][j] = j
for i in range(MAX_LENGTH + 1):
table[k][i][0] = i
# It prints the 3D table based on the 2D table version.
def print_table(table):
row = len(table)
for i in range(row):
print(table[i])
if __name__ == '__main__':
random_file_names = ["Files/random_file_1.txt", "Files/random_file_2.txt",
"Files/random_file_3.txt", "Files/random_file_4.txt",
"Files/random_file_5.txt"]
random_words = read_random_file(random_file_names[0])
print("======================================================================")
print("It is an INEFFICIENT version of minimum edit distance algorithm.")
print("======================================================================")
print("======================================================================")
print("Test File is :", random_file_names[0])
print("======================================================================")
main_word = get_word()
most_similar_five_words = [MostSimilarWord() for i in range(TOP_FIVE)]
most_similar_five_words, percentage = min_edit_distance(main_word, random_words, most_similar_five_words)
print("======================================================================")
print("The Most Similar Five Words :")
for msfw in most_similar_five_words:
msfw.print_msw()
print("======================================================================")
print("Percentage of the Table :", percentage)
print("======================================================================")
print("======================================================================")
print("Created by Ali Haydar KURBAN")
print("======================================================================")