-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhmm.py
74 lines (65 loc) · 2.88 KB
/
hmm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# This code should help get you started, but it is not guaranteed to
# be bug free! If you find problems, please report to
import sys
from dicts import DefaultDict
from random import choice
def Dict(**args):
"""Return a dictionary with argument names as the keys,
and argument values as the key values"""
return args
def hmm (file):
"""Given an open FILE, e.g. from the open(filename) function,
Read pre-tagged sentences of WSJ, one per line. Return an HMM,
here represented as a tuple containing (1) the transition probabilities,
and (2) the emmission probabilities."""
transitions = DefaultDict(DefaultDict(0))
emissions = DefaultDict(DefaultDict(0))
wordcounts = DefaultDict(0)
# For each sentence (one per line)
for line in file.xreadlines():
# for each word in the sentence (space separated)
prevtag = 'START' # Before each sentence, begin in START state
for taggedword in line.split():
(word, tag) = taggedword.split('/')
transitions[prevtag][tag] += 1
emissions[tag][word] += 1
wordcounts[word] += 1
# At test time we will need estimates for "unknown words"---the words
# the words that never occurred in the training data. One recommended
# way to do this is to turn all training words occurring just once
# into '<UNKNOWN>' and use this as the stand-in for all "unknown words"
# at test time. Below we make all the necessary transformations
# to '<UNKNOWN>'.
for tag,dict in emissions.items():
for word,count in dict.items():
if wordcounts[word] == 1:
del emissions[tag][word]
emissions[tag]['<UNKNOWN>'] += 1
# Here you need to add code that will turn these dictionaries
# of counts into dictionaries of smoothed conditional probabilities
return (transitions, emissions)
def viterbi_tags (untagged_sentence):
"""Given a string containing the space-separated words of a sentence;
(there should even be spaces on either side of punctuation, as in the
WSJ training data), return an array containing the mostl likely
sequence of part-of-speech tags."""
wordarray = untagged_sentence.split()
# Implement Viterbi here
# return the mostly likely sequence of part-of-speech tags
def true_tags (tagged_sentence):
"""Given a string containing the space-separated words/POS of a sentence;
(there should even be spaces on either side of punctuation, as in the
WSJ training data) pull out and return the tag sequence."""
wordarray = tagged_sentence.split()
tags = [word.split('/')[1] for word in wordarray]
return tags
if __name__ == '__main__':
print "Usage:", sys.argv[0], "wsjtrainfile wsjtestfile"
dirs = sys.argv[1:-1]
testfile = sys.argv[-1]
h = hmm (sys.stdin)
print h[0]
print '------'
print h[1]
print true_tags ('The/DT August/NNP deficit/NN and/CC the/DT #/# 2.2/CD billion/CD gap/NN')