-
Notifications
You must be signed in to change notification settings - Fork 36
/
Copy pathgrammar.py
162 lines (133 loc) · 5.35 KB
/
grammar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env python
"""
Implements a simple framework for defining grammars of the sort given
in table 1 of the paper. Use
python grammar.py
to run the demo, which creates the training and testing data used in
`synthesis.py`.
To see how the implementation works, it's probably easiest to study
gold_lexicon, rules, and functions below. Together, these implement
the example in table 1.
The only real difference from the paper is that this implementation
creates only binary branching structures, so that we can use the
familiar CYK parsing algorithm (in `Grammar.gen`). Thus, the
structures are like this:
three plus minus two
[('N', 'add(3)(neg(2))'),
[('B', 'add(3)'),
[('N', '3'), 'three'],
[('R', 'add'), 'plus']],
[('N', 'neg(2)'),
[('U', 'neg'), 'minus'],
[('N', '2'), 'two']]]
To create new grammars, you just need to define the following:
* A lexicon mapping strings to arrays of tuples, where the tuples
are (category, logical form) pairs.
* A set of rules, each a list [X, Y, Z, (a,b)], where X is the left
daughter category, Y is the right daughter category, Z is the
mother category, and (a,b) says which order of application to
use in the semantics: (0,1) means apply X to Y, and (1,0)
means apply Y to X.
* A set of Python functions that will interpret the logical forms
(second members of all the nonterminal node labels). If your
logical forms can be intepreted in native Python this can be
empty.
"""
__author__ = "Christopher Potts and Percy Liang"
__credits__ = []
__license__ = "GNU general public license, version 2"
__version__ = "2.0"
__maintainer__ = "Christopher Potts"
__email__ = "See the authors' websites"
import sys
from collections import defaultdict
from itertools import product
class Grammar:
def __init__(self, lexicon, rules, functions):
"""For examples of these arguments, see below."""
self.lexicon = lexicon
self.rules = rules
self.functions = functions
def gen(self, s):
"""CYK parsing, but we just keep the full derivations. The input
s should be a string that can be parsed with this grammar."""
words = s.split()
n = len(words)+1
trace = defaultdict(list)
for i in range(1,n):
word = words[i-1]
trace[(i-1,i)] = [[(syntax, semantics), word]
for syntax, semantics in self.lexicon[word]]
for j in range(2, n):
for i in range(j-1, -1, -1):
for k in range(i+1, j):
for c1, c2 in product(trace[(i,k)], trace[(k,j)]):
for lfnode in self.allcombos(c1[0], c2[0]):
trace[(i,j)].append([lfnode, c1, c2])
# Return only full parses, from the upper right of the chart:
return trace[(0,n-1)]
def allcombos(self, c1, c2):
"""Given any two nonterminal node labels, find all the ways
they can be combined given self.rules."""
results = []
for left, right, mother, app_order in self.rules:
if left == c1[0] and right == c2[0]:
sem = [c1[1], c2[1]]
results.append((mother, "{}({})".format(
sem[app_order[0]], sem[app_order[1]])))
return results
def sem(self, lf):
"""Interpret, as Python code, the root of a logical form
generated by this grammar."""
# Import all of the user's functions into the namespace to
# help with the interpretation of the logical forms.
grammar = sys.modules[__name__]
for key, val in list(self.functions.items()):
setattr(grammar, key, val)
return eval(lf[0][1]) # Interpret just the root node's semantics.
# The lexicon from the paper. This is not used by any learning
# algorithm. Rather, it's here to create training and testing data.
gold_lexicon = {
'one': [('N', '1')],
'two': [('N', '2')],
'three': [('N', '3')],
'four': [('N', '4')],
'five': [('N', '5')],
'six': [('N', '6')],
'seven': [('N', '7')],
'eight': [('N', '8')],
'nine': [('N', '9')],
'plus': [('R', 'add')],
'minus': [('R', 'subtract'), ('U', 'neg')],
'times': [('R', 'multiply')],
}
# The binarized version of the rule sets from the paper. These
# correspond to
#
# N -> B N semantics: apply B(N)
# N -> U N semantics: apply U(N)
# B -> N R semantics: apply R(N)
rules = [
['B', 'N', 'N', (0,1)],
['U', 'N', 'N', (0,1)],
['N', 'R', 'B', (1,0)]
]
# These are needed to interpret our logical forms with eval. They are
# imported into the namespace Grammar.sem to achieve that.
functions = {
'neg': (lambda x : -x),
'add': (lambda x : (lambda y : x + y)),
'subtract': (lambda x : (lambda y : x - y)),
'multiply': (lambda x : (lambda y : x * y))
}
if __name__ == '__main__':
# Simple demo with the test data:
from semdata import test_utterances
gram = Grammar(gold_lexicon, rules, functions)
for u in test_utterances:
lfs = gram.gen(u)
print("======================================================================")
print('Utterance: {}'.format(u))
for lf in lfs:
print("\tLF: {}".format(lf))
print('\tDenotation: {}'.format(gram.sem(lf)))