grammar.py

#!/usr/bin/env python

"""
Implements a simple framework for defining grammars of the sort given
in table 1 of the paper. Use

python grammar.py

to run the demo, which creates the training and testing data used in
`synthesis.py`.

To see how the implementation works, it's probably easiest to study
gold_lexicon, rules, and functions below. Together, these implement
the example in table 1.

The only real difference from the paper is that this implementation
creates only binary branching structures, so that we can use the
familiar CYK parsing algorithm (in `Grammar.gen`). Thus, the
structures are like this:

three plus minus two

[('N', 'add(3)(neg(2))'), 
    [('B', 'add(3)'), 
        [('N', '3'), 'three'], 
        [('R', 'add'), 'plus']], 
    [('N', 'neg(2)'), 
        [('U', 'neg'), 'minus'], 
        [('N', '2'), 'two']]]

To create new grammars, you just need to define the following:

* A lexicon mapping strings to arrays of tuples, where the tuples
  are (category, logical form) pairs.

* A set of rules, each a list [X, Y, Z, (a,b)], where X is the left
  daughter category, Y is the right daughter category, Z is the 
  mother category, and (a,b) says which order of application to
  use in the semantics: (0,1) means apply X to Y, and (1,0)
  means apply Y to X.

* A set of Python functions that will interpret the logical forms
  (second members of all the nonterminal node labels).  If your
  logical forms can be intepreted in native Python this can be 
  empty.
"""


__author__ = "Christopher Potts and Percy Liang"
__credits__ = []
__license__ = "GNU general public license, version 2"
__version__ = "2.0"
__maintainer__ = "Christopher Potts"
__email__ = "See the authors' websites"


import sys
from collections import defaultdict
from itertools import product


class Grammar:
    def __init__(self, lexicon, rules, functions):
        """For examples of these arguments, see below."""
        self.lexicon = lexicon
        self.rules = rules
        self.functions = functions

    def gen(self, s):
        """CYK parsing, but we just keep the full derivations. The input
        s should be a string that can be parsed with this grammar."""
        words = s.split()
        n = len(words)+1
        trace = defaultdict(list)
        for i in range(1,n):
            word = words[i-1]
            trace[(i-1,i)] = [[(syntax, semantics), word]
                              for syntax, semantics in self.lexicon[word]]
        for j in range(2, n):
            for i in range(j-1, -1, -1):
                for k in range(i+1, j):
                    for c1, c2 in product(trace[(i,k)], trace[(k,j)]):
                        for lfnode in self.allcombos(c1[0], c2[0]):                                                                                                          
                            trace[(i,j)].append([lfnode, c1, c2])
        # Return only full parses, from the upper right of the chart:
        return trace[(0,n-1)] 

    def allcombos(self, c1, c2):
        """Given any two nonterminal node labels, find all the ways
        they can be combined given self.rules."""
        results = []
        for left, right, mother, app_order in self.rules:
            if left == c1[0] and right == c2[0]:
                sem = [c1[1], c2[1]]
                results.append((mother, "{}({})".format(
                    sem[app_order[0]], sem[app_order[1]])))
        return results

    def sem(self, lf):
        """Interpret, as Python code, the root of a logical form
        generated by this grammar."""
        # Import all of the user's functions into the namespace to
        # help with the interpretation of the logical forms.
        grammar = sys.modules[__name__]
        for key, val in list(self.functions.items()):
            setattr(grammar, key, val)        
        return eval(lf[0][1]) # Interpret just the root node's semantics. 

# The lexicon from the paper. This is not used by any learning
# algorithm. Rather, it's here to create training and testing data.
gold_lexicon = {
    'one':   [('N', '1')],
    'two':   [('N', '2')],
    'three': [('N', '3')],
    'four':  [('N', '4')],
    'five':  [('N', '5')],
    'six':   [('N', '6')],
    'seven': [('N', '7')],
    'eight': [('N', '8')],
    'nine':  [('N', '9')],
    'plus':  [('R', 'add')],
    'minus': [('R', 'subtract'), ('U', 'neg')],
    'times': [('R', 'multiply')],
}

# The binarized version of the rule sets from the paper. These
# correspond to
#
# N -> B N  semantics: apply B(N)
# N -> U N  semantics: apply U(N)
# B -> N R  semantics: apply R(N)
rules = [
    ['B', 'N', 'N', (0,1)], 
    ['U', 'N', 'N', (0,1)], 
    ['N', 'R', 'B', (1,0)]
]

# These are needed to interpret our logical forms with eval. They are
# imported into the namespace Grammar.sem to achieve that.
functions = {
    'neg': (lambda x : -x),
    'add': (lambda x : (lambda y : x + y)),
    'subtract': (lambda x : (lambda y : x - y)),
    'multiply': (lambda x : (lambda y : x * y))
}


if __name__ == '__main__':

    # Simple demo with the test data:
    from semdata import test_utterances
    
    gram = Grammar(gold_lexicon, rules, functions)

    for u in test_utterances:
        lfs = gram.gen(u)
        print("======================================================================")
        print('Utterance: {}'.format(u))
        for lf in lfs:
            print("\tLF: {}".format(lf))
            print('\tDenotation: {}'.format(gram.sem(lf)))