diff --git a/CHANGELOG.md b/CHANGELOG.md index 584992f1..17bb270e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,12 +48,33 @@ these changes are prefixed with "**BREAKING**" * `delphin.util.LookaheadIterator` for parsing with arbitrary lookahead * `delphin.commands` module to contain logic for `delphin` commands (#140) * `tests/commands_test.py` to test invocation of commands (but not results) +* `delphin.tsql` module for TSQL queries of testsuites +* `delphin.exceptions.TSQLSyntaxError` +* `delphin.itsdb` + - `Relations.find()` returns the names of tables defining a field + - `Relations.path()` returns a path of `(table, shared_field)` tuples + describing how to get from one table to another via shared keys + - `TestSuite.write()` takes an optional `relations` parameter to write + a profile with an updated relations file (#150) + - `TestSuite.exists()` (#150) + - `TestSuite.size()` (#150) + - `Record.get()` takes a `cast` parameter; when `True`, values are cast + to the field's datatype + - `select_rows()` takes a `cast` parameter as with `Record.get()` ### Changed * `delphin.tdl` now parses triple-quoted docstrings (#167); note that it no longer parses the old-style docstrings * `delphin.tdl.TdlDefinition` inherits from `delphin.tfs.FeatureStructure` +* `delphin.itsdb.TestSuite` no longer casts values by default (see note on + `Record.get()` above) +* `delphin.itsdb.TestSuite.process()` can take a `Table` as the `source` + instead of just `TestSuite`. +* **BREAKING** The `delphin` commands have all replaced the previous + method of filtering testsuites with the new TSQL queries. Applicators + are no longer available to commands. Please see `delphin -h` for + updated usage notes. (#138, #179) ### Deprecated @@ -65,6 +86,9 @@ these changes are prefixed with "**BREAKING**" * **BREAKING** `delphin.tdl.TdlDefinition.comment`; replaced by `TdlType.docstring` +* **BREAKING** `--filter` option on all commands +* **BREAKING** `--apply` option on all commands +* **BREAKING** `--join` option on the `select` command ## [v0.8.0][] diff --git a/README.md b/README.md index de21daf0..db503624 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,7 @@ The following packages/modules are available: - `derivation`: [Derivation trees](http://moin.delph-in.net/ItsdbDerivations) - `itsdb`: [incr tsdb()] profiles +- `tsql`: TSQL testsuite queries - `mrs`: [Minimal Recursion Semantics](http://moin.delph-in.net/MrsRfc) - `tdl`: [Type-Description Language](http://moin.delph-in.net/TdlRfc) - `tfs`: Typed-Feature Structures diff --git a/delphin/commands.py b/delphin/commands.py index b9781812..ddbc3100 100644 --- a/delphin/commands.py +++ b/delphin/commands.py @@ -14,7 +14,7 @@ import json from functools import partial -from delphin import itsdb +from delphin import itsdb, tsql from delphin.mrs import xmrs from delphin.util import safe_int, SExpr @@ -59,9 +59,8 @@ def convert(path, source_fmt, target_fmt, select='result:mrs', pretty_print = True indent = 4 if indent is True else safe_int(indent) - sel_table, sel_col = itsdb.get_data_specifier(select) - if len(sel_col) != 1: - raise ValueError('Exactly 1 column must be given in data selector ' + if len(tsql.inspect_query('select ' + select)['projection']) != 1: + raise ValueError('Exactly 1 column must be given in selection query: ' '(e.g., result:mrs)') # read @@ -71,10 +70,10 @@ def convert(path, source_fmt, target_fmt, select='result:mrs', elif hasattr(path, 'read'): xs = loads(path.read()) elif os.path.isdir(path): - p = itsdb.ItsdbProfile(path) + ts = itsdb.TestSuite(path) xs = [ next(iter(loads(r[0])), None) - for r in p.select(sel_table, sel_col) + for r in tsql.select(select, ts) ] else: xs = loads(open(path, 'r').read()) @@ -248,52 +247,33 @@ def _read_ace_parse(s): ############################################################################### ### SELECT #################################################################### -def select(dataspec, testsuite, join=None, - filters=None, applicators=None, mode='list'): +def select(dataspec, testsuite, mode='list', cast=True): """ Select data from [incr tsdb()] profiles. Args: - dataspec (str): data specifier for [incr tsdb()] profiles - (e.g., `"parse:readings"` or `"result:mrs"`) - testsuite (str, ItsdbProfile): testsuite or path to testsuite + query (str): TSQL select query (e.g., `'i-id i-input mrs'` or + `'* from item where readings > 0'`) + testsuite (str, TestSuite): testsuite or path to testsuite containing data to select - join (tuple): 2-tuple of table names to join; the *dataspec* - should then include the table name for all columns - (e.g., `"parse:i-id@result:mrs"`) - filters (list): see :class:`delphin.itsdb.ItsdbProfile` for a - description of filters - applicators (list): see :class:`delphin.itsdb.ItsdbProfile` - for a description of applicators - mode (str): see :meth:`delphin.itsdb.ItsdbProfile.select` for - a description of the *mode* parameter + mode (str): see :func:`delphin.itsdb.select_rows` for a + description of the *mode* parameter (default: `list`) + cast (bool): if `True`, cast column values to their datatype + according to the relations file (default: `True`) Returns: a generator that yields selected data """ - if not isinstance(testsuite, itsdb.ItsdbProfile): - assert os.path.isdir(testsuite) - testsuite = _prepare_input_profile( - testsuite, filters=filters, applicators=applicators) - if join: - tbl1, tbl2 = join - rows = testsuite.join(tbl1, tbl2, key_filter=True) - # Adding : is just for robustness. We need something like - # :table:col@table@col, but may have gotten table:col@table@col - if not dataspec.startswith(':'): - dataspec = ':' + dataspec - table, cols = itsdb.get_data_specifier(dataspec) - else: - table, cols = itsdb.get_data_specifier(dataspec) - rows = testsuite.read_table(table, key_filter=True) - - return itsdb.select_rows(cols, rows, mode=mode) + if isinstance(testsuite, itsdb.ItsdbProfile): + testsuite = itsdb.TestSuite(testsuite.root) + elif not isinstance(testsuite, itsdb.TestSuite): + testsuite = itsdb.TestSuite(testsuite) + return tsql.select(dataspec, testsuite, mode=mode, cast=cast) ############################################################################### ### MKPROF #################################################################### -def mkprof(destination, source=None, relations=None, - filters=None, applicators=None, +def mkprof(destination, source=None, relations=None, where=None, in_place=False, skeleton=False, full=False, gzip=False): """ Create [incr tsdb()] profiles or skeletons. @@ -316,10 +296,8 @@ def mkprof(destination, source=None, relations=None, relations (str): path to a relations file to use for the created testsuite; if `None` and *source* is given, the relations file of the source testsuite is used - filters (list): see :class:`delphin.itsdb.ItsdbProfile` for a - description of filters - applicators (list): see :class:`delphin.itsdb.ItsdbProfile` - for a description of applicators + where (str): TSQL condition to filter records by; ignored if + *source* is not a testsuite in_place (bool): if `True` and *source* is not given, use *destination* as the source for data (default: `False`) skeleton (bool): if `True`, only write tsdb-core files @@ -346,36 +324,37 @@ def mkprof(destination, source=None, relations=None, elif relations is None or not os.path.isfile(relations): raise ValueError('invalid or missing relations file: {}' .format(relations)) - + # setup destination testsuite _prepare_output_directory(destination) - o = itsdb.TestSuite(path=destination, relations=relations) + dts = itsdb.TestSuite(path=destination, relations=relations) # input is sentences on stdin if source is None: - o.write({'item': _lines_to_rows(sys.stdin)}, gzip=gzip) + dts.write({'item': _lines_to_rows(sys.stdin)}, gzip=gzip) # input is sentence file elif os.path.isfile(source): with open(source) as fh: - o.write({'item': _lines_to_rows(fh)}, gzip=gzip) - # input is profile + dts.write({'item': _lines_to_rows(fh)}, gzip=gzip) + # input is source testsuite elif os.path.isdir(source): - p = _prepare_input_profile( - source, filters=filters, applicators=applicators) - if full: - p.write_profile( - destination, - relations_filename=relations, - key_filter=True, - gzip=gzip) - else: - for table in itsdb.tsdb_core_files: - if p.size(table) > 0: - o.write({table: p.read_table(table)}, gzip=gzip) - o.reload() + sts = itsdb.TestSuite(source) + tables = dts.relations.tables if full else itsdb.tsdb_core_files + where = '' if where is None else 'where ' + where + for table in tables: + if sts.size(table) > 0: + # filter the data, but use all if the query fails + # (e.g., if the filter and table cannot be joined) + try: + rows = tsql.select( + '* from {} {}'.format(table, where), sts, cast=False) + except itsdb.ItsdbError: + rows = sts[table] + dts.write({table: rows}, gzip=gzip) + dts.reload() # unless a skeleton was requested, make empty files for other tables if not skeleton: - for table in o.relations: - if len(o[table]) == 0: - o.write({table: []}) + for table in dts.relations: + if len(dts[table]) == 0: + dts.write({table: []}) # summarize what was done if sys.stdout.isatty(): @@ -383,15 +362,13 @@ def mkprof(destination, source=None, relations=None, else: _red = lambda s: s fmt = '{:>8} bytes\t{}' - prof = itsdb.ItsdbProfile(destination, index=False) - relations = prof.relations - for filename in ['relations'] + list(relations.tables): - f = os.path.join(destination, filename) - if os.path.isfile(f): - stat = os.stat(f) + for filename in ['relations'] + list(dts.relations.tables): + path = os.path.join(destination, filename) + if os.path.isfile(path): + stat = os.stat(path) print(fmt.format(stat.st_size, filename)) - elif os.path.isfile(f + '.gz'): - stat = os.stat(f + '.gz') + elif os.path.isfile(path + '.gz'): + stat = os.stat(path + '.gz') print(fmt.format(stat.st_size, _red(filename + '.gz'))) @@ -406,8 +383,7 @@ def _lines_to_rows(lines): ############################################################################### ### PROCESS ################################################################### -def process(grammar, testsuite, source=None, selector=None, - filters=None, applicators=None, +def process(grammar, testsuite, source=None, select=None, generate=False, transfer=False, all_items=False, result_id=None): """ @@ -415,61 +391,89 @@ def process(grammar, testsuite, source=None, selector=None, Results are written to directly to *testsuite*. + If *select* is `None`, the defaults depend on the task: + + ========== ========================= + Task Default value of *select* + ========== ========================= + Parsing `item:i-input` + Transfer `result:mrs` + Generation `result:mrs` + ========== ========================= + Args: grammar (str): path to a compiled grammar image testsuite (str): path to a [incr tsdb()] testsuite where data will be read from (see *source*) and written to - source (str): see :meth:`delphin.itsdb.TestSuite.process` for - a description of *source* - selector (str): see :meth:`delphin.itsdb.TestSuite.process` - for a description of *selector* - filters (list): see :class:`delphin.itsdb.ItsdbProfile` for a - description of filters - applicators (list): see :class:`delphin.itsdb.ItsdbProfile` - for a description of applicators + source (str): path to a [incr tsdb()] testsuite; if `None`, + *testsuite* is used as the source of data + select (str): TSQL query for selecting processor inputs + (default depends on the processor type) generate (bool): if `True`, generate instead of parse (default: `False`) transfer (bool): if `True`, transfer instead of parse (default: `False`) all_items (bool): if `True`, don't exclude ignored items - (those with `i-wf==2`) + (those with `i-wf==2`) when parsing result_id (int): if given, only keep items with the specified `result-id` """ from delphin.interfaces import ace + if generate and transfer: raise ValueError("'generate' is incompatible with 'transfer'") if source is None: source = testsuite - if filters is None: - filters = [] - if result_id is not None: - filters.append( - ('result', ['result-id'], lambda row, x: x == result_id)) + if select is None: + select = 'result:mrs' if (generate or transfer) else 'item:i-input' if generate: processor = ace.AceGenerator elif transfer: processor = ace.AceTransferer else: if not all_items: - filters.append( - ('item', ['i-wf'], lambda row, x: x != 2)) + select += ' where i-wf != 2' processor = ace.AceParser + if result_id is not None: + select += ' where result-id == {}'.format(result_id) - source = _prepare_input_profile( - source, - filters=filters, - applicators=applicators, - cast=True) - + source = itsdb.TestSuite(source) target = itsdb.TestSuite(testsuite) + column, tablename, condition = _interpret_selection(select, source) + table = itsdb.Table( + tablename, + source[tablename].fields, + tsql.select( + '* from {} {}'.format(tablename, condition), + source, + cast=False)) with processor(grammar) as cpu: - target.process(cpu, selector, source=source) + target.process(cpu, tablename + ':' + column, source=table) target.write() +def _interpret_selection(select, source): + queryobj = tsql.inspect_query('select ' + select) + projection = queryobj['projection'] + if projection == '*' or len(projection) != 1: + raise ValueError("'select' must return a single column") + tablename, _, column = projection[0].rpartition(':') + if not tablename: + # query could be 'i-input from item' instead of 'item:i-input' + if len(queryobj['tables']) == 1: + tablename = queryobj['tables'][0] + # otherwise guess + else: + tablename = source.relations.find(column)[0] + try: + condition = select[select.index(' where ') + 1:] + except ValueError: + condition = '' + return column, tablename, condition + + ############################################################################### ### REPP ###################################################################### @@ -555,20 +559,17 @@ def _repp(r, line, format, trace_level): ############################################################################### ### COMPARE ################################################################### -def compare(testsuite, gold, filters=None, applicators=None): +def compare(testsuite, gold, select='i-id i-input mrs'): """ Compare two [incr tsdb()] profiles. - Any filters or applicators are applied to both the test and gold - testsuites. - Args: - testsuite: path to the test [incr tsdb()] testsuite - gold: path to the gold [incr tsdb()] testsuite - filters (list): see :class:`delphin.itsdb.ItsdbProfile` for a - description of filters - applicators (list): see :class:`delphin.itsdb.ItsdbProfile` - for a description of applicators + testsuite (str, TestSuite): path to the test [incr tsdb()] + testsuite or a :class:`TestSuite` object + gold (str, TestSuite): path to the gold [incr tsdb()] + testsuite or a :class:`TestSuite` object + select: TSQL query to select (id, input, mrs) triples + (default: `i-id i-input mrs`) Yields: dict: Comparison results as: @@ -582,25 +583,32 @@ def compare(testsuite, gold, filters=None, applicators=None): """ from delphin.mrs import simplemrs, compare as mrs_compare - test_profile = _prepare_input_profile( - testsuite, - filters=filters, - applicators=applicators) - gold_profile = _prepare_input_profile( - gold, - filters=filters, - applicators=applicators) + if not isinstance(testsuite, itsdb.TestSuite): + if isinstance(testsuite, itsdb.ItsdbProfile): + testsuite = testsuite.root + testsuite = itsdb.TestSuite(testsuite) + if not isinstance(gold, itsdb.TestSuite): + if isinstance(gold, itsdb.ItsdbProfile): + gold = gold.root + gold = itsdb.TestSuite(gold) + + queryobj = tsql.inspect_query('select ' + select) + if len(queryobj['projection']) != 3: + raise ValueError('select does not return 3 fields: ' + select) - i_inputs = dict((row['parse:parse-id'], row['item:i-input']) - for row in test_profile.join('item', 'parse')) + input_select = '{} {}'.format(queryobj['projection'][0], + queryobj['projection'][1]) + i_inputs = dict(tsql.select(input_select, testsuite)) matched_rows = itsdb.match_rows( - test_profile.read_table('result'), gold_profile.read_table('result'), - 'parse-id') + tsql.select(select, testsuite), + tsql.select(select, gold), + 0) + for (key, testrows, goldrows) in matched_rows: (test_unique, shared, gold_unique) = mrs_compare.compare_bags( - [simplemrs.loads_one(row['mrs']) for row in testrows], - [simplemrs.loads_one(row['mrs']) for row in goldrows]) + [simplemrs.loads_one(row[2]) for row in testrows], + [simplemrs.loads_one(row[2]) for row in goldrows]) yield {'id': key, 'input': i_inputs[key], 'test': test_unique, @@ -611,12 +619,6 @@ def compare(testsuite, gold, filters=None, applicators=None): ############################################################################### ### HELPER FUNCTIONS ########################################################## -def _prepare_input_profile(path, filters=None, applicators=None, cast=False): - index = filters is not None and len(filters) > 0 - prof = itsdb.ItsdbProfile( - path, filters=filters, applicators=applicators, index=index, cast=cast) - return prof - def _prepare_output_directory(path): try: diff --git a/delphin/exceptions.py b/delphin/exceptions.py index 07720027..6e05f4e0 100644 --- a/delphin/exceptions.py +++ b/delphin/exceptions.py @@ -88,3 +88,35 @@ class TdlWarning(PyDelphinWarning): class REPPError(PyDelphinException): """Raised when there is an error in tokenizing with REPP.""" pass + +class TSQLSyntaxError(PyDelphinException): + def __init__(self, *args, **kwargs): + # Python2 doesn't allow parameters like: + # (*args, key=val, **kwargs) + # so do this manaully. + lineno = offset = 0 + text = None + if 'lineno' in kwargs: + lineno = kwargs['lineno'] + del kwargs['lineno'] + if 'offset' in kwargs: + offset = kwargs['offset'] + del kwargs['offset'] + if 'text' in kwargs: + text = kwargs['text'] + del kwargs['text'] + + super(TSQLSyntaxError, self).__init__(*args, **kwargs) + self.lineno = lineno + self.offset = offset + self.text = text + + def __str__(self): + display = '' + if self.text is not None: + display = '\n {}\n {}^'.format(self.text, ' ' * self.offset) + return ('Syntax error at line {}, position {}:{}\n{}' + .format(self.lineno or '?', + self.offset or '?', + display, + super(TSQLSyntaxError, self).__str__())) diff --git a/delphin/itsdb.py b/delphin/itsdb.py index 84f475ad..015a5ef2 100644 --- a/delphin/itsdb.py +++ b/delphin/itsdb.py @@ -92,7 +92,9 @@ from delphin.exceptions import ItsdbError -from delphin.util import safe_int, stringtypes, deprecated +from delphin.util import ( + safe_int, stringtypes, deprecated, parse_datetime +) from delphin.interfaces.base import FieldMapper ############################################################################## @@ -192,8 +194,11 @@ def __new__(cls, name, fields): tr._keys = tuple(f.name for f in fields if f.key) return tr + def __contains__(self, name): + return name in self._index + def index(self, fieldname): - """Return the Field given by *fieldname*.""" + """Return the Field index given by *fieldname*.""" return self._index[fieldname] def keys(self): @@ -201,6 +206,82 @@ def keys(self): return self._keys +class _RelationJoin(Relation): + def __new__(cls, rel1, rel2, on=None): + if set(rel1.name.split('+')).intersection(rel2.name.split('+')): + raise ItsdbError('Cannot join tables with the same name; ' + 'try renaming the table.') + + name = '{}+{}'.format(rel1.name, rel2.name) + # the relation of the joined table, merging shared columns in *on* + if isinstance(on, stringtypes): + on = _split_cols(on) + elif on is None: + on = [] + + fields = _prefixed_relation_fields(rel1, on, False) + fields.extend(_prefixed_relation_fields(rel2, on, True)) + r = super(_RelationJoin, cls).__new__(cls, name, fields) + + # reset _keys to be a unique tuple of column-only forms + keys = list(rel1.keys()) + seen = set(keys) + for key in rel2.keys(): + if key not in seen: + keys.append(key) + seen.add(key) + r._keys = tuple(keys) + + return r + + def __contains__(self, name): + try: + self.index(name) + except KeyError: + return False + except ItsdbError: + pass # ambiguous field name + return True + + def index(self, fieldname): + if ':' not in fieldname: + qfieldnames = [] + for table in self.name.split('+'): + qfieldname = table + ':' + fieldname + if qfieldname in self._index: + qfieldnames.append(qfieldname) + if len(qfieldnames) > 1: + raise ItsdbError( + "ambiguous field name; include the table name " + "(e.g., 'item:i-id' instead of 'i-id')") + elif len(qfieldnames) == 1: + fieldname = qfieldnames[0] + else: + pass # lookup should return KeyError + elif fieldname not in self._index: + # join keys don't get prefixed + uqfieldname = fieldname.rpartition(':')[2] + if uqfieldname in self._keys: + fieldname = uqfieldname + return self._index[fieldname] + + +def _prefixed_relation_fields(relation, on, drop): + fields = [] + already_joined = isinstance(relation, _RelationJoin) + for f in relation: + table, _, fieldname = f[0].rpartition(':') + if already_joined : + prefix = table + ':' if table else '' + else: + prefix = relation.name + ':' + if fieldname in on and not drop: + fields.append(Field(fieldname, *f[1:])) + elif fieldname not in on: + fields.append(Field(prefix + fieldname, *f[1:])) + return fields + + class Relations(object): """ A [incr tsdb()] database schema. @@ -214,8 +295,10 @@ class Relations(object): """ def __init__(self, tables): + tables = [(t[0], Relation(*t)) for t in tables] self.tables = tuple(t[0] for t in tables) - self._data = dict((t[0], Relation(*t)) for t in tables) + self._data = dict(tables) + self._field_map = _make_field_map(t[1] for t in tables) @classmethod def from_file(cls, source): @@ -289,6 +372,73 @@ def items(self): """Return a list of (table, :class:`Relation`) for each table.""" return [(table, self[table]) for table in self] + def find(self, fieldname): + """ + Return the list of tables that define the field *fieldname*. + """ + tablename, _, column = fieldname.rpartition(':') + if tablename and tablename in self._field_map[column]: + return tablename + else: + return self._field_map[fieldname] + + def path(self, source, target): + """ + Find the path of id fields connecting two tables. + + This is just a basic breadth-first-search. The relations file + should be small enough to not be a problem. + + Returns: + list: (table, fieldname) pairs describing the path from + the source to target tables + Raises: + :class:`ItsdbError` when no path is found + Example: + >>> relations.path('item', 'result') + [('parse', 'i-id'), ('result', 'parse-id')] + >>> relations.path('parse', 'item') + [('item', 'i-id')] + >>> relations.path('item', 'item') + [] + """ + visited = set(source.split('+')) # split on + for joins + targets = set(target.split('+')) - visited + # ensure sources and targets exists + for tablename in visited.union(targets): + self[tablename] + # base case; nothing to do + if len(targets) == 0: + return [] + paths = [[(tablename, None)] for tablename in visited] + while True: + newpaths = [] + for path in paths: + laststep, pivot = path[-1] + if laststep in targets: + return path[1:] + else: + for key in self[laststep].keys(): + for step in set(self.find(key)) - visited: + visited.add(step) + newpaths.append(path + [(step, key)]) + if newpaths: + paths = newpaths + else: + break + + raise ItsdbError('no relation path found from {} to {}' + .format(source, target)) + + + +def _make_field_map(rels): + g = {} + for rel in rels: + for field in rel: + g.setdefault(field.name, []).append(rel.name) + return g + ############################################################################## # Test items and test suites @@ -334,7 +484,7 @@ def __init__(self, fields, iterable): iterable[i] = value self.fields = fields - list.__init__(self, iterable) + super(Record, self).__init__(iterable) def __repr__(self): return "<{} '{}' {}>".format( @@ -357,7 +507,7 @@ def __setitem__(self, index, value): # should the value be validated against the datatype? return list.__setitem__(self, index, value) - def get(self, key, default=None): + def get(self, key, default=None, cast=False): """ Return the field data given by field name *key*. @@ -365,10 +515,26 @@ def get(self, key, default=None): key: the field name of the data to return default: the value to return if *key* is not in the row """ + tablename, _, key = key.rpartition(':') + if tablename and tablename not in self.fields.name.split('+'): + raise ItsdbError('column requested from wrong table: {}' + .format(tablename)) try: - return self[key] - except KeyError: - return default + index = self.fields.index(key) + value = list.__getitem__(self, index) + except (KeyError, IndexError): + value = default + else: + if cast: + dt = self.fields[index].datatype + if dt == ':integer': + value = int(value) + elif dt == ':float': + value = float(value) + elif dt == ':date': + value = parse_datetime(value) + # others? + return value class Table(list): @@ -433,9 +599,7 @@ def from_file(cls, path, name=None, fields=None, encoding='utf-8'): records = [] with _open_table(path, encoding) as tab: - records.extend( - map((lambda s: decode_row(s, fields)), tab) - ) + records.extend(map((lambda s: decode_row(s)), tab)) return cls(name, fields, records) @@ -549,7 +713,8 @@ def select(self, arg, cols=None, mode='list'): cols = [f.name for f in self.relations[table]] return select_rows(cols, self[table], mode=mode) - def write(self, tables=None, path=None, append=False, gzip=None): + def write(self, tables=None, path=None, relations=None, + append=False, gzip=None): """ Write the testsuite to disk. @@ -559,6 +724,8 @@ def write(self, tables=None, path=None, append=False, gzip=None): all tables will be written path: the destination directory; if `None` use the path assigned to the TestSuite + relations: a :class:`Relations` object or path to a + relations file to be used when writing the tables append: if `True`, append to rather than overwrite tables gzip: compress non-empty tables with gzip Examples: @@ -577,27 +744,80 @@ def write(self, tables=None, path=None, append=False, gzip=None): pass elif isinstance(tables, Sequence): tables = dict((table, self[table]) for table in tables) + if relations is None: + relations = self.relations + elif isinstance(relations, stringtypes): + relations = Relations.from_file(relations) # prepare destination if not os.path.exists(path): os.makedirs(path) # raise error if path != self._path? - if not os.path.isfile(os.path.join(path, _relations_filename)): - with open(os.path.join(path, _relations_filename), 'w') as fh: - print(str(self.relations), file=fh) - - for tablename, data in tables.items(): - # reload table from disk if it is invalidated - if data is None: - data = self[tablename] - _write_table( - path, - tablename, - data, - self.relations[tablename], - gzip=gzip, - encoding=self.encoding - ) + with open(os.path.join(path, _relations_filename), 'w') as fh: + print(str(relations), file=fh) + + for tablename, relation in relations.items(): + if tablename in tables: + data = tables[tablename] + # reload table from disk if it is invalidated + if data is None: + data = self[tablename] + elif not isinstance(data, Table): + data = Table(tablename, relation, data) + _write_table( + path, + tablename, + data, + relation, + gzip=gzip, + encoding=self.encoding + ) + + def exists(self, table=None): + """ + Return `True` if the testsuite or a table exists on disk. + + If *table* is `None`, this function returns `True` if the + :attr:`TestSuite.path` is specified and points to an existing + directory containing a valid relations file. If *table* is + given, the function returns `True` if, in addition to the + above conditions, the table exists as a file (even if + empty). Otherwise it returns False. + """ + if self._path is None or not os.path.isdir(self._path): + return False + if not os.path.isfile(os.path.join(self._path, _relations_filename)): + return False + if table is not None: + try: + _table_filename(os.path.join(self._path, table)) + except ItsdbError: + return False + return True + + def size(self, table=None): + """ + Return the size, in bytes, of the testsuite or *table*. + + If *table* is `None`, return the size of the whole testsuite + (i.e., the sum of the table sizes). Otherwise, return the + size of *table*. + + Notes: + * If the file is gzipped, it returns the compressed size. + * Only tables on disk are included. + """ + size = 0 + if table is None: + for table in self.relations: + size += self.size(table) + else: + try: + fn = _table_filename(os.path.join(self._path, table)) + size += os.stat(fn).st_size + except ItsdbError: + pass + return size def process(self, cpu, selector=None, source=None, fieldmapper=None): """ @@ -609,43 +829,34 @@ def process(self, cpu, selector=None, source=None, fieldmapper=None): :class:`~delphin.interfaces.ace.AceParser`) selector (str): data specifier to select a single table and column as processor input (e.g., `"item:i-input"`) - source (:class:`TestSuite`): testsuite from which input - items are taken; if `None`, use `self` + source (:class:`TestSuite`, :class:`Table`): testsuite or + table from which inputs are taken; if `None`, use `self` fieldmapper (:class:`~delphin.interfaces.base.FieldMapper`): object for mapping response fields to [incr tsdb()] fields; if `None`, use a default mapper for the standard schema Examples: - >>> ts.process(ace_parser, 'item:i-input') + >>> ts.process(ace_parser) >>> ts.process(ace_generator, 'result:mrs', source=ts2) """ if selector is None: selector = _default_task_input_selectors.get(cpu.task) - - data_table, data_col = get_data_specifier(selector) - if len(data_col) != 1: - raise ItsdbError( - 'Selector must specify exactly one data column: {}' - .format(selector) - ) - data_col = data_col[0] - key_fields = [f for f in self.relations[data_table] if f.key] - cols = [f.name for f in key_fields] - cols.append(data_col) - if source is None: source = self - if fieldmapper is None: fieldmapper = FieldMapper() + source, cols = _prepare_source(selector, source) + key_cols = cols[:-1] + tables = {} - for item in source.select(data_table, cols, mode='dict'): - datum = item.pop(data_col) - response = cpu.process_item(datum, keys=item) + for item in select_rows(cols, source, mode='list'): + datum = item.pop() + keys = dict(zip(key_cols, item)) + response = cpu.process_item(datum, keys=keys) logging.info( 'Processed item {:>16} {:>8} results' - .format(make_row(item, key_fields), len(response['results'])) + .format(encode_row(item), len(response['results'])) ) for tablename, data in fieldmapper.map(response): _add_record(tables, tablename, data, self.relations) @@ -657,6 +868,20 @@ def process(self, cpu, selector=None, source=None, fieldmapper=None): self._data[tablename] = table +def _prepare_source(selector, source): + tablename, fields = get_data_specifier(selector) + if len(fields) != 1: + raise ItsdbError( + 'Selector must specify exactly one data column: {}' + .format(selector) + ) + if isinstance(source, TestSuite): + if not tablename: + tablename = source.relations.find(fields[0])[0] + source = source[tablename] + cols = list(source.fields.keys()) + fields + return source, cols + def _add_record(tables, tablename, data, relations): fields = relations[tablename] if tablename not in tables: @@ -732,7 +957,10 @@ def decode_row(line, fields=None): col = int(col) elif dt == ':float': col = float(col) - # other casts? date? + elif dt == ':date': + dt = parse_datetime(col) + col = dt if dt is not None else col + # other casts? :position? cols[i] = col return cols @@ -908,7 +1136,7 @@ def make_row(row, fields): return encode_row(row_fields) -def select_rows(cols, rows, mode='list'): +def select_rows(cols, rows, mode='list', cast=True): """ Yield data selected from rows. @@ -928,24 +1156,32 @@ def select_rows(cols, rows, mode='list'): cols: an iterable of column names to select data for rows: the rows to select column data from mode: the form yielded data should take + cast: if `True`, cast column values to their datatype + (requires *rows* to be :class:`Record` objects) Yields: Selected data in the form specified by *mode*. """ mode = mode.lower() if mode == 'list': - cast = lambda cols, data: data + modecast = lambda cols, data: data elif mode == 'dict': - cast = lambda cols, data: dict(zip(cols, data)) + modecast = lambda cols, data: dict(zip(cols, data)) elif mode == 'row': - cast = lambda cols, data: encode_row(data) + modecast = lambda cols, data: encode_row(data) else: raise ItsdbError('Invalid mode for select operation: {}\n' ' Valid options include: list, dict, row' .format(mode)) for row in rows: - data = [row.get(c) for c in cols] - yield cast(cols, data) + if cast: + try: + data = [row.get(c, cast=True) for c in cols] + except TypeError: + data = [row.get(c) for c in cols] + else: + data = [row.get(c) for c in cols] + yield modecast(cols, data) def match_rows(rows1, rows2, key, sort_keys=True): @@ -979,8 +1215,8 @@ def join(table1, table2, on=None, how='inner', name=None): Fields in the resulting table have their names prefixed with their corresponding table name. For example, when joining `item` and `parse` tables, the `i-input` field of the `item` table will be - named `item:i-input` in the resulting Table. Note that this means - the shared keys will appear twice---once for each table. + named `item:i-input` in the resulting Table. Pivot fields (those + in *on*) are only stored once without the prefix. Both inner and left joins are possible by setting the *how* parameter to `inner` and `left`, respectively. @@ -990,48 +1226,44 @@ def join(table1, table2, on=None, how='inner', name=None): table2 (:class:`Table`): the right table to join on (str): the shared key to use for joining; if `None`, find shared keys using the schemata of the tables - how (str): the method used for joining (`"inner"` or - `"left"`) + how (str): the method used for joining (`"inner"` or `"left"`) name (str): the name assigned to the resulting table """ if how not in ('inner', 'left'): ItsdbError('Only \'inner\' and \'left\' join methods are allowed.') - # the name of the joined table - if name is None: - name = '{}+{}'.format(table1.name, table2.name) - # the relation of the joined table - prefixes = (table1.name + ':', table2.name + ':') - fields = Relation( - name, - [Field(prefixes[0] + f.name, *f[1:]) for f in table1.fields] + - [Field(prefixes[1] + f.name, *f[1:]) for f in table2.fields] - ) # validate and normalize the pivot - if isinstance(on, stringtypes): - on = _split_cols(on) - if not on: - on = set(table1.fields.keys()).intersection(table2.fields.keys()) - if not on: - raise ItsdbError( - 'No shared key to join on in the \'{}\' and \'{}\' tables.' - .format(table1.name, table2.name) - ) - on = sorted(on) - key = lambda rec: tuple(rec.get(k) for k in on) + on = _join_pivot(on, table1, table2) + # the relation of the joined table + relation = _RelationJoin(table1.fields, table2.fields, on=on) # get key mappings to the right side (useful for inner and left joins) + get_key = lambda rec: tuple(rec.get(k) for k in on) + key_indices = set(table2.fields.index(k) for k in on) right = defaultdict(list) for rec in table2: - right[key(rec)].append(rec) + right[get_key(rec)].append([c for i, c in enumerate(rec) + if i not in key_indices]) # build joined table - rfill = [f.default_value() for f in table2.fields] + rfill = [f.default_value() for f in table2.fields if f.name not in on] joined = [] for lrec in table1: - k = key(lrec) + k = get_key(lrec) if how == 'left' or k in right: joined.extend(lrec + rrec for rrec in right.get(k, [rfill])) - return Table(name, fields, joined) + return Table(relation.name, relation, joined) + +def _join_pivot(on, table1, table2): + if isinstance(on, stringtypes): + on = _split_cols(on) + if not on: + on = set(table1.fields.keys()).intersection(table2.fields.keys()) + if not on: + raise ItsdbError( + 'No shared key to join on in the \'{}\' and \'{}\' tables.' + .format(table1.name, table2.name) + ) + return sorted(on) ############################################################################## @@ -1314,7 +1546,7 @@ def read_raw_table(self, table): table_path = os.path.join(self.root, table) with _open_table(table_path, self.encoding) as tbl: for line in tbl: - cols = decode_row(line, fields) + cols = decode_row(line, fields=fields) if len(cols) != field_len: # should this throw an exception instead? logging.error('Number of stored fields ({}) ' diff --git a/delphin/main.py b/delphin/main.py index ec63e21f..6176da97 100644 --- a/delphin/main.py +++ b/delphin/main.py @@ -63,15 +63,11 @@ def call_convert(args): def call_select(args): - if args.join is not None: - args.join = [tbl.strip() for tbl in args.join.split(',')] rows = select( - args.DATASPEC, + args.QUERY, args.TESTSUITE, - join=args.join, - filters=_make_itsdb_actions(args.filter), - applicators=_make_itsdb_actions(args.apply), - mode='row') + mode='row', + cast=False) for row in rows: print(row) @@ -81,8 +77,7 @@ def call_mkprof(args): args.DEST, source=args.source or args.input, relations=args.relations, - filters=_make_itsdb_actions(args.filter), - applicators=_make_itsdb_actions(args.apply), + where=args.where, in_place=args.in_place, skeleton=args.skeleton, full=args.full, @@ -92,11 +87,9 @@ def call_mkprof(args): def call_process(args): return process( args.grammar, - args.PROFILE, + args.TESTSUITE, source=args.source, - selector=args.input, - filters=_make_itsdb_actions(args.filter), - applicators=_make_itsdb_actions(args.apply), + select=args.select, generate=args.generate, transfer=args.transfer, all_items=args.all_items, @@ -105,13 +98,12 @@ def call_process(args): def call_compare(args): template = '{id}\t<{test},{shared},{gold}>' - if args.verbosity >= 1: + if args.verbosity > 1: template += '\t{input}' for result in compare( args.TESTSUITE, args.GOLD, - filters=_make_itsdb_actions(args.filter), - applicators=_make_itsdb_actions(args.apply)): + select=args.select): print(template.format(**result)) @@ -168,22 +160,6 @@ def redent(s): action='store_true', help='suppress output on and ') -# Arguments for commands that read profiles -profile_parser = argparse.ArgumentParser(add_help=False) -profile_parser.add_argument( - '-a', - '--apply', - action='append', - metavar='APL', - help=('apply an expression to rows/cols in the test suite; APL ' - 'is a string like \'table:col=expression\'')) -profile_parser.add_argument( - '-f', - '--filter', - action='append', - metavar='CND', - help=('keep rows satisfying a condition; CND is a string like ' - '\'table:col=expression\'')) # Arguments for the convert command convert_parser = argparse.ArgumentParser(add_help=False) @@ -191,9 +167,9 @@ def redent(s): convert_parser.add_argument( 'PATH', nargs='?', - help=('path to a file containing representations to convert, or ' - 'a test suite directory from which result:mrs will be selected; ' - 'if not given, is read as though it were a file')) + help=('file with representations to convert or testsuite directory ' + 'from which result:mrs will be selected; if not given, ' + ' is read as though it were a file')) convert_parser.add_argument( '-f', '--from', @@ -228,12 +204,6 @@ def redent(s): metavar='WHEN', default='auto', help='(auto|always|never) use ANSI color (default: auto)') -convert_parser.add_argument( - '--select', - metavar='DATASPEC', - default='result:mrs', - help=('table:col data specifier; ignored if PATH does not point ' - 'to a test suite directory (default: result:mrs)')) convert_parser.add_argument( '--show-status', action='store_true', @@ -242,29 +212,29 @@ def redent(s): '--predicate-modifiers', action='store_true', help='(--to=eds* only) attempt to join disconnected graphs') +convert_parser.add_argument( + '--select', + metavar='QUERY', + default='result:mrs', + help=('TSQL query for selecting MRS data when PATH points to ' + 'a testsuite directory (default: result:mrs)')) # Arguments for the select command select_parser = argparse.ArgumentParser(add_help=False) select_parser.set_defaults(func=call_select) select_parser.add_argument( - 'DATASPEC', help='table:col[@col...] data specifier (e.g. item:i-input)') -select_parser.add_argument( - 'TESTSUITE', help='path to the test suite directory to select data from') + 'QUERY', help='TSQL selection (e.g., \'i-input where readings = 0\')') select_parser.add_argument( - '-j', - '--join', - help=('join two tables with a shared key (e.g. parse,result); ' - 'the DATASPEC argument then requires explicit tables ' - '(e.g. parse:i-id@result:mrs)')) + 'TESTSUITE', help='path to the testsuite directory to select data from') # mkprof subparser mkprof_parser = argparse.ArgumentParser(add_help=False) mkprof_parser.set_defaults(func=call_mkprof) mkprof_parser.add_argument( - 'DEST', help='directory for the destination (output) test suite') + 'DEST', help='directory for the destination (output) testsuite') mkprof_grp1 = mkprof_parser.add_mutually_exclusive_group() mkprof_grp1.add_argument( - '-s', '--source', metavar='DIR', help='path to a test suite directory') + '-s', '--source', metavar='DIR', help='path to a testsuite directory') mkprof_grp1.add_argument( '--in-place', action='store_true', @@ -274,11 +244,15 @@ def redent(s): '--input', metavar='TXT', help='file of test sentences (* sents are ungrammatical)') +mkprof_parser.add_argument( + '--where', metavar='CONDITION', + help=('filter records in the testsuite with a TSQL condition ' + '(e.g., \'i-length <= 10 && readings > 0\')')) mkprof_parser.add_argument( '-r', '--relations', metavar='FILE', - help='relations file to use for destination test suite') + help='relations file to use for destination testsuite') mkprof_grp2 = mkprof_parser.add_mutually_exclusive_group() mkprof_grp2.add_argument( '--full', @@ -295,7 +269,7 @@ def redent(s): process_parser = argparse.ArgumentParser(add_help=False) process_parser.set_defaults(func=call_process) process_parser.add_argument( - 'PROFILE', help='target profile' + 'TESTSUITE', help='target testsuite' ) process_parser.add_argument( '-g', '--grammar', metavar='GRM', required=True, @@ -303,11 +277,12 @@ def redent(s): ) process_parser.add_argument( '-s', '--source', metavar='PATH', - help='source profile; if unset, set to PROFILE' + help='source testsuite; if unset, set to TESTSUITE' ) process_parser.add_argument( - '-i', '--input', metavar='DATASPEC', - help='data specifier for input items (see above)' + '--select', metavar='QUERY', + help=('TSQL query for selecting processor inputs (e.g., ' + '\'i-input where i-length < 10\'; see above for defaults)') ) process_parser.add_argument( '--all-items', action='store_true', @@ -325,7 +300,7 @@ def redent(s): process_parser.add_argument( '-p', metavar='RID', help=('transfer or generate from result with result-id=RID; ' - 'short for \'--filter result:result-id=x==RID\'') + 'short for adding \'where result-id==RID\' to --select') ) # compare subparser @@ -335,6 +310,12 @@ def redent(s): 'TESTSUITE', help='path to the current test-suite directory') compare_parser.add_argument( 'GOLD', help='path to the gold test-suite directory') +compare_parser.add_argument( + '--select', + metavar='QUERY', + default='item:i-id item:i-input result:mrs', + help=('TSQL query for selecting (id, input, mrs) triples from ' + 'TESTSUITE and GOLD (default: \'i-id i-input mrs\')')) # repp subparser repp_parser = argparse.ArgumentParser(add_help=False) @@ -383,74 +364,74 @@ def redent(s): """)) subparser.add_parser( 'select', - parents=[common_parser, select_parser, profile_parser], + parents=[common_parser, select_parser], formatter_class=argparse.RawDescriptionHelpFormatter, description=redent(""" Select data from [incr tsdb()] testsuites. """)) subparser.add_parser( 'mkprof', - parents=[common_parser, mkprof_parser, profile_parser], + parents=[common_parser, mkprof_parser], formatter_class=argparse.RawDescriptionHelpFormatter, description=redent(""" - This command creates test suites. There are four usage patterns: + This command creates testsuites. There are four usage patterns: delphin mkprof --input=sentences.txt --relations=../relations ... delphin mkprof --relations=../relations ... < sentences.txt - delphin mkprof --source=profile/ ... + delphin mkprof --source=testsuite/ ... delphin mkprof --in-place ... The first two read sentences (one per line; '*' in the first column - indicates ungrammaticality) from --input or and --relations is - is required. The second two use an existing profile; --relations - defaults to the source profile's; --in-place reads and overwrites DEST. + indicates ungrammaticality) from --input or and --relations + is required. The second two use an existing testsuite; --relations + defaults to that of --source; --in-place reads and overwrites DEST. - By default, test suites are skeletons as from the `mkprof` utility of + By default, testsuites are skeletons as from the `mkprof` utility of `art`, where the tsdb-core files (e.g., 'item') are non-empty but all other tables exist as empty files. The --full option, with --source, will copy a full profile, while the --skeleton option will only write the tsdb-core files and 'relations' file. - """)) + """)) subparser.add_parser( 'process', - parents=[common_parser, process_parser, profile_parser], + parents=[common_parser, process_parser], formatter_class=argparse.RawDescriptionHelpFormatter, description=redent(""" Use a processor (namely ACE) to process each item in the [incr tsdb()] - test suite given by --source (PROFILE if --source is not given). For + testsuite given by --source (TESTSUITE if --source is not given). For standard [incr tsdb()] schemata, input items given by the following - selectors for each task (configurable via the --input option): + selectors for each task (configurable via the --select option): - * parse: item:i-input - * transfer: result:mrs - * generate: result:mrs + * parse: i-input + * transfer: mrs + * generate: mrs - In addition, the following filter is applied if --source is a standard - [incr tsdb()] profile and --all-items is not used: + In addition, the following TSQL condition is applied if --source is a + standard [incr tsdb()] profile and --all-items is not used: - --filter=item:i-wf="x!=2" + where i-wf != 2 """)) subparser.add_parser( - 'compare', parents=[common_parser, compare_parser, profile_parser], + 'compare', parents=[common_parser, compare_parser], formatter_class=argparse.RawDescriptionHelpFormatter, description=redent(""" - Compare MRS results in test and gold [incr tsdb()] testsuites. + Compare MRS results in test and gold [incr tsdb()] testsuites. - Graph isomorphism is used to determine if two MRSs are equivalent and the - results show how many unique MRSs exist in the test and gold testsuites - and how many are shared. + Graph isomorphism is used to determine if two MRSs are equivalent and + the results show how many unique MRSs exist in the test and gold + testsuites and how many are shared. """)) subparser.add_parser( 'repp', parents=[common_parser, repp_parser], formatter_class=argparse.RawDescriptionHelpFormatter, description=redent(""" - Tokenize sentences using a Regular Expression PreProcessor (REPP). + Tokenize sentences using a Regular Expression PreProcessor (REPP). - This front-end to the delphin.repp module makes it easy to tokenize inputs - from a testsuite, a file of sentences, or sentences on stdin, and can - present the results in a variety of formats. It also visualizes the - application of REPP rules with the --trace option, which can be useful for - debugging REPP modules. + This front-end to the delphin.repp module makes it easy to tokenize + inputs from a testsuite, a file of sentences, or sentences on stdin, + and can present the results in a variety of formats. It also visualizes + the application of REPP rules with the --trace option, which can be + useful for debugging REPP modules. """)) if __name__ == '__main__': diff --git a/delphin/tsql.py b/delphin/tsql.py new file mode 100644 index 00000000..beee6d79 --- /dev/null +++ b/delphin/tsql.py @@ -0,0 +1,540 @@ + +""" +TSQL -- Test Suite Query Language + +This module implements a subset of TSQL, namely the 'select' (or +'retrieve') queries for extracting data from test suites. The general +form of a select query is:: + + [select] [from ] [where ]* + +For example, the following selects item identifiers that took more +than half a second to parse:: + + select i-id from item where total > 500 + +The `select` string is necessary when querying with the generic +:func:`query` function, but is implied and thus disallowed when using +the :func:`select` function. + +The `` is a list of space-separated field names (e.g., +`i-id i-input mrs`), or the special string `*` which selects all +columns from the joined tables. + +The optional `from` clause provides a list of table names (e.g., +`item parse result`) that are joined on shared keys. The `from` +clause is required when `*` is used for the projection, but it can +also be used to select columns from non-standard tables (e.g., `i-id +from output`). Alternatively, `delphin.itsdb`-style data specifiers +(see :func:`delphin.itsdb.get_data_specifier`) may be used to specify +the table on the column name (e.g., `item:i-id`). + +The `where` clause provide conditions for filtering the list of +results. Conditions are binary operations that take a column or data +specifier on the left side and an integer (e.g., `10`), a date (e.g., +`2018-10-07`), or a string (e.g., `"sleep"`) on the right side of the +operator. The allowed conditions are: + + ================ ====================================== + Condition Form + ================ ====================================== + Regex match `` ~ "regex"`` + Regex fail `` !~ "regex"`` + Equality `` = (integer|date|"string")`` + Inequality `` != (integer|date|"string")`` + Less-than `` < (integer|date)`` + Less-or-equal `` <= (integer|date)`` + Greater-than `` > (integer|date)`` + Greater-or-equal `` >= (integer|date)`` + ================ ====================================== + +Boolean operators can be used to join multiple conditions or for +negation: + + =========== ===================================== + Operation Form + =========== ===================================== + Disjunction ``X | Y``, ``X || Y``, or ``X or Y`` + Conjunction ``X & Y``, ``X && Y``, or ``X and Y`` + Negation ``!X`` or ``not X`` + =========== ===================================== + +Normally, disjunction scopes over conjunction, but parentheses may be +used to group clauses, so the following are equivalent:: + + ... where i-id = 10 or i-id = 20 and i-input ~ "[Dd]og" + ... where i-id = 10 or (i-id = 20 and i-input ~ "[Dd]og") + +Multiple `where` clauses may also be used as a conjunction that scopes +over disjunction, so the following are equivalent:: + + ... where (i-id = 10 or i-id = 20) and i-input ~ "[Dd]og" + ... where i-id = 10 or i-id = 20 where i-input ~ "[Dd]og" + +This facilitates query construction, where a user may want to apply +additional global constraints by appending new conditions to the query +string. + +PyDelphin has several differences to standard TSQL: + +* `select *` requires a `from` clause +* `select * from item result` does not also include columns from the + intervening `parse` table +* `select i-input from result` returns a matching `i-input` for every + row in `result`, rather than only the unique rows + +PyDelphin also adds some features to standard TSQL: + +* optional table specifications on columns (e.g., `item:i-id`) +* multiple `where` clauses (as described above) +""" + +import operator +import copy +import re + +from delphin.exceptions import TSQLSyntaxError +from delphin.util import LookaheadIterator, parse_datetime +from delphin import itsdb + + +### QUERY INSPECTION ########################################################## + +def inspect_query(query): + """ + Parse *query* and return the interpreted query object. + + Example: + >>> from delphin import tsql + >>> from pprint import pprint + >>> pprint(tsql.inspect_query( + ... 'select i-input from item where i-id < 100')) + {'querytype': 'select', + 'projection': ['i-input'], + 'tables': ['item'], + 'where': ('<', ('i-id', 100))} + """ + return _parse_query(query) + +### QUERY PROCESSING ########################################################## + +def query(query, ts, **kwargs): + """ + Perform *query* on the testsuite *ts*. + + Note: currently only 'select' queries are supported. + + Args: + query (str): TSQL query string + ts (:class:`delphin.itsdb.TestSuite`): testsuite to query over + kwargs: keyword arguments passed to the more specific query + function (e.g., :func:`select`) + Example: + >>> list(tsql.query('select i-id where i-length < 4', ts)) + [[142], [1061]] + """ + queryobj = _parse_query(query) + + if queryobj['querytype'] in ('select', 'retrieve'): + return _select( + queryobj['projection'], + queryobj['tables'], + queryobj['where'], + ts, + mode=kwargs.get('mode', 'list'), + cast=kwargs.get('cast', True)) + else: + # not really a syntax error; replace with TSQLError or something + # when the proper exception class exists + raise TSQLSyntaxError(queryobj['querytype'] + + ' queries are not supported') + + +def select(query, ts, mode='list', cast=True): + """ + Perform the TSQL selection query *query* on testsuite *ts*. + + Note: The `select`/`retrieve` part of the query is not included. + + Args: + query (str): TSQL select query + ts (:class:`delphin.itsdb.TestSuite`): testsuite to query over + mode (str): how to return the results (see + :func:`delphin.itsdb.select_rows` for more information + about the *mode* parameter; default: `list`) + cast (bool): if `True`, values will be cast to their datatype + according to the testsuite's relations (default: `True`) + Example: + >>> list(tsql.select('i-id where i-length < 4', ts)) + [[142], [1061]] + """ + queryobj = _parse_select(query) + return _select( + queryobj['projection'], + queryobj['tables'], + queryobj['where'], + ts, + mode, + cast) + + +def _select(projection, tables, condition, ts, mode, cast): + table = _select_from(tables, None, ts) + table = _select_projection(projection, table, ts) + table = _select_where(condition, table, ts) + + # finally select the relevant columns from the joined table + if projection == '*': + if len(tables) == 1: + projection = [f.name for f in ts.relations[tables[0]]] + else: + projection = [] + for t in tables: + projection.extend(t + ':' + f.name + for f in ts.relations[t]) + return itsdb.select_rows(projection, table, mode=mode, cast=cast) + + +def _select_from(tables, table, ts): + joined = set([] if table is None else table.name.split('+')) + for tab in tables: + if tab not in joined: + joined.add(tab) + table = _transitive_join(table, ts[tab], ts, 'inner') + return table + + +def _select_projection(projection, table, ts): + if projection != '*': + for p in projection: + table = _join_if_missing(table, p, ts, 'inner') + return table + + +def _select_where(condition, table, ts): + keys = table.fields.keys() + ids = set() + if condition is not None: + func, fields = _process_condition(condition) + # join tables in the condition for filtering + tmptable = table + for field in fields: + tmptable = _join_if_missing(tmptable, field, ts, 'left') + # filter the rows and store the keys only + for record in filter(func, tmptable): + idtuple = tuple(record[key] for key in keys) + ids.add(idtuple) + # check if a matching idtuple was retained + def meta_condition(rec): + return tuple(rec[key] for key in keys) in ids + table[:] = filter(meta_condition, table) + return table + + +_operator_functions = {'==': operator.eq, + '!=': operator.ne, + '<': operator.lt, + '<=': operator.le, + '>': operator.gt, + '>=': operator.ge} + + +def _process_condition(condition): + # conditions are something like: + # ('==', ('i-id', 11)) + op, body = condition + if op in ('and', 'or'): + fields = [] + conditions = [] + for cond in body: + _func, _fields = _process_condition(cond) + fields.extend(_fields) + conditions.append(_func) + _func = all if op == 'and' else any + def func(row): + return _func(cond(row) for cond in conditions) + elif op == 'not': + nfunc, fields = _process_condition(body) + func = lambda row, nfunc=nfunc: not nfunc(row) + elif op == '~': + fields = [body[0]] + func = lambda row, body=body: re.search(body[1], row[body[0]]) + elif op == '!~': + fields = [body[0]] + func = lambda row, body=body: not re.search(body[1], row[body[0]]) + else: + fields = [body[0]] + compare = _operator_functions[op] + def func(row): + return compare(row.get(body[0], cast=True), body[1]) + return func, fields + + +def _join_if_missing(table, col, ts, how): + tab, _, column = col.rpartition(':') + if not tab: + # Just get the first table defining the column. This + # makes the assumption that relations are ordered and + # that the first one is 'primary' + tab = ts.relations.find(column)[0] + if table is None or column not in table.fields: + table = _transitive_join(table, ts[tab], ts, how) + return table + + +def _transitive_join(tab1, tab2, ts, how): + if tab1 is None: + table = copy.copy(tab2) + else: + table = tab1 + # the tables may not be directly joinable but could be + # joinable transitively via a 'path' of table joins + path = ts.relations.path(tab1.name, tab2.name) + for intervening, pivot in path: + table = itsdb.join(table, ts[intervening], on=pivot, how=how) + return table + + +### QUERY PARSING ############################################################# + +_keywords = list(map(re.escape, + ('info', 'set', 'retrieve', 'select', 'insert', + 'from', 'where', 'report', '*', '.'))) +_operators = list(map(re.escape, + ('==', '=', '!=', '~', '!~', '<=', '<', '>=', '>', + '&&', '&', 'and', '||', '|', 'or', '!', 'not'))) + +_tsql_lex_re = re.compile( + r'''# regex-pattern gid description + ({keywords}) # 1 keywords + |({operators}) # 2 operators + |(\(|\)) # 3 parentheses + |"([^"\\]*(?:\\.[^"\\]*)*)" # 4 double-quoted "strings" + |'([^'\\]*(?:\\.[^'\\]*)*)' # 5 single-quoted 'strings' + |({yyyy}-{m}(?:-{d})?(?:{t}|{tt})?) # 6 yyyy-mm-dd date + |((?:{d}-)?{m}-{yy}(?:{t}|{tt})?) # 7 dd-mm-yy date + |(:today|now) # 8 keyword date + |([+-]?\d+) # 9 integers + |((?:{id}:)?{id}(?:@(?:{id}:)?{id})*) # 10 identifier (extended def) + |([^\s]) # 11 unexpected + '''.format(keywords='|'.join(_keywords), + operators='|'.join(_operators), + d=r'[0-9]{1,2}', + m=(r'(?:[0-9]{1,2}|' + r'jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'), + yy=r'(?:[0-9]{2})?[0-9]{2}', + yyyy=r'[0-9]{4}', + t=r'\s*\([0-9]{2}:[0-9]{2}(?::[0-9]{2})?\)', + tt=r'\s+[0-9]{2}:[0-9]{2}(?::[0-9]{2})', + id=r'[a-zA-Z][-_a-zA-Z0-9]*'), + flags=re.VERBOSE|re.IGNORECASE) + + +def _lex(s): + """ + Lex the input string according to _tsql_lex_re. + + Yields + (gid, token, line_number) + """ + s += '.' # make sure there's a terminator to know when to stop parsing + lines = enumerate(s.splitlines(), 1) + lineno = pos = 0 + try: + for lineno, line in lines: + matches = _tsql_lex_re.finditer(line) + for m in matches: + gid = m.lastindex + if gid == 11: + raise TSQLSyntaxError('unexpected input', + lineno=lineno, + offset=m.start(), + text=line) + else: + token = m.group(gid) + yield (gid, token, lineno) + except StopIteration: + pass + + +def _parse_query(query): + querytype, _, querybody = query.lstrip().partition(' ') + querytype = querytype.lower() + if querytype in ('select', 'retrieve'): + result = _parse_select(querybody) + else: + raise TSQLSyntaxError("'{}' queries are not supported" + .format(querytype), lineno=1) + + return result + + +def _parse_select(query): + tokens = LookaheadIterator(_lex(query)) + _, token, lineno = tokens.peek() # maybe used in error below + + projection = _parse_select_projection(tokens) + tables = _parse_select_from(tokens) + condition = _parse_select_where(tokens) + + if projection == '*' and not tables: + raise TSQLSyntaxError( + "'select *' requires a 'from' clause", + lineno=lineno, text=token) + + # verify we're at the end of the query (the '.' may have been + # added in _lex()) + gid, token, lineno = tokens.next() + _expect(gid == 1 and token == '.', token, lineno, "'.'") + + return {'querytype': 'select', + 'projection': projection, + 'tables': tables, + 'where': condition} + + +def _parse_select_projection(tokens): + gid, token, lineno = tokens.next() + if token == '*': + projection = token + elif gid == 10: + projection = [token] + while tokens.peek()[0] == 10: + _, col, _ = tokens.next() + projection.append(col) + projection = _prepare_columns(projection) + else: + raise TSQLSyntaxError("expected '*' or column identifiers", + lineno=lineno, text=token) + return projection + + +def _prepare_columns(cols): + columns = [] + for col in cols: + table = '' + for part in col.split('@'): + tblname, _, colname = part.rpartition(':') + if tblname: + table = tblname + ':' + columns.append(table + colname) + return columns + + +def _parse_select_from(tokens): + tables = [] + if tokens.peek()[1] == 'from': + tokens.next() + while tokens.peek()[0] == 10: + _, table, _ = tokens.next() + tables.append(table) + return tables + + +def _parse_select_where(tokens): + conditions = [] + while tokens.peek()[1] == 'where': + tokens.next() + conditions.append(_parse_condition_disjunction(tokens)) + if len(conditions) == 1: + condition = conditions[0] + elif len(conditions) > 1: + condition = ('and', conditions) + else: + condition = None + return condition + + +def _parse_condition_disjunction(tokens): + conds = [] + while True: + cond = _parse_condition_conjunction(tokens) + if cond is not None: + conds.append(cond) + if tokens.peek()[1] in ('|', '||', 'or'): + tokens.next() + nextgid, nexttoken, nextlineno = tokens.peek() + else: + break + if len(conds) == 0: + return None + elif len(conds) == 1: + return conds[0] + else: + return ('or', tuple(conds)) + + +def _parse_condition_conjunction(tokens): + conds = [] + nextgid, nexttoken, nextlineno = tokens.peek() + while True: + if nextgid == 2 and nexttoken.lower() in ('!', 'not'): + cond = _parse_condition_negation(tokens) + elif nextgid == 3 and nexttoken == '(': + cond = _parse_condition_group(tokens) + elif nextgid == 3 and nexttoken == ')': + break + elif nextgid == 10: + cond = _parse_condition_statement(tokens) + else: + raise TSQLSyntaxError("expected '!', 'not', '(', or a column name", + lineno=nextlineno, text=nexttoken) + conds.append(cond) + if tokens.peek()[1].lower() in ('&', '&&', 'and'): + tokens.next() + nextgid, nexttoken, nextlineno = tokens.peek() + else: + break + + if len(conds) == 0: + return None + elif len(conds) == 1: + return conds[0] + else: + return ('and', tuple(conds)) + + +def _parse_condition_negation(tokens): + gid, token, lineno = tokens.next() + _expect(gid == 2 and token in ('!', 'not'), token, lineno, "'!' or 'not'") + cond = _parse_condition_disjunction(tokens) + return ('not', cond) + + +def _parse_condition_group(tokens): + gid, token, lineno = tokens.next() + _expect(gid == 3 and token == '(', token, lineno, "'('") + cond = _parse_condition_disjunction(tokens) + gid, token, lineno = tokens.next() + _expect(gid == 3 and token == ')', token, lineno, "')'") + return tuple(cond) + + +def _parse_condition_statement(tokens): + gid, column, lineno = tokens.next() + _expect(gid == 10, column, lineno, 'a column name') + gid, op, lineno = tokens.next() + _expect(gid == 2, op, lineno, 'an operator') + if op == '=': + op = '==' # normalize = to == (I think these are equivalent) + gid, value, lineno = tokens.next() + if op in ('~', '!~') and gid not in (4, 5): + raise TSQLSyntaxError( + "the '{}' operator is only valid with strings".format(op), + lineno=lineno, text=op) + elif op in ('<', '<=', '>', '>=') and gid not in (6, 7, 8, 9): + raise TSQLSyntaxError( + "the '{}' operator is only valid with integers and dates" + .format(op), lineno=lineno, text=op) + else: + if gid in (6, 7, 8): + value = parse_datetime(value) + elif gid == 9: + value = int(value) + return (op, (column, value)) + + +def _expect(expected, token, lineno, msg): + msg = 'expected ' + msg + if not expected: + raise TSQLSyntaxError(msg, lineno=lineno, text=token) diff --git a/delphin/util.py b/delphin/util.py index e455e8c0..4955f168 100644 --- a/delphin/util.py +++ b/delphin/util.py @@ -1,4 +1,7 @@ + import warnings +import re +from datetime import datetime from collections import deque from functools import wraps @@ -38,6 +41,51 @@ def safe_int(x): pass return x + +def parse_datetime(s): + if re.match(r':?(today|now)', s): + return datetime.now() + + # YYYY-MM-DD HH:MM:SS + m = re.match( + r''' + (?P[0-9]{4}) + -(?P[0-9]{1,2}|\w{3}) + (?:-(?P[0-9]{1,2}))? + (?:\s*\(? + (?P[0-9]{2}):(?P[0-9]{2})(?::(?P[0-9]{2}))? + \)?)?''', s, flags=re.VERBOSE) + if m is None: + # DD-MM-YYYY HH:MM:SS + m = re.match( + r''' + (?:(?P[0-9]{1,2})-)? + (?P[0-9]{1,2}|\w{3}) + -(?P[0-9]{2}(?:[0-9]{2})?) + (?:\s*\(? + (?P[0-9]{2}):(?P[0-9]{2})(?::(?P[0-9]{2}))? + \)?)?''', s, flags=re.VERBOSE) + if m is not None: + return datetime.strptime(_date_fix(m), '%Y-%m-%d %H:%M:%S') + + return None + + +def _date_fix(mo): + y = mo.group('y') + if len(y) == 2: + y = '20' + y # buggy in ~80yrs or if using ~20yr-old data :) + m = mo.group('m') + if len(m) == 3: # assuming 3-letter abbreviations + m = str(datetime.strptime(m, '%b').month) + d = mo.group('d') or '01' + H = mo.group('H') or '00' + M = mo.group('M') or '00' + S = mo.group('S') or '00' + return '{}-{}-{} {}:{}:{}'.format(y, m, d, H, M, S) + + + # unescaping escaped strings (potentially with unicode) # (disabled but left here in case a need arises) # thanks: http://stackoverflow.com/a/24519338/1441112 @@ -64,23 +112,7 @@ def safe_int(x): # S-expressions # e.g. (:n-inputs . 3) or (S (NP (NNS Dogs)) (VP (VBZ bark))) -import re - -from delphin.lib.pegre import ( - sequence, - choice, - literal, - regex, - nonterminal, - delimited, - bounded, - Spacing, - Integer, - Float, - DQString, - Peg, - PegreResult -) +from delphin.lib.pegre import PegreResult # escapes from https://en.wikipedia.org/wiki/S-expression#Use_in_Lisp _SExpr_escape_chars = r'"\s\(\)\[\]\{\}\\;' @@ -152,35 +184,6 @@ def parse(self, s): SExpr = SExprParser() -# _SExpr = nonterminal('SExpr') -# _Symbol = regex(r'(?:[^{ec}]+|\\.)+'.format(ec=_SExpr_escape_chars), -# value=_SExpr_unescape_symbol) -# # dummy choice around DQString just to get character unescaping -# _String = choice(DQString, value=_SExpr_unescape_string) - -# SExpr = Peg( -# grammar=dict( -# start=sequence(Spacing, _SExpr, value=lambda xs: xs[1]), -# SExpr=choice( -# # atom -# sequence(choice(Float, Integer, _String, _Symbol), Spacing, -# value=lambda xs: xs[0]), -# # Expression -# bounded( -# regex(r'\(\s*'), -# choice( -# # DotPair -# sequence(_SExpr, regex(r'\.\s*'), _SExpr, -# value=lambda xs: tuple([xs[0], xs[2]])), -# # List -# delimited(_SExpr, Spacing) -# ), -# regex(r'\)\s*') -# ) -# ) -# ) -# ) - # attach an additional method for convenience def _format_SExpr(d): if isinstance(d, tuple) and len(d) == 2: diff --git a/docs/api/delphin.itsdb.rst b/docs/api/delphin.itsdb.rst index 0110d04d..2bde19a9 100644 --- a/docs/api/delphin.itsdb.rst +++ b/docs/api/delphin.itsdb.rst @@ -34,9 +34,9 @@ databases: .. autoclass:: TestSuite :members: -.. autoclass:: delphin.itsdb.Table +.. autoclass:: Table :members: -.. autoclass:: delphin.itsdb.Record +.. autoclass:: Record :members: Relations Files and Field Descriptions diff --git a/docs/index.rst b/docs/index.rst index 1a1368fc..1c1579a0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -38,6 +38,7 @@ PyDelphin api/delphin.tdl.rst api/delphin.tfs.rst api/delphin.tokens.rst + api/delphin.tsql.rst Indices and tables diff --git a/docs/tutorials/walkthrough.rst b/docs/tutorials/walkthrough.rst index fdedf764..ba0e3174 100644 --- a/docs/tutorials/walkthrough.rst +++ b/docs/tutorials/walkthrough.rst @@ -336,6 +336,22 @@ testsuites: - :mod:`delphin.itsdb` module - :doc:`itsdb` tutorial + +TSQL Queries +------------ + +Partial support of the Test Suite Query Language (TSQL) allows for +easy selection of [incr tsdb()] TestSuite data. + +>>> from delphin import itsdb, tsql +>>> ts = itsdb.TestSuite('erg/tsdb/gold/mrs') +>>> next(tsql.select('i-id i-input where i-length > 5 && readings > 0', ts)) +[61, 'Abrams handed the cigarette to Browne.'] + +.. seealso:: + - TSQL documentation: http://www.delph-in.net/tsnlp/ftp/manual/volume2.ps.gz + - :mod:`delphin.tsql` module + Regular Expression Preprocessors (REPP) --------------------------------------- diff --git a/tests/commands_test.py b/tests/commands_test.py index 5695bba3..963faae3 100644 --- a/tests/commands_test.py +++ b/tests/commands_test.py @@ -35,21 +35,32 @@ def mini_testsuite(tmpdir): rel.write('item:\n' ' i-id :integer :key\n' ' i-input :string\n' + ' i-wf :integer\n' + ' i-date :date\n' '\n' 'parse:\n' ' parse-id :integer :key\n' ' i-id :integer :key\n' + ' readings :integer\n' '\n' 'result:\n' ' parse-id :integer :key\n' ' result-id :integer\n' ' mrs :string\n') - item.write('10@It rained.') - parse.write('10@10') + item.write('10@It rained.@1@1-feb-2018 15:00\n' + '20@Rained.@0@01-02-18 15:00:00\n' + '30@It snowed.@1@2018-2-1 (15:00:00)\n') + parse.write('10@10@1\n' + '20@20@0\n' + '30@30@1\n') result.write('10@0@' '[ TOP: h0 INDEX: e2 [ e TENSE: past ]' ' RELS: < [ _rain_v_1<3:9> LBL: h1 ARG0: e2 ] >' - ' HCONS: < h0 qeq h1 > ]') + ' HCONS: < h0 qeq h1 > ]\n' + '30@0@' + '[ TOP: h0 INDEX: e2 [ e TENSE: past ]' + ' RELS: < [ _snow_v_1<3:9> LBL: h1 ARG0: e2 ] >' + ' HCONS: < h0 qeq h1 > ]\n') return ts @@ -184,7 +195,7 @@ def test_select(mini_testsuite): select('result:mrs', ts0) from delphin import itsdb select('result:mrs', itsdb.ItsdbProfile(ts0)) - select('parse:i-id@result:mrs', ts0, join=('parse', 'result')) + select('parse:i-id@result:mrs', ts0) select('result:result-id@mrs', ts0, mode='row') diff --git a/tests/itsdb_test.py b/tests/itsdb_test.py index 03bf56ae..7fb8f96e 100644 --- a/tests/itsdb_test.py +++ b/tests/itsdb_test.py @@ -11,25 +11,44 @@ from delphin.interfaces.base import Processor, ParseResponse from delphin import itsdb -_simple_relations = ''' -item: +_simple_relations = '''item: i-id :integer :key i-input :string +fold: + fold-id :integer :key + run: - run-id :integer :key # unique test run identifier + run-id :integer :key # unique test run identifier + +parse: + parse-id :integer :key # unique parse identifier + run-id :integer :key # test run for this parse + i-id :integer :key # item parsed + +result: + parse-id :integer :key # parse for this result + result-id :integer # result identifier + mrs :string # MRS for this reading +''' + +_alt_relations = '''item: + i-id :integer :key + i-input :string + i-date :date parse: - parse-id :integer :key # unique parse identifier - run-id :integer :key # test run for this parse - i-id :integer :key # item parsed + parse-id :integer :key # unique parse identifier + run-id :integer :key # test run for this parse + i-id :integer :key # item parsed result: - parse-id :integer :key # parse for this result - result-id :integer # result identifier - mrs :string # MRS for this reading + parse-id :integer :key # parse for this result + result-id :integer # result identifier + mrs :string # MRS for this reading ''' + @pytest.fixture def parser_cpu(): class DummyParser(Processor): @@ -95,34 +114,32 @@ def process_item(self, datum, keys=None): return DummyParser() @pytest.fixture -def empty_profile(): - d = tempfile.mkdtemp() - print(_simple_relations, file=open(os.path.join(d, 'relations'), 'w')) - return d +def empty_profile(tmpdir): + ts = tmpdir.mkdir('empty') + ts.join('relations').write(_simple_relations) + return str(ts) @pytest.fixture -def single_item_skeleton(): - d = tempfile.mkdtemp() - print(_simple_relations, file=open(os.path.join(d, 'relations'), 'w')) - print('0@The dog barks.', file=open(os.path.join(d, 'item'), 'w')) - return d +def single_item_skeleton(tmpdir): + ts = tmpdir.mkdir('skeleton') + ts.join('relations').write(_simple_relations) + ts.join('item').write('0@The dog barks.') + return str(ts) @pytest.fixture -def single_item_profile(): - d = tempfile.mkdtemp() - print(_simple_relations, file=open(os.path.join(d, 'relations'), 'w')) - print('0@The dog barks.', file=open(os.path.join(d, 'item'), 'w')) - print('0', file=open(os.path.join(d, 'run'), 'w')) - print('0@0@0', file=open(os.path.join(d, 'parse'), 'w')) - print( +def single_item_profile(tmpdir): + ts = tmpdir.mkdir('single') + ts.join('relations').write(_simple_relations) + ts.join('item').write('0@The dog barks.') + ts.join('run').write('0') + ts.join('parse').write('0@0@0') + ts.join('result').write( '0@0@[ LTOP: h0 INDEX: e2 RELS: < ' '[ _the_q<0:3> LBL: h4 ARG0: x3 RSTR: h5 BODY: h6 ] ' '[ _dog_n_1<4:7> LBL: h7 ARG0: x3 ] ' '[ _bark_v_1<8:14> LBL: h1 ARG0: e2 ARG1: x3 ] > ' - 'HCONS: < h0 qeq h1 h5 qeq h7 > ]', - file=open(os.path.join(d, 'result'), 'w') - ) - return d + 'HCONS: < h0 qeq h1 h5 qeq h7 > ]') + return str(ts) def test_Field(): f = itsdb.Field('x', ':y', True, False, 'a comment') @@ -160,12 +177,38 @@ def test_Relations(): ' i-id :integer :key # item parsed' ) + +def test_Relations_find(): + r = itsdb.Relations.from_string(_simple_relations) + assert r.find('i-id') == ['item', 'parse'] + assert r.find('mrs') == ['result'] + with pytest.raises(KeyError): + r.find('foo') + + +def test_Relations_path(): + r = itsdb.Relations.from_string(_simple_relations) + assert r.path('item', 'result') == [('parse', 'i-id'), ('result', 'parse-id')] + assert r.path('parse', 'item') == [('item', 'i-id')] + assert r.path('item+parse', 'result') == [('result', 'parse-id')] + assert r.path('item', 'parse+result') == [('parse', 'i-id')] + assert r.path('parse', 'parse') == [] + assert r.path('item+parse', 'parse+result') == [('result', 'parse-id')] + with pytest.raises(KeyError): + r.path('foo', 'result') + with pytest.raises(KeyError): + r.path('item', 'bar') + with pytest.raises(itsdb.ItsdbError): + r.path('item', 'fold') + + def test_Record(): rels = itsdb.Relations.from_string(_simple_relations) - r = itsdb.Record(rels['item'], [0, 'sentence']) + r = itsdb.Record(rels['item'], ['0', 'sentence']) assert r.fields == rels['item'] assert len(r) == 2 - assert r['i-id'] == r[0] == 0 + assert r['i-id'] == r[0] == '0' + assert r.get('i-id', cast=True) == 0 assert r['i-input'] == r[1] == 'sentence' assert r.get('i-input') == 'sentence' assert r.get('unknown') == None @@ -173,8 +216,10 @@ def test_Record(): # incorrect number of fields with pytest.raises(itsdb.ItsdbError): itsdb.Record(rels['item'], [0]) - # None values get set to default + # None values get set to default, and + # non-string values are left as-is r = itsdb.Record(rels['item'], [0, None]) + assert r['i-id'] == 0 assert r['i-input'] == '' # mapped fields r = itsdb.Record(rels['item'], {'i-id': 0, 'i-input': 'sentence'}) @@ -221,7 +266,7 @@ def test_Table(single_item_skeleton): assert t.name == 'item' assert len(t) == 1 assert isinstance(t[0], itsdb.Record) - assert t[0]['i-id'] == 0 + assert t[0]['i-id'] == '0' assert t[0]['i-input'] == 'The dog barks.' assert list(t.select('i-input')) == [['The dog barks.']] @@ -259,7 +304,7 @@ def test_reload(self, single_item_profile): t.reload() assert t['item'][0]['i-input'] == 'The dog barks.' - def test_write(self, single_item_profile): + def test_write(self, single_item_profile, tmpdir): t = itsdb.TestSuite(single_item_profile) assert t['item'][0]['i-input'] == 'The dog barks.' t['item'][0]['i-input'] = 'The dog sleeps.' @@ -280,6 +325,14 @@ def test_write(self, single_item_profile): t.write({'item': [record]}) t.reload() assert t['item'][0]['i-input'] == 'The cat meows.' + d = tmpdir.mkdir('alt') + altrels = itsdb.Relations.from_string(_alt_relations) + t.write(path=str(d), relations=altrels) + assert d.join('relations').read() == _alt_relations + assert sorted(x.basename for x in d.listdir()) == [ + 'item', 'parse', 'relations', 'result'] + ts = itsdb.TestSuite(str(d)) + assert 'i-date' in ts['item'].fields def test_process(self, parser_cpu, single_item_skeleton): ts = itsdb.TestSuite(single_item_skeleton) @@ -362,6 +415,30 @@ def test_match_rows(): ('30', [], [{'i-id': '30', 'i-input': 'd'}]) ] +def test_join(single_item_profile): + p = itsdb.TestSuite(single_item_profile) + + j = itsdb.join(p['parse'], p['result']) + assert j.name == 'parse+result' + assert len(j) == 1 + assert len(j.fields) == len(p['parse'].fields) + len(p['result'].fields) - 1 + r = j[0] + assert r['parse:run-id'] == r['run-id'] + assert r['result:mrs'] == r['mrs'] + assert r['parse:parse-id'] == r['result:parse-id'] == r['parse-id'] + + j2 = itsdb.join(p['item'], j) + assert j2.name == 'item+parse+result' + assert len(j2) == 1 + assert len(j2.fields) == len(j.fields) + len(p['item'].fields) - 1 + r = j2[0] + assert r['item:i-input'] == r['i-input'] + assert r['item:i-id'] == r['parse:i-id'] + + j3 = itsdb.join(j, p['item']) + assert j3.name == 'parse+result+item' + + ## Deprecated def test_get_relations(empty_profile): diff --git a/tests/tsql_test.py b/tests/tsql_test.py new file mode 100644 index 00000000..d7d4a9d7 --- /dev/null +++ b/tests/tsql_test.py @@ -0,0 +1,151 @@ + +from datetime import datetime + +import pytest + +from delphin import tsql +from delphin import itsdb +from delphin.exceptions import TSQLSyntaxError + +from .commands_test import mini_testsuite as ts0 + + +def test_parse_query(): + parse = lambda s: tsql._parse_query(s) + with pytest.raises(TSQLSyntaxError): + parse('info relations') + with pytest.raises(TSQLSyntaxError): + parse('set max-results 5') + with pytest.raises(TSQLSyntaxError): + parse('insert into item i-id values 10') + + +def test_parse_select(): + parse = lambda s: tsql._parse_select(s) + with pytest.raises(TSQLSyntaxError): + parse('*') + # with pytest.raises(TSQLSyntaxError): + # parse('i-input from item report "%s"') + + assert parse('i-input') == { + 'querytype': 'select', + 'projection': ['i-input'], + 'tables': [], + 'where': None} + + assert parse('i-input i-wf') == { + 'querytype': 'select', + 'projection': ['i-input', 'i-wf'], + 'tables': [], + 'where': None} + + assert parse('i-input i-wf from item') == { + 'querytype': 'select', + 'projection': ['i-input', 'i-wf'], + 'tables': ['item'], + 'where': None} + + assert parse('i-input mrs from item result') == { + 'querytype': 'select', + 'projection': ['i-input', 'mrs'], + 'tables': ['item', 'result'], + 'where': None} + + +def test_parse_select_complex_identifiers(): + parse = lambda s: tsql._parse_select(s) + assert parse('item:i-input') == { + 'querytype': 'select', + 'projection': ['item:i-input'], + 'tables': [], + 'where': None} + + assert parse('item:i-id@i-input') == { + 'querytype': 'select', + 'projection': ['item:i-id', 'item:i-input'], + 'tables': [], + 'where': None} + + assert parse('item:i-id@result:mrs') == { + 'querytype': 'select', + 'projection': ['item:i-id', 'result:mrs'], + 'tables': [], + 'where': None} + + assert parse('item:i-id@i-input mrs') == { + 'querytype': 'select', + 'projection': ['item:i-id', 'item:i-input', 'mrs'], + 'tables': [], + 'where': None} + + +def test_parse_select_where(): + parse = lambda s: tsql._parse_select(s) + assert parse('i-input where i-wf = 2') == { + 'querytype': 'select', + 'projection': ['i-input'], + 'tables': [], + 'where': ('==', ('i-wf', 2))} + + assert parse('i-input where i-date < 2018-01-15')['where'] == ( + '<', ('i-date', datetime(2018, 1, 15))) + + assert parse('i-input where i-date > 15-jan-2018(15:00:00)')['where'] == ( + '>', ('i-date', datetime(2018, 1, 15, 15, 0, 0))) + + assert parse('i-input where i-input ~ "Abrams"')['where'] == ( + '~', ('i-input', 'Abrams')) + + assert parse("i-input where i-input !~ 'Browne'")['where'] == ( + '!~', ('i-input', 'Browne')) + + assert parse('i-input ' + 'where i-wf = 2 & i-input ~ \'[Dd]og\'')['where'] == ( + 'and', (('==', ('i-wf', 2)), + ('~', ('i-input', '[Dd]og')))) + + assert parse('i-input ' + 'where i-id = 10 | i-id = 20 & i-wf = 2')['where'] == ( + 'or', (('==', ('i-id', 10)), + ('and', (('==', ('i-id', 20)), + ('==', ('i-wf', 2)))))) + + assert parse('i-input ' + 'where (i-id = 10 | i-id = 20) & !i-wf = 2')['where'] == ( + 'and', (('or', (('==', ('i-id', 10)), + ('==', ('i-id', 20)))), + ('not', ('==', ('i-wf', 2))))) + + +def test_select(ts0): + ts = itsdb.TestSuite(str(ts0)) + assert list(tsql.select('i-input', ts)) == [ + ['It rained.'], ['Rained.'], ['It snowed.']] + assert list(tsql.select('i-input from item', ts)) == [ + ['It rained.'], ['Rained.'], ['It snowed.']] + assert list(tsql.select('i-input from item item', ts)) == [ + ['It rained.'], ['Rained.'], ['It snowed.']] + assert list(tsql.select('i-input from result', ts)) == [ + ['It rained.'], ['It snowed.']] + assert list(tsql.select('i-input from item result', ts)) == [ + ['It rained.'], ['It snowed.']] + assert list(tsql.select('i-id i-input', ts)) == [ + [10, 'It rained.'], [20, 'Rained.'], [30, 'It snowed.']] + res = ts['result'] + assert list(tsql.select('i-id mrs', ts)) == [ + [10, res[0]['mrs']], [30, res[1]['mrs']]] + with pytest.raises(tsql.TSQLSyntaxError): + tsql.select('*', ts) + assert list(tsql.select('* from item', ts, cast=False)) == ts['item'] + + +def test_select_where(ts0): + ts = itsdb.TestSuite(str(ts0)) + assert list(tsql.select('i-input where i-input ~ "It"', ts)) == [ + ['It rained.'], ['It snowed.']] + assert list(tsql.select('i-input where i-input ~ "It" or i-id = 20', ts)) == [ + ['It rained.'], ['Rained.'], ['It snowed.']] + assert list(tsql.select('i-input where i-date >= 2018-02-01', ts)) == [ + ['It rained.'], ['Rained.'], ['It snowed.']] + assert list(tsql.select('i-input where readings > 0', ts)) == [ + ['It rained.'], ['It snowed.']]