Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Question about the implementation of the parser #7

Open
mezoni opened this issue Mar 12, 2022 · 15 comments
Open

Question about the implementation of the parser #7

mezoni opened this issue Mar 12, 2022 · 15 comments

Comments

@mezoni
Copy link

mezoni commented Mar 12, 2022

Hello, István!
For the sake of interest, I implemented a parser for your queries specification.
To be honest, your specification does not specify everything, so I took some liberties in interpreting the missing definitions.
But this is not a problem, because it can be fixed in a few minutes in the parser definition.
Most of the time was spent studying the source code in order to understand the grammar (and specification too).

It took me no more than a few hours to write the parser definition, and then only because the specification is not comprehensive.

Below is the source code for the parser definition. The source code of the tests are included (for simplicity) in the source code of the parser. By running the generated parser, you are essentially running tests.

To be able to generate this parser, you need to add the package from the Github repository into dependencies.

import 'package:parser_builder/branch.dart';
import 'package:parser_builder/bytes.dart';
import 'package:parser_builder/combinator.dart';
import 'package:parser_builder/fast_build.dart';
import 'package:parser_builder/multi.dart';
import 'package:parser_builder/parser_builder.dart';
import 'package:parser_builder/sequence.dart';
import 'package:parser_builder/transformers.dart';
import 'package:query/src/ast.dart';

Future<void> main(List<String> args) async {
  final context = Context();
  context.optimizeForSize = false;
  await fastBuild(
      context, [_parse, _expression_, _and_], 'bin/query_parser.dart',
      header: __header, publish: {'parseQuery': _parse});
}

const __header = r'''
// ignore_for_file: unused_local_variable

import 'package:query/src/ast.dart';
import 'package:source_span/source_span.dart';
import 'package:test/test.dart';

String debugQuery(String input) => parseQuery(input).toString(debug: true);

void main() {
  group('Base expressions', () {
test('missing `)` or `"`', () {
      expect(debugQuery('(field: "x'), '(field:"<x>")');
      expect(debugQuery('("x'), '("<x>")');
      expect(debugQuery('('), '(<>)');
      expect(debugQuery('(a b'), '((<a> <b>))');
      expect(debugQuery('-(-(a OR -b) -c) | -(d'),
          '(-((-((<a> OR -<b>)) -<c>)) OR -(<d>))');

      // Missing after `c`
      // The parser parses, but the result is unpredictable.
      debugQuery('-(-(a OR -b) -c | -(d)');
    });

    test('1 string', () {
      expect(debugQuery('abc'), '<abc>');
      expect(debugQuery('"abc"'), '"<abc>"');
      expect(debugQuery('abc*'), '<abc*>');
    });

    test('2 strings', () {
      expect(debugQuery('abc def'), '(<abc> <def>)');
      expect(debugQuery('"abc 1" def'), '("<abc> <1>" <def>)');
      expect(debugQuery('abc "def 2"'), '(<abc> "<def> <2>")');
      expect(debugQuery('"abc" "def"'), '("<abc>" "<def>")');
      expect(debugQuery('"abc 1" "def 2"'), '("<abc> <1>" "<def> <2>")');
    });

    test('explicit AND', () {
      expect(debugQuery('a AND b c'), '(<a> <b> <c>)');
    });

    test('negative word', () {
      expect(debugQuery('-abc'), '-<abc>');
      expect(debugQuery('-"abc"'), '-"<abc>"');
      expect(debugQuery('-"abc 1"'), '-"<abc> <1>"');

      expect(debugQuery('NOT abc'), '-<abc>');
      expect(debugQuery('NOT "abc"'), '-"<abc>"');
      expect(debugQuery('NOT "abc 1"'), '-"<abc> <1>"');
    });

    test('scoped', () {
      expect(debugQuery('a:abc'), 'a:<abc>');
      expect(debugQuery('a:"abc"'), 'a:"<abc>"');
      expect(debugQuery('a:"abc 1"'), 'a:"<abc> <1>"');
      expect(debugQuery('a:-"abc 1"'), 'a:-"<abc> <1>"');
      expect(debugQuery('NOT field:abc'), '-field:<abc>');
    });

    test('special scoped', () {
      expect(debugQuery('a*:abc'), 'a*:<abc>');
      expect(debugQuery('a%:"abc"'), 'a%:"<abc>"');
    });

    test('compare', () {
      expect(debugQuery('year < 2000'), '<year<2000>');
      expect(debugQuery('field >= "test case"'), '<field>="test case">');
    });

    test('range', () {
      expect(debugQuery('[1 TO 10]'), '<[<1> TO <10>]>');
      expect(debugQuery(']1 TO 10['), '<]<1> TO <10>[>');
      expect(debugQuery(']"1 a" TO "10 b"['), '<]"<1> <a>" TO "<10> <b>"[>');
    });
  });

  group('or', () {
    test('2 items', () {
      expect(debugQuery('a OR b'), '(<a> OR <b>)');
    });

    test('2 items pipe', () {
      expect(debugQuery('a | b'), '(<a> OR <b>)');
    });

    test('3 items', () {
      expect(debugQuery('a OR b OR c'), '(<a> OR <b> OR <c>)');
    });

    test('3 items pipe', () {
      expect(debugQuery('a | b | c'), '(<a> OR <b> OR <c>)');
    });

    test('3 items pipe mixed', () {
      expect(debugQuery('a OR b | c'), '(<a> OR <b> OR <c>)');
      expect(debugQuery('a | b OR c'), '(<a> OR <b> OR <c>)');
    });

    test('precedence of implicit AND, explicit OR', () {
      expect(debugQuery('a b OR c'), '(<a> (<b> OR <c>))');
      expect(debugQuery('a OR b c'), '(<a> OR (<b> <c>))');
      expect(debugQuery('a OR b c OR d'), '(<a> OR (<b> (<c> OR <d>)))');
    });

    test('precedence of implicit AND, explicit OR pipe', () {
      expect(debugQuery('a b | c'), '(<a> (<b> OR <c>))');
      expect(debugQuery('a | b c'), '(<a> OR (<b> <c>))');
      expect(debugQuery('a | b c | d'), '(<a> OR (<b> (<c> OR <d>)))');
    });
  });

  group('complex cases', () {
    test('#1', () {
      expect(debugQuery('a:-v1 b:(beta OR moon < Deimos OR [a TO e])'),
          '(a:-<v1> b:((<beta> OR <moon<Deimos> OR <[<a> TO <e>]>)))');
    });

    test('#2', () {
      expect(debugQuery('a = 2000 b > 2000 c'), '(<a=2000> <b>2000> <c>)');
    });

    test('#3', () {
      // TODO: fix
      // expect(debugQuery('(f:abc)'), '');
    });
  });

  group('unicode chars', () {
    test('hungarian', () {
      expect(
          debugQuery('árvíztűrő TÜKÖRFÚRÓGÉP'), '(<árvíztűrő> <TÜKÖRFÚRÓGÉP>)');
    });
  });

  group('grouping precedence', () {
    test('empty group', () {
      expect(debugQuery('()'), '(<>)');
    });
    test('empty group with space', () {
      expect(debugQuery('(  )'), '(<>)');
    });
    test('single item group', () {
      expect(debugQuery('(a)'), '(<a>)');
    });
    test('single item group with space', () {
      expect(debugQuery('( a )'), '(<a>)');
    });
    test('grouping with two items implicit AND', () {
      expect(debugQuery('(a b)'), '((<a> <b>))');
    });
    test('grouping with two items explicit AND', () {
      expect(debugQuery('(a AND b)'), '((<a> <b>))');
    });
    test('grouping with multiple items', () {
      expect(debugQuery('(a | b) c (d | e)'),
          '(((<a> OR <b>)) <c> ((<d> OR <e>)))');
    });
    test('nested grouping', () {
      expect(debugQuery('(a OR b) OR c'), '(((<a> OR <b>)) OR <c>)');
      expect(debugQuery('(a OR b) c'), '(((<a> OR <b>)) <c>)');
      expect(debugQuery('((a OR b) c) | d'), '(((((<a> OR <b>)) <c>)) OR <d>)');
    });
    test('negative grouping', () {
      expect(debugQuery('-(a OR b) OR c'), '(-((<a> OR <b>)) OR <c>)');
      expect(debugQuery('(a OR -b) c'), '(((<a> OR -<b>)) <c>)');
      expect(debugQuery('-(-(a OR -b) -c) | -(d)'),
          '(-((-((<a> OR -<b>)) -<c>)) OR -(<d>))');
    });
    test('scoped grouping', () {
      expect(debugQuery('(field:abc)'), '(field:<abc>)');
      expect(debugQuery('(field:abc AND field:def)'),
          '((field:<abc> field:<def>))');
      expect(debugQuery('(field:abc OR field:def)'),
          '((field:<abc> OR field:<def>))');
    });
  });

  group('phrase match', () {
    test('empty phrase', () {
      expect(debugQuery('""'), '""');
    });
    test('empty phrase with space', () {
      expect(debugQuery('"  "'), '""');
    });
    test('simple word phrase', () {
      expect(debugQuery('"a"'), '"<a>"');
    });
    test('single word phrase with space', () {
      expect(debugQuery('" a "'), '"<a>"');
    });
    test('two word phrase', () {
      expect(debugQuery('"a b"'), '"<a> <b>"');
    });
    test('three word phrase', () {
      expect(debugQuery('"a b c"'), '"<a> <b> <c>"');
    });
    test('three word phrase with AND', () {
      expect(debugQuery('"a AND b"'), '"<a> <AND> <b>"');
    });
    test('three word phrase with OR', () {
      expect(debugQuery('"a OR b"'), '"<a> <OR> <b>"');
    });
    test('negative phrase grouping', () {
      expect(debugQuery('-("a OR b") OR c'), '(-("<a> <OR> <b>") OR <c>)');
      expect(debugQuery('(a OR -"b") ("c")'), '(((<a> OR -"<b>")) ("<c>"))');
      expect(debugQuery('-(-"a OR -b" -c) | -"d"'),
          '(-((-"<a> <OR> <-b>" -<c>)) OR -"<d>")');
    });
    test('phrase with parenthesis', () {
      expect(debugQuery('"(a OR -b)" -("-c | []")'),
          '("<(a> <OR> <-b)>" -("<-c> <|> <[]>"))');
    });
  });
}
''';

const _and = Ref<String, Query>('_and');

const _and_ = Named(
    '_and',
    Map1(
        CombinedList1(_or, Preceded(Opt(_andOp), _or)),
        Expr<Query>([
          'children'
        ], '{{children}}.length == 1 ? {{children}}[0] : AndQuery({{children}})')));

const _andOp = Named('_andOp', Terminated(Tag('AND'), _ws));

const _closeBracket = Named('_closeBracket', Terminated(Tag(']'), _ws));

const _closeParen = Named('_closeParen', Terminated(Tag(')'), _ws));

const _colon = Named('_colon', Terminated(Tag(':'), _ws));

const _comparison = Named(
    '_comparison',
    Map3(
        _identifier,
        _comparisonOperator,
        _wordOrText,
        Expr<FieldCompareQuery>(['field', 'op', 'text'],
            'FieldCompareQuery({{field}}, {{op}}, {{text}})')));

const _comparisonOperator = Named('_comparisonOperator',
    Terminated(Tags(['<=', '<', '>=', '>', '!=', '=']), _ws));

const _exclusion = Named(
    '_exclusion',
    Alt2(
      Map1(Preceded(_exclusionOp, _expression),
          Expr<NotQuery>(['child'], 'NotQuery({{child}})')),
      _expression,
    ));

const _exclusionOp = Named(
    '_exclusionOp',
    Alt2(
      Tag('-'),
      Terminated(Tag('NOT'), _ws),
    ));

const _expression = Ref<String, Query>('_expression');

const _expression_ = Named(
    '_expression',
    Alt5(
      _group,
      _phrase,
      _range,
      _comparison,
      _word,
    ));

const _field = Named('_field', TakeWhile1(_isFieldChar));

const _group = Named(
    '_group',
    Preceded(
        _openParen,
        Alt2(
          Map1(Terminated(_and, _recoveryCloseParen),
              Expr<GroupQuery>(['child'], 'GroupQuery({{child}})')),
          Map1(_recoveryCloseParen,
              Expr<TextQuery>(['_'], "GroupQuery(TextQuery(''))")),
        )));

const _identifier =
    Named('_identifier', Terminated(TakeWhile1(_isAllowedChar), _ws));

const _isAllowedChar = NotCharClass('"[" | "]" | [():<!=>] | [#x0-#x20]');

const _isFieldChar = NotCharClass('[#x0-#x20] | [:]');

const _isPhraseWord = NotCharClass('#x9 | #xA | #xD | [" ]');

const _openBracket = Named('_openBracket', Terminated(Tag('['), _ws));

const _openOrCloseBracket =
    Named('_openOrCloseBracket', Alt2(_openBracket, _closeBracket));

const _openParen = Named('_openParen', Terminated(Tag('('), _ws));

const _or = Named(
    '_or',
    Map1(
        CombinedList1(
            Alt2(
              _group,
              _scopedExclusion,
            ),
            Preceded(_orOp, _and)),
        FuncTransformer<Query>(['List<Query> children'], '''
if (children.length == 1) return children.single;
final second = children.last;
if (children.length == 2 && second is OrQuery) {
  second.children.insert(0, children.first);
  return second;
}
return OrQuery(children);''')));

const _orOp = Named(
    '_orOp',
    Alt2(
      Terminated(Tag('|'), _ws),
      Terminated(Tag('OR'), _ws),
    ));

const _parse = Named('_parse', Delimited(_ws, _and, Eof<String>()));

const _phrase = Named(
    '_phrase',
    Map1(
        Delimited(_quote, Many0(_phraseWord), _recoveryCloseQuote),
        Expr<PhraseQuery>([
          'words'
        ], 'PhraseQuery({{words}}.join(\' \'), {{words}}.map((e) => TextQuery(e)).toList())')));

const _phraseWord =
    Named('_phraseWord', Terminated(TakeWhile1(_isPhraseWord), _ws));

const _quote = Named('_closeQuote', Terminated(Tag('"'), _ws));

const _range = Named(
    '_range',
    Map5(
        _openOrCloseBracket,
        _wordOrText,
        _to,
        _wordOrText,
        _openOrCloseBracket,
        Expr<RangeQuery>(['open', 'start', '_', 'end', 'close'], '''
RangeQuery({{start}}, {{end}}, startInclusive: {{open}} == '[', endInclusive: {{close}} == ']')
''')));

const _recoveryCloseParen = Named('_recoveryCloseParen',
    Terminated(Alt2(Tag(')'), Value<String, String>(')')), _ws));

const _recoveryCloseQuote = Named('_recoveryCloseQuote',
    Terminated(Alt2(Tag('"'), Value<String, String>('"')), _ws));

const _scopedExclusion = Named(
    '_scopedExclusion',
    Alt2(
      Map1(Preceded(_exclusionOp, _scopedExpression),
          Expr<NotQuery>(['child'], 'NotQuery({{child}})')),
      _scopedExpression,
    ));

const _scopedExpression = Named(
    '_scopedExpression',
    Alt2(
      Map3(
          _field,
          _colon,
          _exclusion,
          Expr<FieldScope>(
              ['field', '_', 'child'], 'FieldScope({{field}}, {{child}})')),
      _exclusion,
    ));

const _to = Named('_to', Terminated(Tag('TO'), _ws));

const _word = Named('_word',
    Map1(_identifier, Expr<TextQuery>(['text'], 'TextQuery({{text}})')));

const _wordOrText = Named('_wordOrText ', Alt2(_phrase, _word));

const _ws = Named('_ws', SkipWhile(CharClass('#x9 | #xA | #xD | #x20')));

typedef Expr<T> = ExprTransformer<T>;
@mezoni
Copy link
Author

mezoni commented Mar 12, 2022

If you do not want to generate it, then here is a temporary link to a ready-made parser.
https://gist.github.com/mezoni/480e584ea39ac1652b62a6d8863399bf

I am ready to answer any questions you may have.

@isoos
Copy link
Owner

isoos commented Mar 12, 2022

@mezoni: for a while now, what I was mostly interested was a lenient query parser: users may provide badly formatted queries, and rejecting them may or may not be with the desired output. Would your parser and grammar provide that feature?

@mezoni
Copy link
Author

mezoni commented Mar 12, 2022

No. Only simple cases. When a possible error is taken into account in the grammar and a special parsing rule is created for this. Not the most convenient way.
This is difficult to do automatically, but it can be done, but you will have to study the materials on how to implement it. Recovering an error is not a very easy task, because error are different.
This is a complex solution, different strategies, different approach.
There is no simple solution.
Simple solution is this.
You give up and try to recreate all possible errors and change the grammar for this purpose, creating additional parsing options, in order to recover from a parsing error.

You can read on the Internet, so many interesting articles. All of them are theoretical.
There is no universal recipe.

@mezoni
Copy link
Author

mezoni commented Mar 12, 2022

users may provide badly formatted queries

Can you provide examples?

@mezoni mezoni closed this as completed Mar 12, 2022
@mezoni
Copy link
Author

mezoni commented Mar 12, 2022

Sorry for disturbing you.

@isoos isoos reopened this Mar 12, 2022
@isoos
Copy link
Owner

isoos commented Mar 12, 2022

users may provide badly formatted queries
Can you provide examples?

Anything that mixes and does not close subqueries, e.g. (field: "x

@mezoni
Copy link
Author

mezoni commented Mar 12, 2022

Your case is very simple and not simple at the same time. A malformed string can only be recovered only at the position of an invalid character (such as a newline) and at the end of the file.

The missing ) can be recovred simply? only if it is unambiguous.

For example, it is easier to recovred it in a statements.

if (a return x;

But recovering it in expressions is not easy. I checked, even Dart analyzer does not recover in complex expressions with multiple allowed and used parentheses. Because this is unambiguous where to insert missing ).

Well formed:

-(-(a OR -b) -c) | -(d)

Malformed

-(-(a OR -b) -c | -(d) - missing after c or missing after d???

How to recover?

Only in this way.

-(-(a OR -b) -c | -(d))

But this is no correct if it was missing after c (by your logic).

-(-(a OR -b) -c) | -(d) != -(-(a OR -b) -c | -(d))

Dart analyzer also cannot recover this correctly.

@mezoni
Copy link
Author

mezoni commented Mar 12, 2022

But I changed the grammar and it doesn't throw errors in simple cases.

test('missign )', () {
      expect(debugQuery('('), '(<>)');
      expect(debugQuery('(a b'), '((<a> <b>))');
      expect(debugQuery('-(-(a OR -b) -c) | -(d'),
          '(-((-((<a> OR -<b>)) -<c>)) OR -(<d>))');

      // Missing after `c`
      // The parser parses, but the result is unpredictable.
      debugQuery('-(-(a OR -b) -c | -(d)');
    });

00:00 +0: Base expressions missign )

@mezoni
Copy link
Author

mezoni commented Mar 12, 2022

Passed

  group('Base expressions', () {
    test('missing `) or `"`', () {
      expect(debugQuery('(field: "x'), '(field:"<x>")');
      expect(debugQuery('("x'), '("<x>")');
      expect(debugQuery('('), '(<>)');
      expect(debugQuery('(a b'), '((<a> <b>))');
      expect(debugQuery('-(-(a OR -b) -c) | -(d'),
          '(-((-((<a> OR -<b>)) -<c>)) OR -(<d>))');

      // Missing after `c`
      // The parser parses, but the result is unpredictable.
      debugQuery('-(-(a OR -b) -c | -(d)');
    });
00:00 +0: Base expressions missing `) or `"`

Send you a grammar?
This is manual tuning. Nothing special.

@mezoni
Copy link
Author

mezoni commented Mar 12, 2022

Error Handling in PEG Parsers
Local Error Recovery for PetitParser
http://scg.unibe.ch/archive/masters/Ruef16a.pdf

@mezoni
Copy link
Author

mezoni commented Mar 12, 2022

I edited the grammar here:
#7 (comment)

@mezoni
Copy link
Author

mezoni commented Mar 12, 2022

Of course, for such purposes, I could even write a separate parser builder. And then the generated parser will do it instead of you and for you.

Here I posted a post about the performance test.
https://gitter.im/dart-lang/TALK-general?at=622c5c7b257a357825269532

@mezoni
Copy link
Author

mezoni commented Mar 13, 2022

Instead of this code.

const _group = Named(
    '_group',
    Preceded(
        _openParen,
        Alt2(
          Map1(Terminated(_and, _recoveryCloseParen),
              Expr<GroupQuery>(['child'], 'GroupQuery({{child}})')),
          Map1(_recoveryCloseParen,
              Expr<TextQuery>(['_'], "GroupQuery(TextQuery(''))")),
        )));

Thisi code should be used. For recovering.

const _group = Named(
    '_group',
    Map3(
        _openParen,
        Opt(_and),
        _recoveryCloseParen,
        Expr<GroupQuery>(['open', 'child', 'close'],
            ''' GroupQuery({{child}} ?? TextQuery('')) ''')));

This generates simple and understandable code.

GroupQuery? _group(State<String> state) {
  GroupQuery? $0;
  final $pos = state.pos;
  String? $1;
  $1 = _openParen(state);
  if (state.ok) {
    Query? $2;
    final $log = state.log;
    state.log = false;
    Query? $3;
    $3 = _and(state);
    if (state.ok) {
      $2 = $3!;
    } else {
      state.ok = true;
      $2 = null;
    }
    state.log = $log;
    if (state.ok) {
      String? $4;
      $4 = _recoveryCloseParen(state);
      if (state.ok) {
        $0 = GroupQuery($2 ?? TextQuery(''));
      }
    }
  }
  if (!state.ok) {
    state.pos = $pos;
  }
  return $0;
}

And it works.
And easy to debug.

8oty6vjB

@mezoni
Copy link
Author

mezoni commented Mar 13, 2022

A new version is available.
Fixed problems with incorrect work of the procedures for creating an error report.
Added new features to error hanling and reporting.

@mezoni
Copy link
Author

mezoni commented Mar 13, 2022

Initially, a trace implementation feature is available. The way you want it, because it's a code generator and nothing is impossible (almost not).

The trace will show you how the parsing process goes and where something goes wrong (in your opinion).
Useful for initial grammar development.
When it seems that the parsing should go as you would like, but the parsing does not go as it should.

And all this can be debugged (that is, use deep analysis).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants