Skip to content

Commit

Permalink
Don't output extra whitespace in YAML multiline (#993)
Browse files Browse the repository at this point in the history
This resolves a particular issue with parsing YAML multiline, for
example:
```yaml
a: |
  multiline literal
  line 2
```

The regex used would capture the amount of indentation in the third
capture group and then use that as a kind of "status" to know which
lines are part of the indented multiline. However, because its a
captured group it has to be assigned a token which was `TextWhitespace`.
This meant that the indentation was outputted after the multiline,
technically it should be seen as an non-captured group, but then its no
longer to refer to it in the regex. Therefore I've gone with the
solution to add a new token, Ignore, which will not be emitted as a
token in the iterator, which can safely be used to make use of capture
groups but not have them show up in the output.

## Before

![image](https://github.com/user-attachments/assets/c29353c5-9e15-4f14-a733-57a60fb51910)

## After

![image](https://github.com/user-attachments/assets/57b5d129-a9d3-4b84-ae1f-dc05182b9ad3)
  • Loading branch information
Gusted committed Aug 22, 2024
1 parent 895a048 commit 4d11870
Show file tree
Hide file tree
Showing 6 changed files with 448 additions and 424 deletions.
2 changes: 1 addition & 1 deletion lexers/embedded/yaml.xml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
<bygroups>
<token type="Punctuation"/>
<token type="LiteralStringDoc"/>
<token type="TextWhitespace"/>
<token type="Ignore"/>
</bygroups>
</rule>
<rule pattern="(false|False|FALSE|true|True|TRUE|null|Off|off|yes|Yes|YES|OFF|On|ON|no|No|on|NO|n|N|Y|y)\b">
Expand Down
40 changes: 20 additions & 20 deletions lexers/testdata/yaml.expected
Original file line number Diff line number Diff line change
Expand Up @@ -191,69 +191,69 @@
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"|"},
{"type":"LiteralStringDoc","value":"\n This entire block of text will be the value of the 'literal_block' key,\n with line breaks being preserved.\n\n The literal continues until de-dented, and the leading indentation is\n stripped.\n\n Any lines that are 'more-indented' keep the rest of their indentation -\n these lines will be indented by 4 spaces."},
{"type":"TextWhitespace","value":" \n"},
{"type":"TextWhitespace","value":"\n"},
{"type":"NameTag","value":"folded_style"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"\u003e"},
{"type":"LiteralStringDoc","value":"\n This entire block of text will be the value of 'folded_style', but this\n time, all newlines will be replaced with a single space.\n\n Blank lines, like above, are converted to a newline character.\n\n 'More-indented' lines keep their newlines, too -\n this text will appear over two lines."},
{"type":"TextWhitespace","value":" \n"},
{"type":"TextWhitespace","value":"\n"},
{"type":"NameTag","value":"literal_block_with_strip_chomping"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"|-"},
{"type":"LiteralStringDoc","value":"\n This entire block of text will be the value of the 'literal_block' key,\n with line breaks being preserved and the strip chomping indicator.\n\n The literal continues until de-dented, and the leading indentation is\n stripped.\n\n Any lines that are 'more-indented' keep the rest of their indentation -\n these lines will be indented by 4 spaces."},
{"type":"TextWhitespace","value":" \n"},
{"type":"TextWhitespace","value":"\n"},
{"type":"NameTag","value":"literal_block_with_keep_chomping"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"|+"},
{"type":"LiteralStringDoc","value":"\n This entire block of text will be the value of the 'literal_block' key,\n with line breaks being preserved and the keep chomping indicator.\n\n The literal continues until de-dented, and the leading indentation is\n stripped."},
{"type":"TextWhitespace","value":" \n\n"},
{"type":"TextWhitespace","value":"\n\n"},
{"type":"NameTag","value":"a"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"|"},
{"type":"LiteralStringDoc","value":"\n multiline literal\n line 2"},
{"type":"TextWhitespace","value":" \n"},
{"type":"TextWhitespace","value":"\n"},
{"type":"NameTag","value":"b"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"\u003e"},
{"type":"LiteralStringDoc","value":"\n multiline: folded\n line 2"},
{"type":"TextWhitespace","value":" \n"},
{"type":"TextWhitespace","value":"\n"},
{"type":"NameTag","value":"c"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"|-"},
{"type":"LiteralStringDoc","value":"\n multiline # literal strip\n line 2"},
{"type":"TextWhitespace","value":" \n"},
{"type":"TextWhitespace","value":"\n"},
{"type":"NameTag","value":"d"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"\u003e-"},
{"type":"LiteralStringDoc","value":"\n multiline folded strip\n line 2: test\n\n # not a comment\n indented by 1"},
{"type":"TextWhitespace","value":" \n"},
{"type":"TextWhitespace","value":"\n"},
{"type":"NameTag","value":"e"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"|+"},
{"type":"LiteralStringDoc","value":"\n multiline literal keep\n line: 2"},
{"type":"TextWhitespace","value":" \n"},
{"type":"TextWhitespace","value":"\n"},
{"type":"Comment","value":"# this is a comment"},
{"type":"TextWhitespace","value":"\n"},
{"type":"NameTag","value":"f"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"\u003e+"},
{"type":"LiteralStringDoc","value":"\n multiline folded keep one space\n line 2"},
{"type":"TextWhitespace","value":" \n"},
{"type":"TextWhitespace","value":"\n"},
{"type":"NameTag","value":"g"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"|"},
{"type":"LiteralStringDoc","value":"\n multiline literal with only one line"},
{"type":"TextWhitespace","value":" \n"},
{"type":"TextWhitespace","value":"\n"},
{"type":"NameTag","value":"h"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
Expand All @@ -267,45 +267,45 @@
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"|"},
{"type":"LiteralStringDoc","value":"\n multiline literal\n line 2"},
{"type":"TextWhitespace","value":" \n "},
{"type":"TextWhitespace","value":"\n "},
{"type":"NameTag","value":"b"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"\u003e"},
{"type":"LiteralStringDoc","value":"\n multiline: folded\n line 2"},
{"type":"TextWhitespace","value":" \n "},
{"type":"TextWhitespace","value":"\n "},
{"type":"NameTag","value":"c"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"|-"},
{"type":"LiteralStringDoc","value":"\n multiline # literal strip\n line 2 6 leading spaces"},
{"type":"TextWhitespace","value":" \n\n "},
{"type":"TextWhitespace","value":"\n\n "},
{"type":"NameTag","value":"d"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"\u003e-"},
{"type":"LiteralStringDoc","value":"\n multiline folded strip\n line 2: test\n # not a comment"},
{"type":"TextWhitespace","value":" \n "},
{"type":"TextWhitespace","value":"\n "},
{"type":"NameTag","value":"e"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"|+"},
{"type":"LiteralStringDoc","value":"\n multiline literal keep\n line: 2"},
{"type":"TextWhitespace","value":" \n "},
{"type":"TextWhitespace","value":"\n "},
{"type":"Comment","value":"# this is a comment"},
{"type":"TextWhitespace","value":"\n "},
{"type":"NameTag","value":"f"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"\u003e+"},
{"type":"LiteralStringDoc","value":"\n multiline folded keep\n line 2"},
{"type":"TextWhitespace","value":" \n "},
{"type":"TextWhitespace","value":"\n "},
{"type":"NameTag","value":"g"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"|"},
{"type":"LiteralStringDoc","value":"\n multiline literal with only one line"},
{"type":"TextWhitespace","value":" \n "},
{"type":"TextWhitespace","value":"\n "},
{"type":"NameTag","value":"h"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
Expand Down Expand Up @@ -355,7 +355,7 @@
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"|"},
{"type":"LiteralStringDoc","value":"\n This is a key\n that has multiple lines"},
{"type":"TextWhitespace","value":" \n"},
{"type":"TextWhitespace","value":"\n"},
{"type":"Punctuation","value":":"},
{"type":"TextWhitespace","value":" "},
{"type":"Literal","value":"and this is its value"},
Expand Down Expand Up @@ -622,7 +622,7 @@
{"type":"TextWhitespace","value":" "},
{"type":"Punctuation","value":"|"},
{"type":"LiteralStringDoc","value":"\n R0lGODlhDAAMAIQAAP//9/X17unp5WZmZgAAAOfn515eXvPz7Y6OjuDg4J+fn5\n OTk6enp56enmlpaWNjY6Ojo4SEhP/++f/++f/++f/++f/++f/++f/++f/++f/+\n +f/++f/++f/++f/++f/++SH+Dk1hZGUgd2l0aCBHSU1QACwAAAAADAAMAAAFLC\n AgjoEwnuNAFOhpEMTRiggcz4BNJHrv/zCFcLiwMWYNG84BwwEeECcgggoBADs="},
{"type":"TextWhitespace","value":" \n\n"},
{"type":"TextWhitespace","value":"\n\n"},
{"type":"Comment","value":"# YAML also has a set type, which looks like this:"},
{"type":"TextWhitespace","value":"\n"},
{"type":"NameTag","value":"set"},
Expand Down
6 changes: 6 additions & 0 deletions regexp.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,9 @@ func (l *LexerState) Iterator() Token { // nolint: gocognit
for len(l.iteratorStack) > 0 {
n := len(l.iteratorStack) - 1
t := l.iteratorStack[n]()
if t.Type == Ignore {
continue
}
if t == EOF {
l.iteratorStack = l.iteratorStack[:n]
continue
Expand Down Expand Up @@ -243,6 +246,9 @@ func (l *LexerState) Iterator() Token { // nolint: gocognit
for len(l.iteratorStack) > 0 {
n := len(l.iteratorStack) - 1
t := l.iteratorStack[n]()
if t.Type == Ignore {
continue
}
if t == EOF {
l.iteratorStack = l.iteratorStack[:n]
continue
Expand Down
11 changes: 11 additions & 0 deletions regexp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,3 +192,14 @@ func TestByGroupNames(t *testing.T) {
assert.NoError(t, err)
assert.Equal(t, []Token{{Error, `abc=123`}}, it.Tokens())
}

func TestIgnoreToken(t *testing.T) {
l := Coalesce(mustNewLexer(t, &Config{EnsureNL: true}, Rules{ // nolint: forbidigo
"root": {
{`(\s*)(\w+)(?:\1)(\n)`, ByGroups(Ignore, Keyword, Whitespace), nil},
},
}))
it, err := l.Tokenise(nil, ` hello `)
assert.NoError(t, err)
assert.Equal(t, []Token{{Keyword, "hello"}, {TextWhitespace, "\n"}}, it.Tokens())
}
Loading

0 comments on commit 4d11870

Please sign in to comment.