Skip to content

Commit 4d11870

Browse files
authored
Don't output extra whitespace in YAML multiline (#993)
This resolves a particular issue with parsing YAML multiline, for example: ```yaml a: | multiline literal line 2 ``` The regex used would capture the amount of indentation in the third capture group and then use that as a kind of "status" to know which lines are part of the indented multiline. However, because its a captured group it has to be assigned a token which was `TextWhitespace`. This meant that the indentation was outputted after the multiline, technically it should be seen as an non-captured group, but then its no longer to refer to it in the regex. Therefore I've gone with the solution to add a new token, Ignore, which will not be emitted as a token in the iterator, which can safely be used to make use of capture groups but not have them show up in the output. ## Before ![image](https://github.com/user-attachments/assets/c29353c5-9e15-4f14-a733-57a60fb51910) ## After ![image](https://github.com/user-attachments/assets/57b5d129-a9d3-4b84-ae1f-dc05182b9ad3)
1 parent 895a048 commit 4d11870

File tree

6 files changed

+448
-424
lines changed

6 files changed

+448
-424
lines changed

lexers/embedded/yaml.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
<bygroups>
5454
<token type="Punctuation"/>
5555
<token type="LiteralStringDoc"/>
56-
<token type="TextWhitespace"/>
56+
<token type="Ignore"/>
5757
</bygroups>
5858
</rule>
5959
<rule pattern="(false|False|FALSE|true|True|TRUE|null|Off|off|yes|Yes|YES|OFF|On|ON|no|No|on|NO|n|N|Y|y)\b">

lexers/testdata/yaml.expected

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -191,69 +191,69 @@
191191
{"type":"TextWhitespace","value":" "},
192192
{"type":"Punctuation","value":"|"},
193193
{"type":"LiteralStringDoc","value":"\n This entire block of text will be the value of the 'literal_block' key,\n with line breaks being preserved.\n\n The literal continues until de-dented, and the leading indentation is\n stripped.\n\n Any lines that are 'more-indented' keep the rest of their indentation -\n these lines will be indented by 4 spaces."},
194-
{"type":"TextWhitespace","value":" \n"},
194+
{"type":"TextWhitespace","value":"\n"},
195195
{"type":"NameTag","value":"folded_style"},
196196
{"type":"Punctuation","value":":"},
197197
{"type":"TextWhitespace","value":" "},
198198
{"type":"Punctuation","value":"\u003e"},
199199
{"type":"LiteralStringDoc","value":"\n This entire block of text will be the value of 'folded_style', but this\n time, all newlines will be replaced with a single space.\n\n Blank lines, like above, are converted to a newline character.\n\n 'More-indented' lines keep their newlines, too -\n this text will appear over two lines."},
200-
{"type":"TextWhitespace","value":" \n"},
200+
{"type":"TextWhitespace","value":"\n"},
201201
{"type":"NameTag","value":"literal_block_with_strip_chomping"},
202202
{"type":"Punctuation","value":":"},
203203
{"type":"TextWhitespace","value":" "},
204204
{"type":"Punctuation","value":"|-"},
205205
{"type":"LiteralStringDoc","value":"\n This entire block of text will be the value of the 'literal_block' key,\n with line breaks being preserved and the strip chomping indicator.\n\n The literal continues until de-dented, and the leading indentation is\n stripped.\n\n Any lines that are 'more-indented' keep the rest of their indentation -\n these lines will be indented by 4 spaces."},
206-
{"type":"TextWhitespace","value":" \n"},
206+
{"type":"TextWhitespace","value":"\n"},
207207
{"type":"NameTag","value":"literal_block_with_keep_chomping"},
208208
{"type":"Punctuation","value":":"},
209209
{"type":"TextWhitespace","value":" "},
210210
{"type":"Punctuation","value":"|+"},
211211
{"type":"LiteralStringDoc","value":"\n This entire block of text will be the value of the 'literal_block' key,\n with line breaks being preserved and the keep chomping indicator.\n\n The literal continues until de-dented, and the leading indentation is\n stripped."},
212-
{"type":"TextWhitespace","value":" \n\n"},
212+
{"type":"TextWhitespace","value":"\n\n"},
213213
{"type":"NameTag","value":"a"},
214214
{"type":"Punctuation","value":":"},
215215
{"type":"TextWhitespace","value":" "},
216216
{"type":"Punctuation","value":"|"},
217217
{"type":"LiteralStringDoc","value":"\n multiline literal\n line 2"},
218-
{"type":"TextWhitespace","value":" \n"},
218+
{"type":"TextWhitespace","value":"\n"},
219219
{"type":"NameTag","value":"b"},
220220
{"type":"Punctuation","value":":"},
221221
{"type":"TextWhitespace","value":" "},
222222
{"type":"Punctuation","value":"\u003e"},
223223
{"type":"LiteralStringDoc","value":"\n multiline: folded\n line 2"},
224-
{"type":"TextWhitespace","value":" \n"},
224+
{"type":"TextWhitespace","value":"\n"},
225225
{"type":"NameTag","value":"c"},
226226
{"type":"Punctuation","value":":"},
227227
{"type":"TextWhitespace","value":" "},
228228
{"type":"Punctuation","value":"|-"},
229229
{"type":"LiteralStringDoc","value":"\n multiline # literal strip\n line 2"},
230-
{"type":"TextWhitespace","value":" \n"},
230+
{"type":"TextWhitespace","value":"\n"},
231231
{"type":"NameTag","value":"d"},
232232
{"type":"Punctuation","value":":"},
233233
{"type":"TextWhitespace","value":" "},
234234
{"type":"Punctuation","value":"\u003e-"},
235235
{"type":"LiteralStringDoc","value":"\n multiline folded strip\n line 2: test\n\n # not a comment\n indented by 1"},
236-
{"type":"TextWhitespace","value":" \n"},
236+
{"type":"TextWhitespace","value":"\n"},
237237
{"type":"NameTag","value":"e"},
238238
{"type":"Punctuation","value":":"},
239239
{"type":"TextWhitespace","value":" "},
240240
{"type":"Punctuation","value":"|+"},
241241
{"type":"LiteralStringDoc","value":"\n multiline literal keep\n line: 2"},
242-
{"type":"TextWhitespace","value":" \n"},
242+
{"type":"TextWhitespace","value":"\n"},
243243
{"type":"Comment","value":"# this is a comment"},
244244
{"type":"TextWhitespace","value":"\n"},
245245
{"type":"NameTag","value":"f"},
246246
{"type":"Punctuation","value":":"},
247247
{"type":"TextWhitespace","value":" "},
248248
{"type":"Punctuation","value":"\u003e+"},
249249
{"type":"LiteralStringDoc","value":"\n multiline folded keep one space\n line 2"},
250-
{"type":"TextWhitespace","value":" \n"},
250+
{"type":"TextWhitespace","value":"\n"},
251251
{"type":"NameTag","value":"g"},
252252
{"type":"Punctuation","value":":"},
253253
{"type":"TextWhitespace","value":" "},
254254
{"type":"Punctuation","value":"|"},
255255
{"type":"LiteralStringDoc","value":"\n multiline literal with only one line"},
256-
{"type":"TextWhitespace","value":" \n"},
256+
{"type":"TextWhitespace","value":"\n"},
257257
{"type":"NameTag","value":"h"},
258258
{"type":"Punctuation","value":":"},
259259
{"type":"TextWhitespace","value":" "},
@@ -267,45 +267,45 @@
267267
{"type":"TextWhitespace","value":" "},
268268
{"type":"Punctuation","value":"|"},
269269
{"type":"LiteralStringDoc","value":"\n multiline literal\n line 2"},
270-
{"type":"TextWhitespace","value":" \n "},
270+
{"type":"TextWhitespace","value":"\n "},
271271
{"type":"NameTag","value":"b"},
272272
{"type":"Punctuation","value":":"},
273273
{"type":"TextWhitespace","value":" "},
274274
{"type":"Punctuation","value":"\u003e"},
275275
{"type":"LiteralStringDoc","value":"\n multiline: folded\n line 2"},
276-
{"type":"TextWhitespace","value":" \n "},
276+
{"type":"TextWhitespace","value":"\n "},
277277
{"type":"NameTag","value":"c"},
278278
{"type":"Punctuation","value":":"},
279279
{"type":"TextWhitespace","value":" "},
280280
{"type":"Punctuation","value":"|-"},
281281
{"type":"LiteralStringDoc","value":"\n multiline # literal strip\n line 2 6 leading spaces"},
282-
{"type":"TextWhitespace","value":" \n\n "},
282+
{"type":"TextWhitespace","value":"\n\n "},
283283
{"type":"NameTag","value":"d"},
284284
{"type":"Punctuation","value":":"},
285285
{"type":"TextWhitespace","value":" "},
286286
{"type":"Punctuation","value":"\u003e-"},
287287
{"type":"LiteralStringDoc","value":"\n multiline folded strip\n line 2: test\n # not a comment"},
288-
{"type":"TextWhitespace","value":" \n "},
288+
{"type":"TextWhitespace","value":"\n "},
289289
{"type":"NameTag","value":"e"},
290290
{"type":"Punctuation","value":":"},
291291
{"type":"TextWhitespace","value":" "},
292292
{"type":"Punctuation","value":"|+"},
293293
{"type":"LiteralStringDoc","value":"\n multiline literal keep\n line: 2"},
294-
{"type":"TextWhitespace","value":" \n "},
294+
{"type":"TextWhitespace","value":"\n "},
295295
{"type":"Comment","value":"# this is a comment"},
296296
{"type":"TextWhitespace","value":"\n "},
297297
{"type":"NameTag","value":"f"},
298298
{"type":"Punctuation","value":":"},
299299
{"type":"TextWhitespace","value":" "},
300300
{"type":"Punctuation","value":"\u003e+"},
301301
{"type":"LiteralStringDoc","value":"\n multiline folded keep\n line 2"},
302-
{"type":"TextWhitespace","value":" \n "},
302+
{"type":"TextWhitespace","value":"\n "},
303303
{"type":"NameTag","value":"g"},
304304
{"type":"Punctuation","value":":"},
305305
{"type":"TextWhitespace","value":" "},
306306
{"type":"Punctuation","value":"|"},
307307
{"type":"LiteralStringDoc","value":"\n multiline literal with only one line"},
308-
{"type":"TextWhitespace","value":" \n "},
308+
{"type":"TextWhitespace","value":"\n "},
309309
{"type":"NameTag","value":"h"},
310310
{"type":"Punctuation","value":":"},
311311
{"type":"TextWhitespace","value":" "},
@@ -355,7 +355,7 @@
355355
{"type":"TextWhitespace","value":" "},
356356
{"type":"Punctuation","value":"|"},
357357
{"type":"LiteralStringDoc","value":"\n This is a key\n that has multiple lines"},
358-
{"type":"TextWhitespace","value":" \n"},
358+
{"type":"TextWhitespace","value":"\n"},
359359
{"type":"Punctuation","value":":"},
360360
{"type":"TextWhitespace","value":" "},
361361
{"type":"Literal","value":"and this is its value"},
@@ -622,7 +622,7 @@
622622
{"type":"TextWhitespace","value":" "},
623623
{"type":"Punctuation","value":"|"},
624624
{"type":"LiteralStringDoc","value":"\n R0lGODlhDAAMAIQAAP//9/X17unp5WZmZgAAAOfn515eXvPz7Y6OjuDg4J+fn5\n OTk6enp56enmlpaWNjY6Ojo4SEhP/++f/++f/++f/++f/++f/++f/++f/++f/+\n +f/++f/++f/++f/++f/++SH+Dk1hZGUgd2l0aCBHSU1QACwAAAAADAAMAAAFLC\n AgjoEwnuNAFOhpEMTRiggcz4BNJHrv/zCFcLiwMWYNG84BwwEeECcgggoBADs="},
625-
{"type":"TextWhitespace","value":" \n\n"},
625+
{"type":"TextWhitespace","value":"\n\n"},
626626
{"type":"Comment","value":"# YAML also has a set type, which looks like this:"},
627627
{"type":"TextWhitespace","value":"\n"},
628628
{"type":"NameTag","value":"set"},

regexp.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,9 @@ func (l *LexerState) Iterator() Token { // nolint: gocognit
194194
for len(l.iteratorStack) > 0 {
195195
n := len(l.iteratorStack) - 1
196196
t := l.iteratorStack[n]()
197+
if t.Type == Ignore {
198+
continue
199+
}
197200
if t == EOF {
198201
l.iteratorStack = l.iteratorStack[:n]
199202
continue
@@ -243,6 +246,9 @@ func (l *LexerState) Iterator() Token { // nolint: gocognit
243246
for len(l.iteratorStack) > 0 {
244247
n := len(l.iteratorStack) - 1
245248
t := l.iteratorStack[n]()
249+
if t.Type == Ignore {
250+
continue
251+
}
246252
if t == EOF {
247253
l.iteratorStack = l.iteratorStack[:n]
248254
continue

regexp_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,3 +192,14 @@ func TestByGroupNames(t *testing.T) {
192192
assert.NoError(t, err)
193193
assert.Equal(t, []Token{{Error, `abc=123`}}, it.Tokens())
194194
}
195+
196+
func TestIgnoreToken(t *testing.T) {
197+
l := Coalesce(mustNewLexer(t, &Config{EnsureNL: true}, Rules{ // nolint: forbidigo
198+
"root": {
199+
{`(\s*)(\w+)(?:\1)(\n)`, ByGroups(Ignore, Keyword, Whitespace), nil},
200+
},
201+
}))
202+
it, err := l.Tokenise(nil, ` hello `)
203+
assert.NoError(t, err)
204+
assert.Equal(t, []Token{{Keyword, "hello"}, {TextWhitespace, "\n"}}, it.Tokens())
205+
}

0 commit comments

Comments
 (0)