Don't output extra whitespace in YAML multiline

This resolves a particular issue with parsing YAML multiline, for example: ```yaml a: | multiline literal line 2 ``` The regex used would capture the amount of indentation in the third capture group and then use that as a kind of "status" to know which lines are part of the indented multiline. However, because its a captured group it has to be assigned a token which was `TextWhitespace`. This meant that the indentation was outputed after the multiline, technically it should be seen as an non-captured group, but then its no longer to refer to it in the regex. Therefore I've gone with the solution to add a new token, Ignore, which will not be emitted as a token in the iterator, which can safely be used to make use of capture groups but not have them show up in the output.
alecthomas · Aug 22, 2024 · d446dfd · d446dfd
1 parent 895a048
commit d446dfd
Show file tree

Hide file tree

Showing 6 changed files with 448 additions and 424 deletions.
diff --git a/lexers/embedded/yaml.xml b/lexers/embedded/yaml.xml
@@ -53,7 +53,7 @@
         <bygroups>
           <token type="Punctuation"/>
           <token type="LiteralStringDoc"/>
-          <token type="TextWhitespace"/>
+          <token type="Ignore"/>
         </bygroups>
       </rule>
       <rule pattern="(false|False|FALSE|true|True|TRUE|null|Off|off|yes|Yes|YES|OFF|On|ON|no|No|on|NO|n|N|Y|y)\b">

diff --git a/lexers/testdata/yaml.expected b/lexers/testdata/yaml.expected
@@ -191,69 +191,69 @@
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"|"},
   {"type":"LiteralStringDoc","value":"\n    This entire block of text will be the value of the 'literal_block' key,\n    with line breaks being preserved.\n\n    The literal continues until de-dented, and the leading indentation is\n    stripped.\n\n        Any lines that are 'more-indented' keep the rest of their indentation -\n        these lines will be indented by 4 spaces."},
-  {"type":"TextWhitespace","value":"    \n"},
+  {"type":"TextWhitespace","value":"\n"},
   {"type":"NameTag","value":"folded_style"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"\u003e"},
   {"type":"LiteralStringDoc","value":"\n    This entire block of text will be the value of 'folded_style', but this\n    time, all newlines will be replaced with a single space.\n\n    Blank lines, like above, are converted to a newline character.\n\n        'More-indented' lines keep their newlines, too -\n        this text will appear over two lines."},
-  {"type":"TextWhitespace","value":"    \n"},
+  {"type":"TextWhitespace","value":"\n"},
   {"type":"NameTag","value":"literal_block_with_strip_chomping"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"|-"},
   {"type":"LiteralStringDoc","value":"\n    This entire block of text will be the value of the 'literal_block' key,\n    with line breaks being preserved and the strip chomping indicator.\n\n    The literal continues until de-dented, and the leading indentation is\n    stripped.\n\n        Any lines that are 'more-indented' keep the rest of their indentation -\n        these lines will be indented by 4 spaces."},
-  {"type":"TextWhitespace","value":"    \n"},
+  {"type":"TextWhitespace","value":"\n"},
   {"type":"NameTag","value":"literal_block_with_keep_chomping"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"|+"},
   {"type":"LiteralStringDoc","value":"\n    This entire block of text will be the value of the 'literal_block' key,\n    with line breaks being preserved and the keep chomping indicator.\n\n    The literal continues until de-dented, and the leading indentation is\n    stripped."},
-  {"type":"TextWhitespace","value":"    \n\n"},
+  {"type":"TextWhitespace","value":"\n\n"},
   {"type":"NameTag","value":"a"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"|"},
   {"type":"LiteralStringDoc","value":"\n  multiline literal\n  line 2"},
-  {"type":"TextWhitespace","value":"  \n"},
+  {"type":"TextWhitespace","value":"\n"},
   {"type":"NameTag","value":"b"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"\u003e"},
   {"type":"LiteralStringDoc","value":"\n  multiline: folded\n  line 2"},
-  {"type":"TextWhitespace","value":"  \n"},
+  {"type":"TextWhitespace","value":"\n"},
   {"type":"NameTag","value":"c"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"|-"},
   {"type":"LiteralStringDoc","value":"\n  multiline # literal strip\n  line 2"},
-  {"type":"TextWhitespace","value":"  \n"},
+  {"type":"TextWhitespace","value":"\n"},
   {"type":"NameTag","value":"d"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"\u003e-"},
   {"type":"LiteralStringDoc","value":"\n  multiline folded strip\n  line 2: test\n\n  # not a comment\n   indented by 1"},
-  {"type":"TextWhitespace","value":"  \n"},
+  {"type":"TextWhitespace","value":"\n"},
   {"type":"NameTag","value":"e"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"|+"},
   {"type":"LiteralStringDoc","value":"\n  multiline literal keep\n  line: 2"},
-  {"type":"TextWhitespace","value":"  \n"},
+  {"type":"TextWhitespace","value":"\n"},
   {"type":"Comment","value":"# this is a comment"},
   {"type":"TextWhitespace","value":"\n"},
   {"type":"NameTag","value":"f"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"\u003e+"},
   {"type":"LiteralStringDoc","value":"\n multiline folded keep one space\n line 2"},
-  {"type":"TextWhitespace","value":" \n"},
+  {"type":"TextWhitespace","value":"\n"},
   {"type":"NameTag","value":"g"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"|"},
   {"type":"LiteralStringDoc","value":"\n  multiline literal with only one line"},
-  {"type":"TextWhitespace","value":"  \n"},
+  {"type":"TextWhitespace","value":"\n"},
   {"type":"NameTag","value":"h"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
@@ -267,45 +267,45 @@
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"|"},
   {"type":"LiteralStringDoc","value":"\n    multiline literal\n    line 2"},
-  {"type":"TextWhitespace","value":"    \n  "},
+  {"type":"TextWhitespace","value":"\n  "},
   {"type":"NameTag","value":"b"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"\u003e"},
   {"type":"LiteralStringDoc","value":"\n    multiline: folded\n    line 2"},
-  {"type":"TextWhitespace","value":"    \n  "},
+  {"type":"TextWhitespace","value":"\n  "},
   {"type":"NameTag","value":"c"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"|-"},
   {"type":"LiteralStringDoc","value":"\n      multiline # literal strip\n      line 2 6 leading spaces"},
-  {"type":"TextWhitespace","value":"      \n\n  "},
+  {"type":"TextWhitespace","value":"\n\n  "},
   {"type":"NameTag","value":"d"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"\u003e-"},
   {"type":"LiteralStringDoc","value":"\n    multiline folded strip\n    line 2: test\n    # not a comment"},
-  {"type":"TextWhitespace","value":"    \n  "},
+  {"type":"TextWhitespace","value":"\n  "},
   {"type":"NameTag","value":"e"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"|+"},
   {"type":"LiteralStringDoc","value":"\n    multiline literal keep\n    line: 2"},
-  {"type":"TextWhitespace","value":"    \n  "},
+  {"type":"TextWhitespace","value":"\n  "},
   {"type":"Comment","value":"# this is a comment"},
   {"type":"TextWhitespace","value":"\n  "},
   {"type":"NameTag","value":"f"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"\u003e+"},
   {"type":"LiteralStringDoc","value":"\n    multiline folded keep\n    line 2"},
-  {"type":"TextWhitespace","value":"    \n  "},
+  {"type":"TextWhitespace","value":"\n  "},
   {"type":"NameTag","value":"g"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"|"},
   {"type":"LiteralStringDoc","value":"\n    multiline literal with only one line"},
-  {"type":"TextWhitespace","value":"    \n  "},
+  {"type":"TextWhitespace","value":"\n  "},
   {"type":"NameTag","value":"h"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
@@ -355,7 +355,7 @@
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"|"},
   {"type":"LiteralStringDoc","value":"\n  This is a key\n  that has multiple lines"},
-  {"type":"TextWhitespace","value":"  \n"},
+  {"type":"TextWhitespace","value":"\n"},
   {"type":"Punctuation","value":":"},
   {"type":"TextWhitespace","value":" "},
   {"type":"Literal","value":"and this is its value"},
@@ -622,7 +622,7 @@
   {"type":"TextWhitespace","value":" "},
   {"type":"Punctuation","value":"|"},
   {"type":"LiteralStringDoc","value":"\n  R0lGODlhDAAMAIQAAP//9/X17unp5WZmZgAAAOfn515eXvPz7Y6OjuDg4J+fn5\n  OTk6enp56enmlpaWNjY6Ojo4SEhP/++f/++f/++f/++f/++f/++f/++f/++f/+\n  +f/++f/++f/++f/++f/++SH+Dk1hZGUgd2l0aCBHSU1QACwAAAAADAAMAAAFLC\n  AgjoEwnuNAFOhpEMTRiggcz4BNJHrv/zCFcLiwMWYNG84BwwEeECcgggoBADs="},
-  {"type":"TextWhitespace","value":"  \n\n"},
+  {"type":"TextWhitespace","value":"\n\n"},
   {"type":"Comment","value":"# YAML also has a set type, which looks like this:"},
   {"type":"TextWhitespace","value":"\n"},
   {"type":"NameTag","value":"set"},

diff --git a/regexp.go b/regexp.go
@@ -194,6 +194,9 @@ func (l *LexerState) Iterator() Token { // nolint: gocognit
 		for len(l.iteratorStack) > 0 {
 			n := len(l.iteratorStack) - 1
 			t := l.iteratorStack[n]()
+			if t.Type == Ignore {
+				continue
+			}
 			if t == EOF {
 				l.iteratorStack = l.iteratorStack[:n]
 				continue
@@ -243,6 +246,9 @@ func (l *LexerState) Iterator() Token { // nolint: gocognit
 	for len(l.iteratorStack) > 0 {
 		n := len(l.iteratorStack) - 1
 		t := l.iteratorStack[n]()
+		if t.Type == Ignore {
+			continue
+		}
 		if t == EOF {
 			l.iteratorStack = l.iteratorStack[:n]
 			continue

diff --git a/regexp_test.go b/regexp_test.go
@@ -192,3 +192,14 @@ func TestByGroupNames(t *testing.T) {
 	assert.NoError(t, err)
 	assert.Equal(t, []Token{{Error, `abc=123`}}, it.Tokens())
 }
+
+func TestIgnoreToken(t *testing.T) {
+	l := Coalesce(mustNewLexer(t, &Config{EnsureNL: true}, Rules{ // nolint: forbidigo
+		"root": {
+			{`(\s*)(\w+)(?:\1)(\n)`, ByGroups(Ignore, Keyword, Whitespace), nil},
+		},
+	}))
+	it, err := l.Tokenise(nil, `  hello  `)
+	assert.NoError(t, err)
+	assert.Equal(t, []Token{{Keyword, "hello"}, {TextWhitespace, "\n"}}, it.Tokens())
+}